- 追加された行はこの色です。
- 削除された行はこの色です。
*Python:apacheアクセスログをsqlite3にインポートしてみた [#f26d2fbb]
Pythonのcsvモジュールを使用すれば、apacheのアクセスログは簡単にsqlite3に入ります。
調べて作ったスクリプトをベタ貼り。
ちなみに、元情報はここです。「lolloo-htn」さん、すてきな記事をありがとうです!
-http://d.hatena.ne.jp/lolloo-htn/20090130/1233327457
して、そのままではなくて、ところどころ独自処理を入れてます。
-ログのフォーマットがデフォルトではなかったり。
-sqlite3ではlike分で正規表現使えないため、事前に加工していたり。
import csv, sqlite3, datetime, sys, re
class ApacheDialect(csv.Dialect):
delimiter = " "
doublequote = True
lineterminator="\r\n"
quotechar='"'
quoting=0
skipinitialspace=False
class ApacheLog:
_DETAILPATTERN_ = """^/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"""
_DETAILFMT_ = re.compile(_DETAILPATTERN_)
def __init__(self,line):
self.log={}
self.log['ip'] = line[0]
self.log['remotelog'] = line[1]
self.log['remoteuser'] = line[2]
self.log['time_hour'] = self._timeparser(line[3],1)
self.log['time_minute'] = self._timeparser(line[3],2)
self.log['time_second'] = self._timeparser(line[3],3)
self.log['method'] = line[5].split()[0]
self.log['url'] = line[5].split()[1]
self.log['location'] = line[5].split()[1].split('?')[0]
self.log['status'] = line[6]
self.log['bytes'] = line[7]
self.log['referer'] = line[8]
self.log['ua'] = line[9]
self.log['xforwardedproto'] = line[10]
self.log['remotehost'] = line[11]
self.log['duration'] = line[12]
self.log['isdetailpage'] = self._is_detail_page(self.log['url'])
def __getattr__(self, attr):
return self.log[attr]
def _timeparser(self,str,flg=3):
t = datetime.datetime.strptime(str ,"[%d/%b/%Y:%H:%M:%S")
if flg == 1:
return t.strftime("%Y%m%d%H")
elif flg == 2:
return t.strftime("%Y%m%d%H%M")
else:
return t.strftime("%Y%m%d%H%M%S")
def _is_detail_page(self, str):
if self._DETAILFMT_.search(str):
return True
return False
if __name__ == '__main__':
argvs = sys.argv
if len(argvs) != 3:
print "usage : python accesslog2sqlite.py [access_log] [sqlite_db_file]"
print "example : python accesslog2sqlite.py access_log db.sqlite3"
exit(255)
logfile = argvs[1]
dbfile = argvs[2]
ddl = ''' create table if not exists access_log (
ip text,
remotelog text,
remoteuser text,
time_hour integer,
time_minute integer,
time_second integer,
method text,
url text,
location text,
status text,
bytes integer,
referer text,
ua text,
xforwardedproto text,
remotehost teger,
duration integer,
isdetailpage text
)
'''
csv.register_dialect("apache",ApacheDialect)
reader = csv.reader(open(logfile),dialect="apache")
c = sqlite3.connect(dbfile)
c.execute(ddl)
for aline in reader:
if len(aline) != 13:
continue
l = ApacheLog(aline)
# print l.log
t = (l.ip, l.remotelog, l.remoteuser, l.time_hour,l.time_minute, l.time_second, l.method, l.url, l.location,l.status, l.bytes, l.referer, l.ua, l.xforwardedproto, l.remotehost, l.duration, l.isdetailpage)
c.execute("insert into access_log values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t)
c.commit()
c.close()
***突っ込んだあとの処理 [#a444556d]
こんなSQLを作ってみました。
これでババっと集計できますw
■shuukei.sql
.separator \t
.output ./01_by_hour.txt
select time_hour, count(time_hour) from access_log group by time_hour;
.output ./02_by_minute.txt
select time_minute , count(time_minute) from access_log group by time_minute order by 2 desc limit 100;
.output ./03_by_source_ip.txt
select ip, count(ip) from access_log group by ip order by 2 desc limit 100;
.output ./04_by_useragent.txt
select "IE*" ,count(ip) from access_log where ua like '%IE%';
select "IE7" ,count(ip) from access_log where ua like '%IE%7%';
select "IE8" ,count(ip) from access_log where ua like '%IE%8%';
select "IE9" ,count(ip) from access_log where ua like '%IE%9%';
select "FireFox" ,count(ip) from access_log where ua like '%FireFox%';
select "Chrome" ,count(ip) from access_log where ua like '%Chrome%';
select "iPad" ,count(ip) from access_log where ua like '%iPad%';
select "iPhone" ,count(ip) from access_log where ua like '%iPhone%';
select "Android*" ,count(ip) from access_log where ua like '%android%';
select "Android2*" ,count(ip) from access_log where ua like '%android 2%';
select "Android3*" ,count(ip) from access_log where ua like '%android 3%';
select "Android4*" ,count(ip) from access_log where ua like '%android 4%';
.output ./05_by_external_referer.txt
select referer, count(referer) from access_log where referer not like 'http://kn.ndl.go.jp%' group by referer order by 2 desc limit 100;
***参考サイト [#n5a919ea]
apacheのアクセスログの公式サイト
-http://httpd.apache.org/docs/2.2/ja/mod/mod_log_config.html
python2.6のcsvモジュール
-http://docs.python.jp/2.6/library/csv.html