Python:apacheアクセスログをsqlite3にインポートしてみた
Last-modified: 2013-08-24 (土) 02:14:21 (3898d)
Top / Python:apacheアクセスログをsqlite3にインポートしてみた
Python:apacheアクセスログをsqlite3にインポートしてみた †
Pythonのcsvモジュールを使用すれば、apacheのアクセスログは簡単にsqlite3に入ります。
調べて作ったスクリプトをベタ貼り。
ちなみに、元情報はここです。「lolloo-htn」さん、すてきな記事をありがとうです!
して、そのままではなくて、ところどころ独自処理を入れてます。
- ログのフォーマットがデフォルトではなかったり。
- sqlite3ではlike分で正規表現使えないため、事前に加工していたり。
import csv, sqlite3, datetime, sys, re class ApacheDialect(csv.Dialect): delimiter = " " doublequote = True lineterminator="\r\n" quotechar='"' quoting=0 skipinitialspace=False class ApacheLog: _DETAILPATTERN_ = """^/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}""" _DETAILFMT_ = re.compile(_DETAILPATTERN_) def __init__(self,line): self.log={} self.log['ip'] = line[0] self.log['remotelog'] = line[1] self.log['remoteuser'] = line[2] self.log['time_hour'] = self._timeparser(line[3],1) self.log['time_minute'] = self._timeparser(line[3],2) self.log['time_second'] = self._timeparser(line[3],3) self.log['method'] = line[5].split()[0] self.log['url'] = line[5].split()[1] self.log['location'] = line[5].split()[1].split('?')[0] self.log['status'] = line[6] self.log['bytes'] = line[7] self.log['referer'] = line[8] self.log['ua'] = line[9] self.log['xforwardedproto'] = line[10] self.log['remotehost'] = line[11] self.log['duration'] = line[12] self.log['isdetailpage'] = self._is_detail_page(self.log['url']) def __getattr__(self, attr): return self.log[attr] def _timeparser(self,str,flg=3): t = datetime.datetime.strptime(str ,"[%d/%b/%Y:%H:%M:%S") if flg == 1: return t.strftime("%Y%m%d%H") elif flg == 2: return t.strftime("%Y%m%d%H%M") else: return t.strftime("%Y%m%d%H%M%S") def _is_detail_page(self, str): if self._DETAILFMT_.search(str): return True return False if __name__ == '__main__': argvs = sys.argv if len(argvs) != 3: print "usage : python accesslog2sqlite.py [access_log] [sqlite_db_file]" print "example : python accesslog2sqlite.py access_log db.sqlite3" exit(255) logfile = argvs[1] dbfile = argvs[2] ddl = ''' create table if not exists access_log ( ip text, remotelog text, remoteuser text, time_hour integer, time_minute integer, time_second integer, method text, url text, location text, status text, bytes integer, referer text, ua text, xforwardedproto text, remotehost teger, duration integer, isdetailpage text ) ''' csv.register_dialect("apache",ApacheDialect) reader = csv.reader(open(logfile),dialect="apache") c = sqlite3.connect(dbfile) c.execute(ddl) for aline in reader: if len(aline) != 13: continue l = ApacheLog(aline) # print l.log t = (l.ip, l.remotelog, l.remoteuser, l.time_hour,l.time_minute, l.time_second, l.method, l.url, l.location,l.status, l.bytes, l.referer, l.ua, l.xforwardedproto, l.remotehost, l.duration, l.isdetailpage) c.execute("insert into access_log values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t) c.commit() c.close()
突っ込んだあとの処理 †
こんなSQLを作ってみました。
これでババっと集計できますw
■shuukei.sql
.separator \t .output ./01_by_hour.txt select time_hour, count(time_hour) from access_log group by time_hour; .output ./02_by_minute.txt select time_minute , count(time_minute) from access_log group by time_minute order by 2 desc limit 100; .output ./03_by_source_ip.txt select ip, count(ip) from access_log group by ip order by 2 desc limit 100; .output ./04_by_useragent.txt select "IE*" ,count(ip) from access_log where ua like '%IE%'; select "IE7" ,count(ip) from access_log where ua like '%IE%7%'; select "IE8" ,count(ip) from access_log where ua like '%IE%8%'; select "IE9" ,count(ip) from access_log where ua like '%IE%9%'; select "FireFox" ,count(ip) from access_log where ua like '%FireFox%'; select "Chrome" ,count(ip) from access_log where ua like '%Chrome%'; select "iPad" ,count(ip) from access_log where ua like '%iPad%'; select "iPhone" ,count(ip) from access_log where ua like '%iPhone%'; select "Android*" ,count(ip) from access_log where ua like '%android%'; select "Android2*" ,count(ip) from access_log where ua like '%android 2%'; select "Android3*" ,count(ip) from access_log where ua like '%android 3%'; select "Android4*" ,count(ip) from access_log where ua like '%android 4%'; .output ./05_by_external_referer.txt select referer, count(referer) from access_log where referer not like 'http://kn.ndl.go.jp%' group by referer order by 2 desc limit 100;
参考サイト †
apacheのアクセスログの公式サイト
python2.6のcsvモジュール