トップ   編集 凍結 差分 バックアップ 添付 複製 名前変更 リロード   新規 一覧 単語検索 最終更新   ヘルプ   最終更新のRSS

Python:apacheアクセスログをsqlite3にインポートしてみた

Last-modified: 2013-08-24 (土) 02:14:21 (2982d)
Top / Python:apacheアクセスログをsqlite3にインポートしてみた

Python:apacheアクセスログをsqlite3にインポートしてみた

Pythonのcsvモジュールを使用すれば、apacheのアクセスログは簡単にsqlite3に入ります。

調べて作ったスクリプトをベタ貼り。

ちなみに、元情報はここです。「lolloo-htn」さん、すてきな記事をありがとうです!

して、そのままではなくて、ところどころ独自処理を入れてます。

  • ログのフォーマットがデフォルトではなかったり。
  • sqlite3ではlike分で正規表現使えないため、事前に加工していたり。
import csv, sqlite3, datetime, sys, re

class ApacheDialect(csv.Dialect):
    delimiter = " "
    doublequote = True
    lineterminator="\r\n"
    quotechar='"'
    quoting=0 
    skipinitialspace=False

class ApacheLog:
    _DETAILPATTERN_ = """^/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"""
    _DETAILFMT_ = re.compile(_DETAILPATTERN_)

    def __init__(self,line):
        self.log={}
        self.log['ip'] = line[0]
        self.log['remotelog'] = line[1]
        self.log['remoteuser'] = line[2]
        self.log['time_hour'] = self._timeparser(line[3],1)
        self.log['time_minute'] = self._timeparser(line[3],2)
        self.log['time_second'] = self._timeparser(line[3],3)
        self.log['method'] = line[5].split()[0]
        self.log['url'] = line[5].split()[1]
        self.log['location'] = line[5].split()[1].split('?')[0]
        self.log['status'] = line[6]
        self.log['bytes'] = line[7]
        self.log['referer'] = line[8]
        self.log['ua'] = line[9]
        self.log['xforwardedproto'] = line[10]
        self.log['remotehost'] = line[11]
        self.log['duration'] = line[12]
        self.log['isdetailpage'] = self._is_detail_page(self.log['url'])

    def __getattr__(self, attr):
        return self.log[attr]

    def _timeparser(self,str,flg=3):
        t = datetime.datetime.strptime(str ,"[%d/%b/%Y:%H:%M:%S")
        if flg == 1:
            return t.strftime("%Y%m%d%H")
        elif flg == 2:
            return t.strftime("%Y%m%d%H%M")
        else:
            return t.strftime("%Y%m%d%H%M%S")

    def _is_detail_page(self, str):
        if self._DETAILFMT_.search(str):
            return True
        return False

if __name__ == '__main__':
    argvs = sys.argv
    if len(argvs) != 3:
        print "usage : python accesslog2sqlite.py [access_log] [sqlite_db_file]"
        print "example : python accesslog2sqlite.py access_log db.sqlite3"
        exit(255)
    
    logfile = argvs[1]
    dbfile = argvs[2]
    
    ddl = ''' create table if not exists access_log (
    ip text,  
    remotelog text,
    remoteuser text,
    time_hour integer,
    time_minute integer,
    time_second integer,
    method text,
    url text,
    location text,
    status text,
    bytes integer,
    referer text,
    ua text,  
    xforwardedproto text,
    remotehost teger,
    duration integer,
    isdetailpage text
    )
    '''

    csv.register_dialect("apache",ApacheDialect)
    reader = csv.reader(open(logfile),dialect="apache")
    c = sqlite3.connect(dbfile)
    c.execute(ddl)

    for aline in reader:
        if len(aline) != 13:
            continue
        l = ApacheLog(aline)
#        print l.log
        t = (l.ip, l.remotelog, l.remoteuser, l.time_hour,l.time_minute, l.time_second, l.method, l.url, l.location,l.status, l.bytes, l.referer, l.ua, l.xforwardedproto, l.remotehost, l.duration, l.isdetailpage)
        c.execute("insert into access_log values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t)
    c.commit()
    c.close() 

突っ込んだあとの処理

こんなSQLを作ってみました。

これでババっと集計できますw

■shuukei.sql

.separator \t

.output ./01_by_hour.txt
select time_hour, count(time_hour) from access_log group by time_hour; 

.output ./02_by_minute.txt
select time_minute , count(time_minute) from access_log group by time_minute order by 2 desc limit 100;

.output ./03_by_source_ip.txt
select ip, count(ip) from access_log group by ip order by 2 desc limit 100;

.output ./04_by_useragent.txt
select "IE*" ,count(ip) from access_log where ua like '%IE%';
select "IE7" ,count(ip) from access_log where ua like '%IE%7%';
select "IE8" ,count(ip) from access_log where ua like '%IE%8%';
select "IE9" ,count(ip) from access_log where ua like '%IE%9%';
select "FireFox" ,count(ip) from access_log where ua like '%FireFox%';
select "Chrome" ,count(ip) from access_log where ua like '%Chrome%';
select "iPad" ,count(ip) from access_log where ua like '%iPad%';
select "iPhone" ,count(ip) from access_log where ua like '%iPhone%';
select "Android*" ,count(ip) from access_log where ua like '%android%';
select "Android2*" ,count(ip) from access_log where ua like '%android 2%';
select "Android3*" ,count(ip) from access_log where ua like '%android 3%';
select "Android4*" ,count(ip) from access_log where ua like '%android 4%';

.output ./05_by_external_referer.txt
select referer, count(referer) from access_log where referer not like 'http://kn.ndl.go.jp%' group by referer order by 2 desc limit 100;

参考サイト

apacheのアクセスログの公式サイト

python2.6のcsvモジュール