Python分析网站日志数据教程

人生苦短,我用Python,从入门到放弃,在到学会装X,这里是一份装X指南,教你如何用Python分析网站日志数据教程。

 

在使用Python分析网站日志之前,请安装openpyxl模块

 

下面贴出分析网站日志的轮子:

# -*- coding: utf-8 -*-

from openpyxl import Workbook
import json
import re
import time

# 日志格式,支持NGINX 来源:www.daxianseo.cn
log_format = '^\[(?P<time_local>.*?)\+[:\d]+\]\[(?P<status>\d+)\]\[(?P<remote_addr>.*?)\] '\
             '"(?P<http_method>.*?) (?P<request_uri>.*?) (?:.*?)" ' \
             '"(?P<http_referer>.*?)" "(?P<http_user_agent>.*?)"'

# 搜索引擎蜘蛛
spider_user_agent = {
    '百度': re.compile('(Baiduspider(?:[-\w]+)?/[\d\.]+)', re.I),
    '搜狗': re.compile('(Sogou \w+ spider/[\d\.]+)', re.I),
    '神马': re.compile('(YisouSpider)', re.I),
    '360': re.compile('(360Spider)', re.I),
    '谷歌': re.compile('(Googlebot/[\d\.]+)', re.I),
    '头条': re.compile('(Bytespider)', re.I),
    '必应': re.compile('(bingbot/[\d\.]+)', re.I)
}

# 日志正则
log_re = re.compile(log_format, re.I)

def process_log_line(result_dict, log_line):
    log_line_items = log_re.search(log_line)
    if log_line_items is None:
        return
    for engine_name, spider_reg in spider_user_agent.items():
        user_agent = log_line_items.group('http_user_agent')
        has_spider = spider_reg.search(user_agent)
        if has_spider is None: continue
        # 访问时间处理
        time_local = log_line_items.group('time_local')
        time_local_str = time.strptime(time_local, '%Y-%m-%dT%H:%M:%S')
        access_time = time.strftime('%Y-%m-%d:%H', time_local_str)
        # 访客IP处理
        remote_addr = log_line_items.group('remote_addr')
        # 访问的URI处理
        request_uri = log_line_items.group('request_uri')
        # 状态码
        status = int(log_line_items.group('status'))
        # 蜘蛛名称
        spider_name = has_spider.group(1)
        process_spider_log(result_dict, engine_name, spider_name, remote_addr,
                            access_time, request_uri, status)
        break


def process_spider_log(result_dict, engine_name, spider_name, remote_addr,
                       access_time, request_uri, status):
    engine = result_dict.get(engine_name)
    if engine:
        engine['count'] += 1
        # 处理蜘蛛名称
        spider_names = engine['spider_names']
        if spider_names.get(spider_name):
            spider_names[spider_name] += 1
        else:
            spider_names[spider_name] = 1
        # 处理蜘蛛IP
        spider_ips = engine['spider_ips']
        if spider_ips.get(remote_addr):
            spider_ips[remote_addr] += 1
        else:
            spider_ips[remote_addr] = 1
        # 处理访问时间
        access_times = engine['access_times']
        if access_times.get(access_time):
            access_times[access_time] += 1
        else:
            access_times[access_time] = 1
        # 处理状态码
        status_codes = engine['status_codes']
        if status_codes.get(status):
            status_codes[status] += 1
        else:
            status_codes[status] = 1
        # 处理请求的URI
        request_uris = engine['request_uris']
        if request_uris.get(request_uri):
            request_uris[request_uri]['count'] += 1
            spider_info = request_uris[request_uri]['spider_info']
            if spider_info.get(spider_name):
                if spider_info[spider_name].get(remote_addr):
                    spider_info[spider_name][remote_addr] += 1
                else:
                    spider_info[spider_name][remote_addr] = 1
            else:
                spider_info[spider_name] = {remote_addr: 1}
        else:
            request_uris[request_uri] = {
                'count': 1,
                'spider_info': {
                    spider_name: {
                        remote_addr: 1
                    }
                }
            }
    else:
        result_dict[engine_name] = {
            'count': 1,
            'spider_names': {
                spider_name: 1,
            },
            'spider_ips': {
                remote_addr: 1
            },
            'access_times': {
                access_time: 1
            },
            'status_codes': {
                status: 1
            },
            'request_uris': {
                request_uri: {
                    'count': 1,
                    'spider_info': {
                        spider_name: {
                            remote_addr: 1
                        }
                    }
                }
            }
        }

 

分析后的效果:

Python分析网站日志数据

 

需要注意的是,不是每个NGINX日志格式都一样,可能你的日志和轮子的顺序有改变,那样跑出来的结果就会出错,需要自行调整顺序。

网站日志数据教程

 

关于网站日志数据分析教程,可以看这里:《网站日志数据分析教程》。

 

注:想要获取完整的Python分析网站日志代码,请关注公众号:大仙的小黑屋。

赞赏

微信赞赏支付宝赞赏

相关文章