专栏名称: 编程派
Python程序员都在看的公众号,跟着编程派一起学习Python,看最新国外教程和资源!
目录
相关文章推荐
51好读  ›  专栏  ›  编程派

实战 | Python脚本收集腾讯云CDN日志,并入ELK日志分析

编程派  · 公众号  · Python  · 2017-03-08 11:39

正文

请到「今天看啥」查看全文


原文: http://www.cnblogs.com/xiaoming279/p/6475832.html

作者:xiaoming279

全文约 10689 字,读完可能需要 16 分钟。

(我)负责搭建公司日志分析,一直想把CDN日志也放入到日志分析,前些日志终于达成所愿,现在贴出具体做法:

1、收集日志

腾讯云CDN日志一般一小时刷新一次,也就是说当前只能下载一小时之前的日志数据,但据本人观察,有时前一小时的并下载不到,所以为了保险起见,可以下载两小时之前的日志数据。下载日志可以通过腾讯云的API获取日志列表,然后下载。

腾讯云日志下载 API 链接:

日志采集脚本:get_cdn_log.py

  1. [root@BJVM-2-181 bin]# cat get_cdn_log.py

  2. #!/usr/bin/env python

  3. # coding=utf-8

  4. import hashlib

  5. import requests

  6. import hmac

  7. import random

  8. import time

  9. import base64

  10. import json

  11. import gzip

  12. import os

  13. import sys

  14. from datetime import datetime, timedelta

  15. class Sign(object):

  16.     def __init__(self, secretId, secretKey):

  17.        self.secretId = secretId

  18.        self.secretKey = secretKey

  19.    # 生成签名串

  20.    def make(self, requestHost, requestUri, params, method='GET'):

  21.        srcStr = method.upper() + requestHost + requestUri + '?' + "&".join(k.replace("_",".") + "=" + str(params[k]) for k in sorted(params.keys()))

  22.        hashed = hmac.new(self.secretKey , srcStr, hashlib.sha1)

  23.        return base64.b64encode(hashed.digest())

  24. class CdnHelper(object):

  25.    SecretId='AKIDLsldjflsdjflsdjflsdjfpGSO5XoGiY9'

  26.    SecretKey='SeaHjSDFLJSLDFJQIuFJ7rMiz0lGV'

  27.    requestHost='cdn.api.qcloud.com'

  28.    requestUri='/v2/index.php'

  29.     def __init__(self, host, startDate, endDate):

  30.        self.host = host

  31.        self.startDate = startDate

  32.        self.endDate = endDate

  33.        self.params = {

  34.            'Timestamp': int(time.time()),

  35.            'Action' : 'GetCdnLogList',

  36.            'SecretId': CdnHelper.SecretId,

  37.            'Nonce': random.randint(10000000,99999999),

  38.            'host': self.host,

  39.            'startDate': self.startDate,

  40.            'endDate': self.endDate

  41.        }

  42.        self .params['Signature'] =  Sign(CdnHelper.SecretId, CdnHelper.SecretKey).make(CdnHelper.requestHost, CdnHelper.requestUri, self.params)

  43.        self.url = 'https://%s%s' % (CdnHelper.requestHost, CdnHelper.requestUri)

  44.    def GetCdnLogList(self):

  45.        ret = requests.get(self.url, params=self.params)

  46.        return ret.json()

  47. class GZipTool(object):

  48.    """

  49.    压缩与解压gzip

  50.    """

  51.    def __init__(self, bufSize = 1024*8):

  52.        self.bufSize = bufSize

  53.        self .fin = None

  54.        self.fout = None

  55.    def compress(self, src, dst):

  56.        self.fin = open(src, 'rb')

  57.        self.fout = gzip.open(dst, 'wb')

  58.        self .__in2out()

  59.    def decompress(self, gzFile, dst):

  60.        self.fin = gzip.open(gzFile, 'rb')

  61.        self.fout = open(dst, 'wb')

  62.        self .__in2out()

  63.    def __in2out(self,):

  64.        while True:

  65.            buf = self.fin.read(self.bufSize)

  66.            if len(buf) < 1:

  67.                break

  68.            self.fout.write(buf)

  69.        self.fin.close()

  70.        self.fout.close()

  71. def download(link, name):

  72.    try:

  73.        r = requests.get(link)

  74.        with open(name, 'wb') as f:

  75.            f .write(r.content)

  76.        return True

  77.    except:

  78.        return False

  79. def writelog(src, dst):

  80.    # 保存为以天命名日志

  81.    dst = dst.split('-')[0][:-2] + '-' + dst.split('-')[1]

  82.    with open(src, 'r') as f1:

  83.       with open(dst, 'a+') as f2:

  84.         for line in f1:

  85.            f2.write(line)

  86. if __name__ == '__main__':

  87.     #startDate = "2017-02-23 12:00:00"

  88.    #endDate = "2017-02-23 12:00:00"

  89.    # 前一小时

  90.     # startDate = endDate = time.strftime('%Y-%m-%d ', time.localtime()) + str(time.localtime().tm_hour-1) + ":00:00"

  91.    tm = datetime.now() + timedelta(hours=-2)

  92.    startDate = endDate = tm.strftime("%Y-%m-%d %H:00:00")

  93.    #hosts = ['userface.51img1.com']

  94.    hosts = [

  95.        'pfcdn.xxx.com',

  96.        'pecdn.xxx.com',

  97.        'pdcdn.xxx.com',

  98.         'pccdn.xxx.com',

  99.        'pbcdn.xxx.com',

  100.        'pacdn.xxx.com',

  101.         'p9cdn.xxx.com',

  102.        'p8cdn.xxx.com',

  103.        'p7cdn.xxx.com',

  104.         ]

  105.    for host in hosts:

  106.         try:

  107.            obj = CdnHelper(host, startDate,endDate)

  108.            ret = obj.GetCdnLogList()

  109.            link = ret['data']['list'][0]['link']

  110.            name = ret['data']['list'][0]['name']

  111.            # 下载链接保存的文件名

  112.            gzip_name = '/data/logs/cdn/cdn_log_temp/' + name + '.gz'

  113.            # 解压后的文件名

  114.            local_name = '/data/logs/cdn/cdn_log_temp/' + name + '.log'

  115.             # 追加的文件名

  116.            real_path = '/data/logs/cdn/' + name + '.log'

  117.            print local_name , real_path

  118.            status = download(link, gzip_name)

  119.             if status:

  120.                try:

  121.                    GZipTool().decompress(gzip_name, local_name)

  122.                    writelog (local_name, real_path)

  123.    #                os.remove(gzip_name)

  124.                    os .remove(local_name)

  125.                except:

  126.                    continue

  127.         except Exception ,e:

  128.            print e

  129.             continue

放到定时任务,每小时执行一次。

  1. # cdn日志

  2. 30 */1 * * * /usr/bin/python /root/bin/get_cdn_log.py &> /dev/null

此图解压后的日志,每个域名保存为一个文件,按天分割。

2、filebeat配置(具体含义查看官方文档)

  1. [root@BJ-2-11 bin]# cat /usr/local/app/filebeat-1.2.3-x86_64/nginx-php.yml

  2. filebeat:

  3.  prospectors:

  4.     -

  5.      paths:

  6.        - /data/logs/cdn/*.log

  7.      document_type: cdn-log

  8.      input_type: log

  9.       #tail_files: true

  10.      multiline:

  11.        negate: true

  12.        match : after

  13. output:

  14.  logstash :

  15.    hosts: ["10.80.2.181:5048", "10.80.2.182:5048"]

  16. shipper:

  17. logging :

  18.  files:

3、logstash配置

日志格式:

  1. 20170227152116 61.135.234.125 cdn.xxx.com /game/2017/201701/20170121/57037f7fc1a0dde9091d4fe6502a6c53.jpg 17769 22 26 200 http: //www.xxx.com/ 5 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; NetworkBench/7.0.0.282-5004888-124025)" "(null)" GET HTTP/1.1 hit

日志内容依次包括:请求时间、访问域名的客户端IP、被访问域名、文件请求路径、本次访问字节数大小、省份、运营商、http返回码、referer信息、request- time(毫秒)、User-Agent、range、HTTP Method、HTTP协议标识、缓存Hit/Miss。

配置文件

  1. # /usr/local/app/logstash-2.3.4/conf.d/logstash.conf

  2. input {

  3.    beats {

  4.        port => 5048

  5.        host => "0.0.0.0"

  6.     }

  7. }

  8. filter {

  9.    .....(省略)

  10.     else if [type] == "cdn-log" {

  11.        grok {

  12.            patterns_dir => ["./patterns"]

  13.            match => { "message" => "%{DATESTAMP_EVENTLOG:timestamp} %{IPORHOST:client_ip} %{IPORHOST:server_name} %{NOTSPACE:request} %{NUMBER:bytes} %{NUMBER:province} %{NUMBER:operator} %{NUMBER:status} (?:%{URI:referrer}|%{WORD:referrer}) %{NUMBER:request_time} %{QS:agent} \"\(%{WORD:range}\)\" %{WORD:method} HTTP/%{NUMBER:protocol} %{WORD:cache}" }

  14.         }

  15.        date {

  16.            match => [ "timestamp", "yyyyMMddHHmmss"]

  17.            target => "@timestamp"

  18.         }

  19.        alter {

  20.            condrewrite => [

  21.                "province", "22", "北京",

  22.                 "province", "86", "内蒙古",

  23.                "province", "146", "山西",

  24.                "province", "1069", "河北",

  25.                "province", "1077", "天津",

  26.                "province", "119", "宁夏",

  27.                 "province", "152", "陕西",

  28.                "province", "1208", "甘肃",

  29.                 "province", "1467", "青海",

  30.                "province", "1468", "新疆",

  31.                 "province", "145", "黑龙江",

  32.                "province", "1445", "吉林",

  33.                 "province", "1464", "辽宁",

  34.                "province", "2", "福建",

  35.                 "province", "120", "江苏",

  36.                "province", "121", "安徽",

  37.                 "province", "122", "山东",

  38.                "province", "1050", "上海",

  39.                 "province", "1442", "浙江",

  40.                "province", "182", "河南",

  41.                 "province", "1135", "湖北",

  42.                "province", "1465", "江西",

  43.                 "province", "1466", "湖南",

  44.                "province", "118", "贵州",

  45.                 "province", "153", "云南",

  46.                "province", "1051", "重庆",

  47.                 "province", "1068", "四川",

  48.                "province", "1155", "西藏",

  49.                "province", "4" , "广东",

  50.                "province", "173", "广西",

  51.                "province", "1441", "海南",

  52.                 "province", "0", "其他",

  53.                "province", "1", "港澳台",

  54.                 "province", "1", "海外",

  55.                "operator", "2", "中国电信",

  56.                 "operator", "26", "中国联通",

  57.                "operator", "38", "教育网",

  58.                 "operator", "43", "长城宽带",

  59.                "operator", "1046", "中国移动",

  60.                 "operator", "3947", "中国铁通",

  61.                "operator", "-1", "海外运营商",

  62.                 "operator", "0", "其他运营商"

  63.            ]

  64.         }

  65.    }

  66. } # filter

  67. output {

  68.    if "_grokparsefailure" in [tags] {

  69.        file { path => "/var/log/logstash/grokparsefailure-%{[type]}-%{+YYYY.MM.dd}.log" }

  70.     }

  71.    ......(省略)

  72.    else if [type] == "cdn-log"{

  73.        elasticsearch {

  74.            hosts => ["10.80.2.13:9200","10.80.2.14:9200","10.80.2.15:9200","10.80.2.16:9200"]

  75.            sniffing => true

  76.            manage_template => true

  77.            template_overwrite => true

  78.            template_name => "cdn"

  79.            template => "/usr/local/app/logstash-2.3.4/templates/cdn.json"

  80.            index => "%{[type]}-%{+YYYY.MM.dd}"

  81.            document_type => "%{[type]}"

  82.         }

  83.    }

  84.     ......(省略)

  85. } # output

4 效果图(一小时数据)

cdn使用量效果图

cdn访问情况统计

状态码统计


题图:pexels,CC0 授权。

点击 阅读原文 ,查看更多 Python 教程和资源。







请到「今天看啥」查看全文