之前在做CDN运维的时候,因为业务的特殊性(跨机房,跨ISP,跨区域),把日志集中传输到一个中心来做qos的分析不太现实,因此采用的方法是每5分钟对Nginx日志进行切割,然后通过Python程序计算http code的分布,并通过Zabbix来实现单台机器Nginx qos的监控,配合对Zabbix数据库的Lastvalue进行聚合,则可以监控整个CDN的流量,qos数据等,这样一般发现问题的延迟就在5分钟左右(cdn的qos敏感性不是很强),配合rsync+hadoop+hive来计算nginx的日志,也可以得到更加详细的各个维度的分析(离线数据分析),下面贴下Nginx日志使用的分析脚本:
先贴下zabbix聚合脚本:
#!/usr/bin/python #togetwebcdntotaolstatistics #-*-coding:utf8-*- importMySQLdb importsys importos defget_total_value(sql): db=MySQLdb.connect(host='xxxx',user='xxxx',passwd='xxxx',db='xxxx') cursor=db.cursor() cursor.execute(sql) try: result=cursor.fetchone()[0] except: result=0 cursor.close() db.close() returnresult if__name__=='__main__': sql='' ifsys.argv[1]=="network_traffic": sql="selectround(sum(lastvalue)/(1024*1024),4)fromhostsa,itemsbwherekey_in('net.if.out[eth1,bytes]','net.if.out[eth0,bytes]')andlower(host)like'%-cdn-cache%'anda.hostid=b.hostid" elifsys.argv[1]=="nginx_traffic": sql="selectsum(lastvalue)fromhostsa,itemsbwherekey_='log_webcdn_getvalue[traffic]'andlower(host)like'%cdn-cache%'anda.hostid=b.hostid" elifsys.argv[1]=="2xxand3xx": sql="selectsum(lastvalue)fromhostsa,itemsbwherekey_in('log_webcdn_getvalue[200]','log_webcdn_getvalue[300]')andlower(host)like'%-cdn-cache%'anda.hostid=b.hostid" elifsys.argv[1]=="4xxand5xx": sql="selectsum(lastvalue)fromhostsa,itemsbwherekey_in('log_webcdn_getvalue[four]','log_webcdn_getvalue[five]')andlower(host)like'%-cdn-cache%'anda.hostid=b.hostid" elifsys.argv[1]=="network_ss": sql="selectsum(lastvalue)fromhostsa,itemsbwherekey_='network_conn'andlower(host)like'%-cdn-cache%'anda.hostid=b.hostid" else: sys.exit(0) #printsql value=get_total_value(sql) printvalue
然后是单台的分析脚本:
#!/usr/bin/python #coding=utf-8 from__future__importdivision importsubprocess,signal,string importcodecs importre importos importtime,datetime importsys defshow_usage(): print""" pythonnginx_log_wedcdn.pyresult_key result_keycouldbe: average_bodysize,response_time,sum_count,count_success,four,403,404,499,five,500,502,503,200,300,requests_second response_time_source,percentage_time_1,percentage_time_3,all """ defrunCmd(command,timeout=10): start=datetime.datetime.now() process=subprocess.Popen(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) whileprocess.poll()isNone: time.sleep(0.2) now=datetime.datetime.now() if(now-start).seconds>timeout: os.kill(process.pid,signal.SIGKILL) os.waitpid(-1,os.WNOHANG) returnNone returnprocess.stdout.readlines() defget_old_filename(): t=datetime.datetime.now()+datetime.timedelta(minutes=-5) a=t.strftime('%Y-%m-%d-%H') b=t.strftime('%M') b=int(b)//5*5 ifb<10: c="0"+str(b) else: c=str(b) d="/log/nginx/old/"+a+"-%s.log.gz"%c #printd returnd defget_new_filename(): t=datetime.datetime.now()+datetime.timedelta(minutes=-5) a=t.strftime('%Y-%m-%d-%H') b=t.strftime('%M') b=int(b)//5*5 ifb<10: c="0"+str(b) else: c=str(b) d="/log/nginx/old/"+a+"-%s.log"%c #printd returnd defget_new2_filename(): t=datetime.datetime.now()+datetime.timedelta(minutes=-5) a=t.strftime('%Y-%m-%d-%H') b=t.strftime('%M') b=int(b)//5*5 ifb<10: c="0"+str(b) else: c=str(b) d="/log/nginx/new/"+a+"-%s.log"%c #printd returnd defaverage_flow(): flow=0 flow1=0 flow_ppsucai=0 flow_asimgs=0 flow_static9=0 traffic=0.0 traffic1=0.0 count=0 count_sucai=0 count_sucai_100=0 count_sucai_30_100=0 count_sucai_30=0 count_asimgs=0 count_asimgs_100=0 count_asimgs_30_100=0 count_asimgs_30=0 count_static9=0 count_static9_100=0 count_static9_30_100=0 count_static9_30=0 sum_time=0.0 sum_ppsucai_time=0.0 sum_asimgs_time=0.0 sum_static9_time=0.0 sum_time_source=0.0 count_200=0 count_300=0 count_success=0 count_200_backup=0 count_not_200_backup=0 id_list_200=[200,206] id_list_300=[300,301,302,303,304,305,306,307] id_list_success=[200,206,300,301,302,303,304,305,306,307] data_byte=0 elapsed=0.0 response_time=0.0 response_time_source=0.0 requests_second=0.0 requests_second_sucai=0.0 requests_second_asimgs=0.0 list_time_1=[] list_time_3=[] list_ip_403=[] list_ip_404=[] list_ip_415=[] list_ip_499=[] list_ip_500=[] list_ip_502=[] list_ip_503=[] server_list=['"127.0.0.1:8080"','"127.0.0.1:8081"','"-"'] file_name=get_old_filename() ifos.path.isfile("%s"%file_name): Writelog(file_name) i=os.popen("/bin/zcat%s"%file_name).readlines() #i=gzip.GzipFile("%s"%file_name).readlines() else: file_name=get_new_filename() ifos.path.isfile("%s"%file_name): Writelog(file_name) i=os.popen("/bin/cat%s"%file_name).readlines() else: #time.sleep(15) file_name=get_new2_filename() ifos.path.isfile("%s"%file_name): Writelog(file_name) i=os.popen("/bin/cat%s"%file_name).readlines() else: os.popen("rm-f/tmp/exist.txt") sys.exit(1) forlineini: count+=1 try: domain_name=line.split()[1] except: pass try: web_code=int(line.split()[8]) except: web_code=888 try: IP=str(line.split()[0]) except: pass try: data_byte=int(line.split()[9]) #print"data",data_byte except: data_byte=0.0001 try: elapsed=float(line.split()[-1].strip('"')) ifelapsed==0.000: elapsed=0.0001 except: elapsed=0.0001 try: time_source=float(line.split()[-4].strip('"')) except: time_source=0.0 try: backup_server=str(line.split()[-3]) except: pass flow1+=data_byte ifweb_codeinid_list_success: flow+=data_byte sum_time_source+=time_source ifdomain_name!="ppsucai.pptv.com": sum_time+=elapsed else: #printdomain_name sum_time+=0.000 ifweb_codeinid_list_200: #printweb_code count_200+=1 ifbackup_servernotinserver_list: #printweb_code,backup_server count_200_backup+=1 elifweb_code==200anddate_byte==0: #printline.split()[3].lstrip("[") WriteURLInfo(line.split()[3].lstrip("[")) WriteURLInfo("\t") WriteURLInfo(line.split()[10]) WriteURLInfo("\n") elifweb_codeinid_list_300: count_300+=1 elifweb_code==403andIPnotinlist_ip_403: list_ip_403.append(IP) #print"thisisthesum403count:",IP,len(list_ip_403) elifweb_code==404andIPnotinlist_ip_404: list_ip_404.append(IP) #print"thisisthesum404count:",IP,len(list_ip_404) elifweb_code==415andIPnotinlist_ip_415: list_ip_415.append(IP) #print"thisisthesum415count:",IP,len(list_ip_415) elifweb_code==499andIPnotinlist_ip_499: list_ip_499.append(IP) #print"thisisthesum499count:",IP,len(list_ip_499) elifweb_code==500andIPnotinlist_ip_500: list_ip_500.append(IP) #print"thisisthesum500count:",IP,len(list_ip_500) elifweb_code==502andIPnotinlist_ip_502: list_ip_502.append(IP) #print"thisisthesum502count:",IP,len(list_ip_502) elifweb_code==503andIPnotinlist_ip_503: list_ip_503.append(IP) #print"thisisthesum503count:",IP,len(list_ip_503) ifweb_codenotinid_list_200andbackup_servernotinserver_list: #printweb_code,backup_server count_not_200_backup+=1 ifelapsed>1.0andweb_codeinid_list_successandIPnotinlist_time_1: list_time_1.append(IP) elifelapsed>3.0andweb_codeinid_list_successandIPnotinlist_time_3: list_time_3.append(IP) ifdomain_name=="ppsucai.pptv.com"andweb_codeinid_list_success: download_speed_sucai=round(data_byte/elapsed/1024,2) flow_ppsucai+=data_byte sum_ppsucai_time+=elapsed count_sucai+=1 ifdownload_speed_sucai>=100: count_sucai_100+=1 elifdownload_speed_sucai<100anddownload_speed_sucai>=30: count_sucai_30_100+=1 else: count_sucai_30+=1 elifdomain_name=="asimgs.pplive.cn"andweb_codeinid_list_success: download_speed_asimgs=round(data_byte/elapsed/1024,2) flow_asimgs+=data_byte sum_asimgs_time+=elapsed count_asimgs+=1 ifdownload_speed_asimgs>=100: count_asimgs_100+=1 elifdownload_speed_asimgs<100anddownload_speed_asimgs>=30: count_asimgs_30_100+=1 else: count_asimgs_30+=1 elifdomain_name=="static9.pplive.cn"andweb_codeinid_list_success: download_speed_static9=round(data_byte/elapsed/1024,2) flow_static9+=data_byte sum_static9_time+=elapsed count_static9+=1 ifdownload_speed_static9>=100: count_static9_100+=1 elifdownload_speed_static9<100anddownload_speed_static9>=30: count_static9_30_100+=1 else: count_static9_30+=1 #else: #break try: traffic=round((flow*1.07*8)/300/1024/1024,2) #traffic1=round((flow1*1.07)/300/1024/1024,2) #printtraffic,traffic1 #traffic1=round(flow/sum_time/1024/1024,2) count_success=count_200+count_300 response_time=round(sum_time/count_success,2) response_time_source=round(sum_time_source/count_success,2) requests_second=round(count_success/300,2) ifsum_ppsucai_time==0.0: sum_ppsucai_time=0.0001 ifsum_asimgs_time==0.0: sum_asimgs_time=0.0001 #printsum_static9_time ifsum_static9_time==0.0: sum_static9_time=0.0001 traffic_ppsucai=round(flow_ppsucai/sum_ppsucai_time/1024,2) traffic_asimgs=round(flow_asimgs/sum_asimgs_time/1024,2) traffic_static9=round(flow_static9/sum_static9_time/1024,2) #print"flow_static:",flow_static9,"traffic_static9",traffic_static9 average_bodysize=round((flow/count_success)/1024,2) percentage_time_1=round(len(list_time_1)/count_success*100,2) percentage_time_3=round(len(list_time_3)/count_success*100,2) ifcount_sucai==0: count_sucai=0.0001 percentage_sucai_100=round(count_sucai_100/count_sucai*100,2) percentage_sucai_30_100=round(count_sucai_30_100/count_sucai*100,2) percentage_sucai_30=round(count_sucai_30/count_sucai*100,2) ifcount_asimgs==0: count_asimgs=0.0001 percentage_asimgs_100=round(count_asimgs_100/count_asimgs*100,2) percentage_asimgs_30_100=round(count_asimgs_30_100/count_asimgs*100,2) percentage_asimgs_30=round(count_asimgs_30/count_asimgs*100,2) #printcount_static9 ifcount_static9==0: count_static9=0.0001 percentage_static9_100=round(count_static9_100/count_static9*100,2) #printcount_static9_100,"100",percentage_static9_100 percentage_static9_30_100=round(count_static9_30_100/count_static9*100,2) #printcount_static9_30_100,"30-100",percentage_static9_30_100 percentage_static9_30=round(count_static9_30/count_static9*100,2) #printcount_static9_30,"30",percentage_static9_30 requests_second_sucai=round(count_sucai/300,2) requests_second_asimgs=round(count_asimgs/300,2) requests_second_static9=round(count_static9/300,2) #printrequests_second_static9 #printcount,"thisisthecountof2xx_backup:",count_200_backup,"%",round(count_200_backup/count,4),"thisisthecountof!2xx_backup:",count_not_200_backup,round(count_not_200_backup/count,4) percentage_200_backup=round(count_200_backup/count*100,2) percentage_not_200_backup=round(count_not_200_backup/count*100,2) returnaverage_bodysize,response_time,count,count_success,len(list_ip_403),len(list_ip_404),len(list_ip_499),len(list_ip_500),len(list_ip_502),len(list_ip_503),count_200,count_300,requests_second,response_time_source,len(list_time_1),len(list_time_3),percentage_time_1,percentage_time_3,count_sucai,percentage_sucai_100,percentage_sucai_30_100,percentage_sucai_30,requests_second_sucai,count_asimgs,percentage_asimgs_100,percentage_asimgs_30_100,percentage_asimgs_30,requests_second_asimgs,traffic_ppsucai,traffic_asimgs,traffic,traffic_static9,count_static9,percentage_static9_100,percentage_static9_30_100,percentage_static9_30,requests_second_static9,percentage_200_backup,percentage_not_200_backup,len(list_ip_415) except: return0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deflog_files(pwd): log_file_list=[] files=os.popen("ls%s"%pwd).readlines() forxinfiles: ifx.strip().endswith("log"): log_file_list.append(x.strip()) returnlog_file_list defresult_dic(): list=average_flow() #printlist #printlist result={} result['average_bodysize']=list[0] result['response_time']=list[1] result['sum_count']=list[2] result['count_success']=list[3] result['four']=list[4]+list[5]+list[6]+list[39] #print'four','=','%s'%list[4],'+','%s'%list[5],'+','%s'%list[6],'+','%s'%list[39],result['four'] result['403']=list[4] #print'403',result['403'] result['404']=list[5] #print'404',result['404'] result['499']=list[6] #print'499',result['499'] result['415']=list[39] #print'415',result['415'] result['five']=list[7]+list[8]+list[9] result['500']=list[7] result['502']=list[8] result['503']=list[9] result['200']=list[10] result['300']=list[11] result['requests_second']=list[12] result['response_time_source']=list[13] result['percentage_time_1']=list[16] result['percentage_time_3']=list[17] result['count_sucai']=list[18] result['percentage_sucai_100']=list[19] result['percentage_sucai_30_100']=list[20] result['percentage_sucai_30']=list[21] result['requests_second_sucai']=list[22] result['count_asimgs']=list[23] result['percentage_asimgs_100']=list[24] result['percentage_asimgs_30_100']=list[25] result['percentage_asimgs_30']=list[26] result['requests_second_asimgs']=list[27] result['traffic_ppsucai']=list[28] result['traffic_asimgs']=list[29] result['traffic']=list[30] result['traffic_static9']=list[31] result['count_static9']=list[32] result['percentage_static9_100']=list[33] result['percentage_static9_30_100']=list[34] result['percentage_static9_30']=list[35] result['requests_second_static9']=list[36] result['percentage_200_backup']=list[37] result['percentage_not_200_backup']=list[38] result['all']=list returnresult defWritelog(msg): o=open("/log/nginx/qos_result_new"+".log","aw") o.write(time.strftime("%Y-%m-%d%H:%M:%S",time.localtime())+":"+msg+"\n") o.close() defWriteTmpInfo(msg): o=open("/tmp/webcdnqos_result"+".txt","aw+") o.write(msg+"\n") o.close() defWriteURLInfo(msg): today=datetime.date.today() o=open("/tmp/webcdnqos_url_%s"%today.strftime('%Y-%m-%d')+".log","aw") #o.write(time.strftime("%Y-%m-%d%H:%M:%S",time.localtime())+""+msg+"\n") o.write(msg) o.close() if__name__=="__main__": iflen(sys.argv)<2: show_usage() os.popen("rm-f/tmp/exist.txt") sys.exit(1) else: ifos.path.isfile("/tmp/exist.txt"): sys.exit(1) else: os.popen("echo'hello'>/tmp/exist.txt") result_key=sys.argv[1] status=result_dic() os.popen(">/tmp/webcdnqos_result.txt") printstatus[result_key] Writelog(str(status[result_key])) foriinstatus.keys(): WriteTmpInfo(str(i)+"="+str(status[i])) os.popen("rm-f/tmp/exist.txt")
相关文章
标签:系统运维