因为工作需要cnnvd漏洞信息,以前用着集客搜、八爪鱼之类的工具,但对其效果和速度都不满意。最近开始接触学习爬虫,作为初学者,还需要慢慢完善。先记录下第一个爬虫。还想着在多进程和IP代理方向改善学习。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然
import urllib.request
from urllib import parse
from bs4 import BeautifulSoup
import http.cookiejar
import xlwt
import zlib
import re
import time
import xlsxwriter
import sys
import datetime
'''
运行方法:
python holes_crawler 2017-10-01 2017-10-31 178
第一个为开始时间,第二个为结束时间,第三个为总页数。
'''
#获得漏洞详情链接列表
def holes_url_list(url,start_time,end_time):
header = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag'
}
data = {
'qstartdate':'2017-10-30', #---------------》开始日期
'qenddate':'2017-10-31' #---------------》结束日期
}
data['qstartdate'] = start_time
data['qenddate'] = end_time
data = parse.urlencode(data).encode('utf-8')
holes_url_html = urllib.request.Request(url,headers=header,data=data)
holes_url_cookie = http.cookiejar.CookieJar()
holes_url_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(holes_url_cookie))
holes_url_html = holes_url_opener.open(holes_url_html)
holes_url_html = zlib.decompress(holes_url_html.read(), 16+zlib.MAX_WBITS)
holes_url_html = holes_url_html.decode()
#提取漏洞详情链接
response = r'href="(.+?)" target="_blank" class="a_title2"'
holes_link_list = re.compile(response).findall(holes_url_html)
#添加http前序
i = 0
for link in holes_link_list:
holes_lists.append('http://cnnvd.org.cn'+holes_link_list[i])
i+=1
print("已完成爬行第%d个漏洞链接"%i)
time.sleep(0.2)
#漏洞信息爬取函数
def holes_data(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept-Encoding': 'gzip, deflate, sdch',
}
holes_data_html = urllib.request.Request(url,headers=header)
holes_data_cookie = http.cookiejar.CookieJar()
holes_data_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(holes_data_cookie))
holes_data_html = holes_data_opener.open(holes_data_html)
holes_data_html = zlib.decompress(holes_data_html.read(), 16+zlib.MAX_WBITS)
holes_data_html = holes_data_html.decode()
global holes_result_list
holes_result_list=[] #抓取信息列表命名
#添加漏洞信息详情
holes_detainled_soup1 = BeautifulSoup(holes_data_html,'html.parser')
holes_detainled_data = holes_detainled_soup1.find('div',attrs={'class':'detail_xq w770'}) ##定义 漏洞信息详情 块的soup
holes_detainled_data = holes_detainled_data.decode()
holes_detainled_soup = BeautifulSoup(holes_detainled_data,'html.parser') #二次匹配
holes_detainled_data_list = holes_detainled_soup.find_all('li') #标签a信息汇总
try:
holes_name = holes_detainled_soup.h2.string #漏洞名称
except:
holes_name = ''
holes_result_list.append(holes_name)
try:
holes_cnnvd_num = holes_detainled_soup.span.string #cnnvd编号
holes_cnnvd_num = re.findall(r"\:([\s\S]*)",holes_cnnvd_num)[0]
except:
holes_cnnvd_num = ''
ho