python3-爬取cnnvd漏洞信息 - Python

TOP

python3-爬取cnnvd漏洞信息(一)

2017-11-22 06:07:16 【大中小】浏览:419次

　　因为工作需要cnnvd漏洞信息，以前用着集客搜、八爪鱼之类的工具，但对其效果和速度都不满意。最近开始接触学习爬虫，作为初学者，还需要慢慢完善。先记录下第一个爬虫。还想着在多进程和IP代理方向改善学习。

　　这个是运行情况，速度还是无法忍受，多进程在数据获取应该能快很多，IP代理应该能忽视短时间多次访问被限制的问题，从而可以提高速度。

输出 excel 如图：

以下是整个代码：

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然

import urllib.request
from urllib import parse
from bs4 import BeautifulSoup
import http.cookiejar
import xlwt
import zlib
import re
import time
import xlsxwriter
import sys
import datetime

'''
运行方法：
python holes_crawler 2017-10-01 2017-10-31 178
第一个为开始时间，第二个为结束时间，第三个为总页数。

'''


#获得漏洞详情链接列表
def holes_url_list(url,start_time,end_time):
    header = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
        'Accept-Encoding': 'gzip, deflate',
        'Referer': 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag'
    }    
    data = {
        'qstartdate':'2017-10-30',                #---------------》开始日期
        'qenddate':'2017-10-31'                    #---------------》结束日期
    }
    data['qstartdate'] = start_time 
    data['qenddate'] = end_time
    data = parse.urlencode(data).encode('utf-8')
    holes_url_html = urllib.request.Request(url,headers=header,data=data)
    holes_url_cookie = http.cookiejar.CookieJar()
    holes_url_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(holes_url_cookie))
    holes_url_html = holes_url_opener.open(holes_url_html)
    holes_url_html = zlib.decompress(holes_url_html.read(), 16+zlib.MAX_WBITS)
    holes_url_html = holes_url_html.decode()
    
    #提取漏洞详情链接
    response = r'href="(.+?)" target="_blank" class="a_title2"'
    holes_link_list = re.compile(response).findall(holes_url_html)
    
    #添加http前序
    i = 0
    for link in holes_link_list:
        holes_lists.append('http://cnnvd.org.cn'+holes_link_list[i])
        i+=1
        print("已完成爬行第%d个漏洞链接"%i)
        time.sleep(0.2)

#漏洞信息爬取函数
def holes_data(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
        'Accept-Encoding': 'gzip, deflate, sdch',
    }
    holes_data_html = urllib.request.Request(url,headers=header)
    holes_data_cookie = http.cookiejar.CookieJar()
    holes_data_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(holes_data_cookie))
    holes_data_html = holes_data_opener.open(holes_data_html)
    holes_data_html = zlib.decompress(holes_data_html.read(), 16+zlib.MAX_WBITS)
    holes_data_html = holes_data_html.decode()

    global holes_result_list
    holes_result_list=[]    #抓取信息列表命名
    
    #添加漏洞信息详情
    holes_detainled_soup1 = BeautifulSoup(holes_data_html,'html.parser')
    holes_detainled_data = holes_detainled_soup1.find('div',attrs={'class':'detail_xq w770'})    ##定义 漏洞信息详情 块的soup
    holes_detainled_data = holes_detainled_data.decode()
    holes_detainled_soup = BeautifulSoup(holes_detainled_data,'html.parser')    #二次匹配    

    holes_detainled_data_list = holes_detainled_soup.find_all('li')    #标签a信息汇总    
    
    try:
        holes_name = holes_detainled_soup.h2.string    #漏洞名称
    except:
        holes_name = ''
    holes_result_list.append(holes_name)
    
    try:
        holes_cnnvd_num = holes_detainled_soup.span.string    #cnnvd编号
        holes_cnnvd_num = re.findall(r"\：([\s\S]*)",holes_cnnvd_num)[0]
    except:
        holes_cnnvd_num = ''
    ho

首页上一页 1 2 3 4 下一页尾页 1/4/4
【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】

上一篇：[Python]嵌套循环nested loop-练..	下一篇：Python入门3