python爬虫批量抓取ip代理 - Python

TOP

python爬虫批量抓取ip代理(一)

2019-03-14 22:14:43 【大中小】浏览:129次

使用爬虫抓取数据时，经常要用到多个ip代理，防止单个ip访问太过频繁被封禁。
ip代理可以从这个网站获取：http://www.xicidaili.com/nn/。
因此写一个python程序来获取ip代理，保存到本地。
python版本：3.6.3

 1 #grab ip proxies from xicidaili
 2 import sys, time, re, requests
 3 from multiprocessing.dummy import Pool as ThreadPool
 4 from lxml import etree
 5 
 6 IP_POOL = 'ip_pool.py'
 7 URL = 'http://www.xicidaili.com/nn/' #IP代理 高匿
 8 #URL = 'http://www.xicidaili.com/wt/' #IP代理 http
 9 RUN_TIME = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #执行时间
10 
11 #用字典存放有效ip代理
12 alive_ip = {'http': [], 'https': []}
13 #多线程
14 pool = ThreadPool(20)
15 
16 #返回html文本
17 def get_html(url):
18     headers = {
19         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
20         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
21         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
22         "Accept-Encoding": "gzip, deflate",
23         "Referer": "https://www.xicidaili.com/",
24         "Connection": "keep-alive",
25         "Upgrade-Insecure-Requests": "1"
26     }
27     r = requests.get(url, headers=headers)
28     r.encoding = 'utf-8'
29     return r.text
30 
31 #测试ip代理是否存活
32 def test_alive(proxy):
33     global alive_ip
34     proxies = {'http': proxy}
35     try:
36         r = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
37         if r.status_code == 200:
38             if proxy.startswith('https'):
39                 alive_ip['https'].append(proxy)
40             else:
41                 alive_ip['http'].append(proxy)
42     except:
43         print("%s无效!"%proxy)
44 
45 #解析html文本，获取ip代理
46 def get_alive_ip_address():
47     iplist = []
48     html = get_html(URL)
49     selector = etree.HTML(html)
50     table = selector.xpath('//table[@id="ip_list"]')[0]
51     lines = table.xpath('./tr')[1:]
52     for line in lines:
53         speed, connect_time = line.xpath('.//div/@title')
54         data = line.xpath('./td')
55         ip = data[1].xpath('./text()')[0]
56         port = data[2].xpath('./text()')[0]
57         anonymous = data[4].xpath('./text()')[0]
58         ip_type = data[5].xpath('./text()')[0]
59         #过滤掉速度慢和非高匿的ip代理
60         if float(speed[:-1])>1 or float(connect_time[:-1])>1 or anonymous != '高匿':
61             continue
62         iplist.append(ip_type.lower() + '://' + ip + ':' + port)
63     pool.map(test_alive, iplist)
64 
65 #把抓取到的有效ip代理写入到本地
66 def write_txt(output_file):
67     with open(output_file, 'w') as f:
68         f.write('#create time: %s\n\n' % RUN_TIME)
69         f.write('http_ip_pool = \\\n')
70         f.write(str(alive_ip['http']).replace(',', ',\n'))
71         f.write('\n\n')
72     with open(output_file, 'a') as f:
73         f.write('https_ip_pool = \\\n')
74         f.write(str(alive_ip['https']).replace(',', ',\n'))
75     print('write successful: %s' % output_file)
76 
77 def main():
78     get_alive_ip_address()
79     write_txt(output_file)
80 
81 if __name__ == '__main__':
82     try:
83         output_file = sys.argv[1] #第一个参数作为文件名
84     except:
85         output_file = IP_POOL
86     main()

运行程序：

root@c:test$ python get_ip_proxies.py
write successful: ip_pool.py

查看文件：

root@c:test$ vim ip_pool.py

 1 #create time: 2019-03-14 19:53
 2 
 3 http_ip_pool = \
 4 ['http://183.148.152.1:9999',
 5  'http://112.85.165.234:9999',
 6  'http://112.8

首页上一页 1 2 下一页尾页 1/2/2
【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】

上一篇：python函数学习1	下一篇：10函数的进阶