设为首页 加入收藏

TOP

PYTHON爬取图片
2023-07-25 21:27:17 】 浏览:29
Tags:PYTHON
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse

# 异常处理还未优化,后续补上
# 未解决问题1:这是爬取多个页面的当前所有图片,图片内部的还未处理
# 未解决问题2:当爬取页面过多时,会报错,原因还未找到,后续补上

headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
# 防盗链 : 朔源,当前本次请求的上一级是谁
"Referer": "https://xxx"
}


def get_img_src(q):
urls = []
for i in range(1, 5):
if i == 1:
a = https://xxx/index.html
else:
a = f"https://xxx/{i}.html"
urls.append(a)
href_list_all = []
for i in urls:
resp = requests.get(i, headers=headers)
resp.encoding = 'utf-8'
tree = etree.HTML(resp.text)
href_list = tree.xpath("//div[@class='list-box-p']/ul/li/a/@href")
href_list_all.append(href_list)

for all_list in href_list_all:
for href in all_list:
child_resp = requests.get(href, headers=headers)
child_resp.encoding = 'utf-8'
child_tree = etree.HTML(child_resp.text)
src = child_tree.xpath("//div[@class='img_box']/a/img/@src")[0] # 注意这里获取的是列表,需要取里面的下标为0的第一个元素值
q.put(src) # 循环向队列里装东西,后面好给下载用
print(f"---------------------------------------------------被塞进队列--------------------->{src}")
q.put("已经没了")


def download(src):
print('开始下载------------>', src)
name = src.split('/')[-1]
with open("./image/" + name, mode='wb') as f:
resp = requests.get(src, headers=headers)
f.write(resp.content)
print('下载完毕------------>', src)


def download_img(q):
with ThreadPoolExecutor(5) as t:
while 1:
src = q.get() # 从队列里拿东西,如果没数据就阻塞,一直等着有数据来
if src == "已经没了":
break
t.submit(download, src)


if __name__ == '__main__':
q = Queue()
p1 = Process(target=get_img_src, args=(q,))
p2 = Process(target=download_img, args=(q,))
p1.start()
p2.start()
】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
上一篇(已失效)OpenGPT搭建QQ机器人 下一篇Python爬虫实战,requests+openpy..

最新文章

热门文章

Hot 文章

Python

C 语言

C++基础

大数据基础

linux编程基础

C/C++面试题目