目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片
items.py
1 import scrapy 2 3 4 class Zol2Item(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 image_urls = scrapy.Field() 8 images = scrapy.Field() 9 10 image_title = scrapy.Field()
pipelines.py
1 from scrapy import Request 2 from scrapy.pipelines.images import ImagesPipeline 3 4 class ZolPipeline(ImagesPipeline): 5 # num = 1 6 def get_media_requests(self, item, info): 7 image_url = item["image_urls"] 8 if image_url: 9 # self.num + 1 10 yield Request(url=image_url, meta={"item": item}) 11 12 def file_path(self, request, response=None, info=None): 13 ## start of deprecation warning block (can be removed in the future) 14 def _warn(): 15 from scrapy.exceptions import ScrapyDeprecationWarning 16 import warnings 17 warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 18 'please use file_path(request, response=None, info=None) instead', 19 category=ScrapyDeprecationWarning, stacklevel=1) 20 21 # check if called from image_key or file_key with url as first argument 22 if not isinstance(request, Request): 23 _warn() 24 url = request 25 else: 26 url = request.url 27 28 # detect if file_key() or image_key() methods have been overridden 29 if not hasattr(self.file_key, '_base'): 30 _warn() 31 return self.file_key(url) 32 elif not hasattr(self.image_key, '_base'): 33 _warn() 34 return self.image_key(url) 35 ## end of deprecation warning block 36 37 return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])
middlewares.py
1 from scrapy import signals 2 from zol2.useragents import agents 3 4 5 class Zol2SpiderMiddleware(object): 6 # Not all methods need to be defined. If a method is not defined, 7 # scrapy acts as if the spider middleware does not modify the 8 # passed objects. 9 10 @classmethod 11 def from_crawler(cls, crawler): 12 # This method is used by Scrapy to create your spiders. 13 s = cls() 14 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 15 return s 16 17 def process_spider_input(self, response, spider): 18 # Called for each response that goes through the spider 19 # middleware and into the spider. 20 21 # Should return None or raise an exception. 22 return None 23 24 def process_spider_output(self, response, result, spider): 25 # Called with the results returned from the Spider, after 26 # it has processed the response. 27 28 # Must return an iterable of Request, dict or Item objects. 29 for i in result: 30 yield i 31 32 def process_spider_exception(self, response, exception, spider): 33 # Called when a spider or process_spider_input() method 34 # (from other spider middleware) raises an exception. 35 36 # Should return either None or an iterable of Response, dict 37 # or Item objects. 38 pass 39 40 def process_start_requests(self, start_requests, spider): 41 # Called with the start requests of the spider, and works 42 # similarly to the process_spider_output() method, except 43 # that it doesn’t have a response associated. 44 45 # Must return only requests (not items). 46 for r in start_requests: 47 yield r 48 49 def spider_opened(self, spider): 50 spider.logger.info('Spide