阳光热线问政平台
URL地址:http://wz.sun0769.com/index.php/question/questionType?type=4&page=
爬取字段:帖子的编号、投诉类型、帖子的标题、帖子的URL地址、部门、状态、网友、时间。
1.items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SunwzspiderItem(scrapy.Item):
# define the fields for your item here like:
# 爬取投诉帖子的编号、投诉类型、帖子的标题、帖子的URL、部门、状态、网友、时间。
# 帖子的编号
post_id = scrapy.Field()
# 投诉类型
post_type = scrapy.Field()
# 帖子的标题
post_title = scrapy.Field()
# 帖子的URL
post_url = scrapy.Field()
# 部门
sector = scrapy.Field()
# 状态
post_state = scrapy.Field()
# 网友
net_friend = scrapy.Field()
# 时间
post_time = scrapy.Field()
2.spiders/sunwz.py
# -*- coding: utf-8 -*-
import scrapy
from sunwzSpider.items import SunwzspiderItem
class SunwzSpider(scrapy.Spider):
name = 'sunwz'
allowed_domains = ['wz.sun0769.com']
url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page="
offset = 0
start_urls = [url + str(offset)]
def parse(self, response):
table = response.xpath("//table[@width='98%']")[0]
trs = table.xpath("./tr")
# 是否爬取下一页的标记
next_flag = False
for tr in trs:
next_flag = True
try:
item = SunwzspiderItem()
# 帖子的编号
post_id = tr.xpath("./td/text()").extract()[0]
td2 = tr.xpath("./td")[1]
# 投诉类型
post_type = td2.xpath("./a/text()").extract()[0]
# 帖子的标题
post_title = td2.xpath("./a/text()").extract()[1]
# 帖子的URL
post_url = td2.xpath("./a/@href").extract()[1]
# 部门
sector = td2.xpath("./a/text()").extract()[2]
td3 = tr.xpath("./td")[2]
# 状态
post_state = td3.xpath("./span/text()").extract()[0]
# 网友
net_friend = tr.xpath("./td/text()").extract()[3]
# 时间
post_time = tr.xpath("./td/text()").extract()[4]
item["post_id"] = post_id
item["post_type"] = post_type
item["post_title"] = post_title
item["post_url"] = post_url
item["sector"] = sector
item["post_state"] = post_state
item["net_friend"] = net_friend
item["post_time"] = post_time
yield item
except:
pass
# 判断是否继续爬取下一页
if next_flag:
self.offset += 30
yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
3.pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class SunwzspiderPipeline(object):
def __init__(self):
self.file = open("阳光问政平台.json", "w", encoding = "utf-8")
self.first_flag = True
def process_item(self, item, spider):
if self.first_flag:
self.first_flag = False
content = "[\n" + json.dumps(dict(item), ensure_ascii = False)
else:
content = ",\n" + json.dumps(dict(item), ensure_ascii = False)
self.file.write(content)
return item
def close_spider(self, spider):
self.file.write("\n]")
self.file.close()
4.settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for sunwzSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly use