headers=self.headers).text
21 selector = etree.HTML(html)
22 divList = selector.xpath('//div[@id="content-left"]/div')
23 return divList
24
25 # 获取文本中要截取的元素
26 def getHtmlItems(self, divList):
27
28 items = []
29
30 for div in divList:
31 item = []
32 # 发布人
33 name = div.xpath('.//h2/text()')[0].replace("\n", "")
34 item.append(name)
35
36 # 内容(阅读全文)
37 contentForAll = div.xpath('.//div[@class="content"]/span[@class="contentForAll"]')
38 if contentForAll:
39 contentForAllHref = div.xpath('.//a[@class="contentHerf"]/@href')[0]
40 contentForAllHref = "https://www.qiushibaike.com" + contentForAllHref
41 contentForAllHrefPage = requests.get(url=contentForAllHref).text
42 selector2 = etree.HTML(contentForAllHrefPage)
43 content = selector2.xpath('//div[@class="content"]/text()')
44 content = "".join(content)
45 content = content.replace("\n", "")
46 else:
47 content = div.xpath('.//div[@class="content"]/span/text()')
48 content = "".join(content)
49 content = content.replace("\n", "")
50 item.append(content)
51
52 # 点赞数
53 love = div.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')
54 love = love[0]
55 item.append(love)
56
57 # 评论人数
58 num = div.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')
59 num = num[0]
60 item.append(num)
61
62 items.append(item)
63
64 return items
65
66 # 保存入文本
67 def saveItem(self, items):
68 f = open('F:\\Pythontest1\\qiushi.txt', "a", encoding='UTF-8')
69
70 for item in items:
71 name = item[0]
72 content = item[1]
73 love = item[2]
74 num = item[3]
75
76 # 写入文本
77 f.write("发布人:" + name + '\n')
78 f.write("内容:" + content + '\n')
79 f.write("点赞数:" + love + '\t')
80 f.write("评论人数:" + num)
81 f.write('\n\n')
82
83 f.close()
84
85 # 判断文本是否已创建,添加路径
86 def judgePath(self):
87 if os.path.exists('F:\\Pythontest1') == False:
88 os.mkdir('F:\\Pythontest1')
89 if os.path.exists("F:\\Pythontest1\\qiushi.txt") == True:
90 os.remove("F:\\Pythontest1\\qiushi.txt")
91
92 def start(self):
93 self.judgePath()
94 print("正在读取糗事百科,按回车继续保存下一页,Q退出")
95 self.enable = True
96 while self.enable:
97 divList = self.getHtmlDivList(self.pageIndex)
98 data = self.getHtmlItems(divList)
99 self.saveItem(data)
100 print('已保存第%d页的内容' % self.pageIndex)
101 pan = input('是否继续保存:')
102 if pan != 'Q':
103 self.pageIndex += 1
104 self.enable = True
105 else:
106 print('程序运行结束!!')
107 self.enable = False
108
109
110 spider = QSBK()
111 spider.start()
View Code
|