'.//h2')[0].text
# 用户性别:匿名用户,匹配不到性别
article_gender = res.xpath('./div/div/@class')
if article_gender:
gender = article_gender[0].split()[-1].replace("Icon", "")
else:
gender = ""
# 用户年龄:匿名用户,匹配不到年龄
article_age = res.xpath('./div/div')
if article_age:
age = article_age[0].text
else:
age = 0
# 段子内容
content = res.xpath('.//div[@class="content"]/span')[0].text.strip()
# 点赞次数
stats_vote = res.xpath('.//span[@class="stats-vote"]//i[@class="number"]')
if stats_vote:
stats_vote = stats_vote[0].text.strip()
else:
stats_vote = "0"
# 评论次数
stats_comments = res.xpath('.//span[@class="stats-comments"]//i[@class="number"]')
if stats_comments:
stats_comments = stats_comments[0].text.strip()
else:
stats_comments = "0"
record = {
"head_url": head_url,
"home_url": home_url,
"user_name": user_name,
"gender": gender,
"age": age,
"content": content,
"stats_vote": stats_vote,
"stats_comments": stats_comments
}
with lock:
self.file_name.write(json.dumps(record, ensure_ascii = False) + ",")
except Exception as e:
print(e)
except Exception as e:
print(e)
def main():
"""
主函数
:return:
"""
# 采集的数据存储在本地磁盘的文件名
file_name = open("糗事百科.json", "a", encoding = "utf-8")
# 待采集的起始页码
start_page = int(input("请输入起始页码:"))
# 待采集的终止页码
end_page = int(input("请输入终止页码:"))
# 定义一个page队列
pageQueue = Queue()
for page in range(start_page, end_page + 1):
# 页码入队列
pageQueue.put(page)
# 初始化采集线程
crawl_threads = []
crawl_list = ["采集线程1", "采集线程2", "采集线程3"]
for thread_name in crawl_list:
thread = ThreadCrawl(thread_name, pageQueue)
thread.start()
crawl_threads.append(thread)
# 初始化解析线程
parser_threads = []
parser_list = ["解析线程1", "解析线程2", "解析线程3"]
for thread_name in parser_list:
thread = ThreadParser(thread_name, file_name)
thread.start()
parser_threads.append(thread)
# 等待列队被清空
while not pageQueue.empty():
pass
# 等待所有线程处理完成
for thread in crawl_threads:
thread.join()
# 等待队列被清空
while not data_queue.empty():
pass
# 通知线程退出
global exitFlag_Parser
exitFlag_Parser = True
for thread in parser_threads:
thread.join()
with lock:
file_name.close()
if __name__ == '__main__':
# 运行主函数
main()
|