爬虫——多线程糗事百科案例 - Python

TOP

爬虫——多线程糗事百科案例(二)

2017-09-30 17:54:11 【大中小】浏览:8395次

'.//h2')[0].text # 用户性别：匿名用户，匹配不到性别 article_gender = res.xpath('./div/div/@class') if article_gender: gender = article_gender[0].split()[-1].replace("Icon", "") else: gender = "" # 用户年龄：匿名用户，匹配不到年龄 article_age = res.xpath('./div/div') if article_age: age = article_age[0].text else: age = 0 # 段子内容 content = res.xpath('.//div[@class="content"]/span')[0].text.strip() # 点赞次数 stats_vote = res.xpath('.//span[@class="stats-vote"]//i[@class="number"]') if stats_vote: stats_vote = stats_vote[0].text.strip() else: stats_vote = "0" # 评论次数 stats_comments = res.xpath('.//span[@class="stats-comments"]//i[@class="number"]') if stats_comments: stats_comments = stats_comments[0].text.strip() else: stats_comments = "0" record = { "head_url": head_url, "home_url": home_url, "user_name": user_name, "gender": gender, "age": age, "content": content, "stats_vote": stats_vote, "stats_comments": stats_comments } with lock: self.file_name.write(json.dumps(record, ensure_ascii = False) + ",") except Exception as e: print(e) except Exception as e: print(e) def main(): """ 主函数 :return: """ # 采集的数据存储在本地磁盘的文件名 file_name = open("糗事百科.json", "a", encoding = "utf-8") # 待采集的起始页码 start_page = int(input("请输入起始页码：")) # 待采集的终止页码 end_page = int(input("请输入终止页码：")) # 定义一个page队列 pageQueue = Queue() for page in range(start_page, end_page + 1): # 页码入队列 pageQueue.put(page) # 初始化采集线程 crawl_threads = [] crawl_list = ["采集线程1", "采集线程2", "采集线程3"] for thread_name in crawl_list: thread = ThreadCrawl(thread_name, pageQueue) thread.start() crawl_threads.append(thread) # 初始化解析线程 parser_threads = [] parser_list = ["解析线程1", "解析线程2", "解析线程3"] for thread_name in parser_list: thread = ThreadParser(thread_name, file_name) thread.start() parser_threads.append(thread) # 等待列队被清空 while not pageQueue.empty(): pass # 等待所有线程处理完成 for thread in crawl_threads: thread.join() # 等待队列被清空 while not data_queue.empty(): pass # 通知线程退出 global exitFlag_Parser exitFlag_Parser = True for thread in parser_threads: thread.join() with lock: file_name.close() if __name__ == '__main__': # 运行主函数 main()

首页上一页 1 2 下一页尾页 2/2/2
【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】

上一篇：Python装饰器与面向切面编程	下一篇：python数据结构之链表