设为首页 加入收藏

TOP

爬虫——多线程糗事百科案例(二)
2017-09-30 17:54:11 】 浏览:8395
Tags:爬虫 线程 百科 案例
'.//h2')[0].text # 用户性别:匿名用户,匹配不到性别 article_gender = res.xpath('./div/div/@class') if article_gender: gender = article_gender[0].split()[-1].replace("Icon", "") else: gender = "" # 用户年龄:匿名用户,匹配不到年龄 article_age = res.xpath('./div/div') if article_age: age = article_age[0].text else: age = 0 # 段子内容 content = res.xpath('.//div[@class="content"]/span')[0].text.strip() # 点赞次数 stats_vote = res.xpath('.//span[@class="stats-vote"]//i[@class="number"]') if stats_vote: stats_vote = stats_vote[0].text.strip() else: stats_vote = "0" # 评论次数 stats_comments = res.xpath('.//span[@class="stats-comments"]//i[@class="number"]') if stats_comments: stats_comments = stats_comments[0].text.strip() else: stats_comments = "0" record = { "head_url": head_url, "home_url": home_url, "user_name": user_name, "gender": gender, "age": age, "content": content, "stats_vote": stats_vote, "stats_comments": stats_comments } with lock: self.file_name.write(json.dumps(record, ensure_ascii = False) + ",") except Exception as e: print(e) except Exception as e: print(e) def main(): """ 主函数 :return: """ # 采集的数据存储在本地磁盘的文件名 file_name = open("糗事百科.json", "a", encoding = "utf-8") # 待采集的起始页码 start_page = int(input("请输入起始页码:")) # 待采集的终止页码 end_page = int(input("请输入终止页码:")) # 定义一个page队列 pageQueue = Queue() for page in range(start_page, end_page + 1): # 页码入队列 pageQueue.put(page) # 初始化采集线程 crawl_threads = [] crawl_list = ["采集线程1", "采集线程2", "采集线程3"] for thread_name in crawl_list: thread = ThreadCrawl(thread_name, pageQueue) thread.start() crawl_threads.append(thread) # 初始化解析线程 parser_threads = [] parser_list = ["解析线程1", "解析线程2", "解析线程3"] for thread_name in parser_list: thread = ThreadParser(thread_name, file_name) thread.start() parser_threads.append(thread) # 等待列队被清空 while not pageQueue.empty(): pass # 等待所有线程处理完成 for thread in crawl_threads: thread.join() # 等待队列被清空 while not data_queue.empty(): pass # 通知线程退出 global exitFlag_Parser exitFlag_Parser = True for thread in parser_threads: thread.join() with lock: file_name.close() if __name__ == '__main__': # 运行主函数 main()

  

首页 上一页 1 2 下一页 尾页 2/2/2
】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
上一篇Python装饰器与面向切面编程 下一篇python数据结构之链表

最新文章

热门文章

Hot 文章

Python

C 语言

C++基础

大数据基础

linux编程基础

C/C++面试题目