\u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e<\/h1> \n
? \u4eca\u5929\u95f2\u6765\u65e0\u4e8b\uff0c\u5199\u4e00\u4e2a\u722c\u866b\u6765\u73a9\u73a9\u3002\u5728\u7f51\u4e0a\u51b2\u6d6a\u7684\u65f6\u5019\u53d1\u73b0\u4e86\u4e00\u4e2a\u641e\u7b11\u7684\u6bb5\u5b50\u7f51\uff0c\u53d1\u73b0\u91cc\u9762\u7684\u5185\u5bb9\u8fd8\u662f\u6bd4\u8f83\u6709\u610f\u601d\u7684\uff0c\u4e8e\u662f\u5fc3\u8840\u6765\u6f6e\uff0c\u5c31\u60f3\u7740\u80fd\u4e0d\u80fd\u5199\u4e00\u4e2aPython\u7a0b\u5e8f\uff0c\u6293\u53d6\u51e0\u6761\u6570\u636e\u4e0b\u6765\u770b\u770b\uff0c\u4e00\u4e0d\u5c0f\u5fc3\u5c31\u628a\u8fd9\u4e2a\u7f51\u7ad9\u7684\u6240\u6709\u6570\u636e\u90fd\u62ff\u5230\u4e86\u3002<\/p> \n
<\/p> \n
? \u8fd9\u4e2a\u7f51\u7ad9\u4e3b\u8981\u7684\u6570\u636e\u90fd\u662f\u8be6\u60c5\u5728HTML\u91cc\u9762\u7684\uff0c\u53ef\u4ee5\u91c7\u7528lxml\u6a21\u5757\u7684xpath\u5bf9HTML\u6807\u7b7e\u7684\u5185\u5bb9\u89e3\u6790\uff0c\u83b7\u53d6\u5230\u81ea\u5df1\u60f3\u8981\u7684\u6570\u636e\uff0c\u7136\u540e\u518d\u4fdd\u5b58\u5728\u672c\u5730\u6587\u4ef6\u4e2d\uff0c\u6574\u4e2a\u8fc7\u7a0b\u662f\u4e00\u6c14\u5475\u6210\u7684\u3002\u80fd\u591f\u6293\u53d6\u5230\u4e00\u9875\u7684\u6570\u636e\u4e4b\u540e\uff0c\u52a0\u4e00\u4e2a\u5faa\u73af\u5c31\u53ef\u4ee5\u6293\u53d6\u5230\u6240\u6709\u9875\u7684\u6570\u636e\uff0c\u4e0b\u9762\u7684\u5c31\u662f\u6570\u636e\u5c55\u793a\u3002<\/p> \n
<\/p> \n
\u5e9f\u8bdd\u5c11\u8bf4\uff0c\u76f4\u63a5\u4e0aPython\u4ee3\u7801<\/p> \n
import requests\nimport csv\nfrom lxml import etree\nimport time\n\n\nclass Page:\n\n def __init__(self):\n self.pre_url = "https:\/\/www.biedoul.com"\n self.start_page = 1\n self.end_page = 15233\n\n def askHTML(self, current_page, opportunity):\n print(\n "=============================== current page => " + str(current_page) + "===============================")\n try:\n\n pre_url = self.pre_url + "\/index\/" + str(current_page)\n page = requests.get(url=pre_url)\n html = etree.HTML(page.content)\n articles = html.xpath('\/html\/body\/div\/div\/div\/dl')\n return articles\n except Exception as e:\n if opportunity > 0:\n time.sleep(500)\n print(\n "=============================== retry => " + str(opportunity) + "===============================")\n return self.askHTML(current_page, opportunity - 1)\n else:\n return None\n\n def analyze(self, articles):\n lines = []\n for article in articles:\n data = {}\n data["link"] = article.xpath(".\/span\/dd\/a\/@href")[0]\n data["title"] = article.xpath(".\/span\/dd\/a\/strong\/text()")[0]\n data["content"] = self.analyze_content(article)\n picture_links = article.xpath(".\/dd\/img\/@src")\n if (picture_links is not None and len(picture_links) > 0):\n # print(picture_links)\n data["picture_links"] = picture_links\n else:\n data["picture_links"] = []\n\n # data["good_zan"] = article.xpath(".\/div\/div\/a[@class='pinattn good']\/p\/text()")[0]\n # data["bad_bs"] = article.xpath(".\/div\/div\/a[@class='pinattn bad']\/p\/text()")[0]\n data["good_zan"] = self.analyze_zan(article, "good")\n # article.xpath(".\/div\/div\/a[@class='pinattn good']\/p\/text()")[0]\n data["bad_bs"] = self.analyze_zan(article, "bad")\n # article.xpath(".\/div\/div\/a[@class='pinattn bad']\/p\/text()")[0]\n lines.append(data)\n return lines\n\n # \u89e3\u6790\u6587\u7ae0\u5185\u5bb9\n def analyze_content(self, article):\n # 1. \u5224\u65addd\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n content = article.xpath(".\/dd\/text()")\n if content is not None and len(content) > 0 and not self.is_empty_list(content):\n return content\n\n content = []\n p_list = article.xpath(".\/dd")\n for p in p_list:\n # 2. \u5224\u65addd\/...\/font\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n if len(content) <= 0 or content is None:\n fonts = p.xpath(".\/\/font")\n for font_html in fonts:\n font_content = font_html.xpath(".\/text()")\n if font_content is not None and len(font_content) > 0:\n content.append(font_content)\n\n # 3. \u5224\u65addd\/...\/p\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n if len(content) <= 0 or content is None:\n fonts = p.xpath(".\/\/p")\n for font_html in fonts:\n font_content = font_html.xpath(".\/text()")\n if font_content is not None and len(font_content) > 0:\n content.append(font_content)\n\n return content\n\n def analyze_zan(self, article, type):\n num = article.xpath(".\/div\/div\/a[@class='pinattn " + type + "']\/p\/text()")\n if num is not None and len(num) > 0:\n return num[","orderid":"0","title":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e(\u4e00)","smalltitle":"","mid":"0","fname":"Python","special_id":"0","bak_id":"0","info":"0","hits":"651","pages":"2","comments":"0","posttime":"2023-09-23 15:44:25","list":"1695455065","username":"admin","author":"","copyfrom":"","copyfromurl":"","titlecolor":"","fonttype":"0","titleicon":"0","picurl":"https:\/\/www.cppentry.com\/upload_files\/","ispic":"0","yz":"1","yzer":"","yztime":"0","levels":"0","levelstime":"0","keywords":"","jumpurl":"","iframeurl":"","style":"","template":"a:3:{s:4:\"head\";s:0:\"\";s:4:\"foot\";s:0:\"\";s:8:\"bencandy\";s:0:\"\";}","target":"0","ip":"112.94.1.100","lastfid":"0","money":"0","buyuser":"","passwd":"","allowdown":"","allowview":"","editer":"","edittime":"0","begintime":"0","endtime":"0","description":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e","lastview":"1715493203","digg_num":"750","digg_time":"1715789013","forbidcomment":"0","ifvote":"0","heart":"","htmlname":"","city_id":"0"},"page":"1"}