{"rsdb":{"rid":"397505","subhead":"","postdate":"0","aid":"273597","fid":"77","uid":"1","topic":"1","content":"<div id=\"cnblogs_post_body\" class=\"blogpost-body cnblogs-markdown\"> \n <h1 id=\"\u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e\">\u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e<\/h1> \n <p>? \u4eca\u5929\u95f2\u6765\u65e0\u4e8b\uff0c\u5199\u4e00\u4e2a\u722c\u866b\u6765\u73a9\u73a9\u3002\u5728\u7f51\u4e0a\u51b2\u6d6a\u7684\u65f6\u5019\u53d1\u73b0\u4e86\u4e00\u4e2a\u641e\u7b11\u7684\u6bb5\u5b50\u7f51\uff0c\u53d1\u73b0\u91cc\u9762\u7684\u5185\u5bb9\u8fd8\u662f\u6bd4\u8f83\u6709\u610f\u601d\u7684\uff0c\u4e8e\u662f\u5fc3\u8840\u6765\u6f6e\uff0c\u5c31\u60f3\u7740\u80fd\u4e0d\u80fd\u5199\u4e00\u4e2aPython\u7a0b\u5e8f\uff0c\u6293\u53d6\u51e0\u6761\u6570\u636e\u4e0b\u6765\u770b\u770b\uff0c\u4e00\u4e0d\u5c0f\u5fc3\u5c31\u628a\u8fd9\u4e2a\u7f51\u7ad9\u7684\u6240\u6709\u6570\u636e\u90fd\u62ff\u5230\u4e86\u3002<\/p> \n <p><img src=\"https:\/\/img2023.cnblogs.com\/other\/2413128\/202309\/2413128-20230909230705273-464487089.png\" alt=\"\u522b\u9017\u4e86(biedoul.com)\" loading=\"lazy\" \/><\/p> \n <p>? \u8fd9\u4e2a\u7f51\u7ad9\u4e3b\u8981\u7684\u6570\u636e\u90fd\u662f\u8be6\u60c5\u5728HTML\u91cc\u9762\u7684\uff0c\u53ef\u4ee5\u91c7\u7528lxml\u6a21\u5757\u7684xpath\u5bf9HTML\u6807\u7b7e\u7684\u5185\u5bb9\u89e3\u6790\uff0c\u83b7\u53d6\u5230\u81ea\u5df1\u60f3\u8981\u7684\u6570\u636e\uff0c\u7136\u540e\u518d\u4fdd\u5b58\u5728\u672c\u5730\u6587\u4ef6\u4e2d\uff0c\u6574\u4e2a\u8fc7\u7a0b\u662f\u4e00\u6c14\u5475\u6210\u7684\u3002\u80fd\u591f\u6293\u53d6\u5230\u4e00\u9875\u7684\u6570\u636e\u4e4b\u540e\uff0c\u52a0\u4e00\u4e2a\u5faa\u73af\u5c31\u53ef\u4ee5\u6293\u53d6\u5230\u6240\u6709\u9875\u7684\u6570\u636e\uff0c\u4e0b\u9762\u7684\u5c31\u662f\u6570\u636e\u5c55\u793a\u3002<\/p> \n <p><img src=\"https:\/\/img2023.cnblogs.com\/other\/2413128\/202309\/2413128-20230909230705800-632013712.png\" alt=\"\u6570\u636e\u5c55\u793a\" loading=\"lazy\" \/><\/p> \n <p>\u5e9f\u8bdd\u5c11\u8bf4\uff0c\u76f4\u63a5\u4e0aPython\u4ee3\u7801<\/p> \n <pre><code class=\"language-python\">import requests\nimport csv\nfrom lxml import etree\nimport time\n\n\nclass Page:\n\n    def __init__(self):\n        self.pre_url = &quot;https:\/\/www.biedoul.com&quot;\n        self.start_page = 1\n        self.end_page = 15233\n\n    def askHTML(self, current_page, opportunity):\n        print(\n            &quot;=============================== current page =&gt; &quot; + str(current_page) + &quot;===============================&quot;)\n        try:\n\n            pre_url = self.pre_url + &quot;\/index\/&quot; + str(current_page)\n            page = requests.get(url=pre_url)\n            html = etree.HTML(page.content)\n            articles = html.xpath('\/html\/body\/div\/div\/div\/dl')\n            return articles\n        except Exception as e:\n            if opportunity &gt; 0:\n                time.sleep(500)\n                print(\n                    &quot;=============================== retry =&gt; &quot; + str(opportunity) + &quot;===============================&quot;)\n                return self.askHTML(current_page, opportunity - 1)\n            else:\n                return None\n\n    def analyze(self, articles):\n        lines = []\n        for article in articles:\n            data = {}\n            data[&quot;link&quot;] = article.xpath(&quot;.\/span\/dd\/a\/@href&quot;)[0]\n            data[&quot;title&quot;] = article.xpath(&quot;.\/span\/dd\/a\/strong\/text()&quot;)[0]\n            data[&quot;content&quot;] = self.analyze_content(article)\n            picture_links = article.xpath(&quot;.\/dd\/img\/@src&quot;)\n            if (picture_links is not None and len(picture_links) &gt; 0):\n                # print(picture_links)\n                data[&quot;picture_links&quot;] = picture_links\n            else:\n                data[&quot;picture_links&quot;] = []\n\n            # data[&quot;good_zan&quot;] = article.xpath(&quot;.\/div\/div\/a[@class='pinattn good']\/p\/text()&quot;)[0]\n            # data[&quot;bad_bs&quot;] = article.xpath(&quot;.\/div\/div\/a[@class='pinattn bad']\/p\/text()&quot;)[0]\n            data[&quot;good_zan&quot;] = self.analyze_zan(article, &quot;good&quot;)\n            # article.xpath(&quot;.\/div\/div\/a[@class='pinattn good']\/p\/text()&quot;)[0]\n            data[&quot;bad_bs&quot;] = self.analyze_zan(article, &quot;bad&quot;)\n            # article.xpath(&quot;.\/div\/div\/a[@class='pinattn bad']\/p\/text()&quot;)[0]\n            lines.append(data)\n        return lines\n\n    # \u89e3\u6790\u6587\u7ae0\u5185\u5bb9\n    def analyze_content(self, article):\n        # 1. \u5224\u65addd\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n        content = article.xpath(&quot;.\/dd\/text()&quot;)\n        if content is not None and len(content) &gt; 0 and not self.is_empty_list(content):\n            return content\n\n        content = []\n        p_list = article.xpath(&quot;.\/dd&quot;)\n        for p in p_list:\n            # 2. \u5224\u65addd\/...\/font\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n            if len(content) &lt;= 0 or content is None:\n                fonts = p.xpath(&quot;.\/\/font&quot;)\n                for font_html in fonts:\n                    font_content = font_html.xpath(&quot;.\/text()&quot;)\n                    if font_content is not None and len(font_content) &gt; 0:\n                        content.append(font_content)\n\n            # 3. \u5224\u65addd\/...\/p\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n            if len(content) &lt;= 0 or content is None:\n                fonts = p.xpath(&quot;.\/\/p&quot;)\n                for font_html in fonts:\n                    font_content = font_html.xpath(&quot;.\/text()&quot;)\n                    if font_content is not None and len(font_content) &gt; 0:\n                        content.append(font_content)\n\n        return content\n\n    def analyze_zan(self, article, type):\n        num = article.xpath(&quot;.\/div\/div\/a[@class='pinattn &quot; + type + &quot;']\/p\/text()&quot;)\n        if num is not None and len(num) &gt; 0:\n            return num[","orderid":"0","title":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e(\u4e00)","smalltitle":"","mid":"0","fname":"Python","special_id":"0","bak_id":"0","info":"0","hits":"651","pages":"2","comments":"0","posttime":"2023-09-23 15:44:25","list":"1695455065","username":"admin","author":"","copyfrom":"","copyfromurl":"","titlecolor":"","fonttype":"0","titleicon":"0","picurl":"https:\/\/www.cppentry.com\/upload_files\/","ispic":"0","yz":"1","yzer":"","yztime":"0","levels":"0","levelstime":"0","keywords":"","jumpurl":"","iframeurl":"","style":"","template":"a:3:{s:4:\"head\";s:0:\"\";s:4:\"foot\";s:0:\"\";s:8:\"bencandy\";s:0:\"\";}","target":"0","ip":"112.94.1.100","lastfid":"0","money":"0","buyuser":"","passwd":"","allowdown":"","allowview":"","editer":"","edittime":"0","begintime":"0","endtime":"0","description":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e","lastview":"1715493203","digg_num":"750","digg_time":"1715789013","forbidcomment":"0","ifvote":"0","heart":"","htmlname":"","city_id":"0"},"page":"1"}