{"rsdb":{"rid":"397505","subhead":"","postdate":"0","aid":"273597","fid":"77","uid":"1","topic":"1","content":"
\n

\u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e<\/h1> \n

? \u4eca\u5929\u95f2\u6765\u65e0\u4e8b\uff0c\u5199\u4e00\u4e2a\u722c\u866b\u6765\u73a9\u73a9\u3002\u5728\u7f51\u4e0a\u51b2\u6d6a\u7684\u65f6\u5019\u53d1\u73b0\u4e86\u4e00\u4e2a\u641e\u7b11\u7684\u6bb5\u5b50\u7f51\uff0c\u53d1\u73b0\u91cc\u9762\u7684\u5185\u5bb9\u8fd8\u662f\u6bd4\u8f83\u6709\u610f\u601d\u7684\uff0c\u4e8e\u662f\u5fc3\u8840\u6765\u6f6e\uff0c\u5c31\u60f3\u7740\u80fd\u4e0d\u80fd\u5199\u4e00\u4e2aPython\u7a0b\u5e8f\uff0c\u6293\u53d6\u51e0\u6761\u6570\u636e\u4e0b\u6765\u770b\u770b\uff0c\u4e00\u4e0d\u5c0f\u5fc3\u5c31\u628a\u8fd9\u4e2a\u7f51\u7ad9\u7684\u6240\u6709\u6570\u636e\u90fd\u62ff\u5230\u4e86\u3002<\/p> \n

\"\u522b\u9017\u4e86(biedoul.com)\"<\/p> \n

? \u8fd9\u4e2a\u7f51\u7ad9\u4e3b\u8981\u7684\u6570\u636e\u90fd\u662f\u8be6\u60c5\u5728HTML\u91cc\u9762\u7684\uff0c\u53ef\u4ee5\u91c7\u7528lxml\u6a21\u5757\u7684xpath\u5bf9HTML\u6807\u7b7e\u7684\u5185\u5bb9\u89e3\u6790\uff0c\u83b7\u53d6\u5230\u81ea\u5df1\u60f3\u8981\u7684\u6570\u636e\uff0c\u7136\u540e\u518d\u4fdd\u5b58\u5728\u672c\u5730\u6587\u4ef6\u4e2d\uff0c\u6574\u4e2a\u8fc7\u7a0b\u662f\u4e00\u6c14\u5475\u6210\u7684\u3002\u80fd\u591f\u6293\u53d6\u5230\u4e00\u9875\u7684\u6570\u636e\u4e4b\u540e\uff0c\u52a0\u4e00\u4e2a\u5faa\u73af\u5c31\u53ef\u4ee5\u6293\u53d6\u5230\u6240\u6709\u9875\u7684\u6570\u636e\uff0c\u4e0b\u9762\u7684\u5c31\u662f\u6570\u636e\u5c55\u793a\u3002<\/p> \n

\"\u6570\u636e\u5c55\u793a\"<\/p> \n

\u5e9f\u8bdd\u5c11\u8bf4\uff0c\u76f4\u63a5\u4e0aPython\u4ee3\u7801<\/p> \n

import requests\nimport csv\nfrom lxml import etree\nimport time\n\n\nclass Page:\n\n    def __init__(self):\n        self.pre_url = "https:\/\/www.biedoul.com"\n        self.start_page = 1\n        self.end_page = 15233\n\n    def askHTML(self, current_page, opportunity):\n        print(\n            "=============================== current page => " + str(current_page) + "===============================")\n        try:\n\n            pre_url = self.pre_url + "\/index\/" + str(current_page)\n            page = requests.get(url=pre_url)\n            html = etree.HTML(page.content)\n            articles = html.xpath('\/html\/body\/div\/div\/div\/dl')\n            return articles\n        except Exception as e:\n            if opportunity > 0:\n                time.sleep(500)\n                print(\n                    "=============================== retry => " + str(opportunity) + "===============================")\n                return self.askHTML(current_page, opportunity - 1)\n            else:\n                return None\n\n    def analyze(self, articles):\n        lines = []\n        for article in articles:\n            data = {}\n            data["link"] = article.xpath(".\/span\/dd\/a\/@href")[0]\n            data["title"] = article.xpath(".\/span\/dd\/a\/strong\/text()")[0]\n            data["content"] = self.analyze_content(article)\n            picture_links = article.xpath(".\/dd\/img\/@src")\n            if (picture_links is not None and len(picture_links) > 0):\n                # print(picture_links)\n                data["picture_links"] = picture_links\n            else:\n                data["picture_links"] = []\n\n            # data["good_zan"] = article.xpath(".\/div\/div\/a[@class='pinattn good']\/p\/text()")[0]\n            # data["bad_bs"] = article.xpath(".\/div\/div\/a[@class='pinattn bad']\/p\/text()")[0]\n            data["good_zan"] = self.analyze_zan(article, "good")\n            # article.xpath(".\/div\/div\/a[@class='pinattn good']\/p\/text()")[0]\n            data["bad_bs"] = self.analyze_zan(article, "bad")\n            # article.xpath(".\/div\/div\/a[@class='pinattn bad']\/p\/text()")[0]\n            lines.append(data)\n        return lines\n\n    # \u89e3\u6790\u6587\u7ae0\u5185\u5bb9\n    def analyze_content(self, article):\n        # 1. \u5224\u65addd\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n        content = article.xpath(".\/dd\/text()")\n        if content is not None and len(content) > 0 and not self.is_empty_list(content):\n            return content\n\n        content = []\n        p_list = article.xpath(".\/dd")\n        for p in p_list:\n            # 2. \u5224\u65addd\/...\/font\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n            if len(content) <= 0 or content is None:\n                fonts = p.xpath(".\/\/font")\n                for font_html in fonts:\n                    font_content = font_html.xpath(".\/text()")\n                    if font_content is not None and len(font_content) > 0:\n                        content.append(font_content)\n\n            # 3. \u5224\u65addd\/...\/p\u6807\u7b7e\u4e0b\u662f\u5426\u4e3a\u6587\u672c\u5185\u5bb9\n            if len(content) <= 0 or content is None:\n                fonts = p.xpath(".\/\/p")\n                for font_html in fonts:\n                    font_content = font_html.xpath(".\/text()")\n                    if font_content is not None and len(font_content) > 0:\n                        content.append(font_content)\n\n        return content\n\n    def analyze_zan(self, article, type):\n        num = article.xpath(".\/div\/div\/a[@class='pinattn " + type + "']\/p\/text()")\n        if num is not None and len(num) > 0:\n            return num[","orderid":"0","title":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e(\u4e00)","smalltitle":"","mid":"0","fname":"Python","special_id":"0","bak_id":"0","info":"0","hits":"651","pages":"2","comments":"0","posttime":"2023-09-23 15:44:25","list":"1695455065","username":"admin","author":"","copyfrom":"","copyfromurl":"","titlecolor":"","fonttype":"0","titleicon":"0","picurl":"https:\/\/www.cppentry.com\/upload_files\/","ispic":"0","yz":"1","yzer":"","yztime":"0","levels":"0","levelstime":"0","keywords":"","jumpurl":"","iframeurl":"","style":"","template":"a:3:{s:4:\"head\";s:0:\"\";s:4:\"foot\";s:0:\"\";s:8:\"bencandy\";s:0:\"\";}","target":"0","ip":"112.94.1.100","lastfid":"0","money":"0","buyuser":"","passwd":"","allowdown":"","allowview":"","editer":"","edittime":"0","begintime":"0","endtime":"0","description":" \u4e00\u6761\u722c\u866b\u6293\u53d6\u4e00\u4e2a\u5c0f\u7f51\u7ad9\u6240\u6709\u6570\u636e","lastview":"1715493203","digg_num":"750","digg_time":"1715789013","forbidcomment":"0","ifvote":"0","heart":"","htmlname":"","city_id":"0"},"page":"1"}