page-header h1')[0].text.strip()
14 nt = datetime.strptime(soup.select('.time-source')[0].contents[0].strip(),'%Y年%m月%d日%H:%M')
15 newsTime = datetime.strftime(nt,'%Y-%m-%d %H:%M')
16 newsArticle = getnewsArticle(soup.select('.article p'))
17 newsAuthor = newsArticle[-1]
18 return newsTitle,newsTime,newsArticle,newsAuthor
19 def getnewsArticle(news):
20 newsArticle = []
21 for p in news:
22 newsArticle.append(p.text.strip())
23 return newsArticle
24
25 # 获取评论数量
26
27 def getCommentCount(newsurl):
28 m = re.search('doc-i(.+).shtml',newsurl)
29 newsid = m.group(1)
30 commenturl = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
31 comment = requests.get(commenturl.format(newsid)) #将要修改的地方换成大括号,并用format将newsid放入大括号的位置
32 jd = json.loads(comment.text.lstrip('var data='))
33 return jd['result']['count']['total']
34
35
36 def getNewsLinkUrl():
37 # 得到异步载入的新闻地址(即获得所有分页新闻地址)
38 urlFormat = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1501000415111'
39 url = []
40 for i in range(1,10):
41 res = requests.get(urlFormat.format(i))
42 jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))
43 url.extend(getUrl(jd)) #entend和append的区别
44 return url
45
46 def getUrl(jd):
47 # 获取每一分页的新闻地址
48 url = []
49 for i in jd['result']['data']:
50 url.append(i['url'])
51 return url
52
53 # 取得新闻时间,编辑,内容,标题,评论数量并整合在total_2中
54 def getNewsDetial():
55 title_all = []
56 author_all = []
57 commentCount_all = []
58 article_all = []
59 time_all = []
60 url_all = getNewsLinkUrl()
61 for url in url_all:
62 title_all.append(getNewsdetial(url)[0])
63 time_all.append(getNewsdetial(url)[1])
64 article_all.append(getNewsdetial(url)[2])
65 author_all.append(getNewsdetial(url)[3])
66 commentCount_all.append(getCommentCount(url))
67 total_2 = {'a_title':title_all,'b_article':article_all,'c_commentCount':commentCount_all,'d_time':time_all,'e_editor':author_all}
68 return total_2
69
70 # ( 运行起始点 )用pandas模块处理数据并转化为excel文档
71
72 df = pandas.DataFrame(getNewsDetial())
73 df.to_excel('news2.xlsx')
存储的excel文档如下:
TIPS:
问题:在jupyter notebook导入pandas时可能会出现导入错误
解决:不要用命令行打开jupyter notebook,直接找到软件打开或者在Anocanda Navigator中打开
2017-07-29 21:49:37
|