Python学习笔记-简易抓取网页(二)

2014-11-23 22:22:20 · 作者: · 浏览: 39
nt = urllib2.urlopen(url).read()
html = BeautifulSoup(content)
fetch_resource = Resource(url, None, str(html.find('body'))[0:9999], 1)
fetch_resource.updateContentAndStatus()
aLinks = html.find_all('a')
print 'aLinks %s' %aLinks
for aLink in aLinks :
href = aLink.get('href')
a_text = CodeHelper.encodeContent(aLink.get_text())
print 'href %s text %s' %(href, a_text)
subResource = Resource(href, a_text, '', 0)
subResource.insert()

def execute():
urls = ['http://www.kuwo.cn', 'http://www.1ting.com/', 'http://www.kugou.com/', 'http://y.**.com/']
for url in urls :
resource = Resource(url, None, 0)
resource.insert()

start = time.time()
resource_manager = ResourceManager(20, 4)
resource_manager.wait_for_complete()
end = time.time()
print "cost all time: %s" % (end-start)


if __name__ == '__main__':
execute()