TOP

Python爬虫框架(一)
2019-08-04 00:19:38 】 浏览:74
Tags:Python 爬虫 框架

本文章的源代码来源于https://github.com/Holit/Web-Crawler-Framwork

一、爬虫框架的代码

  1 import urllib.request
  2 from bs4 import BeautifulSoup
  3 import re
  4 import time
  5 import _thread
  6 
  7 # Input your Url here####################################
  8 BaseURL = '127.0.0.1/'
  9 #########################################################
 10 TaxURL = ".html"
 11 
 12 #Input your data-saving path ############################
 13 SavePath = ""
 14 #########################################################
 15 
 16 #Input your threads count ###############################
 17 thread_count = 1
 18 #########################################################
 19 
 20 #Set each spider will spy how many pages ################
 21 thread_spy_count_ench = 5
 22 #########################################################
 23 def mkdir(path):
 24     # Create the directory
 25     import os
 26     path=path.strip()
 27     path=path.rstrip("\\")
 28     isExists=os.path.exists(path)
 29     if not isExists:
 30         os.makedirs(path)
 31         return True
 32     else:
 33         return False
 34 
 35 def download(start, count):
 36     #Spider main
 37     for i in range(start,start + count):
 38         try:
 39             #DEBUG##################################################
 40             #print("[INFO] Connecting to page #" + str(i) + "...")
 41             ########################################################
 42             
 43             #Used to record time
 44             time_start=time.time()
 45             
 46             #Construct url
 47             #This only work like
 48             # https://127.0.0.1/articles/00001.html
 49             # https://127.0.0.1/articles/00002.html
 50             # https://127.0.0.1/articles/00003.html
 51             TargetURL = BaseURL + str(i) + TaxURL
 52             
 53             
 54             #create Request object
 55             req = urllib.request.Request(TargetURL)
 56             #create headers using general header, you could find this by Fiddler(R) or by Chrome(R)
 57             req.add_header('Host','www.69aue.com')    #Your Host, usally set as url-base
 58             req.add_header('Referer',TargetURL)        #Your Referer, usally set as url
 59             req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19')
 60             #finishing create Request object
 61             
 62             #get information
 63             res = urllib.request.urlopen(req)
 64             #decode the html
 65             soup = BeautifulSoup(res,"html.parser")
 66             ##############################################################
 67             #add your functions  here....
 68             
 69             #operate_data(data)
 70             
 71             #soup find div to get inforamtion.
 72             #soup is able to opreate html tag very easily, by using soup.find(...)
 73             ##############################################################
 74             
 75             #Change saving path here.
 76             savetarget = SavePath
 77             
 78             #trying to saving files
 79             try:
 80                 #create directory if it doesn't existed
 81                 mkdir(SavePath+"\\"+str(zone)+"\\")
 82                 #using open...
 83                 f = open(savetarget,'w')
 84                 
 85                 #edit this
 86                 f.write("data")
 87                 
 88             except Exception as e:
 89                 time_end=time.time()
 90                 print("  [Failed] - #" + str(i) + " Error : " + str(e))
 91             else:
 92             
 93                 time_end=time.time()
 94                 print("  [Succeed] - #" + str(i) + " has saved to path.("+str(time_end-time_start)+"s)")
 95 
 96             pass
 97         except Exception as e:
 98             print("  [Glob  
		
Python爬虫框架(一) https://www.cppentry.com/bencandy.php?fid=77&id=228374

首页 上一页 1 2 下一页 尾页 1/2/2
】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
上一篇Python——继承 下一篇Windows环境下Python3安装Pyspider