本文章的源代码来源于https://github.com/Holit/Web-Crawler-Framwork
一、爬虫框架的代码
1 import urllib.request 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 import _thread 6 7 # Input your Url here#################################### 8 BaseURL = '127.0.0.1/' 9 ######################################################### 10 TaxURL = ".html" 11 12 #Input your data-saving path ############################ 13 SavePath = "" 14 ######################################################### 15 16 #Input your threads count ############################### 17 thread_count = 1 18 ######################################################### 19 20 #Set each spider will spy how many pages ################ 21 thread_spy_count_ench = 5 22 ######################################################### 23 def mkdir(path): 24 # Create the directory 25 import os 26 path=path.strip() 27 path=path.rstrip("\\") 28 isExists=os.path.exists(path) 29 if not isExists: 30 os.makedirs(path) 31 return True 32 else: 33 return False 34 35 def download(start, count): 36 #Spider main 37 for i in range(start,start + count): 38 try: 39 #DEBUG################################################## 40 #print("[INFO] Connecting to page #" + str(i) + "...") 41 ######################################################## 42 43 #Used to record time 44 time_start=time.time() 45 46 #Construct url 47 #This only work like 48 # https://127.0.0.1/articles/00001.html 49 # https://127.0.0.1/articles/00002.html 50 # https://127.0.0.1/articles/00003.html 51 TargetURL = BaseURL + str(i) + TaxURL 52 53 54 #create Request object 55 req = urllib.request.Request(TargetURL) 56 #create headers using general header, you could find this by Fiddler(R) or by Chrome(R) 57 req.add_header('Host','www.69aue.com') #Your Host, usally set as url-base 58 req.add_header('Referer',TargetURL) #Your Referer, usally set as url 59 req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19') 60 #finishing create Request object 61 62 #get information 63 res = urllib.request.urlopen(req) 64 #decode the html 65 soup = BeautifulSoup(res,"html.parser") 66 ############################################################## 67 #add your functions here.... 68 69 #operate_data(data) 70 71 #soup find div to get inforamtion. 72 #soup is able to opreate html tag very easily, by using soup.find(...) 73 ############################################################## 74 75 #Change saving path here. 76 savetarget = SavePath 77 78 #trying to saving files 79 try: 80 #create directory if it doesn't existed 81 mkdir(SavePath+"\\"+str(zone)+"\\") 82 #using open... 83 f = open(savetarget,'w') 84 85 #edit this 86 f.write("data") 87 88 except Exception as e: 89 time_end=time.time() 90 print(" [Failed] - #" + str(i) + " Error : " + str(e)) 91 else: 92 93 time_end=time.time() 94 print(" [Succeed] - #" + str(i) + " has saved to path.("+str(time_end-time_start)+"s)") 95 96 pass 97 except Exception as e: 98 print(" [Glob