在前几天写的一建抓取网站所有链接的脚步往后衍生了以下的两个脚步,一个是查询网站友情链接,另一个是查询网站的死链。我这里只是初步实现了功能,还有很多地方需要优化,比如说查询友情链接脚步会存在带www与不带www不能共存识别的问题,查询网站死链的脚步运行好慢的问题,这个问题是我目前解决不了的,我的能力还有限。
很多人说,爬虫学的好,“劳烦”吃的 饱。所以,在爬虫教程中,都会劝说大家善良,但是我现在能力有限,可以随便放开造,如果有喜欢一起学习的朋友,可以加我微信,相互讨论,共同学习。下面分享这两个脚步源代码,供大家欣赏,^_^。
一、友情链接查询
import requests from bs4 import BeautifulSoup import time url=input("输入主域名:") headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} def shouye(): r=requests.get(url,headers=headers) soup=BeautifulSoup(r.content.decode('utf-8',"ignore"),'lxml') suoyoua=soup.find_all('a') alla=[] for lia in suoyoua: dana=lia.get("href") try: if dana.find('http')!=-1 and dana.find(url) == -1: alla.append(dana) except: continue # 去重 alla=sorted(set(alla), key=alla.index) fanhui(alla) def fanhui(alla): for duiurl in alla: try: r=requests.get(duiurl,headers=headers) except: print('该网站打不开', duiurl) continue try: soup = BeautifulSoup(r.content.decode('utf-8',"ignore"), 'lxml') except Exception as ex: print(duiurl,ex) suoyoua = soup.find_all('a') sya=[] for lia in suoyoua: dana = lia.get("href") sya.append(dana) sya=str(sya) if sya.find(url)==-1: print('该网站没有我们网站链接',duiurl) if __name__ == '__main__': startime = time.time() shouye() endtime = time.time() thetime=endtime-startime print(thetime)
二、死链接查询
import requests from bs4 import BeautifulSoup # 进程 from threading import Thread import time bbb=[] jishu=0 def shouye(): global jishu url=input("输入主域名:") headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} r=requests.get(url,headers=headers) soup=BeautifulSoup(r.content.decode('utf-8'),'lxml') suoyoua=soup.find_all('a') alla=[] for lia in suoyoua: dana=lia.get("href") alla.append(dana) # 去重 alla=sorted(set(alla), key=alla.index) # 开启多线程 t_list = [] for lianjie in alla: for i in range(5): t = Thread(target=neiye, args=(lianjie, url)) t_list.append(t) t.start() # 回收线程 for t in t_list: t.join() def neiye(lianjie,url): global bbb headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} if lianjie.find(url)!=-1: ciurl= lianjie elif lianjie.find('http')==-1 and lianjie.find('/')!=-1: ciurl=url + lianjie else: ciurl = url + '/' + lianjie r = requests.get(ciurl , headers=headers) bba=[] alla = [] try: soup = BeautifulSoup(r.content.decode('utf-8'), 'lxml') suoyoua = soup.find_all('a') except: bba.append(ciurl) else: for lia in suoyoua: try: dana = lia.get("href") except: continue alla.append(dana) # 去重 alla = sorted(set(alla), key=alla.index) global jishu for lian2 in alla: if lian2 in bbb: continue else: bbb.append(lian2) neiye(lian2,url) if __name__ == '__main__': startime = time.time() shouye() bbb = sorted(set(bbb), key=bbb.index) num=0 for ads in bbb: if ads.find('http')!=-1: ads=ads else: ads='http://zhuxiaoedu.com'+ads print(num, ads) num += 1 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} try: r = requests.get(ads, headers=headers) except Exception as e: print(e) continue print(r.status_code) endtime = time.time() thetime=endtime-startime print(thetime)
评论