李毅吧是一个比较大的百度贴吧了,里面的热门内容都是非常好的内容。所以,我就写了一个实现抓去里面热门帖子图片的脚步,现在分享出来给大家,希望大家一起进步学习:
import requests from lxml import etree from bs4 import BeautifulSoup import threading import time import os import re import random class tieba(): def __init__(self,tiebaname): self.i=0 UserAgentlist = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101'] userage = random.choice(UserAgentlist) self.headers = {'User-Agent': userage} self.tiebaname=tiebaname def res(self,url): res = requests.get(url, headers=self.headers) soup = BeautifulSoup(res.text, 'lxml') return soup def shouye(self): for page in range(0, 2): url = "https://tieba.baidu.com/f?kw=" + self.tiebaname + "&ie=utf-8&pn=" + str(page) soup=self.res(url) tti=soup.find_all('div',class_='t_con cleafix') for title in tti: title=str(title) self.shuaixuan(title) def shuaixuan(self,title): zhiding=re.findall('<i alt=(.*?)</i>',title) huifushu=re.findall('title="回复">(.*?)</span>',title) url=re.findall('href="(.*?)"',title) url=url[0] zhiding=len(zhiding) huifushu=int(huifushu[0]) if zhiding==0 and huifushu>100: url='https://tieba.baidu.com'+url print(url) self.ziye(url) def ziye(self,url): soup = self.res(url) title=soup.find('h3') print(title) if title is None: title=soup.find('h1') neirong=soup.find_all('div',class_='d_post_content j_d_post_content') title=str(title) title=re.findall('title="(.*?)"',title) title=title[0] title=title.strip('?') title = title.strip() print(title) os.makedirs('/tiebaimgs/' + title + '/', exist_ok=True) lujing = '/tiebaimgs/' + title+ '/' self.i=0 for wen in neirong: wen=str(wen) imgs=re.findall('src="(.*?)"',wen) if len(imgs)>0: self.img_load(imgs,lujing) def img_load(self,imgs,lujing): for imgurl in imgs: print(imgurl) print(self.i) if 'gsp0' in imgurl: pass else: res=requests.get(url=imgurl,headers=self.headers) with open(lujing+str(self.i)+'.jpg', 'wb') as f: f.write(res.content) time.sleep(1) self.i+=1 if __name__ == '__main__': liyi=tieba('李毅') liyi.shouye()
下一篇: python如何写一个zblog采集程序
上一篇:python如何实现百度贴吧自动回复
评论