100字范文 > python爬虫多线程多进程示例

python爬虫多线程多进程示例

时间：2023-06-15 15:16:21

相关推荐

python爬虫多线程多进程示例

# -*- coding: utf-8 -*-'''美图录‘’‘#-------------------------------------------------import reimport os,statimport timeimport datetimeimport urllibimport urllib.requestfrom bs4 import BeautifulSoupimport requestsfrom PIL import Imagefrom io import BytesIOfrom threading import Threadfrom multiprocessing import Process#-------------------------------------------------class Web_IMG_Crawler(object):#初始化#def __init__(self,url,de_code):#self.url=url#self.de_code=de_code#self.soup=soup#self.file_path=file_path#self.file_name=file_name#self.count=count#反——反——爬——虫 def ant_Crawler(self,url): header= {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)','Referer':''}FirstPAGE=re.findall(r"//(.+?)/",url)[0]Referer=str('http://'+FirstPAGE)header['Referer']=Refererreturn header#解析网址def JIEXI_WEB(self,url,de_code):open_url=urllib.request.urlopen(url)#请求网址并打开网址html=open_url.read().decode(de_code)#确定以什么编码读取网址///在浏览器中打开目标网址，右键即可查看目标网址的编码方式soup=BeautifulSoup(html,features="lxml")#创建 beautifulsoup 对象,并指定解析器为lxmlreturn soup #网址list去重def QU_CHONG(self,all_url_list):end_list=[] for element in all_url_list :if(element not in end_list):end_list.append(element)return end_list#-------------------------------------------------#保存图片def SAVE_IMG(self,file_path,url_img,file_name):try:#是否有这个路径if not os.path.exists(file_path):#创建路径os.makedirs(file_path)#获得图片后缀file_suffix = os.path.splitext(url_img)[1]print(file_suffix)#拼接图片名（包含路径）file_name ='{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix)print(file_name)#下载图片，并保存到文件夹中urllib.request.urlretrieve(url_img,filename=file_name)except IOError as e:print("IOError")except Exception as e:print("Exception")def SAVE_IMAGE(self,IMG_link_set,url,file_path,file_name,sid): """ def download(num):PAGE=str(sid+1)+'页'+str(num)+'张'file_name0=file_name+PAGE+'.jpg'html_IMG=requests.get(IMG_link_set[num],headers=Web_IMG_Crawler().ant_Crawler(url))try:image=Image.open(BytesIO(html_IMG.content))image.save(file_path+file_name0)print('第%s图片下载完成,图片名称：%s' % (PAGE,file_name0))except:print("IOError") class My_imgThread(Thread):def __init__(self,num):Thread.__init__(self)self.num=numdef run(self):download(self.num)passthreadsimg=[]for num in range(len(IMG_link_set)): j=My_imgThread(i)threadsimg.append(j)j.start() for j in threads: j.join() """i=-1sum=1for url_img in IMG_link_set:i=i+1 url_img=IMG_link_set[i]PAGE=str(sid)+'页'+str(sum)+'张'file_name0=file_name+PAGE+'.jpg'sum=sum+1html_IMG=requests.get(url_img,headers=Web_IMG_Crawler().ant_Crawler(url))try:image=Image.open(BytesIO(html_IMG.content))image.save(file_path+file_name0)print('第%s图片下载完成,图片名称：%s' % (PAGE,file_name0))except:print("IOError") #翻页网址更新获取def Nextall_page(self,soup,url):url_list=[]url_list0=[]url_list0.append(url)stype_url=re.findall(r"item/(.+?).html",url)[0]for list_url in soup.find_all('a',href=pile(stype_url)):urlget=list_url.get('href')url_list.append(urlget)url_list=Web_IMG_Crawler().QU_CHONG(url_list)web_url_len=int(re.findall(r"_(.+?).html",url_list[-1])[0])+1urlf=re.findall(r"(.+?).html",url)[0]for urllast in range(2,web_url_len):url_list_all=urlf+'_'+str(urllast)+'.html'url_list0.append(url_list_all)return url_list0#获取图片网址listdef HQ_IMG_SQL(self,soup,url,file_path,file_name,sid):"""根据源输入网址特征获取目标网址方法stype_url=re.findall(r"item/(.+?).html",url)[0]src=pile(stype_url)src.get("src")"""stype_url=re.findall(r"item/(.+?).html",url)[0]url_IMG_list=[]IMG_link_set=[]for src in soup.find_all(src=pile(stype_url)):#提取图片网址url_img=src.get("src")url_IMG_list.append(url_img)IMG_link_set=Web_IMG_Crawler().QU_CHONG(url_IMG_list)#return IMG_link_setWeb_IMG_Crawler().SAVE_IMAGE(IMG_link_set,url,file_path,file_name,sid)def HQ_IMG_SQL2(self,soup):"""根据网址源代码中jpg特征，获取目标网址方法originalsrc=pile("jpg")src.get("originalsrc")"""url_IMG_list2=[]IMG_link_set2=[]for src in soup.find_all(originalsrc=pile("jpg")):#提取图片网址url_img=src.get("originalsrc")url_IMG_list2.append(url_img)IMG_link_set2=Web_IMG_Crawler().QU_CHONG(url_IMG_list2)return IMG_link_set2#------------------------------------------------- def NAME_IMG(self,soup):"""获取图片名称，根据title检索title.get("title")"""for title in soup.find_all(originalsrc=pile("jpg")):#提取图片网址name_img0=title.get("title") return name_img0def NAME_IMG2(self,soup):"""获取图片名称，根据alt检索alt.get("alt")"""for alt in soup.find_all(src=pile("jpg")):#提取图片网址name_img2=alt.get("alt")return name_img2 def searchurl():search_url=input("请输入目标网址：",)#search_url=r'此处网址被和谐了'de_code=r'utf-8'soup=Web_IMG_Crawler().JIEXI_WEB(search_url,de_code)searchurl_list=[]for list_url in soup.find_all('a',href=pile('item')):searchurlget=list_url.get('href')searchurl_list.append(searchurlget)searchurl_list=Web_IMG_Crawler().QU_CHONG(searchurl_list)return searchurl_listdef TOopen(url):def thread_Open(sid):soup=Web_IMG_Crawler().JIEXI_WEB(NEXTurl_list[sid],de_code)Web_IMG_Crawler().HQ_IMG_SQL(soup,url,file_path,file_name,sid) class My_Thread(Thread):def __init__(self,sid):Thread.__init__(self)self.sid=sid def run(self):thread_Open(self.sid)passurl=url#url=r'此处网址被和谐了'#de_code=input("请输入网页编码类型:",) de_code=r'utf-8'soup=Web_IMG_Crawler().JIEXI_WEB(url,de_code)file_path='F:/图片/Saved Pictures/'#file_path='../Download/'file_name=Web_IMG_Crawler().NAME_IMG2(soup)NEXTurl_list=Web_IMG_Crawler().Nextall_page(soup,url)threads=[]for i in range(len(NEXTurl_list)): t=My_Thread(i)threads.append(t)t.start() for t in threads: t.join()class My_searchThread(Process):def __init__(self,url):Process.__init__(self)self.url=urldef run(self):TOopen(self.url)passclass My_searchThread0(Thread):def __init__(self,url):Thread.__init__(self)self.url=urldef run(self):TOopen(self.url)pass#------------------------------------------------- if __name__ == "__main__":begintime=time.strftime("%Y-%m-%d %H:%M:%S")starttime = datetime.datetime.now()threadssearch=[]for url in searchurl(): y=My_searchThread0(url)threadssearch.append(y)y.start() for y in threadssearch: y.join() endtime=time.strftime("%Y-%m-%d %H:%M:%S")finishtime=datetime.datetime.now()runtime=(finishtime-starttime).secondstimespan='%f' %runtime print( 'Time Begin:'+begintime)print( 'Time End:'+endtime)print( 'Time Span:'+timespan+' s')exit()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。