100字范文,内容丰富有趣,生活中的好帮手!
100字范文 > 单线程 异步伪并发及多线程爬虫实例比较(附经验总结)

单线程 异步伪并发及多线程爬虫实例比较(附经验总结)

时间:2023-10-08 12:32:00

相关推荐

单线程 异步伪并发及多线程爬虫实例比较(附经验总结)

技术要点:

1、BeautifulSoup库select方法选取元素

2、Selenium+phantomjs反爬动态网页

3、asyncio + aiohttp异步爬虫技术

4、多线程threading库的使用

5、异步并发gevent库的使用

爬取京东网站信息:

页数20页,

单线程爬虫:

from bs4 import BeautifulSoupimport requestsimport timeimport jsonimport re# import phantomjsheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': '/'}cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'cookies={} #构建cookies应对反爬for i in cookiestr.split(';'):cookies[i.split('=')[0]]=i.split('=')[1]def get_price(skuid): #获得产品原价及促销价url='/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies) #cookies似乎不需要pattern = pile(r'{.+}')originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])return originalprice,promotionprice #返回原价和促销价的元祖def get_comments(skuid):#获得产品评价信息url='/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)dic=json.loads(html.text)return dic #返回字典格式def get_html(url):# b=webdriver.PhantomJS() #反爬情况下, 采用selenium+PhantomJS爬取# b.get(url)# if b.page_source:#print('获取页面成功...')#parse_html(b.page_source)# else:#print('Error:', html.text)# return b.page_sourcehtml=requests.get(url,headers=headers)if html.status_code==200:# print('获取页面成功...')# print(html.text)parse_html(html.text)else:print('Error:',html.text)return html.textdef parse_html(html):soup=BeautifulSoup(html,'lxml')products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取 id用#,class用.n=0for i in products:try:shopname = i.select_one('div > div.p-shop').get('data-shop_name')# shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以sku=i.get('data-sku')comments = get_comments(sku)price = get_price(sku)[1]productname =i.select_one('div > div.p-name > a > em').text.strip()productlink='http:'+i.select_one('div > div.p-img > a')['href']img='http:'+i.select_one('div > div.p-img > a > img')['src']# r.hmset(sku, {'商品名称': productname, '店铺': shopname,'商品链接':productlink,'商品图片链接':img})n+=1except:img = 'http:' + i.select_one('div > div.p-img > a > img')['data-lazy-img']# shopname = i.select_one('div > div.p-shop').get('data-shop_name')# # shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以# sku = i.select_one('div')['data-sku']# productname = i.select_one('div > div.p-name > a > em').text.strip()# productlink = 'http:' + i.select_one('div > div.p-img > a')['href']n += 1finally:# print(shopname ,',',sku,',', productname,',', productlink,',', img)print(shopname, sku, productname, productlink, img, '价格:', price, '评论数:',comments['CommentsCount'][0]['CommentCount'], '好评率:', comments['CommentsCount'][0]['GoodRate'])print('第%d个产品'%n)print(len(products))print(n)if __name__=='__main__':time1=time.time()url = ['/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,5)]pageNum=0for i in url:time.sleep(1)pageNum+=1print('Crawling Page No.:',pageNum)get_html(i)time2=time.time()print('Time used:',time2-time1) #Time used: 89.84880256652832这是未被反爬情况下,爬取5页的时间

异步伪并发

从花费的时间上看,速度优势相对明显。

from bs4 import BeautifulSoupimport timeimport asyncioimport aiohttpimport requestsimport reimport jsonheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': '/'}cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'cookies={} #构建cookies应对反爬for i in cookiestr.split(';'):cookies[i.split('=')[0]]=i.split('=')[1]async def get_html(url):async with aiohttp.ClientSession(headers=headers) as session:async with session.get(url) as resp:if resp.status==200:print('获取页面成功...')parse_html(await resp.text()) #这里不能使用text,要用text()else:print('ERROR:',resp.status)# returndef get_price(skuid): #获得产品原价及促销价url='/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)pattern = pile(r'{.+}')originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])return originalprice,promotionprice #返回原价和促销价的元祖def get_comments(skuid):#获得产品评价信息url='/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)dic=json.loads(html.text)return dic #返回字典格式def parse_html(html):n=0soup=BeautifulSoup(html,'lxml')products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取for i in products:# print(i)n += 1try:shopname = i.select_one('div > div.p-shop').get('data-shop_name')# shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以sku=i.get('data-sku')comments = get_comments(sku)price = get_price(sku)[1]productname =i.select_one('div > div.p-name > a > em').text.strip()productlink='http:'+i.select_one('div > div.p-img > a')['href']img='http:'+i.select_one('div > div.p-img > a > img')['src']except:img='http:'+i.select_one('div > div.p-img > a > img')['data-lazy-img'] #img属性会变化finally:print(shopname, sku, productname, productlink, img,'价格:',price,'评论数:',comments['CommentsCount'][0]['CommentCount'],'好评率:',comments['CommentsCount'][0]['GoodRate'])print('第%d个产品'%n)if __name__=='__main__':time1=time.time()url = ['/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,5)]tasks=[]for i in url:tasks.append(get_html(i))loop=asyncio.get_event_loop()loop.run_until_complete(asyncio.wait(tasks))loop.close()time2 = time.time()print('Time used:', time2 - time1) #Time used:35.787909746170044 该代码会面临反爬,导致无结果# print(get_price(49901959277))# print(get_comments(49901959277))

异步并发:

花费的时间更加短了,优势明显。

import geventfrom gevent import monkeymonkey.patch_all() from bs4 import BeautifulSoupimport requestsimport timeimport jsonimport re,redis# import phantomjs# import sysfrom selenium import webdriverheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': '/'}cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'cookies={} #构建cookies应对反爬for i in cookiestr.split(';'):cookies[i.split('=')[0]]=i.split('=')[1]def get_price(skuid): #获得产品原价及促销价url='/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)pattern = pile(r'{.+}')originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])return originalprice,promotionprice #返回原价和促销价的元祖def get_comments(skuid):#获得产品评价信息url='/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)dic=json.loads(html.text)return dic #返回字典格式def get_html(url):# b=webdriver.PhantomJS() #反爬情况下, 采用selenium+PhantomJS爬取# b.get(url)# if b.page_source:#print('获取页面成功...')#parse_html(b.page_source)# else:#print('Error:', html.text)# return b.page_sourcehtml=requests.get(url,headers=headers)if html.status_code==200:print(url,'获取页面成功...')# print(html.text)parse_html(html.text)else:print('Error:',html.text)return html.textdef parse_html(html):soup=BeautifulSoup(html,'lxml')products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取n=0for i in products:try:shopname = i.select_one('div > div.p-shop').get('data-shop_name')# shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以sku=i.get('data-sku')comments = get_comments(sku)price = get_price(sku)[1]productname =i.select_one('div > div.p-name > a > em').text.strip()productlink='http:'+i.select_one('div > div.p-img > a')['href']img='http:'+i.select_one('div > div.p-img > a > img')['src']# r.hmset(sku, {'商品名称': productname, '店铺': shopname,'商品链接':productlink,'商品图片链接':img})n+=1except:img = 'http:' + i.select_one('div > div.p-img > a > img')['data-lazy-img']# shopname = i.select_one('div > div.p-shop').get('data-shop_name')# # shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以# sku = i.select_one('div')['data-sku']# productname = i.select_one('div > div.p-name > a > em').text.strip()# productlink = 'http:' + i.select_one('div > div.p-img > a')['href']n += 1finally:# print(shopname ,',',sku,',', productname,',', productlink,',', img)print(shopname, sku, productname, productlink, img, '价格:', price, '评论数:',comments['CommentsCount'][0]['CommentCount'], '好评率:', comments['CommentsCount'][0]['GoodRate'])print('第%d个产品'%n)print(len(products))print(n)if __name__=='__main__':time1=time.time()urllist = ['/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,5)]greenlets=[gevent.spawn(get_html,url) for url in urllist] #构建任务列表gevent.joinall(greenlets) #添加任务time2=time.time()print('Time used:',time2-time1) #Time used: 13.96290135383606 该代码会面临fan爬,导致无结果

最后是多线程爬取:

花费的时间更短,接近gevent的异步并发爬取。

from bs4 import BeautifulSoupimport requests,reimport time,json# import phantomjsfrom selenium import webdriverfrom threading import Threadheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': '/'}cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'cookies={} #构建cookies应对反爬for i in cookiestr.split(';'):cookies[i.split('=')[0]]=i.split('=')[1]def get_html(url):# b=webdriver.PhantomJS()# b.get(url)# if b.page_source:#print('获取页面成功...')#parse_html(b.page_source)# else:#print('Error:', html.text)# return b.page_sourcehtml=requests.get(url,headers=headers)if html.status_code==200:print('获取页面成功...')parse_html(html.text)else:print('Error:',html.text)return html.textdef get_price(skuid): #获得产品原价及促销价url='/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)pattern = pile(r'{.+}')originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])return originalprice,promotionprice #返回原价和促销价的元祖def get_comments(skuid):#获得产品评价信息url='/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuidhtml = requests.get(url,headers=headers,cookies=cookies)dic=json.loads(html.text)return dic #返回字典格式def parse_html(html):n=0soup=BeautifulSoup(html,'lxml')products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取for i in products:n+=1try:shopname = i.select_one('div > div.p-shop').get('data-shop_name')# shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以sku=i.get('data-sku')comments = get_comments(sku)price = get_price(sku)[1]productname =i.select_one('div > div.p-name > a > em').text.strip()productlink='http:'+i.select_one('div > div.p-img > a')['href']img='http:'+i.select_one('div > div.p-img > a > img')['src']except:img = 'http:' + i.select_one('div > div.p-img > a > img')['data-lazy-img']finally:print(shopname, sku, productname, productlink, img, '价格:', price, '评论数:',comments['CommentsCount'][0]['CommentCount'], '好评率:', comments['CommentsCount'][0]['GoodRate'])print('第%d个产品'%n)if __name__=='__main__':time1=time.time()url = ['/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,5)]threads=[]for i in range(0,4):t=Thread(target=get_html,args=(url[i],))threads.append(t)for i in threads:i.start()for i in threads:i.join()time2=time.time()print('Time used:',time2-time1) #Time used: 13.300746202468872

经验总结:

1、如果得到预期的请求结果,解析时使用CSS选择器的select方法很方便,因为可以直接从源码中获得相应

代码,甚于BS的findAll(即find_all)方法。用lxml的Xpath解析也是一个不错的选择。

2、爬不到预期的结果,主要是两个原因,

a.一是频繁请求,被目标网站决绝访问。解决方案,主要是使用不同代理,或者

b.网站采用了动态加载的方式,使得普通的请求得到的结果跟在浏览器中打开的源码不一致,尤其是一些

相对敏感的信息,如价格,销量及用户数量等等,如本例京东。对此,要尝试查看请求结果再变现解析

代码,以确保获得所需信息。对于没请求 的信息,最终方案是采用selenium+webdriver(chrome,firefox或

phantomjs)暴力爬取。

欢迎指点和交流!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。