100字范文 > Python网络爬虫（二）：多线程爬取小说

Python网络爬虫（二）：多线程爬取小说

时间：2022-02-09 21:44:52

这里我们爬取的小说是网站：笔趣阁，其中一本小说：一念永恒。

（一）准备阶段

1、网站URL：/1_1094/

2、浏览器：搜狗

3、我们打开网址，找到搜狗浏览器的审查元素（F12），可以看到class为”listmain”的div标签中的内容是我们想要的小说每个章节的链接地址。

4、我们单击其中一个章节的链接。我们发现，每个章节的正文内容放在class为”content”的div 标签中。

有了这些了解，接下来就可以编写代码。

（二）代码实现：

这里我们将整个代码工程写了四个文件，对应有四个类。

文件shenmu_main.py是程序运行的入口。

shenmu_main.py:

# -*- coding: utf-8 -*-"""主程序"""from xiaoshuo import shenmu_parsefrom xiaoshuo import ProcessPoolDownloadfrom xiaoshuo import shenmu_downloaderclass SpiderMain(object):def __init__(self):self.Htmldownloader= shenmu_downloader.HtmlDownLoader()self.parse= shenmu_parse.HtmlParse()self.PoolDown = ProcessPoolDownload.PoolDownLoader()def TheStart(self,target_url):#一念永恒小说目录地址target_html=self.Htmldownloader.downloader(target_url)new_urls=self.parse.parse(target_html)self.PoolDown.oneprocess(new_urls)#self.PoolDown.ThreadDown(new_urls)if __name__ == "__main__":target_url = '/1_1094/'Spider=SpiderMain()Spider.TheStart(target_url)

shenmu_downloader.py实现了下载小说整个主页面，也就是包含小说各个章节链接的页面。

# -*- coding: utf-8 -*-"""Created on Fri Dec 29 18:12:11 @author: Administrator"""from urllib import request,errorclass HtmlDownLoader(object):def downloader(self,target_url,retry_count=3):if target_url is None:return Nonetry:#User-Agenthead = {}head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'target_req = request.Request(url = target_url, headers = head)target_response = request.urlopen(target_req)target_html = target_response.read().decode('gbk','ignore')#print("整个页面"+target_html)except error.URLError as e:print("HtmlDownLoader download error:",e.reason)target_html=Noneif retry_count>0: #重试下载if hasattr(e, 'code') and 500 <= e.code < 600:return self.downloader(target_url,retry_count-1)return target_html

shenmu_parse.py解析shenmu_downloader传递来的HTML。分析出各个章节的链接地址返回。

# -*- coding: utf-8 -*-"""用BeautifulSoup分析小说每个章节的链接"""from bs4 import BeautifulSoupimport queueclass HtmlParse(object):def parse(self,target_html,html_encode="utf-8"):if target_html is None:return#print(target_html)#创建BeautifulSoup对象listmain_soup = BeautifulSoup(target_html,'lxml')#搜索文档树,找出div标签中class为listmain的所有子标签chapters = listmain_soup.find_all('div',class_ = 'listmain')#使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析download_soup = BeautifulSoup(str(chapters), 'lxml')# print(download_soup)new_urls=self._get_new_urls(download_soup)return new_urlsdef _get_new_urls(self,download_soup):#print(download_soup)#new_urls集合用于存储每章的链接new_urls=queue.Queue()#开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除begin_flag = False#遍历dl标签下所有子节点for child in download_soup.dl.children:#滤除回车if child != '\n':#找到《神墓》正文卷,使能标志位if child.string == u"《一念永恒》正文卷":begin_flag = True#爬取链接并下载链接内容if begin_flag == True and child.a != None:download_url = "" + child.a.get('href')new_urls.put(download_url)return new_urls

ProcessPoolDownload.py实现了从各个章节链接下载小说正文，并写入txt文件中。下载方式有单进程和多线程。

# -*- coding: utf-8 -*-"""下载每个章节的内容"""from urllib import requestfrom multiprocessing import Poolfrom multiprocessing import Lockfrom threading import Thread #线程from bs4 import BeautifulSoupimport sysimport threadingimport datetimeimport queueclass PoolDownLoader(object):def __init__(self):self.new_urls=queue.Queue()self.lock=threading.Lock()self.lock1=threading.Lock()def WriteText(self,download_name,texts):file = open('一念永恒1.txt', 'a', encoding='utf-8')soup_text = BeautifulSoup(str(texts), 'lxml')write_flag = Truefile.write(download_name + '\n\n')for each in soup_text.div.text.replace('\xa0',''):if each == 'h':write_flag = Falseif write_flag == True and each != ' ':file.write(each)if write_flag == True and each == '\r':file.write('\n')file.write('\n\n')#打印爬取进度print("已下载:"+download_name + '\r')def funcDown(self,download_url):head = {}head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'download_req = request.Request(url = download_url, headers = head)download_response = request.urlopen(download_req)download_html = download_response.read().decode('gbk','ignore')soup_texts = BeautifulSoup(download_html, 'lxml')h1=soup_texts.h1.stringtexts = soup_texts.find_all(id = 'content', class_ = 'showtxt')#print(h1)self.lock.acquire()self.WriteText(h1,texts)self.lock.release()def PoolDown(self,new_urls):lock = Lock()p=Pool(processes=4,initargs=(lock,))for download_url in new_urls:#print(download_url)p.apply_async(self.funcDown,args=(download_url,))p.close()p.join()print("小说下载完成，已写入文档！\n")def ThreadDown(self,new_urls):starttime=datetime.datetime.now()self.new_urls=new_urlsthreads=[]while threads or self.new_urls:for thread in threads:if not thread.isAlive():threads.remove(thread)while len(threads)<8 and not self.new_urls.empty():new_url=self.new_urls.get()thread =threading.Thread(target=self.funcDown,args=(new_url,))thread.setDaemon(True)thread.start()threads.append(thread)#thread.join()endtime=datetime.datetime.now()print("下载小说共用了;",(endtime-starttime).seconds)def oneprocess(self,new_urls):starttime=datetime.datetime.now()self.new_urls=new_urlswhile not self.new_urls.empty():new_url=new_urls.get()self.funcDown(new_url)endtime=datetime.datetime.now()print("下载小说共用了：",(endtime-starttime).seconds)

（三）结果分析：

单进程结果：可以看到单进程，是严格按照顺序下载的，下载时间较短。

多线程结果：可以看到多线程，不是严格按照顺序下载的，下载时间比单进程要长，是因为在写入文档时加了线程锁，大型工程多线程肯定要比单进程要高效。

（四）技术讲解：

先占个坑，以后写。

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。