100字范文 > Python多线程爬取小说网站小说

Python多线程爬取小说网站小说

时间：2020-10-15 17:53:24

相关推荐

Python多线程爬取小说网站小说

问题阐述

爬取网站小说，这里我们以努努书坊中爬取刘慈欣的小说球状闪电为例子！

技术支持

python多线程request+beautifulSoup解析网页

代码演示【直接粘贴即可运行】

import osimport threadingfrom queue import Queueimport requestsfrom bs4 import BeautifulSoupCRAWL_EXIT = Falseclass ThreadCrawl(threading.Thread):def __init__(self, thread_name, page_queue,data_queue,name_queue):# 调用父类初始化方法super(ThreadCrawl, self).__init__()self.threadName = thread_nameself.page_queue = page_queueself.data_queue = data_queueself.name_queue = name_queuedef run(self):print(self.threadName + ' 启动************')while not CRAWL_EXIT:page = self.page_queue.get(block=False) # 从里面获取值HEADER = {'Accept': 'application/json, text/javascript, */*; q=0.01','Accept-Encoding': 'gzip, deflate, br','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'Keep-Alive','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36','sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"'}if page < 10:spider_url = '/book3/6633/11600{}.html'.format(page)elif page == 35:continueelse:spider_url = '/book3/6633/1160{}.html'.format(page)print(spider_url)response = requests.get(spider_url, headers=HEADER).content.decode('gbk')soup = BeautifulSoup(response, 'lxml')if self.name_queue.empty():self.name_queue.put(soup.find_all("table")[1].text[10:16])eachPartName = soup.find_all("table")[3].text # 章节名eachContent = soup.find_all("table")[4].text # 内容self.data_queue.put([eachPartName, eachContent])def toFile(list,name):print(1)path = os.path.dirname(os.path.realpath(__file__))if not os.path.exists(path + '/{}'.format(name)):os.makedirs(path + '/{}'.format(name))for i in list:with open(path + '/{}/{}'.format(name, i[0]), 'w',encoding='UTF-8') as f:f.write(i[1])def main():# 声明一个队列，使用循环在里面存入100个页码page_queue = Queue(maxsize=0)for j in range(7, 40):page_queue.put(j)# 储存结果data_queue = Queue(maxsize=0)name_queue = Queue(1)craw_list = ['采集线程1号', '采集线程2号', '采集线程3号', '采集线程4号']for thread_name in craw_list:c_thread = ThreadCrawl(thread_name, page_queue, data_queue,name_queue)c_thread.start()while not page_queue.empty():passglobal CRAWL_EXITCRAWL_EXIT = Truename = name_queue.get()result = []for i in range(7, 39):result.append(data_queue.get())print(result)print(name)toFile(result,name)if __name__ == '__main__':main()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。