100字范文,内容丰富有趣,生活中的好帮手!
100字范文 > Python多线程抓取网页图片地址

Python多线程抓取网页图片地址

时间:2022-01-17 14:25:12

相关推荐

Python多线程抓取网页图片地址

mini-spider

功能描述:多线程网络爬虫,爬取网页图片地址(也可提取其他特征的URL)使用python开发一个迷你定向抓取器mini_spider.py,实现对种子链接的广度优先抓取,并把URL长相符合特定pattern的网页保存到磁盘上。程序运行:python mini_spider.py -c spider.conf配置文件spider.conf:

[spider]

feedfile: ./urls # 种子文件路径result: ./result.data # 抓取结果存储文件, 一行一个max_depth: 6 # 最大抓取深度(种子为0级)crawl_interval: 1 # 抓取间隔. 单位: 秒crawl_timeout: 2 # 抓取超时. 单位: 秒thread_count: 8 # 抓取线程数filter_url: .*.(gif|png|jpg|bmp)$ # URL特征种子文件urls:抓取策略广度优先的网页抓取策略多线程抓取获取符合特征的链接地址并存储到文件(例如gif|png|jpg|bmp为扩展格式的 url)链接的绝对路径存储到result.data文件中, 一行一个 (图片也可直接保存至本地)从HTML提取链接时支持处理相对路径及绝对路径

mini_spier.py

#!/usr/bin/env python################################################################################## Copyright (c) , Inc. All Rights Reserved#################################################################################"""This module is the main module@Time : /11/09@File : mini_spider.py@Author : cenquanyu@"""import logfrom worker.SpiderWorker import SpiderWorkerfrom worker.param_parser import parm_parserdef main():"""Main method to run mini spider"""# get input paramsargs = parm_parser.get_args()# init log configlog.init_log('./log/mini_spider')if args:# read config file spider.confconf_params = parm_parser.set_config_by_file(args.conf)# use config set up spider initial paramsspider = SpiderWorker(conf_params)# init result_path, make it completespider.set_path()# init url queuespider.set_url_queue()# start to crawl urlspider.start_crawl_work()returnif __name__ == '__main__':main()

spider.conf

[spider]feedfile: result: ./result.datamax_depth: 6crawl_interval: 1crawl_timeout: 2thread_count: 8filter_url: .*\.(gif|png|jpg|bmp)$

SpiderThread.py 多线程模块

#!/usr/bin/env python################################################################################## Copyright (c) , Inc. All Rights Reserved#################################################################################"""This module is threading module, it is used to enable multithreading and multi line processing of requests@Time : /11/09@File : SpiderThread.py@Author : cenquanyu@"""import loggingimport reimport timeimport threadingfrom worker.UrlHandler import UrlHandlerclass SpiderThread(threading.Thread):"""Provide multi thread for mini spider"""def __init__(self, urlqueue, result_path, max_depth, interval, timeout, filter_url, total_urlset):threading.Thread.__init__(self)self.urlqueue = urlqueueself.result_path = result_pathself.max_depth = max_depthself.interval = intervalself.timeout = timeoutself.filter_url = filter_urlself.total_urlset = total_urlsetself.lock = threading.Lock()def can_download(self, url):"""Judge whether the url can be download. write your download rules here.:param url: target url:return: True, False"""if not UrlHandler.is_url(url):return Falsetry:# Regular expression matching image URLpattern = pile(self.filter_url)except Exception as e:logging.error("the filter url %s is not re..compile fail: %s" % (self.filter_url, e))return False# if url length < 1 or url is not image type urlif len(url.strip(' ')) < 1 or not pattern.match(url.strip(' ')):return False# if url has been in total url set (avoid repeat downloads)if url in self.total_urlset:return Falsereturn Truedef run(self):"""Run crawling threadGet task from queue and add sub url into queue, crawling page strategy -- BFS.:return: no return"""while True:try:# get url and the page levelurl, level = self.urlqueue.get(block=True, timeout=self.timeout)except Exception as e:logging.error('Can not finish the task. job done. %s' % e)break# print url is Noneself.urlqueue.task_done()# sleep intervaltime.sleep(self.interval)# judge if url can be downloadif self.can_download(url):UrlHandler.download_url(self.result_path, url)# put a lock on add url to total url setself.lock.acquire()self.total_urlset.add(url)self.lock.release()# get the sub urls from urlsuburls = UrlHandler.get_urls(url)suburl_level = level + 1# if sub url level larger than max_depth, stop crawling page deeperif suburl_level > self.max_depth:continuefor suburl in suburls:self.urlqueue.put((suburl, suburl_level))

SpiderWorker.py 主工作模块

#!/usr/bin/env python################################################################################## Copyright (c) , Inc. All Rights Reserved#################################################################################"""This module is main worker, central module for crawling tasks@Time : /11/09@File : SpiderWorker.py@Author : cenquanyu@"""import osfrom queue import Queueimport loggingfrom worker.SpiderThread import SpiderThreadclass SpiderWorker(object):def __init__(self, *args, **kwargs):params = args[0]self.urls = params[0]self.result_path = params[1]self.maxdepth = params[2]self.interval = params[3]self.timeout = params[4]self.thread_count = params[5]self.filter_url = params[6]self.total_urlset = set()self.urlqueue = Queue()def set_abs_dir(self, path):"""Complete url path ,and mkdir if it not exits:param path: url path:return: result output path"""file_dir = os.path.join(os.getcwd(), path)if not os.path.exists(file_dir):try:os.mkdir(file_dir)except os.error as err:logging.error("mkdir result-saved dir error: %s. " % err)return str(file_dir)def set_path(self):"""Complete the path:return:"""self.result_path = self.set_abs_dir(self.result_path)def set_url_queue(self):"""Set url queue:return: True or False"""try:self.urlqueue.put((self.urls, 0))except Exception as e:logging.error(e)return Falsereturn Truedef start_crawl_work(self):"""Start to work:return: nothing"""thread_list = []for i in range(self.thread_count):thread = SpiderThread(self.urlqueue, self.result_path, self.maxdepth, self.interval,self.timeout, self.filter_url, self.total_urlset)thread_list.append(thread)logging.info("%s start..." % thread.name)thread.start()for thread in thread_list:thread.join()logging.info("thread %s work is done " % thread.name)self.urlqueue.join()logging.info("queue is all done")return

URLHandler.py URL处理,http请求模块

#!/usr/bin/env python################################################################################## Copyright (c) , Inc. All Rights Reserved#################################################################################"""This module is used to handle URL and HTTP related requests@Time : /11/09@File : UrlHandler.py@Author : cenquanyu@"""import osfrom urllib import parse, requestimport loggingimport chardetfrom bs4 import BeautifulSoupimport requestsclass UrlHandler(object):"""Public url tools for handle url"""@staticmethoddef is_url(url):"""Ignore url starts with Javascipt:param url::return: True or False"""if url.startswith("javascript"):return Falsereturn True@staticmethoddef get_content(url, timeout=10):"""Get html contents:param url: the target url:param timeout: request timeout, default 10:return: content of html page, return None when error happens"""try:response = requests.get(url, timeout=timeout)except requests.HTTPError as e:logging.error("url %s request error : %s" % (url, e))return Noneexcept Exception as e:logging.error(e)return Nonereturn UrlHandler.decode_html(response.content)@staticmethoddef decode_html(content):"""Decode html content:param content: origin html content:return: returen decoded html content. Error return None"""encoding = chardet.detect(content)['encoding']if encoding == 'GB2312':encoding = 'GBK'else:encoding = 'utf-8'try:content = content.decode(encoding, 'ignore')except Exception as err:logging.error("Decode error: %s.", err)return Nonereturn content@staticmethoddef get_urls(url):"""Get all suburls of this url:param url: origin url:return: the set of sub_urls"""urlset = set()if not UrlHandler.is_url(url):return urlsetcontent = UrlHandler.get_content(url)if content is None:return urlsettag_list = ['img', 'a', 'style', 'script']linklist = []for tag in tag_list:linklist.extend(BeautifulSoup(content).find_all(tag))# get url has attr 'src' and 'href'for link in linklist:if link.has_attr('src'):urlset.add(UrlHandler.parse_url(link['src'], url))if link.has_attr('href'):urlset.add(UrlHandler.parse_url(link['href'], url))return urlset@staticmethoddef parse_url(url, base_url):"""Parse url to make it complete and standard:param url: the current url:param base_url: the base url:return: completed url"""if url.startswith('http') or url.startswith('//'):url = parse.urlparse(url, scheme='http').geturl()else:url = parse.urljoin(base_url, url)return url@staticmethoddef download_image_file(result_dir, url):"""Download image as file, save in result dir:param result_dir: base_path:param url: download url:return: succeed True, fail False"""if not os.path.exists(result_dir):try:os.mkdir(result_dir)except os.error as err:logging.error("download to path, mkdir errror: %s" % err)try:path = os.path.join(result_dir, url.replace('/', '_').replace(':', '_').replace('?', '_').replace('\\', '_'))logging.info("download url..: %s" % url)request.urlretrieve(url, path, None)except Exception as e:logging.error("download url %s fail: %s " % (url, e))return Falsereturn True@staticmethoddef download_url(result_file, url):"""Download the URL that matches the characteristics, and save in a file:param result_file: base_path:param url: download url:return: succeed True, fail False"""try:path = os.path.join(os.getcwd(), result_file)logging.info("download url..: %s" % url)with open(path, 'a') as f:f.write(url + '\n')except Exception as e:logging.error("download url %s fail: %s " % (url, e))return Falsereturn True

param_parser.py 参数解析模块

#!/usr/bin/env python################################################################################## Copyright (c) , Inc. All Rights Reserved#################################################################################"""This module is used to parse params@Time : /11/09@File : param_parser.py@Author : cenquanyu@"""import argparseimport loggingimport configparserclass parm_parser(object):@staticmethoddef set_config_by_file(config_file):"""Set spiderworker params by config file:param : config file:return: True, False"""config = configparser.ConfigParser()config.read(config_file, encoding='utf-8')urls = config['spider']['feedfile'] # feedfile pathresult_path = config['spider']['result'] # result storage filemax_depth = config['spider']['max_depth'] # max scratch depthcrawl_interval = config['spider']['crawl_interval'] # scratch intervalcrawl_timeout = config['spider']['crawl_timeout'] # scratch timeoutthread_count = config['spider']['thread_count'] # scratch threadfilter_url = config['spider']['filter_url'] # URL characteristicsreturn urls, result_path, int(max_depth), int(crawl_interval), int(crawl_timeout), int(thread_count), filter_url@staticmethoddef get_args():"""Get console args and parse:return: nothing"""try:parser = argparse.ArgumentParser(prog='other_mini_spider',usage='minispider using method',description='other_mini_spider is a Multithreaded crawler')parser.add_argument('-c', '--conf', help='config_file')parser.add_argument('-v', '--version', help='version', action="store_true")except argparse.ArgumentError as e:logging.error("get option error : %s." % e)returnargs = parser.parse_args()if args.version:parm_parser.version()if args.conf:return args@staticmethoddef version():"""Print mini spider version"""print("other_mini_spider version 1.0.0")

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。