一、分析页码
在/ent/12/995687.shtml,中点击下一页,可以发现页面url地址发生了变化:
为/ent/12/995687_2.shtml,多了一个下划线加页面数字。
尝试将2改为1:/ent/12/995687_1.shtml,无法打开
二、分析xpath
使用xpath("//*[@class='picact']")即可获取当前图片标签,使用(“..”)获取图片标签的父节点,然后获取文本内容即可
三、代码
import requestsfrom lxml import htmlimport osimport redef catch_images(url):current_page = 1image_list = []index = url.index('.shtml')# 修改页码urlurl = "%s%%s%s" % (url[:index], url[index:])while True:try:page = ('_%d' % (current_page,)) if current_page > 1 else ""respond = requests.get(url=url % (page,))html_text = respond.content.decode("utf-8")document = html.fromstring(html_text)imgs = document.xpath("//*[@class='picact']")if len(imgs) < 1:breakfor img in imgs:parent = img.getparent()src = img.attrib['src'].strip()_txt = parent.text_content().strip()txt = src[src.rfind('/') + 1:] if len(_txt) == 0 else _txtimage = {'url': src,'txt': txt}image_list.append(image)current_page += 1except Exception:breakreturn image_listif __name__ == '__main__':url = "/ent/12/995239.shtml"_list = catch_images(url)for l in _list:print(l)
可看到如下输出:
四、额外
保存图片可以用如下代码:# 获得一个合法文件名def check_name_valid(name=None):if name is None:print("name is None!")returnreg = pile(r'[\\/:*?"<>|\r\n]+')valid_name = reg.findall(name)if valid_name:for nv in valid_name:name = name.replace(nv, "_")return name# 保存图片# _dir路径名:"xxxxxx"# _name图片名:"xx.gif"# _url图片网络地址def save_image(_dir, _name, _url):# 创建指定目录路径,在当前路径\images\xxdir_path = os.path.join("images", _dir)# 不存在则创建if not os.path.exists(dir_path):os.makedirs(dir_path)# 文件名合法化_name = check_name_valid(_name)# 获得文件后缀ext = _url[_url.rfind('.'):]file_path = os.path.join(dir_path, _name + ext)if os.path.exists(file_path):return Truewith open(file_path, 'wb') as file:# content获取二级制格式respond = requests.get(_url).content#保存图片file.write(respond)print("image save in %s", file_path)return True
至此,只需要将上一节中抓取的所有文章链接传递给catch_images 然后将获取到的图片url进行下载即可:
最后一步的代码:
import gamesky_fun as gf # 该模块中仅包含本页面中的三个方法import xlrdimport xlwtimport osfrom xlutils3 import copyDOWN_LIST_FILE = 'gamesky_fun_down_list.xls'LIST_FILE = 'gamesky_fun_list.xls'SHEET_NAME = "list"def down():mod_book = Nonemod_sheet = Noneif not (os.path.exists(DOWN_LIST_FILE)):mod_book = xlwt.Workbook(encoding='utf-8')mod_sheet = mod_book.add_sheet(SHEET_NAME, cell_overwrite_ok=True)else:read_book = xlrd.open_workbook(DOWN_LIST_FILE)mod_book = copy.copy(read_book)mod_sheet = mod_book.get_sheet(SHEET_NAME)list_book = xlrd.open_workbook(LIST_FILE)table = list_book.sheet_by_name(SHEET_NAME)table_len = table.nrowstry:for i in range(table_len - 1, 0, -1):url, title = check_down(i)if url is None:url = table.cell(i, 0).valuetitle = table.cell(i, 1).valueprint(url, title)print("Catching Image Url")images = gf.catch_images(url)_dir = titlefor img in images:print(img)ret = gf.save_image(_dir, img['txt'], img['url'])print("save:%s" % (ret,))else:print("continue")mod_sheet.write(i, 0, url)mod_sheet.write(i, 1, title)mod_sheet.write(i, 2, i)mod_book.save(DOWN_LIST_FILE)print("ok :%d" % (i,))finally:mod_book.save(DOWN_LIST_FILE) # 保存文件def check_down(_id):if not (os.path.exists(DOWN_LIST_FILE)):return Falsewordbookrd = xlrd.open_workbook(DOWN_LIST_FILE)table = wordbookrd.sheet_by_name(SHEET_NAME)table_len = table.nrowsfor i in range(table_len):if table.cell(i, 2).value == _id:return table.cell(i, 0).value, table.cell(i, 1).valuereturn None, Noneif __name__ == "__main__":down()