100字范文 > Python使用scrapy框架编写自动爬虫爬取京东商品信息并写入数据库

Python使用scrapy框架编写自动爬虫爬取京东商品信息并写入数据库

时间：2023-09-30 02:07:23

目标：京东商品详情页的商品标题、商店名、商店链接、价格、好评率、评论数。

随意进入京东一款商品详情页面，查看源码可以看到商品标题、商店名、商店链接就在源码里面，可以直接获得，但是源码里面没有商品的价格，说明隐藏了，需要进行抓包分析

商品的链接：

/100003395443.html

抓包之后得到下面这两个链接：

/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_100003395443%2C&ext=11100000&source=item-pc

/comment/productCommentSummaries.action?referenceIds=100003395443&callback=jQuery2313070&_=1564707435635

第一个链接包含了价格，分析这个URL可以知道，100003395443就是商品的ID，更换这个ID就可以切换到别的商品的价格URL，第二个链接包含了好评率和评论数，代码如下：

items.py:

import scrapyclass JingdongItem(scrapy.Item):title = scrapy.Field()shop = scrapy.Field()shoplink = scrapy.Field()price = scrapy.Field()GoodRateShow = scrapy.Field()CommentCountStr = scrapy.Field()

编写spider程序：

写一个开始的函数

def start_requests(self):headers = {"Cookie":"你的cookie值"}url = "/"yield Request(url=url,headers=headers)rules = (Rule(LinkExtractor(allow=r''), callback='parse_item', follow=True),#要找到每个商品的详情，就不用了设置allow，直接每个URL都进去)

因为每个商品详情页面的URL为：/100003395443.html

中间数字为商品的ID，因此我们通过自动爬虫，自动进入每个页面，找到每个符合详情页面URL的规则，

编写页面规则函数：

def parse_item(self, response):try:item = JingdongItem()thisurl = response.url #找到每个页面的URLpat = "/(.*?).html" #设置每个商品详情页的链接规则x = re.search(pat,thisurl)#找到每个URL看是否符合详情页面URL规则if(x): #符合详情页规则则进入thisid = pile(pat).findall(thisurl)[0] #找到每个商品的ID，就是详情URL的那个数字item["title"] = response.xpath("//html/head/title/text()").extract()#标题item["shop"] = response.xpath("//div[@class='name']/a[@target='_blank']/text()").extract()#店名item["shoplink"] =response.xpath("//div[@class='name']/a/@href").extract()#商店链接priceurl = "/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_"+str(thisid)+"%2C&ext=11100000&source=item-pc"pricedata = request.urlopen(priceurl).read().decode("utf-8","ignore")item["price"] = re.findall(r'"p":"(.*?)"',pricedata)[0]#价格Data_url = "/comment/productCommentSummaries.action?referenceIds="+str(thisid)+"&callback=jQuery2313070&_=1564707435635"GoodRateShowData = request.urlopen(Data_url).read().decode("utf-8","ignore")item["GoodRateShow"] = re.findall(r'GoodRateShow":(.*?),',GoodRateShowData)[0]#好评率item["CommentCountStr"] = re.findall(r'CommentCountStr":"(.*?)"',GoodRateShowData)[0]#评论数if item["price"] == "-1.00":#因为某些价格的URL会出错，为-1.00，所以我们设置取到的值为-1.00就跳过passelse:return itemelse:passexcept:pass

取到值后把item返回到pipelines里面去处理，整个spider的代码为：

# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom jingdong.items import JingdongItemfrom scrapy.http import Requestfrom urllib import requestimport reclass JdSpider(CrawlSpider):name = 'jd'allowed_domains = ['']"""start_urls = ['/']"""def start_requests(self):headers = {"Cookie":"你的cookie值"}url = ""yield Request(url=url,headers=headers)rules = (Rule(LinkExtractor(allow=r''), callback='parse_item', follow=True),#要找到每个商品的详情，就不用了设置allow，直接每个URL都进去)def parse_item(self, response):try:item = JingdongItem()thisurl = response.url #找到每个页面的URLpat = "/(.*?).html" #设置每个商品详情页的链接规则x = re.search(pat,thisurl)#找到每个URL看是否符合详情页面URL规则if(x): #符合详情页规则则进入thisid = pile(pat).findall(thisurl)[0] #找到每个商品的ID，就是详情URL的那个数字item["title"] = response.xpath("//html/head/title/text()").extract()#标题item["shop"] = response.xpath("//div[@class='name']/a[@target='_blank']/text()").extract()#店名item["shoplink"] =response.xpath("//div[@class='name']/a/@href").extract()#商店链接priceurl = "/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_"+str(thisid)+"%2C&ext=11100000&source=item-pc"pricedata = request.urlopen(priceurl).read().decode("utf-8","ignore")item["price"] = re.findall(r'"p":"(.*?)"',pricedata)[0]#价格Data_url = "/comment/productCommentSummaries.action?referenceIds="+str(thisid)+"&callback=jQuery2313070&_=1564707435635"GoodRateShowData = request.urlopen(Data_url).read().decode("utf-8","ignore")item["GoodRateShow"] = re.findall(r'GoodRateShow":(.*?),',GoodRateShowData)[0]#好评率item["CommentCountStr"] = re.findall(r'CommentCountStr":"(.*?)"',GoodRateShowData)[0]#评论数if item["price"] == "-1.00":#因为某些价格的URL会出错，为-1.00，所以我们设置取到的值为-1.00就跳过passelse:return itemelse:passexcept:pass

pipelines.py中直接写入数据库就行了：

import pymysqlclass JingdongPipeline(object):def process_item(self, item, spider):db = pymysql.connect(host="127.0.0.1",port=3306,user="数据库名",passwd="密码",db="表名")cursor = db.cursor()title = item["title"][0]shop = item["shop"][0]shoplink = "http:" + item["shoplink"][0]price = item["price"]GoodRateShow = item["GoodRateShow"]CommentCountStr = item["CommentCountStr"]print(title)print(shop)print(shoplink)print(price)print(GoodRateShow)print(CommentCountStr)print("-"*20)sql = "insert into shangping(title,shop,shoplink,price,GoodRateShow,CommentCountStr) values('"+title+"','"+shop+"','"+shoplink+"','"+price+"','"+GoodRateShow+"','"+CommentCountStr+"')"cursor.execute(sql)mit()cursor.close()db.close()return item

注意：记得设置好settings里面的一些函数。比如ROBOTSTXT_OBEY要改成FALSE

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。