100字范文 > scrapy mysql数据库_Python3学习系列（十三）：Scrapy将数据存入Mysql数据库

scrapy mysql数据库_Python3学习系列（十三）：Scrapy将数据存入Mysql数据库

时间：2023-02-16 22:06:23

前言：

下面给大家介绍将下载的数据存入到Mysql数据库的用法，数据来源是Mooc的课程。

代码实现：

items.py

fromscrapyimportItem,Field

classMoocspiderItem(Item):

#definethefieldsforyouritemherelike:

#name=scrapy.Field()

Url=Field()

Title=Field()

Image_Url=Field()

Student=Field()

Introduction=Field()

settings.py

BOT_NAME='MoocSpider'

SPIDER_MODULES=['MoocSpider.spiders']

NEWSPIDER_MODULE='MoocSpider.spiders'

#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent

#USER_AGENT='MoocSpider(+)'

#Obeyrobots.txtrules

ROBOTSTXT_OBEY=True

#相信这些代码大家都懂的吧

MYSQL_DB_NAME='python_data'

MYSQL_HOST='localhost'

MYSQL_USER='root'

MYSQL_PASSWORD='root'

ITEM_PIPELINES={

'MoocSpider.pipelines.MysqlPipeline':10,

}

pipelines.py

#-*-coding:utf-8-*-

#Defineyouritempipelineshere

#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting

#See:/en/latest/topics/item-pipeline.html

importjson

fromtwisted.enterpriseimportadbapi

#这里要说一下，要先安装一下MySQLdb，PYTHON用来对数据库进行操作的库

importMySQLdb

#使用了连接池的插入数据库函数

classMysqlPipeline(object):

#def__init__(self):

#self.file=open('MoocSpider1.json','w',encoding='utf-8')

defopen_spider(self,spider):

db=spider.settings.get('MYSQL_DB_NAME','python_data')

host=spider.settings.get('MYSQL_HOST','localhost')

port=3306

user=spider.settings.get('MYSQL_DB_USER','root')

passwd=spider.settings.get('MYSQL_DB_PASSWORD','root')

self.dbpool=adbapi.ConnectionPool('MySQLdb',host=host,db=db,user=user,passwd=passwd,charset='utf8')

defclose_spider(self,spider):

self.dbpool.close()

defprocess_item(self,item,spider):

#读取item中的数据

#line=json.dumps(dict(item),ensure_ascii=False)+"\n"

#写入文件

#self.file.write(line)

#返回item

#returnitem

self.dbpool.runInteraction(self.insert_db,item)

definsert_db(self,tx,item):

values=(

item['Url'],

item['Title'],

item['Image_Url'],

item['Student'],

item['Introduction'],

)

sql='INSERTINTObooksVALUES(%s,%s,%s,%s,%s)'

tx.execute(sql,values)

#普通的连接函数

classMysqlPipeline1(object):

defopen_spider(self,spider):

db=spider.settings.get('MYSQL_DB_NAME','python_data')

host=spider.settings.get('MYSQL_HOST','localhost')

port=3306

user=spider.settings.get('MYSQL_DB_USER','root')

passwd=spider.settings.get('MYSQL_DB_PASSWORD','root')

self.db_conn=MySQLdb.connect(host=host,port=port,db=db,user=user,passwd=passwd,charset='utf8')

self.db_cur=self.db_conn.cursor()

defprocess_item(self,item,spider):

#读取item中的数据

#line=json.dumps(dict(item),ensure_ascii=False)+"\n"

#写入文件

#self.file.write(line)

#返回item

#returnitem

try:

self.insert_db(item)

mit()

exceptExceptionaserror:

print(error)

returnitem

definsert_db(self,item):

values=(

item['Url'],

item['Title'],

item['Image_Url'],

item['Student'],

item['Introduction']

)

sql='INSERTINTObooksVALUES(%s,%s,%s,%s,%s)'

self.db_cur.execute(sql,values)

MoocSpider.py

importscrapy

fromMoocSpider.itemsimportMoocspiderItem

fromscrapy.selectorimportSelector

classMoocSpider(scrapy.Spider):

name='MoocSpider'

allowed_domains=['']

start_urls=['/course/list']

defparse(self,response):

html=Selector(response)

item=MoocspiderItem()

base_url=''

content=html.xpath('//div[@class="course-card-container"]')

foreachincontent:

item=MoocspiderItem()

item['Url']=base_url+each.xpath('.//a/@href').extract_first()

item['Title']=each.xpath('.//a/div/h3/text()').extract_first()

item['Image_Url']=each.xpath('.//a/div/img/@src').extract_first()

item['Student']=each.xpath('.//a/div/div/div/span/text()').extract()[1]

item['Introduction']=each.xpath('.//a/div/div/p/text()').extract_first()

yielditem

#获取下一页的URL

#url=response.xpath('//a[contains(text(),"下一页")]/@href').extract()

#ifurl:

#page=''+url[0]

#yieldscrapy.Request(page,callback=self.parse)

参考资料：《精通Scrapy网络爬虫》

转载请注明来自：

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。