前言:
下面给大家介绍将下载的数据存入到Mysql数据库的用法,数据来源是Mooc的课程。
代码实现:
items.py
fromscrapyimportItem,Field
classMoocspiderItem(Item):
#definethefieldsforyouritemherelike:
#name=scrapy.Field()
Url=Field()
Title=Field()
Image_Url=Field()
Student=Field()
Introduction=Field()
settings.py
BOT_NAME='MoocSpider'
SPIDER_MODULES=['MoocSpider.spiders']
NEWSPIDER_MODULE='MoocSpider.spiders'
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='MoocSpider(+)'
#Obeyrobots.txtrules
ROBOTSTXT_OBEY=True
#相信这些代码大家都懂的吧
MYSQL_DB_NAME='python_data'
MYSQL_HOST='localhost'
MYSQL_USER='root'
MYSQL_PASSWORD='root'
ITEM_PIPELINES={
'MoocSpider.pipelines.MysqlPipeline':10,
}
pipelines.py
#-*-coding:utf-8-*-
#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:/en/latest/topics/item-pipeline.html
importjson
fromtwisted.enterpriseimportadbapi
#这里要说一下,要先安装一下MySQLdb,PYTHON用来对数据库进行操作的库
importMySQLdb
#使用了连接池的插入数据库函数
classMysqlPipeline(object):
#def__init__(self):
#self.file=open('MoocSpider1.json','w',encoding='utf-8')
defopen_spider(self,spider):
db=spider.settings.get('MYSQL_DB_NAME','python_data')
host=spider.settings.get('MYSQL_HOST','localhost')
port=3306
user=spider.settings.get('MYSQL_DB_USER','root')
passwd=spider.settings.get('MYSQL_DB_PASSWORD','root')
self.dbpool=adbapi.ConnectionPool('MySQLdb',host=host,db=db,user=user,passwd=passwd,charset='utf8')
defclose_spider(self,spider):
self.dbpool.close()
defprocess_item(self,item,spider):
#读取item中的数据
#line=json.dumps(dict(item),ensure_ascii=False)+"\n"
#写入文件
#self.file.write(line)
#返回item
#returnitem
self.dbpool.runInteraction(self.insert_db,item)
definsert_db(self,tx,item):
values=(
item['Url'],
item['Title'],
item['Image_Url'],
item['Student'],
item['Introduction'],
)
sql='INSERTINTObooksVALUES(%s,%s,%s,%s,%s)'
tx.execute(sql,values)
#普通的连接函数
classMysqlPipeline1(object):
defopen_spider(self,spider):
db=spider.settings.get('MYSQL_DB_NAME','python_data')
host=spider.settings.get('MYSQL_HOST','localhost')
port=3306
user=spider.settings.get('MYSQL_DB_USER','root')
passwd=spider.settings.get('MYSQL_DB_PASSWORD','root')
self.db_conn=MySQLdb.connect(host=host,port=port,db=db,user=user,passwd=passwd,charset='utf8')
self.db_cur=self.db_conn.cursor()
defprocess_item(self,item,spider):
#读取item中的数据
#line=json.dumps(dict(item),ensure_ascii=False)+"\n"
#写入文件
#self.file.write(line)
#返回item
#returnitem
try:
self.insert_db(item)
mit()
exceptExceptionaserror:
print(error)
returnitem
definsert_db(self,item):
values=(
item['Url'],
item['Title'],
item['Image_Url'],
item['Student'],
item['Introduction']
)
sql='INSERTINTObooksVALUES(%s,%s,%s,%s,%s)'
self.db_cur.execute(sql,values)
MoocSpider.py
importscrapy
fromMoocSpider.itemsimportMoocspiderItem
fromscrapy.selectorimportSelector
classMoocSpider(scrapy.Spider):
name='MoocSpider'
allowed_domains=['']
start_urls=['/course/list']
defparse(self,response):
html=Selector(response)
item=MoocspiderItem()
base_url=''
content=html.xpath('//div[@class="course-card-container"]')
foreachincontent:
item=MoocspiderItem()
item['Url']=base_url+each.xpath('.//a/@href').extract_first()
item['Title']=each.xpath('.//a/div/h3/text()').extract_first()
item['Image_Url']=each.xpath('.//a/div/img/@src').extract_first()
item['Student']=each.xpath('.//a/div/div/div/span/text()').extract()[1]
item['Introduction']=each.xpath('.//a/div/div/p/text()').extract_first()
yielditem
#获取下一页的URL
#url=response.xpath('//a[contains(text(),"下一页")]/@href').extract()
#ifurl:
#page=''+url[0]
#yieldscrapy.Request(page,callback=self.parse)
参考资料:《精通Scrapy网络爬虫》
转载请注明来自: