100字范文 > python爬虫—豆瓣电影海报（按类别）

python爬虫—豆瓣电影海报（按类别）

时间：2019-01-20 23:57:40

原文地址：//04/06/getdouban/

python爬虫—豆瓣电影海报

目标：通过python爬虫在豆瓣电影上按类别对电影海报等数据进行抓取，可以通过更改参数控制抓取数据的数量。

库：python3.7+selenium+BeautifulSoup

ChromeDriver：/s/1MBG_AAx-gY5Z5Vc0Qv383A 提取码：69eh

由于豆瓣电影的类别网页是动态加载，不能通过request直接对网页进行抓取分析，通过selenium在浏览器上对网页源码进行动态加载然后分析。

由于豆瓣存在反爬虫策略，如果频繁的从豆瓣请求，豆瓣会封ip。为了避免被豆瓣侦测到，本人采用了一种较为笨拙的方式，对请求数据设定一定的延迟，不要过于频繁的获取数据，延迟设为2s即可，这会导致数据获取速度较慢。

for item in items:movie = {'img':item.find('img')['src'],'id':item['data-id'],'title':item.find('img')['alt']}time.sleep(2)

最终获取的获取包含：

1、电影海报，命名格式为-id.jpg

2、info.csv（电影id、电影名称、上映年份、评分、类型、导演、编剧、主演、国家）

可自行修改网页分析部分获取更多的数据

代码如下：

# -*- coding:utf-8 -*-from selenium import webdriverfrom selenium.webdriver.support.ui import WebDriverWaitfrom bs4 import BeautifulSoupfrom mon.exceptions import ElementNotVisibleExceptionimport timeimport requestsimport osimport csvimport refrom mon.exceptions import NoSuchElementExceptionclass DBDY:def __init__(self): # 进行初始化操作self.driver = webdriver.Chrome()self.wait = WebDriverWait(self.driver, 20)js = 'window.open("");'self.driver.execute_script(js) # 新建一个标签页用于查看电影详情def open_page(self,type): # 利用selenium驱动，打开豆瓣电影的网页url = '/tag/#/?sort=U&range=0,10&tags=电影,'+ typedriver = self.driverdriver.switch_to.window(driver.window_handles[0]) # 定位至总览标签页driver.get(url)driver.refresh() #刷新网页time.sleep(5)self.get_response(type)def get_response(self,type): # 该函数用来获取信息driver = self.driverpage = driver.page_source # 获取网页源代码soup = BeautifulSoup(page, 'html.parser')count = 0#加载更多more = soup.find_all('a', class_ = 'more')while(driver.find_element_by_class_name('more') and count<150):try:print('page: ',count)driver.find_element_by_class_name('more').click()count = count + 1time.sleep(3)except ElementNotVisibleException:breakpage = driver.page_source # 获取网页源代码soup = BeautifulSoup(page, 'html.parser')items = soup.find_all('div', class_='cover-wp') # 找到每一部电影的相关信息print('打印网页信息')for item in items:movie = {'img':item.find('img')['src'],'id':item['data-id'],'title':item.find('img')['alt']}time.sleep(2)is_Exist = self.download_poster_img(movie['img'],type,movie['id']) #下载海报if not is_Exist: #判断当前电影是否存在self.get_info(movie['title'],type,movie['id'],'/subject/')def get_info(self,title,movie_type,movie_id,url):try:driver = self.driverdriver.switch_to.window(driver.window_handles[1])#定位至详情标签页driver.get(url + movie_id + '/?tag='+movie_type+'&from=gaia') #打开电影详情页pagesource = driver.page_source#读取电影信息info = driver.find_element_by_id('info').find_elements_by_tag_name('span')#电影评分rating = driver.find_element_by_tag_name('strong').text#上映年份year = driver.find_element_by_class_name('year').textyear = year.replace('(', '')year = year.replace(')', '')#主演actor = driver.find_element_by_class_name('actor').textactor = str.split(actor,':')[-1]actor = actor.replace(' ','')actor = actor.replace('/更多...','')#电影类型type_span = driver.find_elements_by_xpath(".//span[@property='v:genre']")type = ''for item in type_span:type = type + item.text + '/'#制片国家/地区#由于该内容在网页中特殊，采用正则表达式对其进行搜索#'制片国家/地区: 英国 'expression = r'(?<=制片国家/地区:).+?(?= )'pattern = pile(expression) #编译正则表达式matcher = re.search(pattern,pagesource)#搜索匹配项country = matcher.group(0)country = country.replace(' ','')movie = {'id':movie_id,'title':title,'year':year,'rating':rating,'type':type,'director':info[2].text,'screenwriter':info[5].text,'actor':actor,'country':country}print(movie)self.record_info(movie)except NoSuchElementException:print('该电影信息不完善')try:os.remove('poster_img/'+movie_id+'.jpg')except FileNotFoundError:returndef download_poster_img(self,url,type,movie_id): #下载海报图片res = requests.get(url)file_name = str.split(url, '/')[-1]img_type = str.split(file_name,'.')[-1]file_path = 'poster_img/' + movie_id + '.' + img_typeis_Exist = os.path.exists(file_path)if not is_Exist:print('download img file_path = ', file_path)with open(file_path, 'wb') as f:f.write(res.content)return is_Existdef record_info(self,movie): #记录电影信息file = open('poster_img/info.csv','a',newline='',encoding='utf-8-sig')writer = csv.writer(file,dialect='excel')info = [movie['id'],movie['title'],movie['year'],movie['rating'],movie['type'],movie['director'].replace(' / ','/'),movie['screenwriter'].replace(' / ','/'),movie['actor'].replace(' / ', '/'),movie['country']]writer.writerow(info)file.close()if __name__ == '__main__':#movie_type = ['剧情', '喜剧', '动作', '爱情', '科幻', '动画', '悬疑', '惊悚', '恐怖', '犯罪', '战争']movie_type = ['犯罪', '战争']db = DBDY()for type in movie_type: #遍历所有电影类型print('正在爬取'+type+'电影')db.open_page(type)# 抓取完所有的信息后关闭网页db.driver.quit()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。