CFVideoScraper/CinemScraper/spiders/grabVideoData.py

27 lines
1.1 KiB
Python
Raw Normal View History

2018-05-10 17:31:59 +00:00
# -*- coding: utf-8 -*-
import scrapy
class GrabvideodataSpider(scrapy.Spider):
name = 'grabVideoData'
allowed_domains = ['cinematheque.fr']
start_urls = ['http://www.cinematheque.fr/decouvrir.html']
2018-05-10 17:31:59 +00:00
def parse(self, response):
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
url = response.urljoin(lien.css('a::attr(href)').extract_first())
yield scrapy.Request(url, callback = self.parse_dir_content)
def parse_dir_content(self, response):
2018-05-10 17:31:59 +00:00
for page in response.css("div#content"):
yield {
'titre' : page.css('h1::text').extract_first().strip(),
'sous-titre' : page.css('h1 span::text').extract_first(),
'description' : page.css('.description p').extract(),
'biographies' : page.css('.biographies p').extract(),
'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'),
'articleUrl' : response.url,
'tags' : page.css('.tag::text').re(r'[\n]')
}