CFVideoScraper/CinemScraper/spiders/grabVideoData.py

# -*- coding: utf-8 -*-
import scrapy


class GrabvideodataSpider(scrapy.Spider):
    name = 'grabVideoData'
    allowed_domains = ['cinematheque.fr']
    start_urls = ['http://www.cinematheque.fr/decouvrir.html']

    def parse(self, response):
        for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
            url = response.urljoin(lien.css('a::attr(href)').extract_first())
            yield scrapy.Request(url, callback = self.parse_dir_content)

    def parse_dir_content(self, response):
    	for page in response.css("div#content"):
            yield {
    		'titre' : page.css('h1::text').extract_first(),
    		'sous-titre' : page.css('h1 span::text').extract_first(),
    		'description' : page.css('.biographies p').extract(),
    		'videoSrcUrl' : page.css('iframe::attr(src)').extract_first(),
                'articleUrl' : response.url,
    		'tags'		  : page.css('.tag::text').extract()
    	    }