# -*- coding: utf-8 -*- import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import Join, MapCompose, TakeFirst from w3lib.html import remove_tags from CinemScraper.items import video class GrabvideodataSpider(scrapy.Spider): name = 'grabVideoData' allowed_domains = ['cinematheque.fr'] start_urls = ['http://www.cinematheque.fr/decouvrir.html'] item_fields = { 'title' : '//h1/text()', 'secondary_title' : '//h1/span[@class="sub"]/text()', 'date_event' : '//p[@class="date"]/text()', 'urlVideo' : '//iframe/@src', 'description' : '//div[@class="description"]/p', 'biographies' : '//div[@class="biographies"]', 'tags' : '//span[contains(@class, "tag")]/text()' } content_xpath = '//div[@id="content"]' def parse(self, response): for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): url = response.urljoin(lien.css('a::attr(href)').extract_first()) yield scrapy.Request(url, callback = self.parse_dir_content) def parse_dir_content(self, response): hxs = scrapy.Selector(response) for page in hxs.xpath(self.content_xpath): loader = ItemLoader(item=video(), selector=page) # mettre des processeurs d'entrée ici # loader.default_input_processor = MapCompose(remove_tags) loader.default_output_processor = Join() # iteration des champs de l'item video for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) loader.add_value('urlCF', response.url) yield loader.load_item()