diff --git a/CinemScraper/items.py b/CinemScraper/items.py index d08884d..f0339de 100644 --- a/CinemScraper/items.py +++ b/CinemScraper/items.py @@ -9,11 +9,12 @@ import scrapy class video(scrapy.Item): - titre = scrapy.Field() - sousTitre = scrapy.Field() + title = scrapy.Field() + secondary_title = scrapy.Field() description = scrapy.Field() urlVideo = scrapy.Field() urlCF = scrapy.Field() - dateUpload = scrapy.Field() + date_event = scrapy.Field() tags = scrapy.Field() + biographies = scrapy.Field() pass diff --git a/CinemScraper/spiders/grabVideoData.py b/CinemScraper/spiders/grabVideoData.py index cd470b5..05f9924 100644 --- a/CinemScraper/spiders/grabVideoData.py +++ b/CinemScraper/spiders/grabVideoData.py @@ -1,26 +1,45 @@ # -*- coding: utf-8 -*- import scrapy +from scrapy.loader import ItemLoader +from scrapy.loader.processors import Join, MapCompose +from CinemScraper.items import video class GrabvideodataSpider(scrapy.Spider): name = 'grabVideoData' allowed_domains = ['cinematheque.fr'] start_urls = ['http://www.cinematheque.fr/decouvrir.html'] + item_fields = { + 'title' : '//h1/text()', + 'secondary_title' : '//h1/span[@class="sub"]/text()', + 'date_event' : '//p[@class="date"]/text()', + 'urlVideo' : '//iframe/@src', + #'urlCF' : 'response.url', + 'description' : '//div[@class="description"]', + 'biographies' : '//div[@class="biographies"]' + } + + content_xpath = '//div[@id="content"]' + def parse(self, response): for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): url = response.urljoin(lien.css('a::attr(href)').extract_first()) yield scrapy.Request(url, callback = self.parse_dir_content) def parse_dir_content(self, response): - for page in response.css("div#content"): - yield { - 'titre' : page.css('h1::text').extract_first().strip(), - 'sous-titre' : page.css('h1 span::text').extract_first(), - 'description' : page.css('.description p').extract(), - 'biographies' : page.css('.biographies p').extract(), - 'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'), - 'articleUrl' : response.url, - 'tags' : page.css('.tag::text').re(r'[\n]') - } + hxs = scrapy.Selector(response) + + for page in hxs.xpath(self.content_xpath): + loader = ItemLoader(item=video(), selector=page) + # mettre des processeurs d'entrée ici +# loader.default_input_processor = MapCompose(unicode.strip) + loader.default_output_processor = Join() + + # iteration des champs de l'item video + for field, xpath in self.item_fields.items(): + loader.add_xpath(field, xpath) + loader.add_value('urlCF', response.url) + yield loader.load_item() +