Mise en place et utilisation de l'item "vidéo"

master
Yohann Dedy 2019-01-27 21:39:33 +01:00
parent 880cb67572
commit 898f3b83ed
2 changed files with 33 additions and 13 deletions

View File

@ -9,11 +9,12 @@ import scrapy
class video(scrapy.Item): class video(scrapy.Item):
titre = scrapy.Field() title = scrapy.Field()
sousTitre = scrapy.Field() secondary_title = scrapy.Field()
description = scrapy.Field() description = scrapy.Field()
urlVideo = scrapy.Field() urlVideo = scrapy.Field()
urlCF = scrapy.Field() urlCF = scrapy.Field()
dateUpload = scrapy.Field() date_event = scrapy.Field()
tags = scrapy.Field() tags = scrapy.Field()
biographies = scrapy.Field()
pass pass

View File

@ -1,26 +1,45 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose
from CinemScraper.items import video
class GrabvideodataSpider(scrapy.Spider): class GrabvideodataSpider(scrapy.Spider):
name = 'grabVideoData' name = 'grabVideoData'
allowed_domains = ['cinematheque.fr'] allowed_domains = ['cinematheque.fr']
start_urls = ['http://www.cinematheque.fr/decouvrir.html'] start_urls = ['http://www.cinematheque.fr/decouvrir.html']
item_fields = {
'title' : '//h1/text()',
'secondary_title' : '//h1/span[@class="sub"]/text()',
'date_event' : '//p[@class="date"]/text()',
'urlVideo' : '//iframe/@src',
#'urlCF' : 'response.url',
'description' : '//div[@class="description"]',
'biographies' : '//div[@class="biographies"]'
}
content_xpath = '//div[@id="content"]'
def parse(self, response): def parse(self, response):
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
url = response.urljoin(lien.css('a::attr(href)').extract_first()) url = response.urljoin(lien.css('a::attr(href)').extract_first())
yield scrapy.Request(url, callback = self.parse_dir_content) yield scrapy.Request(url, callback = self.parse_dir_content)
def parse_dir_content(self, response): def parse_dir_content(self, response):
for page in response.css("div#content"): hxs = scrapy.Selector(response)
yield {
'titre' : page.css('h1::text').extract_first().strip(), for page in hxs.xpath(self.content_xpath):
'sous-titre' : page.css('h1 span::text').extract_first(), loader = ItemLoader(item=video(), selector=page)
'description' : page.css('.description p').extract(), # mettre des processeurs d'entrée ici
'biographies' : page.css('.biographies p').extract(), # loader.default_input_processor = MapCompose(unicode.strip)
'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'), loader.default_output_processor = Join()
'articleUrl' : response.url,
'tags' : page.css('.tag::text').re(r'[\n]') # iteration des champs de l'item video
} for field, xpath in self.item_fields.items():
loader.add_xpath(field, xpath)
loader.add_value('urlCF', response.url)
yield loader.load_item()