Mise en place et utilisation de l'item "vidéo"
parent
880cb67572
commit
898f3b83ed
|
|
@ -9,11 +9,12 @@ import scrapy
|
|||
|
||||
|
||||
class video(scrapy.Item):
|
||||
titre = scrapy.Field()
|
||||
sousTitre = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
secondary_title = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
urlVideo = scrapy.Field()
|
||||
urlCF = scrapy.Field()
|
||||
dateUpload = scrapy.Field()
|
||||
date_event = scrapy.Field()
|
||||
tags = scrapy.Field()
|
||||
biographies = scrapy.Field()
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -1,26 +1,45 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import Join, MapCompose
|
||||
|
||||
from CinemScraper.items import video
|
||||
|
||||
class GrabvideodataSpider(scrapy.Spider):
|
||||
name = 'grabVideoData'
|
||||
allowed_domains = ['cinematheque.fr']
|
||||
start_urls = ['http://www.cinematheque.fr/decouvrir.html']
|
||||
|
||||
item_fields = {
|
||||
'title' : '//h1/text()',
|
||||
'secondary_title' : '//h1/span[@class="sub"]/text()',
|
||||
'date_event' : '//p[@class="date"]/text()',
|
||||
'urlVideo' : '//iframe/@src',
|
||||
#'urlCF' : 'response.url',
|
||||
'description' : '//div[@class="description"]',
|
||||
'biographies' : '//div[@class="biographies"]'
|
||||
}
|
||||
|
||||
content_xpath = '//div[@id="content"]'
|
||||
|
||||
def parse(self, response):
|
||||
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
|
||||
url = response.urljoin(lien.css('a::attr(href)').extract_first())
|
||||
yield scrapy.Request(url, callback = self.parse_dir_content)
|
||||
|
||||
def parse_dir_content(self, response):
|
||||
for page in response.css("div#content"):
|
||||
yield {
|
||||
'titre' : page.css('h1::text').extract_first().strip(),
|
||||
'sous-titre' : page.css('h1 span::text').extract_first(),
|
||||
'description' : page.css('.description p').extract(),
|
||||
'biographies' : page.css('.biographies p').extract(),
|
||||
'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'),
|
||||
'articleUrl' : response.url,
|
||||
'tags' : page.css('.tag::text').re(r'[\n]')
|
||||
}
|
||||
hxs = scrapy.Selector(response)
|
||||
|
||||
for page in hxs.xpath(self.content_xpath):
|
||||
loader = ItemLoader(item=video(), selector=page)
|
||||
# mettre des processeurs d'entrée ici
|
||||
# loader.default_input_processor = MapCompose(unicode.strip)
|
||||
loader.default_output_processor = Join()
|
||||
|
||||
# iteration des champs de l'item video
|
||||
for field, xpath in self.item_fields.items():
|
||||
loader.add_xpath(field, xpath)
|
||||
loader.add_value('urlCF', response.url)
|
||||
yield loader.load_item()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue