2018-05-10 17:31:59 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
import scrapy
|
2019-08-28 01:24:20 +00:00
|
|
|
import re
|
2019-01-27 20:39:33 +00:00
|
|
|
from scrapy.loader import ItemLoader
|
2019-01-28 00:07:58 +00:00
|
|
|
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
|
|
|
|
from w3lib.html import remove_tags
|
2018-05-10 17:31:59 +00:00
|
|
|
|
2019-01-27 20:39:33 +00:00
|
|
|
from CinemScraper.items import video
|
2018-05-10 17:31:59 +00:00
|
|
|
|
|
|
|
|
class GrabvideodataSpider(scrapy.Spider):
|
|
|
|
|
name = 'grabVideoData'
|
2018-05-11 23:21:56 +00:00
|
|
|
allowed_domains = ['cinematheque.fr']
|
|
|
|
|
start_urls = ['http://www.cinematheque.fr/decouvrir.html']
|
2018-05-10 17:31:59 +00:00
|
|
|
|
2019-01-27 20:39:33 +00:00
|
|
|
item_fields = {
|
2019-01-28 00:07:58 +00:00
|
|
|
'title' : '//h1/text()',
|
|
|
|
|
'secondary_title' : '//h1/span[@class="sub"]/text()',
|
|
|
|
|
'date_event' : '//p[@class="date"]/text()',
|
|
|
|
|
'urlVideo' : '//iframe/@src',
|
|
|
|
|
'description' : '//div[@class="description"]/p',
|
2019-08-28 01:24:20 +00:00
|
|
|
'biographies' : '//div[@class="biographies"]/p',
|
2019-01-28 00:07:58 +00:00
|
|
|
'tags' : '//span[contains(@class, "tag")]/text()'
|
2019-01-27 20:39:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
content_xpath = '//div[@id="content"]'
|
|
|
|
|
|
2018-05-10 17:31:59 +00:00
|
|
|
def parse(self, response):
|
2018-05-11 23:21:56 +00:00
|
|
|
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
|
|
|
|
|
url = response.urljoin(lien.css('a::attr(href)').extract_first())
|
|
|
|
|
yield scrapy.Request(url, callback = self.parse_dir_content)
|
|
|
|
|
|
|
|
|
|
def parse_dir_content(self, response):
|
2019-01-27 20:39:33 +00:00
|
|
|
hxs = scrapy.Selector(response)
|
|
|
|
|
|
|
|
|
|
for page in hxs.xpath(self.content_xpath):
|
|
|
|
|
loader = ItemLoader(item=video(), selector=page)
|
|
|
|
|
# mettre des processeurs d'entrée ici
|
2019-01-28 00:07:58 +00:00
|
|
|
# loader.default_input_processor = MapCompose(remove_tags)
|
2019-01-27 20:39:33 +00:00
|
|
|
loader.default_output_processor = Join()
|
|
|
|
|
|
|
|
|
|
# iteration des champs de l'item video
|
|
|
|
|
for field, xpath in self.item_fields.items():
|
|
|
|
|
loader.add_xpath(field, xpath)
|
|
|
|
|
loader.add_value('urlCF', response.url)
|
2019-08-28 01:24:20 +00:00
|
|
|
extract_vid_id = re.compile(r'/(\d+).html')
|
|
|
|
|
loader.add_value('id_video', extract_vid_id.findall(response.url)[0])
|
2019-01-27 20:39:33 +00:00
|
|
|
yield loader.load_item()
|
|
|
|
|
|
2018-05-11 23:21:56 +00:00
|
|
|
|