Compare commits

..

No commits in common. "90c1404ca55b8063827868bae31201c46990ba76" and "880cb67572a5565c50cd459ca86b2d28cb72603f" have entirely different histories.

2 changed files with 14 additions and 68 deletions

View File

@ -6,48 +6,14 @@
# https://doc.scrapy.org/en/latest/topics/items.html # https://doc.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst
def format_date(value):
month = {
'janvier' : '01',
'février' : '02',
'mars' : '03',
'avril' : '04',
'mai' : '05',
'juin' : '06',
'juillet' : '07',
'août' : '08',
'septembre' : '09',
'octobre' : '10',
'novembre' : '11',
'décembre' : '12'
}
date = value.split(' ')
yield date[0]+'-'+month[date[1]]+'-'+date[2]
def clean_text(value):
text = value.replace('\\n', '')
yield text.strip()
def clean_url(value):
yield value.split('?')[0]
class video(scrapy.Item): class video(scrapy.Item):
title = scrapy.Field( titre = scrapy.Field()
input_processor = MapCompose(clean_text), sousTitre = scrapy.Field()
output_processor = Join()
)
secondary_title = scrapy.Field()
description = scrapy.Field() description = scrapy.Field()
urlVideo = scrapy.Field( urlVideo = scrapy.Field()
input_processor = MapCompose(clean_url)
)
urlCF = scrapy.Field() urlCF = scrapy.Field()
date_event = scrapy.Field( dateUpload = scrapy.Field()
input_processor = MapCompose(clean_text, format_date)
)
tags = scrapy.Field() tags = scrapy.Field()
biographies = scrapy.Field()
pass pass

View File

@ -1,46 +1,26 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from w3lib.html import remove_tags
from CinemScraper.items import video
class GrabvideodataSpider(scrapy.Spider): class GrabvideodataSpider(scrapy.Spider):
name = 'grabVideoData' name = 'grabVideoData'
allowed_domains = ['cinematheque.fr'] allowed_domains = ['cinematheque.fr']
start_urls = ['http://www.cinematheque.fr/decouvrir.html'] start_urls = ['http://www.cinematheque.fr/decouvrir.html']
item_fields = {
'title' : '//h1/text()',
'secondary_title' : '//h1/span[@class="sub"]/text()',
'date_event' : '//p[@class="date"]/text()',
'urlVideo' : '//iframe/@src',
'description' : '//div[@class="description"]/p',
'biographies' : '//div[@class="biographies"]',
'tags' : '//span[contains(@class, "tag")]/text()'
}
content_xpath = '//div[@id="content"]'
def parse(self, response): def parse(self, response):
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
url = response.urljoin(lien.css('a::attr(href)').extract_first()) url = response.urljoin(lien.css('a::attr(href)').extract_first())
yield scrapy.Request(url, callback = self.parse_dir_content) yield scrapy.Request(url, callback = self.parse_dir_content)
def parse_dir_content(self, response): def parse_dir_content(self, response):
hxs = scrapy.Selector(response) for page in response.css("div#content"):
yield {
for page in hxs.xpath(self.content_xpath): 'titre' : page.css('h1::text').extract_first().strip(),
loader = ItemLoader(item=video(), selector=page) 'sous-titre' : page.css('h1 span::text').extract_first(),
# mettre des processeurs d'entrée ici 'description' : page.css('.description p').extract(),
# loader.default_input_processor = MapCompose(remove_tags) 'biographies' : page.css('.biographies p').extract(),
loader.default_output_processor = Join() 'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'),
'articleUrl' : response.url,
# iteration des champs de l'item video 'tags' : page.css('.tag::text').re(r'[\n]')
for field, xpath in self.item_fields.items(): }
loader.add_xpath(field, xpath)
loader.add_value('urlCF', response.url)
yield loader.load_item()