Compare commits
3 Commits
880cb67572
...
90c1404ca5
| Author | SHA1 | Date |
|---|---|---|
|
|
90c1404ca5 | |
|
|
90d40871a6 | |
|
|
898f3b83ed |
|
|
@ -6,14 +6,48 @@
|
||||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
|
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||||
|
|
||||||
|
def format_date(value):
|
||||||
|
month = {
|
||||||
|
'janvier' : '01',
|
||||||
|
'février' : '02',
|
||||||
|
'mars' : '03',
|
||||||
|
'avril' : '04',
|
||||||
|
'mai' : '05',
|
||||||
|
'juin' : '06',
|
||||||
|
'juillet' : '07',
|
||||||
|
'août' : '08',
|
||||||
|
'septembre' : '09',
|
||||||
|
'octobre' : '10',
|
||||||
|
'novembre' : '11',
|
||||||
|
'décembre' : '12'
|
||||||
|
}
|
||||||
|
date = value.split(' ')
|
||||||
|
yield date[0]+'-'+month[date[1]]+'-'+date[2]
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(value):
|
||||||
|
text = value.replace('\\n', '')
|
||||||
|
yield text.strip()
|
||||||
|
|
||||||
|
def clean_url(value):
|
||||||
|
yield value.split('?')[0]
|
||||||
|
|
||||||
class video(scrapy.Item):
|
class video(scrapy.Item):
|
||||||
titre = scrapy.Field()
|
title = scrapy.Field(
|
||||||
sousTitre = scrapy.Field()
|
input_processor = MapCompose(clean_text),
|
||||||
|
output_processor = Join()
|
||||||
|
)
|
||||||
|
secondary_title = scrapy.Field()
|
||||||
description = scrapy.Field()
|
description = scrapy.Field()
|
||||||
urlVideo = scrapy.Field()
|
urlVideo = scrapy.Field(
|
||||||
|
input_processor = MapCompose(clean_url)
|
||||||
|
)
|
||||||
urlCF = scrapy.Field()
|
urlCF = scrapy.Field()
|
||||||
dateUpload = scrapy.Field()
|
date_event = scrapy.Field(
|
||||||
|
input_processor = MapCompose(clean_text, format_date)
|
||||||
|
)
|
||||||
tags = scrapy.Field()
|
tags = scrapy.Field()
|
||||||
|
biographies = scrapy.Field()
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -1,26 +1,46 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import scrapy
|
import scrapy
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||||
|
from w3lib.html import remove_tags
|
||||||
|
|
||||||
|
from CinemScraper.items import video
|
||||||
|
|
||||||
class GrabvideodataSpider(scrapy.Spider):
|
class GrabvideodataSpider(scrapy.Spider):
|
||||||
name = 'grabVideoData'
|
name = 'grabVideoData'
|
||||||
allowed_domains = ['cinematheque.fr']
|
allowed_domains = ['cinematheque.fr']
|
||||||
start_urls = ['http://www.cinematheque.fr/decouvrir.html']
|
start_urls = ['http://www.cinematheque.fr/decouvrir.html']
|
||||||
|
|
||||||
|
item_fields = {
|
||||||
|
'title' : '//h1/text()',
|
||||||
|
'secondary_title' : '//h1/span[@class="sub"]/text()',
|
||||||
|
'date_event' : '//p[@class="date"]/text()',
|
||||||
|
'urlVideo' : '//iframe/@src',
|
||||||
|
'description' : '//div[@class="description"]/p',
|
||||||
|
'biographies' : '//div[@class="biographies"]',
|
||||||
|
'tags' : '//span[contains(@class, "tag")]/text()'
|
||||||
|
}
|
||||||
|
|
||||||
|
content_xpath = '//div[@id="content"]'
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
|
for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
|
||||||
url = response.urljoin(lien.css('a::attr(href)').extract_first())
|
url = response.urljoin(lien.css('a::attr(href)').extract_first())
|
||||||
yield scrapy.Request(url, callback = self.parse_dir_content)
|
yield scrapy.Request(url, callback = self.parse_dir_content)
|
||||||
|
|
||||||
def parse_dir_content(self, response):
|
def parse_dir_content(self, response):
|
||||||
for page in response.css("div#content"):
|
hxs = scrapy.Selector(response)
|
||||||
yield {
|
|
||||||
'titre' : page.css('h1::text').extract_first().strip(),
|
for page in hxs.xpath(self.content_xpath):
|
||||||
'sous-titre' : page.css('h1 span::text').extract_first(),
|
loader = ItemLoader(item=video(), selector=page)
|
||||||
'description' : page.css('.description p').extract(),
|
# mettre des processeurs d'entrée ici
|
||||||
'biographies' : page.css('.biographies p').extract(),
|
# loader.default_input_processor = MapCompose(remove_tags)
|
||||||
'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'),
|
loader.default_output_processor = Join()
|
||||||
'articleUrl' : response.url,
|
|
||||||
'tags' : page.css('.tag::text').re(r'[\n]')
|
# iteration des champs de l'item video
|
||||||
}
|
for field, xpath in self.item_fields.items():
|
||||||
|
loader.add_xpath(field, xpath)
|
||||||
|
loader.add_value('urlCF', response.url)
|
||||||
|
yield loader.load_item()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue