CFVideoScraper/CinemScraper/items.py

81 lines
2.1 KiB
Python
Raw Permalink Normal View History

2018-05-10 17:31:59 +00:00
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst
2018-05-10 17:31:59 +00:00
def format_date(value):
if not value:
return None
month = {
'janvier' : '01',
'février' : '02',
'mars' : '03',
'avril' : '04',
'mai' : '05',
'juin' : '06',
'juillet' : '07',
'août' : '08',
'septembre' : '09',
'octobre' : '10',
'novembre' : '11',
'décembre' : '12'
}
date = value.split(' ')
yield date[0]+'-'+month[date[1]]+'-'+date[2]
def clean_text(value):
text = value.replace('\\n', '')
yield text.strip()
def tags_to_array(value):
array = value.strip()
yield array
def line_breaks(value):
res = value.replace('</p><p>','\\n')
yield res
def clean_p_tags(value):
text = value.replace('<p>','')
text = text.replace('</p>','')
yield text
def clean_url(value):
yield value.split('?')[0]
2018-05-10 17:31:59 +00:00
2018-05-11 22:46:01 +00:00
class video(scrapy.Item):
title = scrapy.Field(
input_processor = MapCompose(clean_text),
output_processor = Join()
)
secondary_title = scrapy.Field(
input_processor = MapCompose(clean_text)
)
description = scrapy.Field(
input_processor = MapCompose(line_breaks, clean_p_tags),
output_processor = Join('\n')
)
urlVideo = scrapy.Field(
input_processor = MapCompose(clean_url)
)
2018-05-11 22:46:01 +00:00
urlCF = scrapy.Field()
2019-08-28 01:24:20 +00:00
id_video = scrapy.Field()
date_event = scrapy.Field(
input_processor = MapCompose(clean_text, format_date)
)
tags = scrapy.Field(
input_processor = MapCompose(tags_to_array),
output_processor = Join('\n')
)
biographies = scrapy.Field(
input_processor = MapCompose(line_breaks, clean_p_tags),
output_processor = Join('\n')
)
2018-05-10 17:31:59 +00:00
pass