81 lines
2.1 KiB
Python
81 lines
2.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Define here the models for your scraped items
|
|
#
|
|
# See documentation in:
|
|
# https://doc.scrapy.org/en/latest/topics/items.html
|
|
|
|
import scrapy
|
|
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
|
|
|
def format_date(value):
|
|
if not value:
|
|
return None
|
|
month = {
|
|
'janvier' : '01',
|
|
'février' : '02',
|
|
'mars' : '03',
|
|
'avril' : '04',
|
|
'mai' : '05',
|
|
'juin' : '06',
|
|
'juillet' : '07',
|
|
'août' : '08',
|
|
'septembre' : '09',
|
|
'octobre' : '10',
|
|
'novembre' : '11',
|
|
'décembre' : '12'
|
|
}
|
|
date = value.split(' ')
|
|
yield date[0]+'-'+month[date[1]]+'-'+date[2]
|
|
|
|
|
|
def clean_text(value):
|
|
text = value.replace('\\n', '')
|
|
yield text.strip()
|
|
|
|
def tags_to_array(value):
|
|
array = value.strip()
|
|
yield array
|
|
|
|
def line_breaks(value):
|
|
res = value.replace('</p><p>','\\n')
|
|
yield res
|
|
|
|
def clean_p_tags(value):
|
|
text = value.replace('<p>','')
|
|
text = text.replace('</p>','')
|
|
yield text
|
|
|
|
def clean_url(value):
|
|
yield value.split('?')[0]
|
|
|
|
class video(scrapy.Item):
|
|
title = scrapy.Field(
|
|
input_processor = MapCompose(clean_text),
|
|
output_processor = Join()
|
|
)
|
|
secondary_title = scrapy.Field(
|
|
input_processor = MapCompose(clean_text)
|
|
)
|
|
description = scrapy.Field(
|
|
input_processor = MapCompose(line_breaks, clean_p_tags),
|
|
output_processor = Join('\n')
|
|
)
|
|
urlVideo = scrapy.Field(
|
|
input_processor = MapCompose(clean_url)
|
|
)
|
|
urlCF = scrapy.Field()
|
|
id_video = scrapy.Field()
|
|
date_event = scrapy.Field(
|
|
input_processor = MapCompose(clean_text, format_date)
|
|
)
|
|
tags = scrapy.Field(
|
|
input_processor = MapCompose(tags_to_array),
|
|
output_processor = Join('\n')
|
|
)
|
|
biographies = scrapy.Field(
|
|
input_processor = MapCompose(line_breaks, clean_p_tags),
|
|
output_processor = Join('\n')
|
|
)
|
|
pass
|