Amélioration du formatage des données extraites

Les tags, descriptions et biographies sont séparés par des sauts de
lines pour faciliter leur import dans des tables dédiées de la base de
données.
master
Yohann Dedy 2019-10-19 20:58:39 +02:00
parent 6a8061a75f
commit befa141eb2
1 changed files with 30 additions and 4 deletions

View File

@ -9,6 +9,8 @@ import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst from scrapy.loader.processors import Join, MapCompose, TakeFirst
def format_date(value): def format_date(value):
if not value:
return None
month = { month = {
'janvier' : '01', 'janvier' : '01',
'février' : '02', 'février' : '02',
@ -31,6 +33,19 @@ def clean_text(value):
text = value.replace('\\n', '') text = value.replace('\\n', '')
yield text.strip() yield text.strip()
def tags_to_array(value):
array = value.strip()
yield array
def line_breaks(value):
res = value.replace('</p><p>','\\n')
yield res
def clean_p_tags(value):
text = value.replace('<p>','')
text = text.replace('</p>','')
yield text
def clean_url(value): def clean_url(value):
yield value.split('?')[0] yield value.split('?')[0]
@ -39,8 +54,13 @@ class video(scrapy.Item):
input_processor = MapCompose(clean_text), input_processor = MapCompose(clean_text),
output_processor = Join() output_processor = Join()
) )
secondary_title = scrapy.Field() secondary_title = scrapy.Field(
description = scrapy.Field() input_processor = MapCompose(clean_text)
)
description = scrapy.Field(
input_processor = MapCompose(line_breaks, clean_p_tags),
output_processor = Join('\n')
)
urlVideo = scrapy.Field( urlVideo = scrapy.Field(
input_processor = MapCompose(clean_url) input_processor = MapCompose(clean_url)
) )
@ -49,6 +69,12 @@ class video(scrapy.Item):
date_event = scrapy.Field( date_event = scrapy.Field(
input_processor = MapCompose(clean_text, format_date) input_processor = MapCompose(clean_text, format_date)
) )
tags = scrapy.Field() tags = scrapy.Field(
biographies = scrapy.Field() input_processor = MapCompose(tags_to_array),
output_processor = Join('\n')
)
biographies = scrapy.Field(
input_processor = MapCompose(line_breaks, clean_p_tags),
output_processor = Join('\n')
)
pass pass