From befa141eb26f9d672ef4f1c8adad4dbc97581489 Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Sat, 19 Oct 2019 20:58:39 +0200 Subject: [PATCH] =?UTF-8?q?Am=C3=A9lioration=20du=20formatage=20des=20donn?= =?UTF-8?q?=C3=A9es=20extraites?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Les tags, descriptions et biographies sont séparés par des sauts de lines pour faciliter leur import dans des tables dédiées de la base de données. --- CinemScraper/items.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/CinemScraper/items.py b/CinemScraper/items.py index 3e17f89..908920a 100644 --- a/CinemScraper/items.py +++ b/CinemScraper/items.py @@ -9,6 +9,8 @@ import scrapy from scrapy.loader.processors import Join, MapCompose, TakeFirst def format_date(value): + if not value: + return None month = { 'janvier' : '01', 'février' : '02', @@ -31,6 +33,19 @@ def clean_text(value): text = value.replace('\\n', '') yield text.strip() +def tags_to_array(value): + array = value.strip() + yield array + +def line_breaks(value): + res = value.replace('

','\\n') + yield res + +def clean_p_tags(value): + text = value.replace('

','') + text = text.replace('

','') + yield text + def clean_url(value): yield value.split('?')[0] @@ -39,8 +54,13 @@ class video(scrapy.Item): input_processor = MapCompose(clean_text), output_processor = Join() ) - secondary_title = scrapy.Field() - description = scrapy.Field() + secondary_title = scrapy.Field( + input_processor = MapCompose(clean_text) + ) + description = scrapy.Field( + input_processor = MapCompose(line_breaks, clean_p_tags), + output_processor = Join('\n') + ) urlVideo = scrapy.Field( input_processor = MapCompose(clean_url) ) @@ -49,6 +69,12 @@ class video(scrapy.Item): date_event = scrapy.Field( input_processor = MapCompose(clean_text, format_date) ) - tags = scrapy.Field() - biographies = scrapy.Field() + tags = scrapy.Field( + input_processor = MapCompose(tags_to_array), + output_processor = Join('\n') + ) + biographies = scrapy.Field( + input_processor = MapCompose(line_breaks, clean_p_tags), + output_processor = Join('\n') + ) pass