From 280ad9842b1b17d01afb5b3f8ec67a9499b76f29 Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Thu, 28 Nov 2019 23:59:01 +0100 Subject: [PATCH] =?UTF-8?q?Ajout=20de=20pr=C3=A9processeurs=20pour=20les?= =?UTF-8?q?=20items?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- FMScraper/items.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/FMScraper/items.py b/FMScraper/items.py index 400fa03..f169811 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -6,13 +6,22 @@ # https://docs.scrapy.org/en/latest/topics/items.html import scrapy - +from scrapy.loader.processors import MapCompose +import re class FmscraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass +def clean_text(value): + yield value.strip() + +def clean_html_script(value): + description_re = re.match("([\w\W]+)()([\w\W]+)",value) + description_full_post = description_re.group(1) + description_re.group(3) + yield description_full_post + class show_Item(scrapy.Item): name = scrapy.Field() url_page = scrapy.Field() @@ -21,8 +30,14 @@ class show_Item(scrapy.Item): class episode_Item(scrapy.Item): title = scrapy.Field() - description_lead = scrapy.Field() - description_full = scrapy.Field() + description_lead = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_text) + ) + description_full = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_html_script) + ) url_file = scrapy.Field() url_page = scrapy.Field() date_diffusion = scrapy.Field()