diff --git a/FMScraper/items.py b/FMScraper/items.py index 400fa03..f169811 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -6,13 +6,22 @@ # https://docs.scrapy.org/en/latest/topics/items.html import scrapy - +from scrapy.loader.processors import MapCompose +import re class FmscraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass +def clean_text(value): + yield value.strip() + +def clean_html_script(value): + description_re = re.match("([\w\W]+)()([\w\W]+)",value) + description_full_post = description_re.group(1) + description_re.group(3) + yield description_full_post + class show_Item(scrapy.Item): name = scrapy.Field() url_page = scrapy.Field() @@ -21,8 +30,14 @@ class show_Item(scrapy.Item): class episode_Item(scrapy.Item): title = scrapy.Field() - description_lead = scrapy.Field() - description_full = scrapy.Field() + description_lead = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_text) + ) + description_full = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_html_script) + ) url_file = scrapy.Field() url_page = scrapy.Field() date_diffusion = scrapy.Field()