diff --git a/FMScraper/items.py b/FMScraper/items.py index a824ae0..bef70c6 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -18,7 +18,7 @@ def clean_text(value): yield value.strip() def clean_html_script(value): - description_re = re.match("([\w\W]+)()([\w\W]+)",value) + description_re = re.match("([\w\W]+)()?([\w\W]+)",value) description_full_post = description_re.group(1) + description_re.group(3) yield description_full_post @@ -26,6 +26,14 @@ def clean_file_url(value): url = re.match("(.+\.mp3)", value) yield url.group(1) +def clean_emission_url(value): + url = re.match("(.+)(\?.+$)?", value) + yield url.group(1) + +def extract_id_episode(value): + id_episode = re.search("-([\d]+$)", value) + yield id_episode.group(1) + class show_Item(scrapy.Item): name = scrapy.Field() url_page = scrapy.Field() @@ -33,19 +41,31 @@ class show_Item(scrapy.Item): tags = scrapy.Field() class episode_Item(scrapy.Item): + url_emission = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_emission_url) + ) title = scrapy.Field( default = 'null', input_processor = MapCompose(clean_text) ) + id_episode = scrapy.Field( + default = 'null', + input_processor = MapCompose(extract_id_episode) + ) description_lead = scrapy.Field( default = 'null', input_processor = MapCompose(clean_text) ) description_full = scrapy.Field( + default = 'null', + input_processor = MapCompose(clean_html_script) + ) url_file = scrapy.Field( default = 'null', input_processor = MapCompose(clean_file_url) ) + file_info = scrapy.Field() url_page = scrapy.Field() date_diffusion = scrapy.Field()