Meilleur nettoyage des données récupérées

master
Yohann Dedy 2019-12-02 01:56:17 +01:00
parent 951e4da065
commit 52a53285d8
1 changed files with 21 additions and 1 deletions

View File

@ -18,7 +18,7 @@ def clean_text(value):
yield value.strip()
def clean_html_script(value):
description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)([\w\W]+)",value)
description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)?([\w\W]+)",value)
description_full_post = description_re.group(1) + description_re.group(3)
yield description_full_post
@ -26,6 +26,14 @@ def clean_file_url(value):
url = re.match("(.+\.mp3)", value)
yield url.group(1)
def clean_emission_url(value):
url = re.match("(.+)(\?.+$)?", value)
yield url.group(1)
def extract_id_episode(value):
id_episode = re.search("-([\d]+$)", value)
yield id_episode.group(1)
class show_Item(scrapy.Item):
name = scrapy.Field()
url_page = scrapy.Field()
@ -33,19 +41,31 @@ class show_Item(scrapy.Item):
tags = scrapy.Field()
class episode_Item(scrapy.Item):
url_emission = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_emission_url)
)
title = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_text)
)
id_episode = scrapy.Field(
default = 'null',
input_processor = MapCompose(extract_id_episode)
)
description_lead = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_text)
)
description_full = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_html_script)
)
url_file = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_file_url)
)
file_info = scrapy.Field()
url_page = scrapy.Field()
date_diffusion = scrapy.Field()