diff --git a/CinemScraper/items.py b/CinemScraper/items.py index 3e17f89..908920a 100644 --- a/CinemScraper/items.py +++ b/CinemScraper/items.py @@ -9,6 +9,8 @@ import scrapy from scrapy.loader.processors import Join, MapCompose, TakeFirst def format_date(value): + if not value: + return None month = { 'janvier' : '01', 'février' : '02', @@ -31,6 +33,19 @@ def clean_text(value): text = value.replace('\\n', '') yield text.strip() +def tags_to_array(value): + array = value.strip() + yield array + +def line_breaks(value): + res = value.replace('
','\\n') + yield res + +def clean_p_tags(value): + text = value.replace('
','') + text = text.replace('
','') + yield text + def clean_url(value): yield value.split('?')[0] @@ -39,8 +54,13 @@ class video(scrapy.Item): input_processor = MapCompose(clean_text), output_processor = Join() ) - secondary_title = scrapy.Field() - description = scrapy.Field() + secondary_title = scrapy.Field( + input_processor = MapCompose(clean_text) + ) + description = scrapy.Field( + input_processor = MapCompose(line_breaks, clean_p_tags), + output_processor = Join('\n') + ) urlVideo = scrapy.Field( input_processor = MapCompose(clean_url) ) @@ -49,6 +69,12 @@ class video(scrapy.Item): date_event = scrapy.Field( input_processor = MapCompose(clean_text, format_date) ) - tags = scrapy.Field() - biographies = scrapy.Field() + tags = scrapy.Field( + input_processor = MapCompose(tags_to_array), + output_processor = Join('\n') + ) + biographies = scrapy.Field( + input_processor = MapCompose(line_breaks, clean_p_tags), + output_processor = Join('\n') + ) pass