# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader.processors import Join, MapCompose, TakeFirst def format_date(value): if not value: return None month = { 'janvier' : '01', 'février' : '02', 'mars' : '03', 'avril' : '04', 'mai' : '05', 'juin' : '06', 'juillet' : '07', 'août' : '08', 'septembre' : '09', 'octobre' : '10', 'novembre' : '11', 'décembre' : '12' } date = value.split(' ') yield date[0]+'-'+month[date[1]]+'-'+date[2] def clean_text(value): text = value.replace('\\n', '') yield text.strip() def tags_to_array(value): array = value.strip() yield array def line_breaks(value): res = value.replace('
','\\n') yield res def clean_p_tags(value): text = value.replace('
','') text = text.replace('
','') yield text def clean_url(value): yield value.split('?')[0] class video(scrapy.Item): title = scrapy.Field( input_processor = MapCompose(clean_text), output_processor = Join() ) secondary_title = scrapy.Field( input_processor = MapCompose(clean_text) ) description = scrapy.Field( input_processor = MapCompose(line_breaks, clean_p_tags), output_processor = Join('\n') ) urlVideo = scrapy.Field( input_processor = MapCompose(clean_url) ) urlCF = scrapy.Field() id_video = scrapy.Field() date_event = scrapy.Field( input_processor = MapCompose(clean_text, format_date) ) tags = scrapy.Field( input_processor = MapCompose(tags_to_array), output_processor = Join('\n') ) biographies = scrapy.Field( input_processor = MapCompose(line_breaks, clean_p_tags), output_processor = Join('\n') ) pass