Amélioration du formatage des données extraites
Les tags, descriptions et biographies sont séparés par des sauts de lines pour faciliter leur import dans des tables dédiées de la base de données.master
parent
6a8061a75f
commit
befa141eb2
|
|
@ -9,6 +9,8 @@ import scrapy
|
||||||
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||||
|
|
||||||
def format_date(value):
|
def format_date(value):
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
month = {
|
month = {
|
||||||
'janvier' : '01',
|
'janvier' : '01',
|
||||||
'février' : '02',
|
'février' : '02',
|
||||||
|
|
@ -31,6 +33,19 @@ def clean_text(value):
|
||||||
text = value.replace('\\n', '')
|
text = value.replace('\\n', '')
|
||||||
yield text.strip()
|
yield text.strip()
|
||||||
|
|
||||||
|
def tags_to_array(value):
|
||||||
|
array = value.strip()
|
||||||
|
yield array
|
||||||
|
|
||||||
|
def line_breaks(value):
|
||||||
|
res = value.replace('</p><p>','\\n')
|
||||||
|
yield res
|
||||||
|
|
||||||
|
def clean_p_tags(value):
|
||||||
|
text = value.replace('<p>','')
|
||||||
|
text = text.replace('</p>','')
|
||||||
|
yield text
|
||||||
|
|
||||||
def clean_url(value):
|
def clean_url(value):
|
||||||
yield value.split('?')[0]
|
yield value.split('?')[0]
|
||||||
|
|
||||||
|
|
@ -39,8 +54,13 @@ class video(scrapy.Item):
|
||||||
input_processor = MapCompose(clean_text),
|
input_processor = MapCompose(clean_text),
|
||||||
output_processor = Join()
|
output_processor = Join()
|
||||||
)
|
)
|
||||||
secondary_title = scrapy.Field()
|
secondary_title = scrapy.Field(
|
||||||
description = scrapy.Field()
|
input_processor = MapCompose(clean_text)
|
||||||
|
)
|
||||||
|
description = scrapy.Field(
|
||||||
|
input_processor = MapCompose(line_breaks, clean_p_tags),
|
||||||
|
output_processor = Join('\n')
|
||||||
|
)
|
||||||
urlVideo = scrapy.Field(
|
urlVideo = scrapy.Field(
|
||||||
input_processor = MapCompose(clean_url)
|
input_processor = MapCompose(clean_url)
|
||||||
)
|
)
|
||||||
|
|
@ -49,6 +69,12 @@ class video(scrapy.Item):
|
||||||
date_event = scrapy.Field(
|
date_event = scrapy.Field(
|
||||||
input_processor = MapCompose(clean_text, format_date)
|
input_processor = MapCompose(clean_text, format_date)
|
||||||
)
|
)
|
||||||
tags = scrapy.Field()
|
tags = scrapy.Field(
|
||||||
biographies = scrapy.Field()
|
input_processor = MapCompose(tags_to_array),
|
||||||
|
output_processor = Join('\n')
|
||||||
|
)
|
||||||
|
biographies = scrapy.Field(
|
||||||
|
input_processor = MapCompose(line_breaks, clean_p_tags),
|
||||||
|
output_processor = Join('\n')
|
||||||
|
)
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue