CFVideoScraper/CinemScraper/items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst

def format_date(value):
    if not value:
        return None
    month = {
            'janvier'   : '01',
            'février'   : '02',
            'mars'      : '03',
            'avril'     : '04',
            'mai'       : '05',
            'juin'      : '06',
            'juillet'   : '07',
            'août'      : '08',
            'septembre' : '09',
            'octobre'   : '10',
            'novembre'  : '11',
            'décembre'  : '12'
            }
    date = value.split(' ')
    yield date[0]+'-'+month[date[1]]+'-'+date[2]


def clean_text(value):
    text = value.replace('\\n', '')
    yield text.strip()

def tags_to_array(value):
    array = value.strip()
    yield array

def line_breaks(value):
    res = value.replace('</p><p>','\\n')
    yield res

def clean_p_tags(value):
    text = value.replace('<p>','')
    text = text.replace('</p>','')
    yield text

def clean_url(value):
    yield value.split('?')[0]

class video(scrapy.Item):
    title = scrapy.Field(
            input_processor = MapCompose(clean_text),
            output_processor = Join()
            )
    secondary_title = scrapy.Field(
            input_processor = MapCompose(clean_text)
            )
    description = scrapy.Field(
            input_processor = MapCompose(line_breaks, clean_p_tags),
            output_processor = Join('\n')
            )
    urlVideo = scrapy.Field(
            input_processor = MapCompose(clean_url)
            )
    urlCF = scrapy.Field()
    id_video = scrapy.Field()
    date_event = scrapy.Field(
            input_processor = MapCompose(clean_text, format_date)
            )
    tags = scrapy.Field(
            input_processor = MapCompose(tags_to_array),
            output_processor = Join('\n')
            )
    biographies = scrapy.Field(
            input_processor = MapCompose(line_breaks, clean_p_tags),
            output_processor = Join('\n')
            )
    pass