From 6a8061a75f543eff5645bc4fae800f2af1981a76 Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Wed, 28 Aug 2019 03:24:20 +0200 Subject: [PATCH] Ajout du champ id_video --- CinemScraper/items.py | 1 + CinemScraper/spiders/grabVideoData.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CinemScraper/items.py b/CinemScraper/items.py index f28b7aa..3e17f89 100644 --- a/CinemScraper/items.py +++ b/CinemScraper/items.py @@ -45,6 +45,7 @@ class video(scrapy.Item): input_processor = MapCompose(clean_url) ) urlCF = scrapy.Field() + id_video = scrapy.Field() date_event = scrapy.Field( input_processor = MapCompose(clean_text, format_date) ) diff --git a/CinemScraper/spiders/grabVideoData.py b/CinemScraper/spiders/grabVideoData.py index f8afb55..6c61853 100644 --- a/CinemScraper/spiders/grabVideoData.py +++ b/CinemScraper/spiders/grabVideoData.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import scrapy +import re from scrapy.loader import ItemLoader from scrapy.loader.processors import Join, MapCompose, TakeFirst from w3lib.html import remove_tags @@ -17,7 +18,7 @@ class GrabvideodataSpider(scrapy.Spider): 'date_event' : '//p[@class="date"]/text()', 'urlVideo' : '//iframe/@src', 'description' : '//div[@class="description"]/p', - 'biographies' : '//div[@class="biographies"]', + 'biographies' : '//div[@class="biographies"]/p', 'tags' : '//span[contains(@class, "tag")]/text()' } @@ -41,6 +42,8 @@ class GrabvideodataSpider(scrapy.Spider): for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) loader.add_value('urlCF', response.url) + extract_vid_id = re.compile(r'/(\d+).html') + loader.add_value('id_video', extract_vid_id.findall(response.url)[0]) yield loader.load_item()