From 880cb67572a5565c50cd459ca86b2d28cb72603f Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Mon, 21 May 2018 05:15:17 +0200 Subject: [PATCH] =?UTF-8?q?Suppression=20des=20saut=20de=20ligne=20et=20es?= =?UTF-8?q?paces=20en=20d=C3=A9but=20et=20fin=20de=20cha=C3=AEnes=20de=20c?= =?UTF-8?q?aract=C3=A8re?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CinemScraper/spiders/grabVideoData.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CinemScraper/spiders/grabVideoData.py b/CinemScraper/spiders/grabVideoData.py index c42884e..cd470b5 100644 --- a/CinemScraper/spiders/grabVideoData.py +++ b/CinemScraper/spiders/grabVideoData.py @@ -15,11 +15,12 @@ class GrabvideodataSpider(scrapy.Spider): def parse_dir_content(self, response): for page in response.css("div#content"): yield { - 'titre' : page.css('h1::text').extract_first(), - 'sous-titre' : page.css('h1 span::text').extract_first(), - 'description' : page.css('.biographies p').extract(), - 'videoSrcUrl' : page.css('iframe::attr(src)').extract_first(), + 'titre' : page.css('h1::text').extract_first().strip(), + 'sous-titre' : page.css('h1 span::text').extract_first(), + 'description' : page.css('.description p').extract(), + 'biographies' : page.css('.biographies p').extract(), + 'videoSrcUrl' : page.css('iframe::attr(src)').re_first(r'\w[\w\.\/]+'), 'articleUrl' : response.url, - 'tags' : page.css('.tag::text').extract() + 'tags' : page.css('.tag::text').re(r'[\n]') }