From b0f57b9a378a1ba84093552adfe5a93ae43488f7 Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Sat, 12 May 2018 01:21:56 +0200 Subject: [PATCH] =?UTF-8?q?Scraping=20des=20liens=20et=20r=C3=A9cup=C3=A9r?= =?UTF-8?q?ation=20des=20informations=20dans=20le=20m=C3=AAme=20spider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CinemScraper/spiders/grabVideoData.py | 26 +++++++++++++++----------- CinemScraper/spiders/listeVideos.py | 18 ------------------ CinemScraper/spiders/listeVideos.pyc | Bin 929 -> 0 bytes 3 files changed, 15 insertions(+), 29 deletions(-) delete mode 100644 CinemScraper/spiders/listeVideos.py delete mode 100644 CinemScraper/spiders/listeVideos.pyc diff --git a/CinemScraper/spiders/grabVideoData.py b/CinemScraper/spiders/grabVideoData.py index 05a1a3c..95a2c27 100644 --- a/CinemScraper/spiders/grabVideoData.py +++ b/CinemScraper/spiders/grabVideoData.py @@ -4,17 +4,21 @@ import scrapy class GrabvideodataSpider(scrapy.Spider): name = 'grabVideoData' - allowed_domains = ['http://www.cinematheque.fr/'] - start_urls = ['http://www.cinematheque.fr/video/1219.html'] + allowed_domains = ['cinematheque.fr'] + start_urls = ['http://www.cinematheque.fr/decouvrir.html'] def parse(self, response): + for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): + url = response.urljoin(lien.css('a::attr(href)').extract_first()) + yield scrapy.Request(url, callback = self.parse_dir_content) + + def parse_dir_content(self, response): for page in response.css("div#content"): - yield { - 'titre' : page.css('h1::text').extract_first(), - 'sous-titre' : page.css('h1 span::text').extract_first(), - 'description' : page.css('.biographies p').extract(), - 'videoSrcUrl' : page.css('iframe::attr(src)').extract_first(), - 'tags' : page.css('.tag::text').extract() - } - - + yield { + 'titre' : page.css('h1::text').extract_first(), + 'sous-titre' : page.css('h1 span::text').extract_first(), + 'description' : page.css('.biographies p').extract(), + 'videoSrcUrl' : page.css('iframe::attr(src)').extract_first(), + 'tags' : page.css('.tag::text').extract() + } + diff --git a/CinemScraper/spiders/listeVideos.py b/CinemScraper/spiders/listeVideos.py deleted file mode 100644 index e25581b..0000000 --- a/CinemScraper/spiders/listeVideos.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - - -class ListevideosSpider(scrapy.Spider): - name = 'listeVideos' - allowed_domains = ['http://www.cinematheque.fr/'] - start_urls = ['http://www.cinematheque.fr/decouvrir.html'] - - def parse(self, response): - for lien in response.xpath('//a/@href[contains(.,"video")]/../..'): - yield { - 'adresse' : lien.css('a::attr(href)').extract(), - 'dateUpload' : lien.css('::attr(data-sort)').extract(), - - } - - diff --git a/CinemScraper/spiders/listeVideos.pyc b/CinemScraper/spiders/listeVideos.pyc deleted file mode 100644 index ca00c6ae0b2e8cb194058c8b16ab2e82e23edeaa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 929 zcmcIi!EVz)5S_Kt0EJ3$=^0fV90|O)B1H(b5F`$$NHGLqR9P!~LmWA_yEASQshrAR z@mX+(nRNu^4y*Osnc4B%H?tdlKiL1a{gLLd8zHYJ{Nae9;HjVjNZAO$0xT8G6%-+S z1-J(Imerp(6ZX+A4RLpg}a z(*aDANYv0>hDr!$p3o~>5Vd2i*Xwnhm$h9PTG)%u#&f3&n>XF1D_y*xRh0+)x6p_} zQGst6Hm~7I!P1v^tzdHg6utyd25=Qfi7bJmOGeHJ;dmDv+=%YM@l0LZhYzeiiH6L1 zhL0GvHu_}Y?EF*S)MUyUqxkWWZ^hB*T*om_Jm5q#9vdPTN$!YlNu+QV(V{ePw^x9x3Z`wr|(YnI~p4%INY6ZCx( dAEoXsiP^tmpZ