Scraping des liens et récupération des informations dans le même spider

2018-05-12 01:21:56 +02:00 · 2018-05-12 01:21:56 +02:00 · b0f57b9a37
parent b27272d3a8
commit b0f57b9a37
3 changed files with 15 additions and 29 deletions
--- a/CinemScraper/spiders/grabVideoData.py
+++ b/CinemScraper/spiders/grabVideoData.py
@ -4,10 +4,15 @@ import scrapy

 class GrabvideodataSpider(scrapy.Spider):
    name = 'grabVideoData'
-    allowed_domains = ['http://www.cinematheque.fr/']
-    start_urls = ['http://www.cinematheque.fr/video/1219.html']
+    allowed_domains = ['cinematheque.fr']
+    start_urls = ['http://www.cinematheque.fr/decouvrir.html']

    def parse(self, response):
+        for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
+            url = response.urljoin(lien.css('a::attr(href)').extract_first())
+            yield scrapy.Request(url, callback = self.parse_dir_content)
+
+    def parse_dir_content(self, response):
    	for page in response.css("div#content"):
            yield {
    		'titre' : page.css('h1::text').extract_first(),
@ -17,4 +22,3 @@ class GrabvideodataSpider(scrapy.Spider):
    		'tags'		  : page.css('.tag::text').extract()
    	    }

-        
--- a/CinemScraper/spiders/listeVideos.py
+++ b/CinemScraper/spiders/listeVideos.py
@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-import scrapy
-
-
-class ListevideosSpider(scrapy.Spider):
-    name = 'listeVideos'
-    allowed_domains = ['http://www.cinematheque.fr/']
-    start_urls = ['http://www.cinematheque.fr/decouvrir.html']
-
-    def parse(self, response):
-    	for lien in response.xpath('//a/@href[contains(.,"video")]/../..'):
-    		yield {
-    			'adresse' : lien.css('a::attr(href)').extract(),
-    			'dateUpload' : lien.css('::attr(data-sort)').extract(),
-
-    		}
-    	
-        
--- a/CinemScraper/spiders/listeVideos.pyc
+++ b/CinemScraper/spiders/listeVideos.pyc