diff --git a/FMScraper/spiders/get_emissions.py b/FMScraper/spiders/get_emissions.py new file mode 100644 index 0000000..1388cd2 --- /dev/null +++ b/FMScraper/spiders/get_emissions.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.loader import ItemLoader +from FMScraper.items import show_Item + + +class GetEmissionsSpider(scrapy.Spider): + name = 'get_emissions' + allowed_domains = ['francemusique.fr'] + start_urls = ['http://francemusique.fr/emissions'] + + def parse(self, response): + + for sel in response.xpath('//h2[@class="emission-title"]/a/@href'): + url_emission = response.urljoin(sel.extract()) + yield scrapy.Request(url_emission, callback = self.parse_emission) + + + def parse_emission(self, response): + page_emission = scrapy.Selector(response) + nom_emission = page_emission.xpath('//h1[@class="cover-emission-content-link-title"]/text()').extract_first() + url_emission = response.url + url_rss = page_emission.xpath('//div[@class="podcast-container rss"]/a/@href').extract_first() + genre = page_emission.xpath('//span[@class="cover-emission-content-information-wrapper-more-genre"]/text()').extract_first() + producteurs = page_emission.xpath('//div[@class="cover-emission-content-information-wrapper-producers"]/a/@title').extract() + yield { 'url_emission' : url_emission, + 'nom_emission' : nom_emission, + 'url_rss' : url_rss, + 'genre_emission' : genre, + 'producteurs_emission' : producteurs + } + +