Compare commits

..

1 Commits

Author SHA1 Message Date
Yohann Dedy 20f0f0d3be Spider fonctionnelle pour émissions
Script spécifique au site de France Musique
2020-01-28 00:51:39 +01:00
1 changed files with 33 additions and 0 deletions

View File

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader import ItemLoader
from FMScraper.items import show_Item
class GetEmissionsSpider(scrapy.Spider):
name = 'get_emissions'
allowed_domains = ['francemusique.fr']
start_urls = ['http://francemusique.fr/emissions']
def parse(self, response):
for sel in response.xpath('//h2[@class="emission-title"]/a/@href'):
url_emission = response.urljoin(sel.extract())
yield scrapy.Request(url_emission, callback = self.parse_emission)
def parse_emission(self, response):
page_emission = scrapy.Selector(response)
nom_emission = page_emission.xpath('//h1[@class="cover-emission-content-link-title"]/text()').extract_first()
url_emission = response.url
url_rss = page_emission.xpath('//div[@class="podcast-container rss"]/a/@href').extract_first()
genre = page_emission.xpath('//span[@class="cover-emission-content-information-wrapper-more-genre"]/text()').extract_first()
producteurs = page_emission.xpath('//div[@class="cover-emission-content-information-wrapper-producers"]/a/@title').extract()
yield { 'url_emission' : url_emission,
'nom_emission' : nom_emission,
'url_rss' : url_rss,
'genre_emission' : genre,
'producteurs_emission' : producteurs
}