Spider fonctionnelle pour émissions
Script spécifique au site de France Musiquerecuperation-emissions
parent
ad75bd9d8d
commit
20f0f0d3be
|
|
@ -0,0 +1,33 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import scrapy
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from FMScraper.items import show_Item
|
||||||
|
|
||||||
|
|
||||||
|
class GetEmissionsSpider(scrapy.Spider):
|
||||||
|
name = 'get_emissions'
|
||||||
|
allowed_domains = ['francemusique.fr']
|
||||||
|
start_urls = ['http://francemusique.fr/emissions']
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
|
||||||
|
for sel in response.xpath('//h2[@class="emission-title"]/a/@href'):
|
||||||
|
url_emission = response.urljoin(sel.extract())
|
||||||
|
yield scrapy.Request(url_emission, callback = self.parse_emission)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_emission(self, response):
|
||||||
|
page_emission = scrapy.Selector(response)
|
||||||
|
nom_emission = page_emission.xpath('//h1[@class="cover-emission-content-link-title"]/text()').extract_first()
|
||||||
|
url_emission = response.url
|
||||||
|
url_rss = page_emission.xpath('//div[@class="podcast-container rss"]/a/@href').extract_first()
|
||||||
|
genre = page_emission.xpath('//span[@class="cover-emission-content-information-wrapper-more-genre"]/text()').extract_first()
|
||||||
|
producteurs = page_emission.xpath('//div[@class="cover-emission-content-information-wrapper-producers"]/a/@title').extract()
|
||||||
|
yield { 'url_emission' : url_emission,
|
||||||
|
'nom_emission' : nom_emission,
|
||||||
|
'url_rss' : url_rss,
|
||||||
|
'genre_emission' : genre,
|
||||||
|
'producteurs_emission' : producteurs
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue