diff --git a/FMScraper/items.py b/FMScraper/items.py index bef70c6..1042730 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -65,7 +65,7 @@ class episode_Item(scrapy.Item): default = 'null', input_processor = MapCompose(clean_file_url) ) - file_info = scrapy.Field() url_page = scrapy.Field() date_diffusion = scrapy.Field() + file_size= scrapy.Field() diff --git a/FMScraper/spiders/get_episodes.py b/FMScraper/spiders/get_episodes.py index 2227a7d..e523ef8 100644 --- a/FMScraper/spiders/get_episodes.py +++ b/FMScraper/spiders/get_episodes.py @@ -3,6 +3,7 @@ import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import Join, MapCompose, TakeFirst from FMScraper.items import show_Item, episode_Item +import urllib.request class GetEpisodesSpider(scrapy.Spider): name = 'get_episodes' @@ -36,14 +37,15 @@ class GetEpisodesSpider(scrapy.Spider): def parse_episodes(self, response): for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'): + url_emission = response.url url_episode = response.urljoin(sel.extract()) next_page = response.xpath('//link[@rel="next"]/@href') - yield scrapy.Request(url_episode, callback = self.parse_episode) + yield scrapy.Request(url_episode, callback = self.parse_episode, cb_kwargs=dict(url_emission=url_emission)) if next_page: next_url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url=next_url,callback = self.parse_episodes) - def parse_episode(self, response): + def parse_episode(self, response, url_emission): page_episode = scrapy.Selector(response) for page in page_episode.xpath(self.episode_xpath): loader = ItemLoader(item=episode_Item(), selector=page) @@ -51,7 +53,12 @@ class GetEpisodesSpider(scrapy.Spider): for field, xpath in self.episode_fields.items(): loader.add_xpath(field, xpath) + if field == 'url_file': + file_info = urllib.request.urlopen(response.xpath(xpath).extract_first()) + loader.add_value('file_size', file_info.headers['content-length'] ) loader.add_value('url_page', response.url) + loader.add_value('url_emission', url_emission) + loader.add_value('id_episode', response.url) yield loader.load_item()