From ebae9e4b120c09ee715d4700e8a42f3af36bd5d9 Mon Sep 17 00:00:00 2001 From: Yohann Dedy Date: Mon, 2 Dec 2019 02:36:32 +0100 Subject: [PATCH] =?UTF-8?q?R=C3=A9cup=C3=A9ration=20simplifi=C3=A9e=20de?= =?UTF-8?q?=20la=20taille=20du=20fichier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit URL de l'émission transmise avec l'item Episode si une association Emission <-> Episode s'avère nécessaire lors de son enregistrement dans la base de données --- FMScraper/items.py | 2 +- FMScraper/spiders/get_episodes.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/FMScraper/items.py b/FMScraper/items.py index bef70c6..1042730 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -65,7 +65,7 @@ class episode_Item(scrapy.Item): default = 'null', input_processor = MapCompose(clean_file_url) ) - file_info = scrapy.Field() url_page = scrapy.Field() date_diffusion = scrapy.Field() + file_size= scrapy.Field() diff --git a/FMScraper/spiders/get_episodes.py b/FMScraper/spiders/get_episodes.py index 2227a7d..e523ef8 100644 --- a/FMScraper/spiders/get_episodes.py +++ b/FMScraper/spiders/get_episodes.py @@ -3,6 +3,7 @@ import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import Join, MapCompose, TakeFirst from FMScraper.items import show_Item, episode_Item +import urllib.request class GetEpisodesSpider(scrapy.Spider): name = 'get_episodes' @@ -36,14 +37,15 @@ class GetEpisodesSpider(scrapy.Spider): def parse_episodes(self, response): for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'): + url_emission = response.url url_episode = response.urljoin(sel.extract()) next_page = response.xpath('//link[@rel="next"]/@href') - yield scrapy.Request(url_episode, callback = self.parse_episode) + yield scrapy.Request(url_episode, callback = self.parse_episode, cb_kwargs=dict(url_emission=url_emission)) if next_page: next_url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url=next_url,callback = self.parse_episodes) - def parse_episode(self, response): + def parse_episode(self, response, url_emission): page_episode = scrapy.Selector(response) for page in page_episode.xpath(self.episode_xpath): loader = ItemLoader(item=episode_Item(), selector=page) @@ -51,7 +53,12 @@ class GetEpisodesSpider(scrapy.Spider): for field, xpath in self.episode_fields.items(): loader.add_xpath(field, xpath) + if field == 'url_file': + file_info = urllib.request.urlopen(response.xpath(xpath).extract_first()) + loader.add_value('file_size', file_info.headers['content-length'] ) loader.add_value('url_page', response.url) + loader.add_value('url_emission', url_emission) + loader.add_value('id_episode', response.url) yield loader.load_item()