diff --git a/FMScraper/items.py b/FMScraper/items.py index 1042730..87535f9 100644 --- a/FMScraper/items.py +++ b/FMScraper/items.py @@ -68,4 +68,5 @@ class episode_Item(scrapy.Item): url_page = scrapy.Field() date_diffusion = scrapy.Field() file_size= scrapy.Field() + file_last_modified = scrapy.Field() diff --git a/FMScraper/pipelines.py b/FMScraper/pipelines.py index e6caca2..7586878 100644 --- a/FMScraper/pipelines.py +++ b/FMScraper/pipelines.py @@ -9,6 +9,21 @@ import psycopg2 from config_db import DATABASE as DB from FMScraper.items import show_Item, episode_Item +def show_exists(self, url): + cur = self.connection.cursor() + cur.execute('SELECT id FROM emission WHERE url_page = %s',(url,)) + return cur.fetchone() + +def episode_exists(self, id_site): + cur = self.connection.cursor() + cur.execute('SELECT id FROM episode WHERE id_episode_site = %s',(id_site,)) + return cur.fetchone() + +def media_exists(self, date_modif, url_file): + cur = self.connection.cursor() + cur.execute('SELECT id FROM media WHERE date_modif = %s AND url_file = %s',(date_modif, url_file)) + return cur.fetchone() + class FmscraperPipeline(object): def open_spider(self, spider): @@ -28,27 +43,23 @@ class FmscraperPipeline(object): def process_item(self, item, spider): if isinstance(item, show_Item): -# Ajouter l'émission à la table Emission -# Vérifier existence de l'émission via URL_page -# champs nom, urlpage, urlrss, genre, producteur - self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed'])) - self.emission_id = self.cur.fetchone()[0] -# self.connection.commit() + if show_exists(self, url=item['url_page']) is not None: + self.emission_id = show_exists(self, url=item['url_page'])[0] + else: + self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed'])) + self.emission_id = self.cur.fetchone()[0] + self.connection.commit() if isinstance(item, episode_Item): -# Vérifier l'existence de l'episode via ID_episode -# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère) - self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id)) - episode_id = self.cur.fetchone()[0] - self.connection.commit() - -# Faire le lien episode -> emission via le nom de l'emission -# Récupérer l'ID de l'entrée emission généré - -# Ajouter les infos de fichier dans la table Media - self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id)) - self.connection.commit() -# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante + if episode_exists(self, item['id_episode']) is not None: + self.episode_id = episode_exists(self, item['id_episode'])[0] + else: + self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id)) + self.episode_id = self.cur.fetchone()[0] + self.connection.commit() + if media_exists(self, item['file_last_modified'], item['url_file']) is None: + self.cur.execute("INSERT INTO media (url_file, filename_orig, size, date_modif, episode_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], item['file_last_modified'], self.episode_id)) + self.connection.commit() return item diff --git a/FMScraper/spiders/get_episodes.py b/FMScraper/spiders/get_episodes.py index e523ef8..c1d1912 100644 --- a/FMScraper/spiders/get_episodes.py +++ b/FMScraper/spiders/get_episodes.py @@ -56,6 +56,7 @@ class GetEpisodesSpider(scrapy.Spider): if field == 'url_file': file_info = urllib.request.urlopen(response.xpath(xpath).extract_first()) loader.add_value('file_size', file_info.headers['content-length'] ) + loader.add_value('file_last_modified', file_info.headers['last-modified']) loader.add_value('url_page', response.url) loader.add_value('url_emission', url_emission) diff --git a/README.md b/README.md index fba4cad..626db75 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ filename_orig text NOT NULL, filename_local text, size integer NOT NULL, - date_modif date NOT NULL, + date_modif timestamp, md5 text, duration real, episode_id INTEGER REFERENCES episode(id)