Vérification si éléments existants avant INSERT

Le pipeline contient de nouvelles fonctions permettant de vérifier si
les émissions/épisodes/médias sont déjà dans la base de données pour
éviter les doublons.

Le champ date_modif de la table media a été amélioré pour garder une
trace d'éventuelles mise à jour/modifications de médias.
master
Yohann Dedy 2019-12-26 01:10:13 +01:00
parent ca2470bb32
commit f47fe47eec
4 changed files with 33 additions and 20 deletions

View File

@ -68,4 +68,5 @@ class episode_Item(scrapy.Item):
url_page = scrapy.Field() url_page = scrapy.Field()
date_diffusion = scrapy.Field() date_diffusion = scrapy.Field()
file_size= scrapy.Field() file_size= scrapy.Field()
file_last_modified = scrapy.Field()

View File

@ -9,6 +9,21 @@ import psycopg2
from config_db import DATABASE as DB from config_db import DATABASE as DB
from FMScraper.items import show_Item, episode_Item from FMScraper.items import show_Item, episode_Item
def show_exists(self, url):
cur = self.connection.cursor()
cur.execute('SELECT id FROM emission WHERE url_page = %s',(url,))
return cur.fetchone()
def episode_exists(self, id_site):
cur = self.connection.cursor()
cur.execute('SELECT id FROM episode WHERE id_episode_site = %s',(id_site,))
return cur.fetchone()
def media_exists(self, date_modif, url_file):
cur = self.connection.cursor()
cur.execute('SELECT id FROM media WHERE date_modif = %s AND url_file = %s',(date_modif, url_file))
return cur.fetchone()
class FmscraperPipeline(object): class FmscraperPipeline(object):
def open_spider(self, spider): def open_spider(self, spider):
@ -28,27 +43,23 @@ class FmscraperPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, show_Item): if isinstance(item, show_Item):
# Ajouter l'émission à la table Emission if show_exists(self, url=item['url_page']) is not None:
# Vérifier existence de l'émission via URL_page self.emission_id = show_exists(self, url=item['url_page'])[0]
# champs nom, urlpage, urlrss, genre, producteur else:
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed'])) self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
self.emission_id = self.cur.fetchone()[0] self.emission_id = self.cur.fetchone()[0]
# self.connection.commit() self.connection.commit()
if isinstance(item, episode_Item): if isinstance(item, episode_Item):
# Vérifier l'existence de l'episode via ID_episode if episode_exists(self, item['id_episode']) is not None:
# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère) self.episode_id = episode_exists(self, item['id_episode'])[0]
else:
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id)) self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
episode_id = self.cur.fetchone()[0] self.episode_id = self.cur.fetchone()[0]
self.connection.commit() self.connection.commit()
# Faire le lien episode -> emission via le nom de l'emission if media_exists(self, item['file_last_modified'], item['url_file']) is None:
# Récupérer l'ID de l'entrée emission généré self.cur.execute("INSERT INTO media (url_file, filename_orig, size, date_modif, episode_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], item['file_last_modified'], self.episode_id))
# Ajouter les infos de fichier dans la table Media
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id))
self.connection.commit() self.connection.commit()
# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante
return item return item

View File

@ -56,6 +56,7 @@ class GetEpisodesSpider(scrapy.Spider):
if field == 'url_file': if field == 'url_file':
file_info = urllib.request.urlopen(response.xpath(xpath).extract_first()) file_info = urllib.request.urlopen(response.xpath(xpath).extract_first())
loader.add_value('file_size', file_info.headers['content-length'] ) loader.add_value('file_size', file_info.headers['content-length'] )
loader.add_value('file_last_modified', file_info.headers['last-modified'])
loader.add_value('url_page', response.url) loader.add_value('url_page', response.url)
loader.add_value('url_emission', url_emission) loader.add_value('url_emission', url_emission)

View File

@ -46,7 +46,7 @@
filename_orig text NOT NULL, filename_orig text NOT NULL,
filename_local text, filename_local text,
size integer NOT NULL, size integer NOT NULL,
date_modif date NOT NULL, date_modif timestamp,
md5 text, md5 text,
duration real, duration real,
episode_id INTEGER REFERENCES episode(id) episode_id INTEGER REFERENCES episode(id)