Vérification si éléments existants avant INSERT
Le pipeline contient de nouvelles fonctions permettant de vérifier si les émissions/épisodes/médias sont déjà dans la base de données pour éviter les doublons. Le champ date_modif de la table media a été amélioré pour garder une trace d'éventuelles mise à jour/modifications de médias.master
parent
ca2470bb32
commit
f47fe47eec
|
|
@ -68,4 +68,5 @@ class episode_Item(scrapy.Item):
|
||||||
url_page = scrapy.Field()
|
url_page = scrapy.Field()
|
||||||
date_diffusion = scrapy.Field()
|
date_diffusion = scrapy.Field()
|
||||||
file_size= scrapy.Field()
|
file_size= scrapy.Field()
|
||||||
|
file_last_modified = scrapy.Field()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,21 @@ import psycopg2
|
||||||
from config_db import DATABASE as DB
|
from config_db import DATABASE as DB
|
||||||
from FMScraper.items import show_Item, episode_Item
|
from FMScraper.items import show_Item, episode_Item
|
||||||
|
|
||||||
|
def show_exists(self, url):
|
||||||
|
cur = self.connection.cursor()
|
||||||
|
cur.execute('SELECT id FROM emission WHERE url_page = %s',(url,))
|
||||||
|
return cur.fetchone()
|
||||||
|
|
||||||
|
def episode_exists(self, id_site):
|
||||||
|
cur = self.connection.cursor()
|
||||||
|
cur.execute('SELECT id FROM episode WHERE id_episode_site = %s',(id_site,))
|
||||||
|
return cur.fetchone()
|
||||||
|
|
||||||
|
def media_exists(self, date_modif, url_file):
|
||||||
|
cur = self.connection.cursor()
|
||||||
|
cur.execute('SELECT id FROM media WHERE date_modif = %s AND url_file = %s',(date_modif, url_file))
|
||||||
|
return cur.fetchone()
|
||||||
|
|
||||||
class FmscraperPipeline(object):
|
class FmscraperPipeline(object):
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
|
|
||||||
|
|
@ -28,27 +43,23 @@ class FmscraperPipeline(object):
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
|
|
||||||
if isinstance(item, show_Item):
|
if isinstance(item, show_Item):
|
||||||
# Ajouter l'émission à la table Emission
|
if show_exists(self, url=item['url_page']) is not None:
|
||||||
# Vérifier existence de l'émission via URL_page
|
self.emission_id = show_exists(self, url=item['url_page'])[0]
|
||||||
# champs nom, urlpage, urlrss, genre, producteur
|
else:
|
||||||
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
|
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
|
||||||
self.emission_id = self.cur.fetchone()[0]
|
self.emission_id = self.cur.fetchone()[0]
|
||||||
# self.connection.commit()
|
self.connection.commit()
|
||||||
|
|
||||||
if isinstance(item, episode_Item):
|
if isinstance(item, episode_Item):
|
||||||
# Vérifier l'existence de l'episode via ID_episode
|
if episode_exists(self, item['id_episode']) is not None:
|
||||||
# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère)
|
self.episode_id = episode_exists(self, item['id_episode'])[0]
|
||||||
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
|
else:
|
||||||
episode_id = self.cur.fetchone()[0]
|
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
|
||||||
self.connection.commit()
|
self.episode_id = self.cur.fetchone()[0]
|
||||||
|
self.connection.commit()
|
||||||
# Faire le lien episode -> emission via le nom de l'emission
|
|
||||||
# Récupérer l'ID de l'entrée emission généré
|
|
||||||
|
|
||||||
# Ajouter les infos de fichier dans la table Media
|
|
||||||
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id))
|
|
||||||
self.connection.commit()
|
|
||||||
# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante
|
|
||||||
|
|
||||||
|
if media_exists(self, item['file_last_modified'], item['url_file']) is None:
|
||||||
|
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, date_modif, episode_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], item['file_last_modified'], self.episode_id))
|
||||||
|
self.connection.commit()
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ class GetEpisodesSpider(scrapy.Spider):
|
||||||
if field == 'url_file':
|
if field == 'url_file':
|
||||||
file_info = urllib.request.urlopen(response.xpath(xpath).extract_first())
|
file_info = urllib.request.urlopen(response.xpath(xpath).extract_first())
|
||||||
loader.add_value('file_size', file_info.headers['content-length'] )
|
loader.add_value('file_size', file_info.headers['content-length'] )
|
||||||
|
loader.add_value('file_last_modified', file_info.headers['last-modified'])
|
||||||
|
|
||||||
loader.add_value('url_page', response.url)
|
loader.add_value('url_page', response.url)
|
||||||
loader.add_value('url_emission', url_emission)
|
loader.add_value('url_emission', url_emission)
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@
|
||||||
filename_orig text NOT NULL,
|
filename_orig text NOT NULL,
|
||||||
filename_local text,
|
filename_local text,
|
||||||
size integer NOT NULL,
|
size integer NOT NULL,
|
||||||
date_modif date NOT NULL,
|
date_modif timestamp,
|
||||||
md5 text,
|
md5 text,
|
||||||
duration real,
|
duration real,
|
||||||
episode_id INTEGER REFERENCES episode(id)
|
episode_id INTEGER REFERENCES episode(id)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue