Transfert sommaire vers base de données fonctionnel

master
Yohann Dedy 2019-12-02 02:44:36 +01:00
parent ebae9e4b12
commit ca2470bb32
3 changed files with 50 additions and 0 deletions

View File

@ -5,7 +5,50 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import psycopg2
from config_db import DATABASE as DB
from FMScraper.items import show_Item, episode_Item
class FmscraperPipeline(object): class FmscraperPipeline(object):
def open_spider(self, spider):
# Connexion BDD
hostname = DB['HOST']
username = DB['USER']
password = DB['PASSWORD']
database = DB['DB_NAME']
self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database)
self.cur = self.connection.cursor()
def close_spider(self, spider):
# Fermeture de la connexion à la BDD
self.cur.close()
self.connection.close()
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, show_Item):
# Ajouter l'émission à la table Emission
# Vérifier existence de l'émission via URL_page
# champs nom, urlpage, urlrss, genre, producteur
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
self.emission_id = self.cur.fetchone()[0]
# self.connection.commit()
if isinstance(item, episode_Item):
# Vérifier l'existence de l'episode via ID_episode
# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère)
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
episode_id = self.cur.fetchone()[0]
self.connection.commit()
# Faire le lien episode -> emission via le nom de l'emission
# Récupérer l'ID de l'entrée emission généré
# Ajouter les infos de fichier dans la table Media
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id))
self.connection.commit()
# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante
return item return item

View File

@ -16,6 +16,7 @@
## Tables de la base RFScraper ## Tables de la base RFScraper
CREATE TABLE station( CREATE TABLE station(
id serial PRIMARY KEY, id serial PRIMARY KEY,
nom text UNIQUE NOT NULL,
url text UNIQUE NOT NULL url text UNIQUE NOT NULL
); );

6
config_db.py Normal file
View File

@ -0,0 +1,6 @@
DATABASE = {
'HOST' : 'hostname',
'USER' : 'user',
'PASSWORD' : 'password',
'DB_NAME' : 'database name'
}