Compare commits

...

3 Commits

Author SHA1 Message Date
Yohann Dedy ca2470bb32 Transfert sommaire vers base de données fonctionnel 2019-12-02 02:53:11 +01:00
Yohann Dedy ebae9e4b12 Récupération simplifiée de la taille du fichier
URL de l'émission transmise avec l'item Episode si une association
Emission <-> Episode s'avère nécessaire lors de son enregistrement
dans la base de données
2019-12-02 02:36:32 +01:00
Yohann Dedy 52a53285d8 Meilleur nettoyage des données récupérées 2019-12-02 01:56:17 +01:00
5 changed files with 80 additions and 3 deletions

View File

@ -18,7 +18,7 @@ def clean_text(value):
yield value.strip() yield value.strip()
def clean_html_script(value): def clean_html_script(value):
description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)([\w\W]+)",value) description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)?([\w\W]+)",value)
description_full_post = description_re.group(1) + description_re.group(3) description_full_post = description_re.group(1) + description_re.group(3)
yield description_full_post yield description_full_post
@ -26,6 +26,14 @@ def clean_file_url(value):
url = re.match("(.+\.mp3)", value) url = re.match("(.+\.mp3)", value)
yield url.group(1) yield url.group(1)
def clean_emission_url(value):
url = re.match("(.+)(\?.+$)?", value)
yield url.group(1)
def extract_id_episode(value):
id_episode = re.search("-([\d]+$)", value)
yield id_episode.group(1)
class show_Item(scrapy.Item): class show_Item(scrapy.Item):
name = scrapy.Field() name = scrapy.Field()
url_page = scrapy.Field() url_page = scrapy.Field()
@ -33,19 +41,31 @@ class show_Item(scrapy.Item):
tags = scrapy.Field() tags = scrapy.Field()
class episode_Item(scrapy.Item): class episode_Item(scrapy.Item):
url_emission = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_emission_url)
)
title = scrapy.Field( title = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_text) input_processor = MapCompose(clean_text)
) )
id_episode = scrapy.Field(
default = 'null',
input_processor = MapCompose(extract_id_episode)
)
description_lead = scrapy.Field( description_lead = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_text) input_processor = MapCompose(clean_text)
) )
description_full = scrapy.Field( description_full = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_html_script)
)
url_file = scrapy.Field( url_file = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_file_url) input_processor = MapCompose(clean_file_url)
) )
url_page = scrapy.Field() url_page = scrapy.Field()
date_diffusion = scrapy.Field() date_diffusion = scrapy.Field()
file_size= scrapy.Field()

View File

@ -5,7 +5,50 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import psycopg2
from config_db import DATABASE as DB
from FMScraper.items import show_Item, episode_Item
class FmscraperPipeline(object): class FmscraperPipeline(object):
def open_spider(self, spider):
# Connexion BDD
hostname = DB['HOST']
username = DB['USER']
password = DB['PASSWORD']
database = DB['DB_NAME']
self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database)
self.cur = self.connection.cursor()
def close_spider(self, spider):
# Fermeture de la connexion à la BDD
self.cur.close()
self.connection.close()
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, show_Item):
# Ajouter l'émission à la table Emission
# Vérifier existence de l'émission via URL_page
# champs nom, urlpage, urlrss, genre, producteur
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
self.emission_id = self.cur.fetchone()[0]
# self.connection.commit()
if isinstance(item, episode_Item):
# Vérifier l'existence de l'episode via ID_episode
# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère)
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
episode_id = self.cur.fetchone()[0]
self.connection.commit()
# Faire le lien episode -> emission via le nom de l'emission
# Récupérer l'ID de l'entrée emission généré
# Ajouter les infos de fichier dans la table Media
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id))
self.connection.commit()
# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante
return item return item

View File

@ -3,6 +3,7 @@ import scrapy
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, TakeFirst from scrapy.loader.processors import Join, MapCompose, TakeFirst
from FMScraper.items import show_Item, episode_Item from FMScraper.items import show_Item, episode_Item
import urllib.request
class GetEpisodesSpider(scrapy.Spider): class GetEpisodesSpider(scrapy.Spider):
name = 'get_episodes' name = 'get_episodes'
@ -36,14 +37,15 @@ class GetEpisodesSpider(scrapy.Spider):
def parse_episodes(self, response): def parse_episodes(self, response):
for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'): for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'):
url_emission = response.url
url_episode = response.urljoin(sel.extract()) url_episode = response.urljoin(sel.extract())
next_page = response.xpath('//link[@rel="next"]/@href') next_page = response.xpath('//link[@rel="next"]/@href')
yield scrapy.Request(url_episode, callback = self.parse_episode) yield scrapy.Request(url_episode, callback = self.parse_episode, cb_kwargs=dict(url_emission=url_emission))
if next_page: if next_page:
next_url = response.urljoin(next_page.extract_first()) next_url = response.urljoin(next_page.extract_first())
yield scrapy.Request(url=next_url,callback = self.parse_episodes) yield scrapy.Request(url=next_url,callback = self.parse_episodes)
def parse_episode(self, response): def parse_episode(self, response, url_emission):
page_episode = scrapy.Selector(response) page_episode = scrapy.Selector(response)
for page in page_episode.xpath(self.episode_xpath): for page in page_episode.xpath(self.episode_xpath):
loader = ItemLoader(item=episode_Item(), selector=page) loader = ItemLoader(item=episode_Item(), selector=page)
@ -51,7 +53,12 @@ class GetEpisodesSpider(scrapy.Spider):
for field, xpath in self.episode_fields.items(): for field, xpath in self.episode_fields.items():
loader.add_xpath(field, xpath) loader.add_xpath(field, xpath)
if field == 'url_file':
file_info = urllib.request.urlopen(response.xpath(xpath).extract_first())
loader.add_value('file_size', file_info.headers['content-length'] )
loader.add_value('url_page', response.url) loader.add_value('url_page', response.url)
loader.add_value('url_emission', url_emission)
loader.add_value('id_episode', response.url)
yield loader.load_item() yield loader.load_item()

View File

@ -16,6 +16,7 @@
## Tables de la base RFScraper ## Tables de la base RFScraper
CREATE TABLE station( CREATE TABLE station(
id serial PRIMARY KEY, id serial PRIMARY KEY,
nom text UNIQUE NOT NULL,
url text UNIQUE NOT NULL url text UNIQUE NOT NULL
); );

6
config_db.py Normal file
View File

@ -0,0 +1,6 @@
DATABASE = {
'HOST' : 'hostname',
'USER' : 'user',
'PASSWORD' : 'password',
'DB_NAME' : 'database name'
}