Compare commits

..

No commits in common. "ca2470bb32d527b0967015fc319b51a54cbc03b1" and "951e4da065533339f8e4724305489276f7f4026f" have entirely different histories.

5 changed files with 3 additions and 80 deletions

View File

@ -18,7 +18,7 @@ def clean_text(value):
yield value.strip() yield value.strip()
def clean_html_script(value): def clean_html_script(value):
description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)?([\w\W]+)",value) description_re = re.match("([\w\W]+)(<script[\w\W]*</script>)([\w\W]+)",value)
description_full_post = description_re.group(1) + description_re.group(3) description_full_post = description_re.group(1) + description_re.group(3)
yield description_full_post yield description_full_post
@ -26,14 +26,6 @@ def clean_file_url(value):
url = re.match("(.+\.mp3)", value) url = re.match("(.+\.mp3)", value)
yield url.group(1) yield url.group(1)
def clean_emission_url(value):
url = re.match("(.+)(\?.+$)?", value)
yield url.group(1)
def extract_id_episode(value):
id_episode = re.search("-([\d]+$)", value)
yield id_episode.group(1)
class show_Item(scrapy.Item): class show_Item(scrapy.Item):
name = scrapy.Field() name = scrapy.Field()
url_page = scrapy.Field() url_page = scrapy.Field()
@ -41,31 +33,19 @@ class show_Item(scrapy.Item):
tags = scrapy.Field() tags = scrapy.Field()
class episode_Item(scrapy.Item): class episode_Item(scrapy.Item):
url_emission = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_emission_url)
)
title = scrapy.Field( title = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_text) input_processor = MapCompose(clean_text)
) )
id_episode = scrapy.Field(
default = 'null',
input_processor = MapCompose(extract_id_episode)
)
description_lead = scrapy.Field( description_lead = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_text) input_processor = MapCompose(clean_text)
) )
description_full = scrapy.Field( description_full = scrapy.Field(
default = 'null',
input_processor = MapCompose(clean_html_script)
)
url_file = scrapy.Field( url_file = scrapy.Field(
default = 'null', default = 'null',
input_processor = MapCompose(clean_file_url) input_processor = MapCompose(clean_file_url)
) )
url_page = scrapy.Field() url_page = scrapy.Field()
date_diffusion = scrapy.Field() date_diffusion = scrapy.Field()
file_size= scrapy.Field()

View File

@ -5,50 +5,7 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import psycopg2
from config_db import DATABASE as DB
from FMScraper.items import show_Item, episode_Item
class FmscraperPipeline(object): class FmscraperPipeline(object):
def open_spider(self, spider):
# Connexion BDD
hostname = DB['HOST']
username = DB['USER']
password = DB['PASSWORD']
database = DB['DB_NAME']
self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database)
self.cur = self.connection.cursor()
def close_spider(self, spider):
# Fermeture de la connexion à la BDD
self.cur.close()
self.connection.close()
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, show_Item):
# Ajouter l'émission à la table Emission
# Vérifier existence de l'émission via URL_page
# champs nom, urlpage, urlrss, genre, producteur
self.cur.execute("INSERT INTO emission (nom, url_page, url_rss) VALUES (%s, %s, %s) RETURNING id", (item['name'], item['url_page'], item['url_feed']))
self.emission_id = self.cur.fetchone()[0]
# self.connection.commit()
if isinstance(item, episode_Item):
# Vérifier l'existence de l'episode via ID_episode
# Ajouter l'épisode à la table Episode (avec l'émission associée en clé étrangère)
self.cur.execute("INSERT INTO episode (titre, description_lead, description_full, id_episode_site, emission_id) VALUES (%s, %s, %s, %s, %s) RETURNING id", (item['title'], item['description_lead'], item['description_full'], item['id_episode'], self.emission_id))
episode_id = self.cur.fetchone()[0]
self.connection.commit()
# Faire le lien episode -> emission via le nom de l'emission
# Récupérer l'ID de l'entrée emission généré
# Ajouter les infos de fichier dans la table Media
self.cur.execute("INSERT INTO media (url_file, filename_orig, size, episode_id) VALUES (%s, %s, %s, %s) RETURNING id", (item['url_file'], item['url_file'], item['file_size'], episode_id))
self.connection.commit()
# Vérifier l'existence du fichier via url et taille du fichier, passer si correspondance existante
return item return item

View File

@ -3,7 +3,6 @@ import scrapy
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, TakeFirst from scrapy.loader.processors import Join, MapCompose, TakeFirst
from FMScraper.items import show_Item, episode_Item from FMScraper.items import show_Item, episode_Item
import urllib.request
class GetEpisodesSpider(scrapy.Spider): class GetEpisodesSpider(scrapy.Spider):
name = 'get_episodes' name = 'get_episodes'
@ -37,15 +36,14 @@ class GetEpisodesSpider(scrapy.Spider):
def parse_episodes(self, response): def parse_episodes(self, response):
for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'): for sel in response.xpath('//section[@class="emission-diffusions-list"]//a[@class="preview-list-element-link"]/@href'):
url_emission = response.url
url_episode = response.urljoin(sel.extract()) url_episode = response.urljoin(sel.extract())
next_page = response.xpath('//link[@rel="next"]/@href') next_page = response.xpath('//link[@rel="next"]/@href')
yield scrapy.Request(url_episode, callback = self.parse_episode, cb_kwargs=dict(url_emission=url_emission)) yield scrapy.Request(url_episode, callback = self.parse_episode)
if next_page: if next_page:
next_url = response.urljoin(next_page.extract_first()) next_url = response.urljoin(next_page.extract_first())
yield scrapy.Request(url=next_url,callback = self.parse_episodes) yield scrapy.Request(url=next_url,callback = self.parse_episodes)
def parse_episode(self, response, url_emission): def parse_episode(self, response):
page_episode = scrapy.Selector(response) page_episode = scrapy.Selector(response)
for page in page_episode.xpath(self.episode_xpath): for page in page_episode.xpath(self.episode_xpath):
loader = ItemLoader(item=episode_Item(), selector=page) loader = ItemLoader(item=episode_Item(), selector=page)
@ -53,12 +51,7 @@ class GetEpisodesSpider(scrapy.Spider):
for field, xpath in self.episode_fields.items(): for field, xpath in self.episode_fields.items():
loader.add_xpath(field, xpath) loader.add_xpath(field, xpath)
if field == 'url_file':
file_info = urllib.request.urlopen(response.xpath(xpath).extract_first())
loader.add_value('file_size', file_info.headers['content-length'] )
loader.add_value('url_page', response.url) loader.add_value('url_page', response.url)
loader.add_value('url_emission', url_emission)
loader.add_value('id_episode', response.url)
yield loader.load_item() yield loader.load_item()

View File

@ -16,7 +16,6 @@
## Tables de la base RFScraper ## Tables de la base RFScraper
CREATE TABLE station( CREATE TABLE station(
id serial PRIMARY KEY, id serial PRIMARY KEY,
nom text UNIQUE NOT NULL,
url text UNIQUE NOT NULL url text UNIQUE NOT NULL
); );

View File

@ -1,6 +0,0 @@
DATABASE = {
'HOST' : 'hostname',
'USER' : 'user',
'PASSWORD' : 'password',
'DB_NAME' : 'database name'
}