OlympIA / plugins /scansite.py
johannoriel's picture
Initial relase. Tested. Working
f34a6fd
raw
history blame
20 kB
from app import Plugin
import streamlit as st
import sqlite3
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import ollama
from global_vars import t, translations
# Ajout des traductions spécifiques à ce plugin
translations["en"].update({
"scansite_title": "News Aggregator",
"total_links": "Total number of links",
"annotated_links": "Number of annotated links",
"known_tags": "Known tags",
"reset_database": "Reset database",
"database_reset_success": "Database reset successfully",
"launch_scan": "Launch scan",
"scan_complete": "Scan complete",
"no_articles": "No articles to display.",
"page": "Page",
"previous_page": "Previous page",
"next_page": "Next page",
"new_articles": "New Articles",
"rated_articles": "Rated Articles",
"clicked_not_rated": "Clicked but not rated Articles",
"tagged_articles": "Tagged Articles",
"ignored_articles": "Ignored Articles",
"excluded_articles": "Excluded Articles",
"rating": "Rating",
"tags": "Tags",
"exclude": "Exclude",
"sources": "Sources",
"update": "Update",
"delete": "Delete",
"add_new_source": "Add a new source (URL)",
"add_source": "Add source",
"new_tag": "New tag",
"new_tag_description": "New tag description",
"add_tag": "Add tag",
"work_directory": "Work Directory",
})
translations["fr"].update({
"scansite_title": "Agrégateur de Nouvelles",
"total_links": "Nombre total de liens",
"annotated_links": "Nombre de liens annotés",
"known_tags": "Tags connus",
"reset_database": "Réinitialiser la base de données",
"database_reset_success": "Base de données réinitialisée",
"launch_scan": "Lancer le scan",
"scan_complete": "Scan terminé",
"no_articles": "Aucun article à afficher.",
"page": "Page",
"previous_page": "Page précédente",
"next_page": "Page suivante",
"new_articles": "Nouveaux Articles",
"rated_articles": "Articles Notés",
"clicked_not_rated": "Articles Cliqués non notés",
"tagged_articles": "Articles Tagués",
"ignored_articles": "Articles Ignorés",
"excluded_articles": "Articles Exclus",
"rating": "Note",
"tags": "Tags",
"exclude": "Exclure",
"sources": "Sources",
"update": "Mettre à jour",
"delete": "Supprimer",
"add_new_source": "Ajouter une nouvelle source (URL)",
"add_source": "Ajouter source",
"new_tag": "Nouveau tag",
"new_tag_description": "Description du nouveau tag",
"add_tag": "Ajouter tag",
"work_directory": "Répertoire de travail",
})
class ScansitePlugin(Plugin):
def __init__(self, name, plugin_manager):
super().__init__(name, plugin_manager)
self.conn = self.get_connection()
self.c = self.conn.cursor()
self.init_db()
def get_connection(self):
return sqlite3.connect('news_app.db', check_same_thread=False)
def init_db(self):
current_version = self.get_db_version()
if current_version < 1:
self.c.execute('''CREATE TABLE IF NOT EXISTS sources
(id INTEGER PRIMARY KEY, url TEXT, title TEXT)''')
self.c.execute('''CREATE TABLE IF NOT EXISTS articles
(id INTEGER PRIMARY KEY, source_id INTEGER, url TEXT UNIQUE, title TEXT, date TEXT,
is_new INTEGER, is_excluded INTEGER DEFAULT 0)''')
self.c.execute('''CREATE TABLE IF NOT EXISTS user_actions
(id INTEGER PRIMARY KEY, article_id INTEGER, action TEXT, rating INTEGER, tags TEXT, timestamp TEXT)''')
self.c.execute('''CREATE TABLE IF NOT EXISTS tags
(id INTEGER PRIMARY KEY, name TEXT UNIQUE, description TEXT)''')
self.set_db_version(1)
# Add more version upgrades here
# if current_version < 2:
# self.c.execute('''ALTER TABLE articles ADD COLUMN new_column TEXT''')
# self.set_db_version(2)
self.conn.commit()
def get_db_version(self):
self.c.execute('''CREATE TABLE IF NOT EXISTS db_version (version INTEGER)''')
self.c.execute('SELECT version FROM db_version')
result = self.c.fetchone()
return result[0] if result else 0
def set_db_version(self, version):
self.c.execute('INSERT OR REPLACE INTO db_version (rowid, version) VALUES (1, ?)', (version,))
self.conn.commit()
def get_tabs(self):
return [{"name": t("scansite_title"), "plugin": "scansite"}]
def run(self, config):
st.title(t("scansite_title"))
total_links, annotated_links = self.get_stats()
st.write(f"{t('total_links')} : {total_links}")
st.write(f"{t('annotated_links')} : {annotated_links}")
all_tags = self.get_all_tags()
st.write(f"{t('known_tags')} :", ", ".join(all_tags))
if st.button(t("reset_database")):
self.reset_database()
st.success(t("database_reset_success"))
if st.button(t("launch_scan")):
self.launch_scan()
st.success(t("scan_complete"))
self.display_tabs()
def get_stats(self):
total_links = self.c.execute("SELECT COUNT(*) FROM articles WHERE is_excluded = 0").fetchone()[0]
annotated_links = self.c.execute("""
SELECT COUNT(DISTINCT article_id) FROM user_actions
WHERE action IN ('click', 'rate', 'tag')
""").fetchone()[0]
return total_links, annotated_links
def get_all_tags(self):
return [row[0] for row in self.c.execute("SELECT name FROM tags").fetchall()]
def reset_database(self):
self.c.execute("DROP TABLE IF EXISTS sources")
self.c.execute("DROP TABLE IF EXISTS articles")
self.c.execute("DROP TABLE IF EXISTS user_actions")
self.c.execute("DROP TABLE IF EXISTS tags")
self.conn.commit()
self.init_db()
def launch_scan(self):
sources = self.c.execute("SELECT * FROM sources").fetchall()
for source in sources:
self.mark_not_new(source[0])
links = self.scan_new_links(source[0], source[1])
for link, title in links:
self.c.execute("""
INSERT OR IGNORE INTO articles (source_id, url, title, date, is_new, is_excluded)
VALUES (?, ?, ?, ?, 1, 0)
""", (source[0], link, title, datetime.now().strftime('%Y-%m-%d')))
self.conn.commit()
def display_tabs(self):
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
t("new_articles"), t("rated_articles"), t("clicked_not_rated"),
t("tagged_articles"), t("ignored_articles"), t("excluded_articles")
])
all_tags = self.get_all_tags()
with tab1:
st.header(t("new_articles"))
self.display_paginated_articles(self.get_new_articles(), all_tags, "nouveaux")
with tab2:
st.header(t("rated_articles"))
self.display_paginated_articles(self.get_rated_articles(), all_tags, "notes")
with tab3:
st.header(t("clicked_not_rated"))
self.display_paginated_articles(self.get_clicked_not_rated_articles(), all_tags, "cliques")
with tab4:
st.header(t("tagged_articles"))
self.display_paginated_articles(self.get_tagged_articles(), all_tags, "tagues")
with tab5:
st.header(t("ignored_articles"))
self.display_paginated_articles(self.get_ignored_articles(), all_tags, "ignores")
with tab6:
st.header(t("excluded_articles"))
self.display_paginated_articles(self.get_excluded_articles(), all_tags, "exclus")
def display_paginated_articles(self, articles, all_tags, tab_name, items_per_page=20):
if not articles:
st.write(t("no_articles"))
return
total_pages = (len(articles) - 1) // items_per_page + 1
page_key = f"{tab_name}_page"
if page_key not in st.session_state:
st.session_state[page_key] = 1
page = st.number_input(t("page"), min_value=1, max_value=total_pages, value=st.session_state[page_key], key=f"{tab_name}_number_input")
st.session_state[page_key] = page
start_idx = (page - 1) * items_per_page
end_idx = start_idx + items_per_page
for article in articles[start_idx:end_idx]:
self.display_article(article, all_tags, tab_name)
col1, col2, col3 = st.columns(3)
with col1:
if page > 1:
if st.button(t("previous_page"), key=f"{tab_name}_prev"):
st.session_state[page_key] = page - 1
st.rerun()
with col3:
if page < total_pages:
if st.button(t("next_page"), key=f"{tab_name}_next"):
st.session_state[page_key] = page + 1
st.rerun()
with col2:
st.write(f"{t('page')} {page}/{total_pages}")
def display_article(self, article, all_tags, tab_name):
article_id = article[0]
col1, col2, col3, col4, col5 = st.columns([3, 0.5, 1, 2, 1])
with col1:
summary_key = f"{tab_name}_summary_{article_id}"
if summary_key not in st.session_state:
st.session_state[summary_key] = None
if st.button(article[3], key=f"{tab_name}_article_{article_id}"):
summary = self.get_article_summary(article[2])
st.session_state[summary_key] = summary
self.c.execute("INSERT INTO user_actions (article_id, action, timestamp) VALUES (?, ?, ?)",
(article_id, 'click', datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
self.c.execute("UPDATE articles SET is_new = 0 WHERE id = ?", (article_id,))
self.conn.commit()
if st.session_state[summary_key]:
st.write(st.session_state[summary_key])
with col2:
st.markdown(f"[🔗]({article[2]})")
with col3:
rating_key = f"{tab_name}_rating_{article_id}"
current_rating = self.get_article_rating(article_id)
rating = st.slider(t("rating"), 0, 5, current_rating, key=rating_key)
if rating != current_rating:
self.c.execute("INSERT INTO user_actions (article_id, action, rating, timestamp) VALUES (?, ?, ?, ?)",
(article_id, 'rate', rating, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
self.conn.commit()
with col4:
tags_key = f"{tab_name}_tags_{article_id}"
current_tags = self.get_article_tags(article_id)
selected_tags = st.multiselect(t("tags"), all_tags, default=current_tags, key=tags_key)
if set(selected_tags) != set(current_tags):
tags_str = ','.join(selected_tags)
self.c.execute("INSERT INTO user_actions (article_id, action, tags, timestamp) VALUES (?, ?, ?, ?)",
(article_id, 'tag', tags_str, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
self.conn.commit()
with col5:
exclude_key = f"{tab_name}_exclude_{article_id}"
if st.button(t("exclude"), key=exclude_key):
self.c.execute("UPDATE articles SET is_excluded = 1 WHERE id = ?", (article_id,))
self.conn.commit()
st.rerun()
def get_config_ui(self, config):
updated_config = {}
updated_config['sources'] = st.header(t("sources"))
sources = self.c.execute("SELECT * FROM sources").fetchall()
for source in sources:
col1, col2, col3 = st.columns([3, 1, 1])
with col1:
new_title = st.text_input(f"{t('update')} {source[1]}", value=source[2], key=f"source_title_{source[0]}")
with col2:
if st.button(t("update"), key=f"update_source_{source[0]}"):
self.c.execute("UPDATE sources SET title = ? WHERE id = ?", (new_title, source[0]))
self.conn.commit()
with col3:
if st.button(t("delete"), key=f"delete_source_{source[0]}"):
self.c.execute("DELETE FROM sources WHERE id = ?", (source[0],))
self.conn.commit()
new_url = st.text_input(t("add_new_source"))
if st.button(t("add_source")):
title = self.fetch_page_title(new_url)
self.c.execute("INSERT INTO sources (url, title) VALUES (?, ?)", (new_url, title))
self.conn.commit()
st.header(t("tags"))
tags = self.get_all_tags_with_descriptions()
for tag, description in tags:
col1, col2, col3, col4 = st.columns([2, 3, 1, 1])
with col1:
st.text(tag)
with col2:
new_description = st.text_input(f"{t('update')} {tag}", value=description, key=f"tag_desc_{tag}")
with col3:
if st.button(t("update"), key=f"update_tag_{tag}"):
self.add_or_update_tag(tag, new_description)
with col4:
if st.button(t("delete"), key=f"delete_tag_{tag}"):
self.delete_tag(tag)
new_tag = st.text_input(t("new_tag"))
new_tag_description = st.text_input(t("new_tag_description"))
if st.button(t("add_tag")):
self.add_or_update_tag(new_tag, new_tag_description)
# Ajout des configurations modifiées au dictionnaire updated_config
updated_config["sources"] = sources
updated_config["new_source_url"] = new_url
updated_config["tags"] = tags
updated_config["new_tag"] = new_tag
updated_config["new_tag_description"] = new_tag_description
return updated_config
def fetch_page_title(self, url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.title.string
except:
return url
def mark_not_new(self, source_id):
self.c.execute("UPDATE articles SET is_new = 0 WHERE source_id = ?", (source_id,))
self.conn.commit()
def scan_new_links(self, source_id, url):
links = self.scan_links(url)
filtered_links = []
for link, title in links:
self.c.execute("SELECT id, is_excluded FROM articles WHERE url = ?", (link,))
result = self.c.fetchone()
if result is None:
filtered_links.append((link, title))
return filtered_links
def scan_links(self, url):
links = set()
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
title = link.text.strip() or href
if href and href.startswith('http'):
try:
article_response = requests.get(href)
article_soup = BeautifulSoup(article_response.text, 'html.parser')
if article_soup.find('article'):
links.add((href, title))
except:
pass
except:
st.error(f"Erreur lors du scan de {url}")
return list(links)
def get_article_summary(self, url, model="qwen2"):
prompt = f"Résumez brièvement l'article à cette URL : {url}"
response = ollama.generate(model=model, prompt=prompt)
return response['response']
def get_new_articles(self):
return self.c.execute("""
SELECT * FROM articles
WHERE is_new = 1
AND is_excluded = 0
AND id NOT IN (
SELECT DISTINCT article_id
FROM user_actions
WHERE action IN ('click', 'rate', 'tag')
)
ORDER BY date DESC
""").fetchall()
def get_rated_articles(self):
return self.c.execute("""
SELECT DISTINCT a.*
FROM articles a
JOIN user_actions ua ON a.id = ua.article_id
WHERE ua.action = 'rate'
AND a.is_excluded = 0
ORDER BY ua.timestamp DESC
""").fetchall()
def get_clicked_not_rated_articles(self):
return self.c.execute("""
SELECT DISTINCT a.*
FROM articles a
JOIN user_actions ua ON a.id = ua.article_id
WHERE ua.action = 'click'
AND a.is_excluded = 0
AND a.id NOT IN (
SELECT article_id
FROM user_actions
WHERE action IN ('rate', 'tag')
)
ORDER BY ua.timestamp DESC
""").fetchall()
def get_tagged_articles(self):
return self.c.execute("""
SELECT DISTINCT a.*
FROM articles a
JOIN user_actions ua ON a.id = ua.article_id
WHERE ua.action = 'tag'
AND a.is_excluded = 0
AND a.id NOT IN (
SELECT article_id
FROM user_actions
WHERE action IN ('rate', 'click')
)
ORDER BY ua.timestamp DESC
""").fetchall()
def get_ignored_articles(self):
return self.c.execute("""
SELECT * FROM articles
WHERE is_new = 0
AND is_excluded = 0
AND id NOT IN (
SELECT DISTINCT article_id
FROM user_actions
WHERE action IN ('click', 'rate', 'tag')
)
ORDER BY date DESC
""").fetchall()
def get_excluded_articles(self):
return self.c.execute("""
SELECT * FROM articles
WHERE is_excluded = 1
ORDER BY date DESC
""").fetchall()
def get_article_rating(self, article_id):
self.c.execute("SELECT rating FROM user_actions WHERE article_id = ? AND action = 'rate' ORDER BY timestamp DESC LIMIT 1", (article_id,))
result = self.c.fetchone()
return result[0] if result else 0
def get_article_tags(self, article_id):
self.c.execute("SELECT tags FROM user_actions WHERE article_id = ? AND action = 'tag' ORDER BY timestamp DESC LIMIT 1", (article_id,))
result = self.c.fetchone()
return result[0].split(',') if result and result[0] else []
def get_all_tags_with_descriptions(self):
return self.c.execute("SELECT name, description FROM tags").fetchall()
def add_or_update_tag(self, name, description):
self.c.execute("INSERT OR REPLACE INTO tags (name, description) VALUES (?, ?)", (name, description))
self.conn.commit()
def delete_tag(self, name):
self.c.execute("DELETE FROM tags WHERE name = ?", (name,))
self.conn.commit()
def get_reference_data(self):
# Récupérer les articles avec leur rating
self.c.execute("""
SELECT a.id, a.url, a.title, COALESCE(ua.rating, 0) as rating
FROM articles a
LEFT JOIN (
SELECT article_id, rating
FROM user_actions
WHERE action = 'rate'
GROUP BY article_id
HAVING MAX(timestamp)
) ua ON a.id = ua.article_id
WHERE a.is_excluded = 0
ORDER BY rating DESC, a.date DESC
""")
articles = self.c.fetchall()
# Séparer les articles en valides (notés) et rejetés (non notés)
reference_data_valid = [(article[1], article[2], article[3]) for article in articles if article[3] > 0]
reference_data_rejected = [(article[1], article[2]) for article in articles if article[3] == 0]
return reference_data_valid, reference_data_rejected