Spaces:
Sleeping
Sleeping
| from app import Plugin | |
| import streamlit as st | |
| import sqlite3 | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime | |
| import ollama | |
| from global_vars import t, translations | |
| # Ajout des traductions spécifiques à ce plugin | |
| translations["en"].update({ | |
| "scansite_title": "News Aggregator", | |
| "total_links": "Total number of links", | |
| "annotated_links": "Number of annotated links", | |
| "known_tags": "Known tags", | |
| "database_reset_success": "Database reset successfully", | |
| "launch_scan": "Launch scan", | |
| "scan_complete": "Scan complete", | |
| "no_articles": "No articles to display.", | |
| "page": "Page", | |
| "previous_page": "Previous page", | |
| "next_page": "Next page", | |
| "new_articles": "New Articles", | |
| "rated_articles": "Rated Articles", | |
| "clicked_not_rated": "Clicked but not rated Articles", | |
| "tagged_articles": "Tagged Articles", | |
| "ignored_articles": "Ignored Articles", | |
| "excluded_articles": "Excluded Articles", | |
| "rating": "Rating", | |
| "tags": "Tags", | |
| "exclude": "Exclude", | |
| "sources": "Sources", | |
| "update": "Update", | |
| "delete": "Delete", | |
| "add_new_source": "Add a new source (URL)", | |
| "add_source": "Add source", | |
| "new_tag": "New tag", | |
| "new_tag_description": "New tag description", | |
| "add_tag": "Add tag", | |
| "work_directory": "Work Directory", | |
| }) | |
| translations["fr"].update({ | |
| "scansite_title": "Agrégateur de Nouvelles", | |
| "total_links": "Nombre total de liens", | |
| "annotated_links": "Nombre de liens annotés", | |
| "known_tags": "Tags connus", | |
| "database_reset_success": "Base de données réinitialisée", | |
| "launch_scan": "Lancer le scan", | |
| "scan_complete": "Scan terminé", | |
| "no_articles": "Aucun article à afficher.", | |
| "page": "Page", | |
| "previous_page": "Page précédente", | |
| "next_page": "Page suivante", | |
| "new_articles": "Nouveaux Articles", | |
| "rated_articles": "Articles Notés", | |
| "clicked_not_rated": "Articles Cliqués non notés", | |
| "tagged_articles": "Articles Tagués", | |
| "ignored_articles": "Articles Ignorés", | |
| "excluded_articles": "Articles Exclus", | |
| "rating": "Note", | |
| "tags": "Tags", | |
| "exclude": "Exclure", | |
| "sources": "Sources", | |
| "update": "Mettre à jour", | |
| "delete": "Supprimer", | |
| "add_new_source": "Ajouter une nouvelle source (URL)", | |
| "add_source": "Ajouter source", | |
| "new_tag": "Nouveau tag", | |
| "new_tag_description": "Description du nouveau tag", | |
| "add_tag": "Ajouter tag", | |
| "work_directory": "Répertoire de travail", | |
| }) | |
| class ScansitePlugin(Plugin): | |
| def __init__(self, name, plugin_manager): | |
| super().__init__(name, plugin_manager) | |
| self.conn = self.get_connection() | |
| self.c = self.conn.cursor() | |
| self.init_db() | |
| def get_connection(self): | |
| return sqlite3.connect('news_app.db', check_same_thread=False) | |
| def init_db(self): | |
| current_version = self.get_db_version() | |
| if current_version < 1: | |
| self.c.execute('''CREATE TABLE IF NOT EXISTS sources | |
| (id INTEGER PRIMARY KEY, url TEXT, title TEXT)''') | |
| self.c.execute('''CREATE TABLE IF NOT EXISTS articles | |
| (id INTEGER PRIMARY KEY, source_id INTEGER, url TEXT UNIQUE, title TEXT, date TEXT, | |
| is_new INTEGER, is_excluded INTEGER DEFAULT 0)''') | |
| self.c.execute('''CREATE TABLE IF NOT EXISTS user_actions | |
| (id INTEGER PRIMARY KEY, article_id INTEGER, action TEXT, rating INTEGER, tags TEXT, timestamp TEXT)''') | |
| self.c.execute('''CREATE TABLE IF NOT EXISTS tags | |
| (id INTEGER PRIMARY KEY, name TEXT UNIQUE, description TEXT)''') | |
| self.set_db_version(1) | |
| # Add more version upgrades here | |
| # if current_version < 2: | |
| # self.c.execute('''ALTER TABLE articles ADD COLUMN new_column TEXT''') | |
| # self.set_db_version(2) | |
| self.conn.commit() | |
| def get_db_version(self): | |
| self.c.execute('''CREATE TABLE IF NOT EXISTS db_version (version INTEGER)''') | |
| self.c.execute('SELECT version FROM db_version') | |
| result = self.c.fetchone() | |
| return result[0] if result else 0 | |
| def set_db_version(self, version): | |
| self.c.execute('INSERT OR REPLACE INTO db_version (rowid, version) VALUES (1, ?)', (version,)) | |
| self.conn.commit() | |
| def get_tabs(self): | |
| return [{"name": t("scansite_title"), "plugin": "scansite"}] | |
| def run(self, config): | |
| st.title(t("scansite_title")) | |
| total_links, annotated_links = self.get_stats() | |
| st.write(f"{t('total_links')} : {total_links}") | |
| st.write(f"{t('annotated_links')} : {annotated_links}") | |
| all_tags = self.get_all_tags() | |
| st.write(f"{t('known_tags')} :", ", ".join(all_tags)) | |
| if st.button(t("launch_scan")): | |
| self.launch_scan() | |
| st.success(t("scan_complete")) | |
| self.display_tabs() | |
| def get_stats(self): | |
| total_links = self.c.execute("SELECT COUNT(*) FROM articles WHERE is_excluded = 0").fetchone()[0] | |
| annotated_links = self.c.execute(""" | |
| SELECT COUNT(DISTINCT article_id) FROM user_actions | |
| WHERE action IN ('click', 'rate', 'tag') | |
| """).fetchone()[0] | |
| return total_links, annotated_links | |
| def get_all_tags(self): | |
| return [row[0] for row in self.c.execute("SELECT name FROM tags").fetchall()] | |
| def reset_database(self): | |
| self.c.execute("DROP TABLE IF EXISTS sources") | |
| self.c.execute("DROP TABLE IF EXISTS articles") | |
| self.c.execute("DROP TABLE IF EXISTS user_actions") | |
| self.c.execute("DROP TABLE IF EXISTS tags") | |
| self.conn.commit() | |
| self.init_db() | |
| def launch_scan(self): | |
| sources = self.c.execute("SELECT * FROM sources").fetchall() | |
| for source in sources: | |
| self.mark_not_new(source[0]) | |
| links = self.scan_new_links(source[0], source[1]) | |
| for link, title in links: | |
| self.c.execute(""" | |
| INSERT OR IGNORE INTO articles (source_id, url, title, date, is_new, is_excluded) | |
| VALUES (?, ?, ?, ?, 1, 0) | |
| """, (source[0], link, title, datetime.now().strftime('%Y-%m-%d'))) | |
| self.conn.commit() | |
| def display_tabs(self): | |
| tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([ | |
| t("new_articles"), t("rated_articles"), t("clicked_not_rated"), | |
| t("tagged_articles"), t("ignored_articles"), t("excluded_articles") | |
| ]) | |
| all_tags = self.get_all_tags() | |
| with tab1: | |
| st.header(t("new_articles")) | |
| self.display_paginated_articles(self.get_new_articles(), all_tags, "nouveaux") | |
| with tab2: | |
| st.header(t("rated_articles")) | |
| self.display_paginated_articles(self.get_rated_articles(), all_tags, "notes") | |
| with tab3: | |
| st.header(t("clicked_not_rated")) | |
| self.display_paginated_articles(self.get_clicked_not_rated_articles(), all_tags, "cliques") | |
| with tab4: | |
| st.header(t("tagged_articles")) | |
| self.display_paginated_articles(self.get_tagged_articles(), all_tags, "tagues") | |
| with tab5: | |
| st.header(t("ignored_articles")) | |
| self.display_paginated_articles(self.get_ignored_articles(), all_tags, "ignores") | |
| with tab6: | |
| st.header(t("excluded_articles")) | |
| self.display_paginated_articles(self.get_excluded_articles(), all_tags, "exclus") | |
| def display_paginated_articles(self, articles, all_tags, tab_name, items_per_page=20): | |
| if not articles: | |
| st.write(t("no_articles")) | |
| return | |
| total_pages = (len(articles) - 1) // items_per_page + 1 | |
| page_key = f"{tab_name}_page" | |
| if page_key not in st.session_state: | |
| st.session_state[page_key] = 1 | |
| page = st.number_input(t("page"), min_value=1, max_value=total_pages, value=st.session_state[page_key], key=f"{tab_name}_number_input") | |
| st.session_state[page_key] = page | |
| start_idx = (page - 1) * items_per_page | |
| end_idx = start_idx + items_per_page | |
| for article in articles[start_idx:end_idx]: | |
| self.display_article(article, all_tags, tab_name) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if page > 1: | |
| if st.button(t("previous_page"), key=f"{tab_name}_prev"): | |
| st.session_state[page_key] = page - 1 | |
| st.rerun() | |
| with col3: | |
| if page < total_pages: | |
| if st.button(t("next_page"), key=f"{tab_name}_next"): | |
| st.session_state[page_key] = page + 1 | |
| st.rerun() | |
| with col2: | |
| st.write(f"{t('page')} {page}/{total_pages}") | |
| def display_article(self, article, all_tags, tab_name): | |
| article_id = article[0] | |
| col1, col2, col3, col4, col5 = st.columns([3, 0.5, 1, 2, 1]) | |
| with col1: | |
| summary_key = f"{tab_name}_summary_{article_id}" | |
| if summary_key not in st.session_state: | |
| st.session_state[summary_key] = None | |
| if st.button(article[3], key=f"{tab_name}_article_{article_id}"): | |
| summary = self.get_article_summary(article[2]) | |
| st.session_state[summary_key] = summary | |
| self.c.execute("INSERT INTO user_actions (article_id, action, timestamp) VALUES (?, ?, ?)", | |
| (article_id, 'click', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) | |
| self.c.execute("UPDATE articles SET is_new = 0 WHERE id = ?", (article_id,)) | |
| self.conn.commit() | |
| if st.session_state[summary_key]: | |
| st.write(st.session_state[summary_key]) | |
| with col2: | |
| st.markdown(f"[🔗]({article[2]})") | |
| with col3: | |
| rating_key = f"{tab_name}_rating_{article_id}" | |
| current_rating = self.get_article_rating(article_id) | |
| rating = st.slider(t("rating"), 0, 5, current_rating, key=rating_key) | |
| if rating != current_rating: | |
| self.c.execute("INSERT INTO user_actions (article_id, action, rating, timestamp) VALUES (?, ?, ?, ?)", | |
| (article_id, 'rate', rating, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) | |
| self.conn.commit() | |
| with col4: | |
| tags_key = f"{tab_name}_tags_{article_id}" | |
| current_tags = self.get_article_tags(article_id) | |
| selected_tags = st.multiselect(t("tags"), all_tags, default=current_tags, key=tags_key) | |
| if set(selected_tags) != set(current_tags): | |
| tags_str = ','.join(selected_tags) | |
| self.c.execute("INSERT INTO user_actions (article_id, action, tags, timestamp) VALUES (?, ?, ?, ?)", | |
| (article_id, 'tag', tags_str, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) | |
| self.conn.commit() | |
| with col5: | |
| exclude_key = f"{tab_name}_exclude_{article_id}" | |
| if st.button(t("exclude"), key=exclude_key): | |
| self.c.execute("UPDATE articles SET is_excluded = 1 WHERE id = ?", (article_id,)) | |
| self.conn.commit() | |
| st.rerun() | |
| def get_config_ui(self, config): | |
| updated_config = {} | |
| updated_config['sources'] = st.header(t("sources")) | |
| sources = self.c.execute("SELECT * FROM sources").fetchall() | |
| for source in sources: | |
| col1, col2, col3 = st.columns([3, 1, 1]) | |
| with col1: | |
| new_title = st.text_input(f"{t('update')} {source[1]}", value=source[2], key=f"source_title_{source[0]}") | |
| with col2: | |
| if st.button(t("update"), key=f"update_source_{source[0]}"): | |
| self.c.execute("UPDATE sources SET title = ? WHERE id = ?", (new_title, source[0])) | |
| self.conn.commit() | |
| with col3: | |
| if st.button(t("delete"), key=f"delete_source_{source[0]}"): | |
| self.c.execute("DELETE FROM sources WHERE id = ?", (source[0],)) | |
| self.conn.commit() | |
| new_url = st.text_input(t("add_new_source")) | |
| if st.button(t("add_source")): | |
| title = self.fetch_page_title(new_url) | |
| self.c.execute("INSERT INTO sources (url, title) VALUES (?, ?)", (new_url, title)) | |
| self.conn.commit() | |
| st.header(t("tags")) | |
| tags = self.get_all_tags_with_descriptions() | |
| for tag, description in tags: | |
| col1, col2, col3, col4 = st.columns([2, 3, 1, 1]) | |
| with col1: | |
| st.text(tag) | |
| with col2: | |
| new_description = st.text_input(f"{t('update')} {tag}", value=description, key=f"tag_desc_{tag}") | |
| with col3: | |
| if st.button(t("update"), key=f"update_tag_{tag}"): | |
| self.add_or_update_tag(tag, new_description) | |
| with col4: | |
| if st.button(t("delete"), key=f"delete_tag_{tag}"): | |
| self.delete_tag(tag) | |
| new_tag = st.text_input(t("new_tag")) | |
| new_tag_description = st.text_input(t("new_tag_description")) | |
| if st.button(t("add_tag")): | |
| self.add_or_update_tag(new_tag, new_tag_description) | |
| # Ajout des configurations modifiées au dictionnaire updated_config | |
| updated_config["sources"] = sources | |
| updated_config["new_source_url"] = new_url | |
| updated_config["tags"] = tags | |
| updated_config["new_tag"] = new_tag | |
| updated_config["new_tag_description"] = new_tag_description | |
| return updated_config | |
| def fetch_page_title(self, url): | |
| try: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| return soup.title.string | |
| except: | |
| return url | |
| def mark_not_new(self, source_id): | |
| self.c.execute("UPDATE articles SET is_new = 0 WHERE source_id = ?", (source_id,)) | |
| self.conn.commit() | |
| def scan_new_links(self, source_id, url): | |
| links = self.scan_links(url) | |
| filtered_links = [] | |
| for link, title in links: | |
| self.c.execute("SELECT id, is_excluded FROM articles WHERE url = ?", (link,)) | |
| result = self.c.fetchone() | |
| if result is None: | |
| filtered_links.append((link, title)) | |
| return filtered_links | |
| def scan_links(self, url): | |
| links = set() | |
| try: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| for link in soup.find_all('a'): | |
| href = link.get('href') | |
| title = link.text.strip() or href | |
| if href and href.startswith('http'): | |
| try: | |
| article_response = requests.get(href) | |
| article_soup = BeautifulSoup(article_response.text, 'html.parser') | |
| if article_soup.find('article'): | |
| links.add((href, title)) | |
| except: | |
| pass | |
| except: | |
| st.error(f"Erreur lors du scan de {url}") | |
| return list(links) | |
| def get_article_summary(self, url, model="qwen2"): | |
| prompt = f"Résumez brièvement l'article à cette URL : {url}" | |
| response = ollama.generate(model=model, prompt=prompt) | |
| return response['response'] | |
| def get_new_articles(self): | |
| return self.c.execute(""" | |
| SELECT * FROM articles | |
| WHERE is_new = 1 | |
| AND is_excluded = 0 | |
| AND id NOT IN ( | |
| SELECT DISTINCT article_id | |
| FROM user_actions | |
| WHERE action IN ('click', 'rate', 'tag') | |
| ) | |
| ORDER BY date DESC | |
| """).fetchall() | |
| def get_rated_articles(self): | |
| return self.c.execute(""" | |
| SELECT DISTINCT a.* | |
| FROM articles a | |
| JOIN user_actions ua ON a.id = ua.article_id | |
| WHERE ua.action = 'rate' | |
| AND a.is_excluded = 0 | |
| ORDER BY ua.timestamp DESC | |
| """).fetchall() | |
| def get_clicked_not_rated_articles(self): | |
| return self.c.execute(""" | |
| SELECT DISTINCT a.* | |
| FROM articles a | |
| JOIN user_actions ua ON a.id = ua.article_id | |
| WHERE ua.action = 'click' | |
| AND a.is_excluded = 0 | |
| AND a.id NOT IN ( | |
| SELECT article_id | |
| FROM user_actions | |
| WHERE action IN ('rate', 'tag') | |
| ) | |
| ORDER BY ua.timestamp DESC | |
| """).fetchall() | |
| def get_tagged_articles(self): | |
| return self.c.execute(""" | |
| SELECT DISTINCT a.* | |
| FROM articles a | |
| JOIN user_actions ua ON a.id = ua.article_id | |
| WHERE ua.action = 'tag' | |
| AND a.is_excluded = 0 | |
| AND a.id NOT IN ( | |
| SELECT article_id | |
| FROM user_actions | |
| WHERE action IN ('rate', 'click') | |
| ) | |
| ORDER BY ua.timestamp DESC | |
| """).fetchall() | |
| def get_ignored_articles(self): | |
| return self.c.execute(""" | |
| SELECT * FROM articles | |
| WHERE is_new = 0 | |
| AND is_excluded = 0 | |
| AND id NOT IN ( | |
| SELECT DISTINCT article_id | |
| FROM user_actions | |
| WHERE action IN ('click', 'rate', 'tag') | |
| ) | |
| ORDER BY date DESC | |
| """).fetchall() | |
| def get_excluded_articles(self): | |
| return self.c.execute(""" | |
| SELECT * FROM articles | |
| WHERE is_excluded = 1 | |
| ORDER BY date DESC | |
| """).fetchall() | |
| def get_article_rating(self, article_id): | |
| self.c.execute("SELECT rating FROM user_actions WHERE article_id = ? AND action = 'rate' ORDER BY timestamp DESC LIMIT 1", (article_id,)) | |
| result = self.c.fetchone() | |
| return result[0] if result else 0 | |
| def get_article_tags(self, article_id): | |
| self.c.execute("SELECT tags FROM user_actions WHERE article_id = ? AND action = 'tag' ORDER BY timestamp DESC LIMIT 1", (article_id,)) | |
| result = self.c.fetchone() | |
| return result[0].split(',') if result and result[0] else [] | |
| def get_all_tags_with_descriptions(self): | |
| return self.c.execute("SELECT name, description FROM tags").fetchall() | |
| def add_or_update_tag(self, name, description): | |
| self.c.execute("INSERT OR REPLACE INTO tags (name, description) VALUES (?, ?)", (name, description)) | |
| self.conn.commit() | |
| def delete_tag(self, name): | |
| self.c.execute("DELETE FROM tags WHERE name = ?", (name,)) | |
| self.conn.commit() | |
| def get_reference_data(self): | |
| # Récupérer les articles avec leur rating | |
| self.c.execute(""" | |
| SELECT a.id, a.url, a.title, COALESCE(ua.rating, 0) as rating | |
| FROM articles a | |
| LEFT JOIN ( | |
| SELECT article_id, rating | |
| FROM user_actions | |
| WHERE action = 'rate' | |
| GROUP BY article_id | |
| HAVING MAX(timestamp) | |
| ) ua ON a.id = ua.article_id | |
| WHERE a.is_excluded = 0 | |
| ORDER BY rating DESC, a.date DESC | |
| """) | |
| articles = self.c.fetchall() | |
| # Séparer les articles en valides (notés) et rejetés (non notés) | |
| reference_data_valid = [(article[1], article[2], article[3]) for article in articles if article[3] > 0] | |
| reference_data_rejected = [(article[1], article[2]) for article in articles if article[3] == 0] | |
| return reference_data_valid, reference_data_rejected | |