Spaces:

Iker
/

ClickbaitFighter

Running on Zero

App Files Files Community

Iker commited on Jan 11, 2024

Commit

d4974c7

1 Parent(s): 5df7c11

First Version

Browse files

Files changed (6) hide show

README.md +14 -5
app.py +159 -0
cache_system.py +51 -0
download_url.py +71 -0
logo2.png +0 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,13 +1,22 @@
 ---
 title: ClickbaitFighter
-emoji: 👁
-colorFrom: purple
-colorTo: green
 sdk: gradio
-sdk_version: 4.14.0
-app_file: app.py
 pinned: false
 license: cc-by-nc-sa-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ClickbaitFighter
+emoji: ⚔️
+colorFrom: yellow
+colorTo: blue
 sdk: gradio
 pinned: false
 license: cc-by-nc-sa-4.0
+suggested_hardware: t4-small
+suggested_storage: small
+app_file: app.py
+fullWidth: true
+models:
+   - Iker/ClickbaitFighter-10B
+datasets:
+   - Iker/Clickbait-News
+tags:
+   - summarization
+   - clickbait
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+import gradio as gr
+from download_url import download_text_and_title
+from cache_system import CacheHandler
+from gradio_client import Client
+print(f"CPU cores: {os.cpu_count()}.")
+server = os.environ.get("SERVER") or True
+auth_token = os.environ.get("TOKEN") or True
+client = Client(server)
+def finish_generation(text: str) -> str:
+    return f"{text}\n\n⬇️ Ayuda a mejorar la herramienta marcando si el resumen es correcto o no.⬇️"
+def generate_text(
+    url: str, mode: int, progress=gr.Progress(track_tqdm=False)
+) -> (str, str):
+    global cache_handler
+    global run_log
+    # 1) Download the article
+    progress(0, desc="🤖 Accediendo a la noticia")
+    # First, check if the URL is in the cache
+    title, text, temp = cache_handler.get_from_cache(url, mode)
+    if title is not None and text is not None and temp is not None:
+        temp = finish_generation(temp)
+        yield title, temp, text
+    else:
+        try:
+            title, text = download_text_and_title(url)
+        except Exception as e:
+            title = None
+            text = None
+        if title is None or text is None:
+            yield (
+                "🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
+                "❌❌❌ Inténtalo de nuevo ❌❌❌",
+                "Error",
+            )
+            return (
+                "🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
+                "❌❌❌ Inténtalo de nuevo ❌❌❌",
+                "Error",
+            )
+        progress(0.5, desc="🤖 Leyendo noticia")
+        try:
+            temp = client.predict(
+                url,  # str  in '🌐 URL de la noticia' Textbox component
+                title,  # str  in '🌐 Título de la noticia' Textbox component
+                text,  # str  in '📰 Cuerpo de la noticia' Textbox component
+                mode,  # float (numeric value between 0 and 100) in '🎚️ Nivel de resumen' Slider component
+                api_name="/predict",
+            )
+            for o in temp:
+                yield title, temp, text
+        except Exception as e:
+            yield (
+                "🤖 El servidor no se encuentra disponible.",
+                "❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
+                "Error",
+            )
+            return (
+                "🤖 El servidor no se encuentra disponible.",
+                "❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
+                "Error",
+            )
+        cache_handler.add_to_cache(
+            url=url, title=title, text=text, summary_type=mode, summary=temp
+        )
+        temp = finish_generation(temp)
+        yield title, temp, text
+        run_log.flag()
+    return title, temp, text
+cache_handler = CacheHandler(max_cache_size=1000)
+feedback_log = gr.HuggingFaceDatasetSaver(auth_token, "Iker/Clickbait-News")
+demo = gr.Interface(
+    generate_text,
+    inputs=[
+        gr.Textbox(
+            label="🌐 URL de la noticia",
+            info="Introduce la URL de la noticia que deseas resumir.",
+            value="https://www.heraldo.es/noticias/salud/2024/01/08/atun-alimento-grasa-muscular-ayuda-combatir-colesterol-1702116.html",
+            interactive=True,
+        ),
+        gr.Slider(
+            minimum=0,
+            maximum=100,
+            step=50,
+            value=50,
+            label="🎚️ Nivel de resumen",
+            info="""¿Hasta qué punto quieres resumir la noticia?
+Si solo deseas un resumen, selecciona 0.
+Si buscas un resumen y desmontar el clickbait, elige 50.
+Para obtener solo la respuesta al clickbait, selecciona 100""",
+            interactive=True,
+        ),
+    ],
+    outputs=[
+        gr.Textbox(
+            label="📰 Titular de la noticia",
+            interactive=False,
+            placeholder="Aquí aparecerá el título de la noticia",
+        ),
+        gr.Textbox(
+            label="🗒️ Resumen",
+            interactive=False,
+            placeholder="Aquí aparecerá el resumen de la noticia.",
+        ),
+        gr.Textbox(
+            label="Noticia completa",
+            visible=False,
+            render=False,
+            interactive=False,
+            placeholder="Aquí aparecerá el resumen de la noticia.",
+        ),
+    ],
+    title="⚔️ Clickbait Fighter! ⚔️",
+    thumbnail="logo2.png",
+    theme="JohnSmith9982/small_and_pretty",
+    description="""Esta Inteligencia Artificial es capaz de generar un resumen de una sola frase que revela la verdad detrás de un titular sensacionalista o clickbait. Solo tienes que introducir la URL de la noticia. La IA accederá a la noticia, la leerá y en cuestión de segundos generará un resumen de una sola frase que revele la verdad detrás del titular.
+   🎚 Ajusta el nivel de resumen con el control deslizante. Cuanto maś alto, más corto será el resumen.
+   🗒 La IA no es capaz de acceder a todas las webs, por ejemplo, si introduces un enlace a una noticia que requiere suscripción, la IA no podrá acceder a ella. Algunas webs pueden tener tecnologías para bloquear bots.
+   ⌚ La IA se encuentra corriendo en un hardware bastante modesto, debería tardar menos de 10 segundos en generar el resumen, pero si muchos usuarios usan la app a la vez, tendrás que esperar tu turno.
+   💸 Este es un projecto sin ánimo de lucro, no se genera ningún tipo de ingreso con esta app. Los datos, la IA y el código se publicarán para su uso en la investigación académica. No puedes usar esta app para ningún uso comercial.
+   🧪 El modelo se encuentra en fase de desarrollo, si quieres ayudar a mejorarlo puedes usar los botones 👍 y 👎 para valorar el resumen. ¡Gracias por tu ayuda!""",
+    article="Esta Inteligencia Artificial ha sido generada por Iker García-Ferrero. Puedes saber más sobre mi trabajo en mi [página web](https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/) o mi perfil de [X](https://twitter.com/iker_garciaf). Puedes ponerte en contacto conmigo a través de correo electrónico (ver web) y X.",
+    cache_examples=False,
+    concurrency_limit=1,
+    allow_flagging="manual",
+    flagging_options=[("👍", "correct"), ("👎", "incorrect")],
+    flagging_callback=feedback_log,
+)
+demo.queue(max_size=None)
+demo.launch(share=False)

cache_system.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from collections import OrderedDict
+from datetime import datetime
+from typing import Optional
+class CacheHandler:
+    def __init__(self, max_cache_size: int = 1000):
+        # Using OrderedDict to maintain the order of insertion for efficient removal of oldest items
+        self.cache = OrderedDict()
+        self.max_cache_size = max_cache_size
+        self.misses = 0
+        self.hits = 0
+    def add_to_cache(
+        self, url: str, title: str, text: str, summary_type: int, summary: str
+    ):
+        # If URL already exists, update it and move it to the end to mark it as the most recently used
+        if url in self.cache:
+            self.cache.move_to_end(url)
+            self.cache[url][f"summary_{summary_type}"] = summary
+            self.cache[url]["date"] = datetime.now()
+        else:
+            # Add new entry to the cache
+            self.cache[url] = {
+                "title": title,
+                "text": text,
+                "date": datetime.now(),
+                "summary_0": summary if summary_type == 0 else None,
+                "summary_50": summary if summary_type == 50 else None,
+                "summary_100": summary if summary_type == 100 else None,
+            }
+            # Remove the oldest item if cache exceeds max size
+            if len(self.cache) > self.max_cache_size:
+                self.cache.popitem(last=False)  # pop the oldest item
+    def get_from_cache(self, url: str, summary_type: int) -> Optional[tuple]:
+        if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None:
+            # Move the accessed item to the end to mark it as recently used
+            self.cache.move_to_end(url)
+            self.hits += 1
+            return (
+                self.cache[url]["title"],
+                self.cache[url]["text"],
+                self.cache[url][f"summary_{summary_type}"],
+            )
+        else:
+            self.misses += 1
+            return None, None, None
+    def get_cache_stats(self):
+        return self.hits, self.misses, len(self.cache)

download_url.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import requests
+from bs4 import BeautifulSoup
+def download_text_and_title(url):
+    try:
+        # Remove the query string from the URL
+        url = url.split("?")[0]
+        # Remove emojis and other special characters
+        url = url.encode("ascii", "ignore").decode("ascii")
+        # Send a GET request to the URL
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/50.0.2661.102 Safari/537.36"
+        }
+        response = requests.get(url, headers=headers, allow_redirects=True)
+        # While response is a redirect, follow it
+        soup = BeautifulSoup(response.text, "html.parser")
+        title = soup.title.string if soup.title else "No Title Found"
+        while title.startswith("http:/") or title.startswith("https:/"):
+            url = title
+            response = requests.get(url, headers=headers, allow_redirects=True)
+            soup = BeautifulSoup(response.text, "html.parser")
+            title = soup.title.string if soup.title else "No Title Found"
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Extract the title
+            title = soup.title.string if soup.title else "No Title Found"
+            # Extract all the text from the webpage
+            text = [p.get_text() for p in soup.find_all("p")]
+            text = [
+                p.replace("\n", " ").replace("\r", " ").replace("\t", " ") for p in text
+            ]
+            text = [" ".join(p.strip().split()) for p in text]
+            text = [p for p in text if len(p) > 0 and len(p.split()) > 5]
+            # Clean text
+            text = "\n".join(text)
+            title = title.replace("\n", " ").replace("\r", " ").replace("\t", " ")
+            title = " ".join(title.strip().split())
+            return title, text
+        else:
+            print("Failed to retrieve the web page. Status code:", response.status_code)
+            print("URL:", url)
+            return None, None
+    except Exception as e:
+        print("An error occurred:", str(e))
+        print("URL:", url)
+        return None, None
+# Example usage
+if __name__ == "__main__":
+    url = "https://www.huffingtonpost.es/sociedad/esta-palabra-mas-prescindible-espanol-cambia-entiende.html"  # Replace with the URL you want to scrape
+    title, text = download_text_and_title(url)
+    if title and text:
+        print("Title:", title)
+        print("Text:", text)
+    else:
+        print("Unable to retrieve text and title.")

logo2.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+cmake
+setuptools
+gradio
+hf_transfer
+beautifulsoup4
+numpy