Spaces:
Running
on
Zero
Running
on
Zero
First Version
Browse files- README.md +14 -5
- app.py +159 -0
- cache_system.py +51 -0
- download_url.py +71 -0
- logo2.png +0 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,13 +1,22 @@
|
|
1 |
---
|
2 |
title: ClickbaitFighter
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.14.0
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
license: cc-by-nc-sa-4.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: ClickbaitFighter
|
3 |
+
emoji: ⚔️
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
|
|
|
|
7 |
pinned: false
|
8 |
license: cc-by-nc-sa-4.0
|
9 |
+
suggested_hardware: t4-small
|
10 |
+
suggested_storage: small
|
11 |
+
app_file: app.py
|
12 |
+
fullWidth: true
|
13 |
+
models:
|
14 |
+
- Iker/ClickbaitFighter-10B
|
15 |
+
datasets:
|
16 |
+
- Iker/Clickbait-News
|
17 |
+
tags:
|
18 |
+
- summarization
|
19 |
+
- clickbait
|
20 |
---
|
21 |
|
22 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from download_url import download_text_and_title
|
5 |
+
from cache_system import CacheHandler
|
6 |
+
from gradio_client import Client
|
7 |
+
|
8 |
+
print(f"CPU cores: {os.cpu_count()}.")
|
9 |
+
|
10 |
+
server = os.environ.get("SERVER") or True
|
11 |
+
auth_token = os.environ.get("TOKEN") or True
|
12 |
+
|
13 |
+
client = Client(server)
|
14 |
+
|
15 |
+
|
16 |
+
def finish_generation(text: str) -> str:
|
17 |
+
return f"{text}\n\n⬇️ Ayuda a mejorar la herramienta marcando si el resumen es correcto o no.⬇️"
|
18 |
+
|
19 |
+
|
20 |
+
def generate_text(
|
21 |
+
url: str, mode: int, progress=gr.Progress(track_tqdm=False)
|
22 |
+
) -> (str, str):
|
23 |
+
global cache_handler
|
24 |
+
global run_log
|
25 |
+
|
26 |
+
# 1) Download the article
|
27 |
+
|
28 |
+
progress(0, desc="🤖 Accediendo a la noticia")
|
29 |
+
|
30 |
+
# First, check if the URL is in the cache
|
31 |
+
title, text, temp = cache_handler.get_from_cache(url, mode)
|
32 |
+
if title is not None and text is not None and temp is not None:
|
33 |
+
temp = finish_generation(temp)
|
34 |
+
yield title, temp, text
|
35 |
+
else:
|
36 |
+
try:
|
37 |
+
title, text = download_text_and_title(url)
|
38 |
+
except Exception as e:
|
39 |
+
title = None
|
40 |
+
text = None
|
41 |
+
|
42 |
+
if title is None or text is None:
|
43 |
+
yield (
|
44 |
+
"🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
|
45 |
+
"❌❌❌ Inténtalo de nuevo ❌❌❌",
|
46 |
+
"Error",
|
47 |
+
)
|
48 |
+
return (
|
49 |
+
"🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
|
50 |
+
"❌❌❌ Inténtalo de nuevo ❌❌❌",
|
51 |
+
"Error",
|
52 |
+
)
|
53 |
+
|
54 |
+
progress(0.5, desc="🤖 Leyendo noticia")
|
55 |
+
|
56 |
+
try:
|
57 |
+
temp = client.predict(
|
58 |
+
url, # str in '🌐 URL de la noticia' Textbox component
|
59 |
+
title, # str in '🌐 Título de la noticia' Textbox component
|
60 |
+
text, # str in '📰 Cuerpo de la noticia' Textbox component
|
61 |
+
mode, # float (numeric value between 0 and 100) in '🎚️ Nivel de resumen' Slider component
|
62 |
+
api_name="/predict",
|
63 |
+
)
|
64 |
+
|
65 |
+
for o in temp:
|
66 |
+
yield title, temp, text
|
67 |
+
except Exception as e:
|
68 |
+
yield (
|
69 |
+
"🤖 El servidor no se encuentra disponible.",
|
70 |
+
"❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
|
71 |
+
"Error",
|
72 |
+
)
|
73 |
+
return (
|
74 |
+
"🤖 El servidor no se encuentra disponible.",
|
75 |
+
"❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
|
76 |
+
"Error",
|
77 |
+
)
|
78 |
+
|
79 |
+
cache_handler.add_to_cache(
|
80 |
+
url=url, title=title, text=text, summary_type=mode, summary=temp
|
81 |
+
)
|
82 |
+
temp = finish_generation(temp)
|
83 |
+
yield title, temp, text
|
84 |
+
|
85 |
+
run_log.flag()
|
86 |
+
return title, temp, text
|
87 |
+
|
88 |
+
|
89 |
+
cache_handler = CacheHandler(max_cache_size=1000)
|
90 |
+
feedback_log = gr.HuggingFaceDatasetSaver(auth_token, "Iker/Clickbait-News")
|
91 |
+
|
92 |
+
demo = gr.Interface(
|
93 |
+
generate_text,
|
94 |
+
inputs=[
|
95 |
+
gr.Textbox(
|
96 |
+
label="🌐 URL de la noticia",
|
97 |
+
info="Introduce la URL de la noticia que deseas resumir.",
|
98 |
+
value="https://www.heraldo.es/noticias/salud/2024/01/08/atun-alimento-grasa-muscular-ayuda-combatir-colesterol-1702116.html",
|
99 |
+
interactive=True,
|
100 |
+
),
|
101 |
+
gr.Slider(
|
102 |
+
minimum=0,
|
103 |
+
maximum=100,
|
104 |
+
step=50,
|
105 |
+
value=50,
|
106 |
+
label="🎚️ Nivel de resumen",
|
107 |
+
info="""¿Hasta qué punto quieres resumir la noticia?
|
108 |
+
|
109 |
+
Si solo deseas un resumen, selecciona 0.
|
110 |
+
|
111 |
+
Si buscas un resumen y desmontar el clickbait, elige 50.
|
112 |
+
|
113 |
+
Para obtener solo la respuesta al clickbait, selecciona 100""",
|
114 |
+
interactive=True,
|
115 |
+
),
|
116 |
+
],
|
117 |
+
outputs=[
|
118 |
+
gr.Textbox(
|
119 |
+
label="📰 Titular de la noticia",
|
120 |
+
interactive=False,
|
121 |
+
placeholder="Aquí aparecerá el título de la noticia",
|
122 |
+
),
|
123 |
+
gr.Textbox(
|
124 |
+
label="🗒️ Resumen",
|
125 |
+
interactive=False,
|
126 |
+
placeholder="Aquí aparecerá el resumen de la noticia.",
|
127 |
+
),
|
128 |
+
gr.Textbox(
|
129 |
+
label="Noticia completa",
|
130 |
+
visible=False,
|
131 |
+
render=False,
|
132 |
+
interactive=False,
|
133 |
+
placeholder="Aquí aparecerá el resumen de la noticia.",
|
134 |
+
),
|
135 |
+
],
|
136 |
+
title="⚔️ Clickbait Fighter! ⚔️",
|
137 |
+
thumbnail="logo2.png",
|
138 |
+
theme="JohnSmith9982/small_and_pretty",
|
139 |
+
description="""Esta Inteligencia Artificial es capaz de generar un resumen de una sola frase que revela la verdad detrás de un titular sensacionalista o clickbait. Solo tienes que introducir la URL de la noticia. La IA accederá a la noticia, la leerá y en cuestión de segundos generará un resumen de una sola frase que revele la verdad detrás del titular.
|
140 |
+
|
141 |
+
🎚 Ajusta el nivel de resumen con el control deslizante. Cuanto maś alto, más corto será el resumen.
|
142 |
+
|
143 |
+
🗒 La IA no es capaz de acceder a todas las webs, por ejemplo, si introduces un enlace a una noticia que requiere suscripción, la IA no podrá acceder a ella. Algunas webs pueden tener tecnologías para bloquear bots.
|
144 |
+
|
145 |
+
⌚ La IA se encuentra corriendo en un hardware bastante modesto, debería tardar menos de 10 segundos en generar el resumen, pero si muchos usuarios usan la app a la vez, tendrás que esperar tu turno.
|
146 |
+
|
147 |
+
💸 Este es un projecto sin ánimo de lucro, no se genera ningún tipo de ingreso con esta app. Los datos, la IA y el código se publicarán para su uso en la investigación académica. No puedes usar esta app para ningún uso comercial.
|
148 |
+
|
149 |
+
🧪 El modelo se encuentra en fase de desarrollo, si quieres ayudar a mejorarlo puedes usar los botones 👍 y 👎 para valorar el resumen. ¡Gracias por tu ayuda!""",
|
150 |
+
article="Esta Inteligencia Artificial ha sido generada por Iker García-Ferrero. Puedes saber más sobre mi trabajo en mi [página web](https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/) o mi perfil de [X](https://twitter.com/iker_garciaf). Puedes ponerte en contacto conmigo a través de correo electrónico (ver web) y X.",
|
151 |
+
cache_examples=False,
|
152 |
+
concurrency_limit=1,
|
153 |
+
allow_flagging="manual",
|
154 |
+
flagging_options=[("👍", "correct"), ("👎", "incorrect")],
|
155 |
+
flagging_callback=feedback_log,
|
156 |
+
)
|
157 |
+
|
158 |
+
demo.queue(max_size=None)
|
159 |
+
demo.launch(share=False)
|
cache_system.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from datetime import datetime
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
|
6 |
+
class CacheHandler:
|
7 |
+
def __init__(self, max_cache_size: int = 1000):
|
8 |
+
# Using OrderedDict to maintain the order of insertion for efficient removal of oldest items
|
9 |
+
self.cache = OrderedDict()
|
10 |
+
self.max_cache_size = max_cache_size
|
11 |
+
self.misses = 0
|
12 |
+
self.hits = 0
|
13 |
+
|
14 |
+
def add_to_cache(
|
15 |
+
self, url: str, title: str, text: str, summary_type: int, summary: str
|
16 |
+
):
|
17 |
+
# If URL already exists, update it and move it to the end to mark it as the most recently used
|
18 |
+
if url in self.cache:
|
19 |
+
self.cache.move_to_end(url)
|
20 |
+
self.cache[url][f"summary_{summary_type}"] = summary
|
21 |
+
self.cache[url]["date"] = datetime.now()
|
22 |
+
else:
|
23 |
+
# Add new entry to the cache
|
24 |
+
self.cache[url] = {
|
25 |
+
"title": title,
|
26 |
+
"text": text,
|
27 |
+
"date": datetime.now(),
|
28 |
+
"summary_0": summary if summary_type == 0 else None,
|
29 |
+
"summary_50": summary if summary_type == 50 else None,
|
30 |
+
"summary_100": summary if summary_type == 100 else None,
|
31 |
+
}
|
32 |
+
# Remove the oldest item if cache exceeds max size
|
33 |
+
if len(self.cache) > self.max_cache_size:
|
34 |
+
self.cache.popitem(last=False) # pop the oldest item
|
35 |
+
|
36 |
+
def get_from_cache(self, url: str, summary_type: int) -> Optional[tuple]:
|
37 |
+
if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None:
|
38 |
+
# Move the accessed item to the end to mark it as recently used
|
39 |
+
self.cache.move_to_end(url)
|
40 |
+
self.hits += 1
|
41 |
+
return (
|
42 |
+
self.cache[url]["title"],
|
43 |
+
self.cache[url]["text"],
|
44 |
+
self.cache[url][f"summary_{summary_type}"],
|
45 |
+
)
|
46 |
+
else:
|
47 |
+
self.misses += 1
|
48 |
+
return None, None, None
|
49 |
+
|
50 |
+
def get_cache_stats(self):
|
51 |
+
return self.hits, self.misses, len(self.cache)
|
download_url.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
|
5 |
+
def download_text_and_title(url):
|
6 |
+
try:
|
7 |
+
# Remove the query string from the URL
|
8 |
+
url = url.split("?")[0]
|
9 |
+
# Remove emojis and other special characters
|
10 |
+
url = url.encode("ascii", "ignore").decode("ascii")
|
11 |
+
|
12 |
+
# Send a GET request to the URL
|
13 |
+
headers = {
|
14 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
|
15 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
16 |
+
"Chrome/50.0.2661.102 Safari/537.36"
|
17 |
+
}
|
18 |
+
|
19 |
+
response = requests.get(url, headers=headers, allow_redirects=True)
|
20 |
+
# While response is a redirect, follow it
|
21 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
22 |
+
title = soup.title.string if soup.title else "No Title Found"
|
23 |
+
while title.startswith("http:/") or title.startswith("https:/"):
|
24 |
+
url = title
|
25 |
+
response = requests.get(url, headers=headers, allow_redirects=True)
|
26 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
27 |
+
title = soup.title.string if soup.title else "No Title Found"
|
28 |
+
|
29 |
+
# Check if the request was successful
|
30 |
+
if response.status_code == 200:
|
31 |
+
# Parse the HTML content using BeautifulSoup
|
32 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
33 |
+
|
34 |
+
# Extract the title
|
35 |
+
title = soup.title.string if soup.title else "No Title Found"
|
36 |
+
|
37 |
+
# Extract all the text from the webpage
|
38 |
+
text = [p.get_text() for p in soup.find_all("p")]
|
39 |
+
text = [
|
40 |
+
p.replace("\n", " ").replace("\r", " ").replace("\t", " ") for p in text
|
41 |
+
]
|
42 |
+
text = [" ".join(p.strip().split()) for p in text]
|
43 |
+
text = [p for p in text if len(p) > 0 and len(p.split()) > 5]
|
44 |
+
|
45 |
+
# Clean text
|
46 |
+
text = "\n".join(text)
|
47 |
+
|
48 |
+
title = title.replace("\n", " ").replace("\r", " ").replace("\t", " ")
|
49 |
+
title = " ".join(title.strip().split())
|
50 |
+
|
51 |
+
return title, text
|
52 |
+
else:
|
53 |
+
print("Failed to retrieve the web page. Status code:", response.status_code)
|
54 |
+
print("URL:", url)
|
55 |
+
return None, None
|
56 |
+
except Exception as e:
|
57 |
+
print("An error occurred:", str(e))
|
58 |
+
print("URL:", url)
|
59 |
+
return None, None
|
60 |
+
|
61 |
+
|
62 |
+
# Example usage
|
63 |
+
if __name__ == "__main__":
|
64 |
+
url = "https://www.huffingtonpost.es/sociedad/esta-palabra-mas-prescindible-espanol-cambia-entiende.html" # Replace with the URL you want to scrape
|
65 |
+
title, text = download_text_and_title(url)
|
66 |
+
|
67 |
+
if title and text:
|
68 |
+
print("Title:", title)
|
69 |
+
print("Text:", text)
|
70 |
+
else:
|
71 |
+
print("Unable to retrieve text and title.")
|
logo2.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake
|
2 |
+
setuptools
|
3 |
+
gradio
|
4 |
+
hf_transfer
|
5 |
+
beautifulsoup4
|
6 |
+
numpy
|