import json
import os
import random
import string
import gradio as gr
import huggingface_hub
from datasets import load_dataset
from evaluate import load
from guidelines import guidelines
human2_annotation_file = "human2/test-human2.jsonl"
space = "Iker/ClickbaitAnnotation"
def clean_text(text: str) -> str:
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Remove newlines and multiple spaces
text = text.replace("\n", " ").strip()
text = " ".join(text.split()).strip()
# lowercase
text = text.lower()
return text
def html_progress_bar(completed_steps, total_steps):
percentage = (completed_steps / total_steps) * 100
return f"""
Progress Bar
"""
class AnnotationManager:
def __init__(self):
self.dataset = list(
load_dataset(
"Iker/NoticIA", token=os.environ.get("TOKEN") or True, split="test"
)
)
self.total = len(self.dataset)
self.predictions = []
self.references = []
print(f"Total examples: {self.total}")
try:
huggingface_hub.hf_hub_download(
repo_id=space,
repo_type="space",
token=os.environ.get("TOKEN") or True,
filename=human2_annotation_file,
local_dir=os.getcwd(),
)
with open(human2_annotation_file, "r") as f:
annotations = f.readlines()
annotations = [json.loads(a) for a in annotations]
for a in annotations:
self.predictions.append(clean_text(a["summary2"]))
self.references.append([clean_text(a["summary"])])
self.dataset = self.dataset[len(annotations) :]
except Exception:
print("Unable to download annotations. Starting from the beginning.")
self.current = None
def get_next(self):
if len(self.dataset) == 0:
return "🎉 Anotación Finalizada 🎉", "🎉 Anotación Finalizada 🎉"
self.current = self.dataset.pop(0)
return self.current["web_headline"], self.current["web_text"]
def save_annotation(self, annotation):
if len(annotation) > 0:
example = {
"web_url": self.current["web_url"],
"web_headline": self.current["web_headline"],
"summary": self.current["summary"],
"summary2": annotation,
"web_text": self.current["web_text"],
"clean_web_text": self.current["clean_web_text"],
}
if not os.path.exists(human2_annotation_file):
os.makedirs(os.path.dirname(human2_annotation_file), exist_ok=True)
with open(human2_annotation_file, "w", encoding="utf8") as f:
print(json.dumps(example, ensure_ascii=False), file=f)
else:
with open(human2_annotation_file, "a", encoding="utf8") as f:
print(json.dumps(example, ensure_ascii=False), file=f)
self.predictions.append(clean_text(annotation))
self.references.append([clean_text(example["summary"])])
huggingface_hub.upload_file(
repo_id=space,
repo_type="space",
token=os.environ.get("TOKEN") or True,
path_in_repo=human2_annotation_file,
path_or_fileobj=human2_annotation_file,
)
next_headline, next_text = self.get_next()
return (
next_headline,
next_text,
self.get_rouge(),
self.progress(),
"",
)
if self.current is not None:
return (
self.current["web_headline"],
self.current["web_text"],
self.get_rouge(),
self.progress(),
"",
)
else:
return (
"Pulsa ▶️",
"Pulsa ▶️",
"Pulsa ▶️",
self.progress(),
"",
)
def get_rouge(self):
try:
experiment_id = "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(6)
)
rouge = load("rouge", experiment_id=experiment_id)
return rouge.compute(
predictions=self.predictions,
references=self.references,
use_aggregator=True,
rouge_types=["rouge1"],
)["rouge1"]
except Exception:
return "N/A"
def progress(self):
# Return first number represents steps completed, and second value represents total steps
return html_progress_bar(self.total - len(self.dataset), self.total)
def gr_start(self):
if self.current is not None:
return (
self.current["web_headline"],
self.current["web_text"],
self.get_rouge(),
self.progress(),
"",
)
headline, text = self.get_next()
return headline, text, self.get_rouge(), self.progress(), ""
theme = gr.themes.Soft(
primary_hue="emerald",
secondary_hue="red",
text_size="sm",
spacing_size="sm",
font=[
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
],
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950")
manager = AnnotationManager()
with gr.Blocks(
theme=theme, title="🖱️ Resumen de noticias Clickbait 🖱️", analytics_enabled=False,
) as demo:
with gr.Tab("Guidelines") as tab_guidelines:
gr.Markdown(guidelines)
with gr.Tab("Anotación") as tab_annotation:
gr_play = gr.Button("▶️ Empieza a anotar")
gr_progress = gr.HTML(value=manager.progress(), label="Progreso")
gr_rouge = gr.Textbox(
value="Pulsa ▶️",
label="Rouge-1",
info="Rouge Score actual entre las anotaciones y los resúmenes de referencia.",
lines=1,
interactive=False,
)
gr_headline = gr.Textbox(
value="Pulsa ▶️",
label="Titular",
info="El titular del artículo.",
lines=2,
interactive=False,
)
gr_body = gr.Textbox(
value="Pulsa ▶️",
label="Artículo",
info="El cuerpo del artículo/noticia.",
lines=10,
interactive=False,
)
gr_summary = gr.Textbox(
value="",
label="Resumen",
info="Escribe aquí el resumen del artículo. Recuerda leer las guidelines antes de empezar.",
lines=2,
interactive=True,
)
save = gr.Button(
"💾 Guardar",
)
save.click(
fn=manager.save_annotation,
inputs=[gr_summary],
outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary],
concurrency_limit=None,
)
gr_play.click(
fn=manager.gr_start,
inputs=None,
outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary],
concurrency_limit=None,
)
demo.queue(default_concurrency_limit=None)
demo.launch(auth=(os.environ.get("pass"), os.environ.get("pass")))