Spaces:
Sleeping
Sleeping
import json | |
import os | |
import random | |
import string | |
import gradio as gr | |
import huggingface_hub | |
from datasets import load_dataset | |
from evaluate import load | |
from guidelines import guidelines | |
human2_annotation_file = "human2/test-human2.jsonl" | |
space = "Iker/ClickbaitAnnotation" | |
def clean_text(text: str) -> str: | |
# Remove punctuation | |
text = text.translate(str.maketrans("", "", string.punctuation)) | |
# Remove newlines and multiple spaces | |
text = text.replace("\n", " ").strip() | |
text = " ".join(text.split()).strip() | |
# lowercase | |
text = text.lower() | |
return text | |
def html_progress_bar(completed_steps, total_steps): | |
percentage = (completed_steps / total_steps) * 100 | |
return f""" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Progress Bar</title> | |
<style> | |
.progress-container {{ | |
width: 100%; | |
background-color: #ffffff; | |
}} | |
.progress-bar {{ | |
width: {percentage}%; | |
height: 30px; | |
background-color: #d1fae5; | |
text-align: center; | |
line-height: 30px; | |
color: white; | |
}} | |
</style> | |
</head> | |
<body> | |
<div class="progress-container"> | |
<div class="progress-bar">{percentage:.0f}%</div> | |
</div> | |
</body> | |
</html> | |
""" | |
class AnnotationManager: | |
def __init__(self): | |
self.dataset = list( | |
load_dataset( | |
"Iker/NoticIA", token=os.environ.get("TOKEN") or True, split="test" | |
) | |
) | |
self.total = len(self.dataset) | |
self.predictions = [] | |
self.references = [] | |
print(f"Total examples: {self.total}") | |
try: | |
huggingface_hub.hf_hub_download( | |
repo_id=space, | |
repo_type="space", | |
token=os.environ.get("TOKEN") or True, | |
filename=human2_annotation_file, | |
local_dir=os.getcwd(), | |
) | |
with open(human2_annotation_file, "r") as f: | |
annotations = f.readlines() | |
annotations = [json.loads(a) for a in annotations] | |
for a in annotations: | |
self.predictions.append(clean_text(a["summary2"])) | |
self.references.append([clean_text(a["summary"])]) | |
self.dataset = self.dataset[len(annotations) :] | |
except Exception: | |
print("Unable to download annotations. Starting from the beginning.") | |
self.current = None | |
def get_next(self): | |
if len(self.dataset) == 0: | |
return "🎉 Anotación Finalizada 🎉", "🎉 Anotación Finalizada 🎉" | |
self.current = self.dataset.pop(0) | |
return self.current["web_headline"], self.current["web_text"] | |
def save_annotation(self, annotation): | |
if len(annotation) > 0: | |
example = { | |
"web_url": self.current["web_url"], | |
"web_headline": self.current["web_headline"], | |
"summary": self.current["summary"], | |
"summary2": annotation, | |
"web_text": self.current["web_text"], | |
"clean_web_text": self.current["clean_web_text"], | |
} | |
if not os.path.exists(human2_annotation_file): | |
os.makedirs(os.path.dirname(human2_annotation_file), exist_ok=True) | |
with open(human2_annotation_file, "w", encoding="utf8") as f: | |
print(json.dumps(example, ensure_ascii=False), file=f) | |
else: | |
with open(human2_annotation_file, "a", encoding="utf8") as f: | |
print(json.dumps(example, ensure_ascii=False), file=f) | |
self.predictions.append(clean_text(annotation)) | |
self.references.append([clean_text(example["summary"])]) | |
huggingface_hub.upload_file( | |
repo_id=space, | |
repo_type="space", | |
token=os.environ.get("TOKEN") or True, | |
path_in_repo=human2_annotation_file, | |
path_or_fileobj=human2_annotation_file, | |
) | |
next_headline, next_text = self.get_next() | |
return ( | |
next_headline, | |
next_text, | |
self.get_rouge(), | |
self.progress(), | |
"", | |
) | |
if self.current is not None: | |
return ( | |
self.current["web_headline"], | |
self.current["web_text"], | |
self.get_rouge(), | |
self.progress(), | |
"", | |
) | |
else: | |
return ( | |
"Pulsa ▶️", | |
"Pulsa ▶️", | |
"Pulsa ▶️", | |
self.progress(), | |
"", | |
) | |
def get_rouge(self): | |
try: | |
experiment_id = "".join( | |
random.choice(string.ascii_uppercase + string.digits) for _ in range(6) | |
) | |
rouge = load("rouge", experiment_id=experiment_id) | |
return rouge.compute( | |
predictions=self.predictions, | |
references=self.references, | |
use_aggregator=True, | |
rouge_types=["rouge1"], | |
)["rouge1"] | |
except Exception: | |
return "N/A" | |
def progress(self): | |
# Return first number represents steps completed, and second value represents total steps | |
return html_progress_bar(self.total - len(self.dataset), self.total) | |
def gr_start(self): | |
if self.current is not None: | |
return ( | |
self.current["web_headline"], | |
self.current["web_text"], | |
self.get_rouge(), | |
self.progress(), | |
"", | |
) | |
headline, text = self.get_next() | |
return headline, text, self.get_rouge(), self.progress(), "" | |
theme = gr.themes.Soft( | |
primary_hue="emerald", | |
secondary_hue="red", | |
text_size="sm", | |
spacing_size="sm", | |
font=[ | |
gr.themes.GoogleFont("Poppins"), | |
gr.themes.GoogleFont("Poppins"), | |
gr.themes.GoogleFont("Poppins"), | |
gr.themes.GoogleFont("Poppins"), | |
], | |
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950") | |
manager = AnnotationManager() | |
with gr.Blocks( | |
theme=theme, title="🖱️ Resumen de noticias Clickbait 🖱️", analytics_enabled=False | |
) as demo: | |
with gr.Tab("Guidelines") as tab_guidelines: | |
gr.Markdown(guidelines) | |
with gr.Tab("Anotación") as tab_annotation: | |
gr_play = gr.Button("▶️ Empieza a anotar") | |
gr_progress = gr.HTML(value=manager.progress(), label="Progreso") | |
gr_rouge = gr.Textbox( | |
value="Pulsa ▶️", | |
label="Rouge-1", | |
info="Rouge Score actual entre las anotaciones y los resúmenes de referencia.", | |
lines=1, | |
interactive=False, | |
) | |
gr_headline = gr.Textbox( | |
value="Pulsa ▶️", | |
label="Titular", | |
info="El titular del artículo.", | |
lines=2, | |
interactive=False, | |
) | |
gr_body = gr.Textbox( | |
value="Pulsa ▶️", | |
label="Artículo", | |
info="El cuerpo del artículo/noticia.", | |
lines=10, | |
interactive=False, | |
) | |
gr_summary = gr.Textbox( | |
value="", | |
label="Resumen", | |
info="Escribe aquí el resumen del artículo. Recuerda leer las guidelines antes de empezar.", | |
lines=2, | |
interactive=True, | |
) | |
save = gr.Button( | |
"💾 Guardar", | |
) | |
save.click( | |
fn=manager.save_annotation, | |
inputs=[gr_summary], | |
outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], | |
) | |
gr_play.click( | |
fn=manager.gr_start, | |
inputs=None, | |
outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], | |
) | |
demo.launch(auth=(os.environ.get("pass"), os.environ.get("pass"))) | |