import json import os import random import string import gradio as gr import huggingface_hub from datasets import load_dataset from evaluate import load from guidelines import guidelines human2_annotation_file = "human2/test-human2.jsonl" space = "Iker/ClickbaitAnnotation" def clean_text(text: str) -> str: # Remove punctuation text = text.translate(str.maketrans("", "", string.punctuation)) # Remove newlines and multiple spaces text = text.replace("\n", " ").strip() text = " ".join(text.split()).strip() # lowercase text = text.lower() return text def html_progress_bar(completed_steps, total_steps): percentage = (completed_steps / total_steps) * 100 return f""" Progress Bar
{percentage:.0f}%
""" class AnnotationManager: def __init__(self): self.dataset = list( load_dataset( "Iker/NoticIA", token=os.environ.get("TOKEN") or True, split="test" ) ) self.total = len(self.dataset) self.predictions = [] self.references = [] print(f"Total examples: {self.total}") try: huggingface_hub.hf_hub_download( repo_id=space, repo_type="space", token=os.environ.get("TOKEN") or True, filename=human2_annotation_file, local_dir=os.getcwd(), ) with open(human2_annotation_file, "r") as f: annotations = f.readlines() annotations = [json.loads(a) for a in annotations] for a in annotations: self.predictions.append(clean_text(a["summary2"])) self.references.append([clean_text(a["summary"])]) self.dataset = self.dataset[len(annotations) :] except Exception: print("Unable to download annotations. Starting from the beginning.") self.current = None def get_next(self): if len(self.dataset) == 0: return "🎉 Anotación Finalizada 🎉", "🎉 Anotación Finalizada 🎉" self.current = self.dataset.pop(0) return self.current["web_headline"], self.current["web_text"] def save_annotation(self, annotation): if len(annotation) > 0: example = { "web_url": self.current["web_url"], "web_headline": self.current["web_headline"], "summary": self.current["summary"], "summary2": annotation, "web_text": self.current["web_text"], "clean_web_text": self.current["clean_web_text"], } if not os.path.exists(human2_annotation_file): os.makedirs(os.path.dirname(human2_annotation_file), exist_ok=True) with open(human2_annotation_file, "w", encoding="utf8") as f: print(json.dumps(example, ensure_ascii=False), file=f) else: with open(human2_annotation_file, "a", encoding="utf8") as f: print(json.dumps(example, ensure_ascii=False), file=f) self.predictions.append(clean_text(annotation)) self.references.append([clean_text(example["summary"])]) huggingface_hub.upload_file( repo_id=space, repo_type="space", token=os.environ.get("TOKEN") or True, path_in_repo=human2_annotation_file, path_or_fileobj=human2_annotation_file, ) next_headline, next_text = self.get_next() return ( next_headline, next_text, self.get_rouge(), self.progress(), "", ) if self.current is not None: return ( self.current["web_headline"], self.current["web_text"], self.get_rouge(), self.progress(), "", ) else: return ( "Pulsa ▶️", "Pulsa ▶️", "Pulsa ▶️", self.progress(), "", ) def get_rouge(self): try: experiment_id = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(6) ) rouge = load("rouge", experiment_id=experiment_id) return rouge.compute( predictions=self.predictions, references=self.references, use_aggregator=True, rouge_types=["rouge1"], )["rouge1"] except Exception: return "N/A" def progress(self): # Return first number represents steps completed, and second value represents total steps return html_progress_bar(self.total - len(self.dataset), self.total) def gr_start(self): if self.current is not None: return ( self.current["web_headline"], self.current["web_text"], self.get_rouge(), self.progress(), "", ) headline, text = self.get_next() return headline, text, self.get_rouge(), self.progress(), "" theme = gr.themes.Soft( primary_hue="emerald", secondary_hue="red", text_size="sm", spacing_size="sm", font=[ gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), gr.themes.GoogleFont("Poppins"), ], ).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950") manager = AnnotationManager() with gr.Blocks( theme=theme, title="🖱️ Resumen de noticias Clickbait 🖱️", analytics_enabled=False, ) as demo: with gr.Tab("Guidelines") as tab_guidelines: gr.Markdown(guidelines) with gr.Tab("Anotación") as tab_annotation: gr_play = gr.Button("▶️ Empieza a anotar") gr_progress = gr.HTML(value=manager.progress(), label="Progreso") gr_rouge = gr.Textbox( value="Pulsa ▶️", label="Rouge-1", info="Rouge Score actual entre las anotaciones y los resúmenes de referencia.", lines=1, interactive=False, ) gr_headline = gr.Textbox( value="Pulsa ▶️", label="Titular", info="El titular del artículo.", lines=2, interactive=False, ) gr_body = gr.Textbox( value="Pulsa ▶️", label="Artículo", info="El cuerpo del artículo/noticia.", lines=10, interactive=False, ) gr_summary = gr.Textbox( value="", label="Resumen", info="Escribe aquí el resumen del artículo. Recuerda leer las guidelines antes de empezar.", lines=2, interactive=True, ) save = gr.Button( "💾 Guardar", ) save.click( fn=manager.save_annotation, inputs=[gr_summary], outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], concurrency_limit=None, ) gr_play.click( fn=manager.gr_start, inputs=None, outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary], concurrency_limit=None, ) demo.queue(default_concurrency_limit=None) demo.launch(auth=(os.environ.get("pass"), os.environ.get("pass")))