Spaces:

Iker
/

ClickbaitAnnotation

Sleeping

File size: 8,194 Bytes

import json
import os
import random
import string

import gradio as gr
import huggingface_hub
from datasets import load_dataset
from evaluate import load

from guidelines import guidelines

human2_annotation_file = "test.jsonl"


def clean_text(text: str) -> str:
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove newlines and multiple spaces
    text = text.replace("\n", " ").strip()
    text = " ".join(text.split()).strip()

    # lowercase
    text = text.lower()

    return text


def html_progress_bar(completed_steps, total_steps):
    percentage = (completed_steps / total_steps) * 100
    return f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Progress Bar</title>
    <style>
        .progress-container {{
            width: 100%;
            background-color: #ffffff;
        }}

        .progress-bar {{
            width: {percentage}%;
            height: 30px;
            background-color: #d1fae5;
            text-align: center;
            line-height: 30px;
            color: white;
        }}
    </style>
    </head>
    <body>
    
    <div class="progress-container">
        <div class="progress-bar">{percentage:.0f}%</div>
    </div>

    </body>
    </html>
    """


class AnnotationManager:
    def __init__(self):
        self.dataset = list(
            load_dataset(
                "Iker/NoticIA", token=os.environ.get("TOKEN") or True, split="test"
            )
        )

        self.total = len(self.dataset)
        self.predictions = []
        self.references = []

        print(f"Total examples: {self.total}")

        try:
            if os.path.exists(human2_annotation_file):
                os.remove(human2_annotation_file)
                
            huggingface_hub.hf_hub_download(
                repo_id="Iker/NoticIA_Human_Validation",
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename="test.jsonl",
                local_dir=os.getcwd(),
            )

            with open(human2_annotation_file, "r") as f:
                annotations = f.readlines()

            annotations = [json.loads(a) for a in annotations]
            for a in annotations:
                self.predictions.append(clean_text(a["summary2"]))
                self.references.append([clean_text(a["summary"])])

            self.dataset = self.dataset[len(annotations) :]
        except Exception:
            print("Unable to download annotations. Starting from the beginning.")

        self.current = None

    def get_next(self):
        if len(self.dataset) == 0:
            return "🎉 Anotación Finalizada 🎉", "🎉 Anotación Finalizada 🎉"
        self.current = self.dataset.pop(0)
        return self.current["web_headline"], self.current["web_text"]

    def save_annotation(self, annotation):
        if len(annotation) > 0:
            example = {
                "web_url": self.current["web_url"],
                "web_headline": self.current["web_headline"],
                "summary": self.current["summary"],
                "summary2": annotation,
                "web_text": self.current["web_text"],
                "clean_web_text": self.current["clean_web_text"],
            }

            if not os.path.exists(human2_annotation_file):
                os.makedirs(os.path.dirname(human2_annotation_file), exist_ok=True)
                with open(human2_annotation_file, "w", encoding="utf8") as f:
                    print(json.dumps(example, ensure_ascii=False), file=f)
            else:
                with open(human2_annotation_file, "a", encoding="utf8") as f:
                    print(json.dumps(example, ensure_ascii=False), file=f)

            self.predictions.append(clean_text(annotation))
            self.references.append([clean_text(example["summary"])])

            huggingface_hub.upload_file(
                repo_id="Iker/NoticIA_Human_Validation",
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                path_in_repo="test.jsonl",
                path_or_fileobj=human2_annotation_file,
            )

            next_headline, next_text = self.get_next()
            return (
                next_headline,
                next_text,
                self.get_rouge(),
                self.progress(),
                "",
            )

        if self.current is not None:
            return (
                self.current["web_headline"],
                self.current["web_text"],
                self.get_rouge(),
                self.progress(),
                "",
            )

        else:
            return (
                "Pulsa ▶️",
                "Pulsa ▶️",
                "Pulsa ▶️",
                self.progress(),
                "",
            )

    def get_rouge(self):
        try:
            experiment_id = "".join(
                random.choice(string.ascii_uppercase + string.digits) for _ in range(6)
            )
            rouge = load("rouge", experiment_id=experiment_id)

            return rouge.compute(
                predictions=self.predictions,
                references=self.references,
                use_aggregator=True,
                rouge_types=["rouge1"],
            )["rouge1"]
        except Exception:
            return "N/A"

    def progress(self):
        # Return  first number represents steps completed, and second value represents total steps
        return html_progress_bar(self.total - len(self.dataset), self.total)

    def gr_start(self):
        self.__init__()
        headline, text = self.get_next()
        return headline, text, self.get_rouge(), self.progress(), ""


theme = gr.themes.Soft(
    primary_hue="emerald",
    secondary_hue="red",
    text_size="sm",
    spacing_size="sm",
    font=[
        gr.themes.GoogleFont("Poppins"),
        gr.themes.GoogleFont("Poppins"),
        gr.themes.GoogleFont("Poppins"),
        gr.themes.GoogleFont("Poppins"),
    ],
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950")

manager = AnnotationManager()


with gr.Blocks(
    theme=theme, title="🖱️ Resumen de noticias Clickbait 🖱️", analytics_enabled=False,
) as demo:
    with gr.Tab("Guidelines") as tab_guidelines:
        gr.Markdown(guidelines)

    with gr.Tab("Anotación") as tab_annotation:
        gr_play = gr.Button("▶️ Empieza a anotar")

        gr_progress = gr.HTML(value=manager.progress(), label="Progreso")

        gr_rouge = gr.Textbox(
            value="Pulsa ▶️",
            label="Rouge-1",
            info="Rouge Score actual entre las anotaciones y los resúmenes de referencia.",
            lines=1,
            interactive=False,
        )

        gr_headline = gr.Textbox(
            value="Pulsa ▶️",
            label="Titular",
            info="El titular del artículo.",
            lines=2,
            interactive=False,
        )

        gr_body = gr.Textbox(
            value="Pulsa ▶️",
            label="Artículo",
            info="El cuerpo del artículo/noticia.",
            lines=10,
            interactive=False,
        )

        gr_summary = gr.Textbox(
            value="",
            label="Resumen",
            info="Escribe aquí el resumen del artículo. Recuerda leer las guidelines antes de empezar.",
            lines=2,
            interactive=True,
        )

        save = gr.Button(
            "💾 Guardar",
        )

        save.click(
            fn=manager.save_annotation,
            inputs=[gr_summary],
            outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary],
            concurrency_limit=None,
        )

        gr_play.click(
            fn=manager.gr_start,
            inputs=None,
            outputs=[gr_headline, gr_body, gr_rouge, gr_progress, gr_summary],
            concurrency_limit=None,
        )

    demo.queue(default_concurrency_limit=None)


demo.launch(auth=(os.environ.get("pass"), os.environ.get("pass")))