Spaces:

HuggingFaceGECLM
/

dataset_explorer

Runtime error

File size: 6,297 Bytes

import gradio as gr
import jsonlines
import os
import uuid


from datetime import datetime
from huggingface_hub import HfApi
from pprint import pprint


datasets = [
    "gutenberg_raw",
    "stackexchange2",
    "bigcode_python_code",
    "bigcode_python_github_issues",
    "bigcode_python_jupyter_scripts_dedup_filtered",
    "books3",
    "c4",
    "s2orc_raw",
    "reddit_threaded",
    "cc_filtered_text",
]


def line_generator(dataset):
    if dataset == "gutenberg_raw":
        with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "stackexchange2":
        with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "bigcode_python_code":
        with jsonlines.open(
            "data/bigcode_python_code_examples_with_stats.json", "r"
        ) as f:
            for line in f:
                yield line
    if dataset == "bigcode_python_github_issues":
        with jsonlines.open(
            "data/bigcode_python_github_issues_examples_with_stats.json", "r"
        ) as f:
            for line in f:
                yield line
    if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
        with jsonlines.open(
            "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
            "r",
        ) as f:
            for line in f:
                yield line
    if dataset == "books3":
        with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "c4":
        with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "s2orc_raw":
        with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "reddit_threaded":
        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
            for line in f:
                yield line
    if dataset == "cc_filtered_text":
        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
            for line in f:
                yield line


line_generators = {dataset: line_generator(dataset) for dataset in datasets}


def send_report(sample, dataset, reason, annotator, campaign):
    text = sample["text"]
    sample.pop("text")

    sample_id = ""
    if "id" not in sample:
        if "title" in sample:
            sample_id = sample["title"]
    else:
        sample_id = sample["id"]

    with jsonlines.open("report.jsonl", "w") as f:
        f.write(
            {
                "dataset": dataset,
                "docid": sample_id,
                "text": text,
                "metadata": sample,
                "reason": reason,
                "annotator": annotator,
                "campaign": campaign,
                "timestamp": str(datetime.now()),
            }
        )

    api = HfApi()
    api.upload_file(
        path_or_fileobj="report.jsonl",
        path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
        repo_id="HuggingFaceGECLM/data_feedback",
        repo_type="dataset",
        token=os.environ.get("geclm_token"),
    )


description = """
GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.
"""


if __name__ == "__main__":
    demo = gr.Blocks()

    with demo:
        current_sample_state = gr.State(dict())

        description = gr.Markdown(value=description)
        with gr.Row():
            annotator = gr.Textbox(
                lines=1,
                max_lines=1,
                placeholder="Optionally provide your name here if you'd like it to be recorded.",
                label="Annotator",
            )
            campaign = gr.Textbox(
                lines=1,
                max_lines=1,
                placeholder="Optionally provide the name of the annotation campagin for ease of filtering the reports.",
                label="Annotation campaign",
            )
        with gr.Row():
            dataset = gr.Dropdown(
                choices=datasets, value="Pick a dataset below", label="Dataset",
            )
        with gr.Row():
            reason_txt = gr.Textbox(
                label="Flagging reason",
                placeholder="Provide the reason for flagging if you think the sample is bad.",
                visible=False,
            )
        with gr.Row():
            bad_btn = gr.Button("Bad ❌", visible=False)
            good_btn = gr.Button("Next ✅", visible=False)
        with gr.Row():
            text = gr.Markdown(visible=False)

        def next_line(dataset):
            next_line = next(line_generators[dataset])
            return [
                gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
                next_line,
                gr.update(visible=True),
                gr.update(visible=True),
                gr.update(visible=True),
            ]

        def bad_line(current_sample, dataset, reason, annotator, campaign):
            send_report(current_sample, dataset, reason, annotator, campaign)
            next_line = next(line_generators[dataset])
            return [
                "<pre>" + next_line["text"] + "</pre>",
                gr.update(
                    value="",
                    placeholder="Provide the reason for flagging if you think the sample is bad.",
                ),
                next_line,
            ]

        good_btn.click(
            next_line,
            inputs=dataset,
            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
        )
        dataset.change(
            next_line,
            inputs=dataset,
            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
        )
        bad_btn.click(
            bad_line,
            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
            outputs=[text, reason_txt, current_sample_state],
        )

    demo.launch(enable_queue=False, debug=True)