Spaces:
Runtime error
Runtime error
import gradio as gr | |
import jsonlines | |
import os | |
import uuid | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
from pprint import pprint | |
datasets = [ | |
"gutenberg_raw", | |
"stackexchange2", | |
"bigcode_python_code", | |
"bigcode_python_github_issues", | |
"bigcode_python_jupyter_scripts_dedup_filtered", | |
"books3", | |
"c4", | |
"s2orc_raw", | |
"reddit_threaded", | |
"cc_filtered_text", | |
] | |
def line_generator(dataset): | |
if dataset == "gutenberg_raw": | |
with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "stackexchange2": | |
with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "bigcode_python_code": | |
with jsonlines.open( | |
"data/bigcode_python_code_examples_with_stats.json", "r" | |
) as f: | |
for line in f: | |
yield line | |
if dataset == "bigcode_python_github_issues": | |
with jsonlines.open( | |
"data/bigcode_python_github_issues_examples_with_stats.json", "r" | |
) as f: | |
for line in f: | |
yield line | |
if dataset == "bigcode_python_jupyter_scripts_dedup_filtered": | |
with jsonlines.open( | |
"data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json", | |
"r", | |
) as f: | |
for line in f: | |
yield line | |
if dataset == "books3": | |
with jsonlines.open("data/books3_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "c4": | |
with jsonlines.open("data/c4_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "s2orc_raw": | |
with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "reddit_threaded": | |
with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
if dataset == "cc_filtered_text": | |
with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: | |
for line in f: | |
yield line | |
line_generators = {dataset: line_generator(dataset) for dataset in datasets} | |
def send_report(sample, dataset, reason, annotator, campaign): | |
text = sample["text"] | |
sample.pop("text") | |
sample_id = "" | |
if "id" not in sample: | |
if "title" in sample: | |
sample_id = sample["title"] | |
else: | |
sample_id = sample["id"] | |
print("submitting") | |
pprint( | |
{ | |
"dataset": dataset, | |
"docid": sample_id, | |
"text": text, | |
"metadata": sample, | |
"reason": reason, | |
"annotator": annotator, | |
"campaign": campaign, | |
"timestamp": str(datetime.now()), | |
} | |
) | |
with jsonlines.open("report.jsonl", "w") as f: | |
f.write( | |
{ | |
"dataset": dataset, | |
"docid": sample_id, | |
"text": text, | |
"metadata": sample, | |
"reason": reason, | |
"annotator": annotator, | |
"campaign": campaign, | |
"timestamp": str(datetime.now()), | |
} | |
) | |
print("geclm_token", os.environ.get("geclm_token")) | |
api = HfApi() | |
api.upload_file( | |
path_or_fileobj="report.jsonl", | |
path_in_repo="report-{}.jsonl".format(uuid.uuid4()), | |
repo_id="HuggingFaceGECLM/data_feedback", | |
repo_type="dataset", | |
token=os.environ.get("geclm_token"), | |
) | |
description = """ | |
GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset. | |
""" | |
if __name__ == "__main__": | |
demo = gr.Blocks() | |
with demo: | |
current_sample_state = gr.State(dict()) | |
description = gr.Markdown(value=description) | |
with gr.Row(): | |
annotator = gr.Textbox( | |
lines=1, | |
max_lines=1, | |
placeholder="Optionally provide your name here if you'd like it to be recorded.", | |
label="Annotator", | |
) | |
campaign = gr.Textbox( | |
lines=1, | |
max_lines=1, | |
placeholder="Optionally provide the name of the annotation campagin for ease of filtering the reports.", | |
label="Annotation campaign", | |
) | |
with gr.Row(): | |
dataset = gr.Dropdown( | |
choices=datasets, value="Pick a dataset below", label="Dataset", | |
) | |
with gr.Row(): | |
reason_txt = gr.Textbox( | |
label="Flagging reason", | |
placeholder="Provide the reason for flagging if you think the sample is bad.", | |
visible=False, | |
) | |
with gr.Row(): | |
bad_btn = gr.Button("Bad β", visible=False) | |
good_btn = gr.Button("Next β ", visible=False) | |
with gr.Row(): | |
text = gr.Markdown(visible=False) | |
def next_line(dataset): | |
next_line = next(line_generators[dataset]) | |
return [ | |
gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True), | |
next_line, | |
gr.update(visible=True), | |
gr.update(visible=True), | |
gr.update(visible=True), | |
] | |
def bad_line(current_sample, dataset, reason, annotator, campaign): | |
send_report(current_sample, dataset, reason, annotator, campaign) | |
next_line = next(line_generators[dataset]) | |
return [ | |
"<pre>" + next_line["text"] + "</pre>", | |
gr.update( | |
value="", | |
placeholder="Provide the reason for flagging if you think the sample is bad.", | |
), | |
next_line, | |
] | |
good_btn.click( | |
next_line, | |
inputs=dataset, | |
outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], | |
) | |
dataset.change( | |
next_line, | |
inputs=dataset, | |
outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], | |
) | |
bad_btn.click( | |
bad_line, | |
inputs=[current_sample_state, dataset, reason_txt, annotator, campaign], | |
outputs=[text, reason_txt, current_sample_state], | |
) | |
demo.launch(enable_queue=False, debug=True) | |