ola13's picture
Init space
de3513e
raw
history blame
6.09 kB
import gradio as gr
import jsonlines
import os
import uuid
from datetime import datetime
from huggingface_hub import HfApi
from pprint import pprint
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
def line_generator(dataset):
if dataset == "gutenberg_raw":
with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "stackexchange2":
with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "bigcode_python_code":
with jsonlines.open(
"data/bigcode_python_code_examples_with_stats.json", "r"
) as f:
for line in f:
yield line
if dataset == "bigcode_python_github_issues":
with jsonlines.open(
"data/bigcode_python_github_issues_examples_with_stats.json", "r"
) as f:
for line in f:
yield line
if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
with jsonlines.open(
"data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
"r",
) as f:
for line in f:
yield line
if dataset == "books3":
with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "c4":
with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "s2orc_raw":
with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "reddit_threaded":
with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
for line in f:
yield line
if dataset == "cc_filtered_text":
with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
for line in f:
yield line
line_generators = {dataset: line_generator(dataset) for dataset in datasets}
def send_report(sample, dataset, reason, annotator):
text = sample["text"]
sample.pop("text")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
print("submitting")
pprint(
{
"dataset": dataset,
"docid": sample_id,
"text": text,
"metadata": sample,
"reason": reason,
"annotator": annotator,
"timestamp": str(datetime.now()),
}
)
with jsonlines.open("report.jsonl", "w") as f:
f.write(
{
"dataset": dataset,
"docid": sample_id,
"text": text,
"metadata": sample,
"reason": reason,
"annotator": annotator,
"timestamp": str(datetime.now()),
}
)
print("geclm_token", os.environ.get("geclm_token"))
api = HfApi()
api.upload_file(
path_or_fileobj="report.jsonl",
path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
repo_id="HuggingFaceGECLM/data_feedback",
repo_type="dataset",
token=os.environ.get("geclm_token"),
)
if __name__ == "__main__":
demo = gr.Blocks()
with demo:
current_sample_state = gr.State(dict())
with gr.Row():
annotator = gr.Textbox(
lines=1,
max_lines=1,
placeholder="Type your name here if you'd like it to be recorded.",
label="Annotator",
)
with gr.Row():
dataset = gr.Dropdown(
choices=datasets,
value="Pick a dataset below",
label="Dataset",
)
with gr.Row():
reason_txt = gr.Textbox(
label="Flagging reason",
placeholder="Provide the reason for flagging if you think the sample is bad.",
visible=False,
)
with gr.Row():
bad_btn = gr.Button("Bad", visible=False)
good_btn = gr.Button("Next", visible=False)
with gr.Row():
text = gr.Markdown(visible=False)
def next_line(dataset):
next_line = next(line_generators[dataset])
return [
gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
next_line,
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
]
def bad_line(current_sample, dataset, reason, annotator):
send_report(current_sample, dataset, reason, annotator)
next_line = next(line_generators[dataset])
return [
"<pre>" + next_line["text"] + "</pre>",
gr.update(
value="",
placeholder="Provide the reason for flagging if you think the sample is bad.",
),
next_line,
]
good_btn.click(
next_line,
inputs=dataset,
outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
)
dataset.change(
next_line,
inputs=dataset,
outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
)
bad_btn.click(
bad_line,
inputs=[current_sample_state, dataset, reason_txt, annotator],
outputs=[text, reason_txt, current_sample_state],
)
demo.launch(enable_queue=False, debug=True)