ola13's picture
flagging to datasets
0f43f50
raw
history blame
3.26 kB
import json
import math
import os
import uuid
from functools import partial
import jsonlines
import streamlit as st
import streamlit.components.v1 as components
from huggingface_hub import HfApi
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
with jsonlines.open("report.jsonl", "w") as f:
f.write(
{
"dataset": dataset,
"docid": docid,
"text": text,
"metadata": metadata,
"reason": reason,
"annotator": annotator,
}
)
api = HfApi()
api.upload_file(
path_or_fileobj="report.jsonl",
path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
repo_id="HuggingFaceGECLM/data_feedback",
repo_type="dataset",
token=os.environ.get("geclm_token"),
)
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_flag_and_get_next_item(sample, issue):
if issue is None or issue == "":
issue = "None"
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
text = sample["text"]
sample.pop("text")
sample.pop("issue")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
report_result_dataset(dataset, sample_id, text, str(sample), issue, "")
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD",
on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_flag_and_get_next_item,
args=(sample, issue),
)