Spaces:

HuggingFaceGECLM
/

random_dataset_exploration

Sleeping

App Files Files Community

random_dataset_exploration / app.py

ola13

flagging to datasets

0f43f50 over 2 years ago

raw

history blame contribute delete

3.26 kB

	import json
	import math
	import os
	import uuid
	from functools import partial

	import jsonlines
	import streamlit as st
	import streamlit.components.v1 as components
	from huggingface_hub import HfApi

	BAD_EXAMPLES_PATH = "bad_examples"
	DATA_PATH = "data"


	def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
	with jsonlines.open("report.jsonl", "w") as f:
	f.write(
	{
	"dataset": dataset,
	"docid": docid,
	"text": text,
	"metadata": metadata,
	"reason": reason,
	"annotator": annotator,
	}
	)

	api = HfApi()
	api.upload_file(
	path_or_fileobj="report.jsonl",
	path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
	repo_id="HuggingFaceGECLM/data_feedback",
	repo_type="dataset",
	token=os.environ.get("geclm_token"),
	)


	def load_jsonl(file_path):
	data = []
	with open(file_path, "r") as f:
	for line in f:
	data.append(json.loads(line))

	return data


	if "idx" not in st.session_state:
	st.session_state.idx = 0


	def get_next_item():
	st.session_state.idx += 1


	def save_flag_and_get_next_item(sample, issue):
	if issue is None or issue == "":
	issue = "None"
	sample["issue"] = issue

	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
	f.write(json.dumps(sample) + "\n")

	text = sample["text"]
	sample.pop("text")
	sample.pop("issue")
	sample_id = ""
	if "id" not in sample:
	if "title" in sample:
	sample_id = sample["title"]
	else:
	sample_id = sample["id"]

	report_result_dataset(dataset, sample_id, text, str(sample), issue, "")

	get_next_item()


	datasets = [
	"gutenberg_raw",
	"stackexchange2",
	"bigcode_python_code",
	"bigcode_python_github_issues",
	"bigcode_python_jupyter_scripts_dedup_filtered",
	"books3",
	"c4",
	"s2orc_raw",
	"reddit_threaded",
	"cc_filtered_text",
	]
	dataset = st.sidebar.selectbox("Dataset", datasets)
	data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

	# create bad file if it does not exists
	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
	pass

	st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
	st.sidebar.download_button(
	"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
	)

	st.sidebar.button(
	"Clear bad examples file",
	on_click=lambda: open(
	f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
	).close(),
	)

	with st.form(key="bad_form", clear_on_submit=True):
	sample = data[st.session_state.idx]
	text = sample["text"]
	st.text_area(f"text id: {st.session_state.idx}", text, height=500)

	issue = st.text_input(
	"What's wrong with this example? (leave blank if example is fine)"
	)

	good = st.form_submit_button(
	"GOOD",
	on_click=get_next_item,
	)
	bad = st.form_submit_button(
	"BAD",
	on_click=save_flag_and_get_next_item,
	args=(sample, issue),
	)