davanstrien's picture
davanstrien HF staff
add card data
4b1895a
from io import StringIO
import gradio as gr
import pandas as pd
from datasets import ClassLabel, Dataset, Image
from httpx import Client
from huggingface_hub import DatasetCard
client = Client()
USER_DATA = {}
def update_user_data(api_key, space_url, hub_api_key, hub_dataset_id):
USER_DATA["api_key"] = api_key
USER_DATA["space_url"] = space_url
USER_DATA["hub_api_key"] = hub_api_key
USER_DATA["hub_dataset_id"] = hub_dataset_id
def check_user_data():
return bool(USER_DATA.get("api_key") and USER_DATA.get("space_url"))
# def list_projects():
# headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
# resp = client.get(
# "https://davanstrien-label-studio.hf.space/api/projects/", headers=headers
# )
# return resp.json()
# def get_column_names():
# headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
# print(headers)
# # resp = client.get(
# # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
# # headers=headers,
# # )
# resp = requests.get(
# "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
# headers=headers,
# )
# return pd.read_csv(StringIO(resp.text)).columns.tolist()
def convert_value(value: int) -> str:
if value < 1_000:
return "n<1K"
elif value < 10_000:
return "1K<n<10K"
elif value < 100_000:
return "10K<n<100K"
elif value < 1_000_000:
return "100K<n<1M"
elif value < 10_000_000:
return "1M<n<10M"
elif value < 100_000_000:
return "10M<n<100M"
elif value < 1_000_000_000:
return "100M<n<1B"
elif value < 10_000_000_000:
return "1B<n<10B"
elif value < 100_000_000_000:
return "10B<n<100B"
elif value < 1_000_000_000_000:
return "100B<n<1T"
else:
return "n>1T"
def push_annotations_to_hub(project_id, input_column, input_column_type, label_column):
headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
resp = client.get(
f"{USER_DATA['space_url']}/api/projects/{int(project_id)}/export?exportType=CSV",
headers=headers,
)
df = pd.read_csv(StringIO(resp.text))
print(df.head(1))
labels = df[label_column].unique().tolist()
ds = Dataset.from_pandas(df)
ds = ds.cast_column(label_column, ClassLabel(names=labels))
if input_column_type == "image":
ds = ds.cast_column(input_column, Image())
ds.push_to_hub(USER_DATA["hub_dataset_id"], token=USER_DATA["hub_api_key"])
card = DatasetCard.load(USER_DATA["hub_dataset_id"])
card.data.tags = ["label-studio-exported"]
card.data.size_categories = [convert_value(len(ds))]
card.push_to_hub(USER_DATA["hub_dataset_id"], repo_type="dataset")
return ds.to_pandas().head(5)
with gr.Blocks() as demo:
gr.Markdown("# Push label studio datasets to the hub")
gr.Markdown(
"This is a proof of concept app which provides a GUI for exporting data from"
" Label Studio and pushing the loaded dataset to the Hugging Face Hub"
)
with gr.Row():
with gr.Column():
with gr.Row():
gr.Markdown("## Label Studio details")
with gr.Row():
gr.Markdown(
"Enter your Label Studio API key, you can find this under settings."
)
with gr.Row():
API_KEY = gr.Textbox(
type="password",
label="Label Studio API Key",
)
with gr.Row():
with gr.Row():
gr.Markdown(
"Space URL, this can be found by clicking on the three dots"
" button on your space and copying the URL shown after clicking"
" the Embed Space button"
)
with gr.Row():
SPACE_URL = gr.Textbox(
"e.g. https://davanstrien-label-studio.hf.space/",
label="Space URL",
placeholder="https://space.example.com",
)
with gr.Column():
gr.Markdown("## Hub Dataset info")
gr.Markdown(
"""Enter a Hub [API key](https://huggingface.co/settings/tokens) with write access and the name you would like to use for your dataset"""
)
HUB_API_KEY = gr.Textbox(
type="password",
label="Hub API Key",
)
with gr.Row():
gr.Markdown("Name of the dataset you would like to create")
with gr.Row():
HUB_DATASET_ID = gr.Textbox(
"e.g. davanstrien/dataset_name",
label="Dataset name",
placeholder="https://space.example.com",
)
button = gr.Button("Submit details")
button.click(update_user_data, [API_KEY, SPACE_URL, HUB_API_KEY, HUB_DATASET_ID])
with gr.Row():
project_id = gr.Number(1, label="Project ID")
input_column = gr.Textbox("text", type="text", label="Input column")
input_column_type = gr.Dropdown(
choices=["text", "image"], label="Input column type", value="text"
)
label_column = gr.Textbox("choice", type="text", label="Label column")
button = gr.Button("Push annotations to Hub")
with gr.Row():
gr.Markdown("## Preview of your dataset")
with gr.Row():
preview = gr.DataFrame()
button.click(
push_annotations_to_hub,
[
project_id,
input_column,
input_column_type,
label_column,
],
preview,
)
demo.launch(debug=True)