from io import StringIO import gradio as gr import pandas as pd from datasets import ClassLabel, Dataset, Image from httpx import Client from huggingface_hub import DatasetCard client = Client() USER_DATA = {} def update_user_data(api_key, space_url, hub_api_key, hub_dataset_id): USER_DATA["api_key"] = api_key USER_DATA["space_url"] = space_url USER_DATA["hub_api_key"] = hub_api_key USER_DATA["hub_dataset_id"] = hub_dataset_id def check_user_data(): return bool(USER_DATA.get("api_key") and USER_DATA.get("space_url")) # def list_projects(): # headers = {"Authorization": f'Token {USER_DATA["api_key"]}'} # resp = client.get( # "https://davanstrien-label-studio.hf.space/api/projects/", headers=headers # ) # return resp.json() # def get_column_names(): # headers = {"Authorization": f'Token {USER_DATA["api_key"]}'} # print(headers) # # resp = client.get( # # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV", # # headers=headers, # # ) # resp = requests.get( # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV", # headers=headers, # ) # return pd.read_csv(StringIO(resp.text)).columns.tolist() def convert_value(value: int) -> str: if value < 1_000: return "n<1K" elif value < 10_000: return "1K1T" def push_annotations_to_hub(project_id, input_column, input_column_type, label_column): headers = {"Authorization": f'Token {USER_DATA["api_key"]}'} resp = client.get( f"{USER_DATA['space_url']}/api/projects/{int(project_id)}/export?exportType=CSV", headers=headers, ) df = pd.read_csv(StringIO(resp.text)) print(df.head(1)) labels = df[label_column].unique().tolist() ds = Dataset.from_pandas(df) ds = ds.cast_column(label_column, ClassLabel(names=labels)) if input_column_type == "image": ds = ds.cast_column(input_column, Image()) ds.push_to_hub(USER_DATA["hub_dataset_id"], token=USER_DATA["hub_api_key"]) card = DatasetCard.load(USER_DATA["hub_dataset_id"]) card.data.tags = ["label-studio-exported"] card.data.size_categories = [convert_value(len(ds))] card.push_to_hub(USER_DATA["hub_dataset_id"], repo_type="dataset") return ds.to_pandas().head(5) with gr.Blocks() as demo: gr.Markdown("# Push label studio datasets to the hub") gr.Markdown( "This is a proof of concept app which provides a GUI for exporting data from" " Label Studio and pushing the loaded dataset to the Hugging Face Hub" ) with gr.Row(): with gr.Column(): with gr.Row(): gr.Markdown("## Label Studio details") with gr.Row(): gr.Markdown( "Enter your Label Studio API key, you can find this under settings." ) with gr.Row(): API_KEY = gr.Textbox( type="password", label="Label Studio API Key", ) with gr.Row(): with gr.Row(): gr.Markdown( "Space URL, this can be found by clicking on the three dots" " button on your space and copying the URL shown after clicking" " the Embed Space button" ) with gr.Row(): SPACE_URL = gr.Textbox( "e.g. https://davanstrien-label-studio.hf.space/", label="Space URL", placeholder="https://space.example.com", ) with gr.Column(): gr.Markdown("## Hub Dataset info") gr.Markdown( """Enter a Hub [API key](https://huggingface.co/settings/tokens) with write access and the name you would like to use for your dataset""" ) HUB_API_KEY = gr.Textbox( type="password", label="Hub API Key", ) with gr.Row(): gr.Markdown("Name of the dataset you would like to create") with gr.Row(): HUB_DATASET_ID = gr.Textbox( "e.g. davanstrien/dataset_name", label="Dataset name", placeholder="https://space.example.com", ) button = gr.Button("Submit details") button.click(update_user_data, [API_KEY, SPACE_URL, HUB_API_KEY, HUB_DATASET_ID]) with gr.Row(): project_id = gr.Number(1, label="Project ID") input_column = gr.Textbox("text", type="text", label="Input column") input_column_type = gr.Dropdown( choices=["text", "image"], label="Input column type", value="text" ) label_column = gr.Textbox("choice", type="text", label="Label column") button = gr.Button("Push annotations to Hub") with gr.Row(): gr.Markdown("## Preview of your dataset") with gr.Row(): preview = gr.DataFrame() button.click( push_annotations_to_hub, [ project_id, input_column, input_column_type, label_column, ], preview, ) demo.launch(debug=True)