Spaces:
Sleeping
Sleeping
File size: 5,843 Bytes
85fb714 a2bb2cd 85fb714 4b1895a a2bb2cd 4b1895a a2bb2cd 4b1895a a2bb2cd 85fb714 a2bb2cd 17b58fd ec81263 a2bb2cd 85fb714 a2bb2cd ec81263 17b58fd a2bb2cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
from io import StringIO
import gradio as gr
import pandas as pd
from datasets import ClassLabel, Dataset, Image
from httpx import Client
from huggingface_hub import DatasetCard
client = Client()
USER_DATA = {}
def update_user_data(api_key, space_url, hub_api_key, hub_dataset_id):
USER_DATA["api_key"] = api_key
USER_DATA["space_url"] = space_url
USER_DATA["hub_api_key"] = hub_api_key
USER_DATA["hub_dataset_id"] = hub_dataset_id
def check_user_data():
return bool(USER_DATA.get("api_key") and USER_DATA.get("space_url"))
# def list_projects():
# headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
# resp = client.get(
# "https://davanstrien-label-studio.hf.space/api/projects/", headers=headers
# )
# return resp.json()
# def get_column_names():
# headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
# print(headers)
# # resp = client.get(
# # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
# # headers=headers,
# # )
# resp = requests.get(
# "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
# headers=headers,
# )
# return pd.read_csv(StringIO(resp.text)).columns.tolist()
def convert_value(value: int) -> str:
if value < 1_000:
return "n<1K"
elif value < 10_000:
return "1K<n<10K"
elif value < 100_000:
return "10K<n<100K"
elif value < 1_000_000:
return "100K<n<1M"
elif value < 10_000_000:
return "1M<n<10M"
elif value < 100_000_000:
return "10M<n<100M"
elif value < 1_000_000_000:
return "100M<n<1B"
elif value < 10_000_000_000:
return "1B<n<10B"
elif value < 100_000_000_000:
return "10B<n<100B"
elif value < 1_000_000_000_000:
return "100B<n<1T"
else:
return "n>1T"
def push_annotations_to_hub(project_id, input_column, input_column_type, label_column):
headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
resp = client.get(
f"{USER_DATA['space_url']}/api/projects/{int(project_id)}/export?exportType=CSV",
headers=headers,
)
df = pd.read_csv(StringIO(resp.text))
print(df.head(1))
labels = df[label_column].unique().tolist()
ds = Dataset.from_pandas(df)
ds = ds.cast_column(label_column, ClassLabel(names=labels))
if input_column_type == "image":
ds = ds.cast_column(input_column, Image())
ds.push_to_hub(USER_DATA["hub_dataset_id"], token=USER_DATA["hub_api_key"])
card = DatasetCard.load(USER_DATA["hub_dataset_id"])
card.data.tags = ["label-studio-exported"]
card.data.size_categories = [convert_value(len(ds))]
card.push_to_hub(USER_DATA["hub_dataset_id"], repo_type="dataset")
return ds.to_pandas().head(5)
with gr.Blocks() as demo:
gr.Markdown("# Push label studio datasets to the hub")
gr.Markdown(
"This is a proof of concept app which provides a GUI for exporting data from"
" Label Studio and pushing the loaded dataset to the Hugging Face Hub"
)
with gr.Row():
with gr.Column():
with gr.Row():
gr.Markdown("## Label Studio details")
with gr.Row():
gr.Markdown(
"Enter your Label Studio API key, you can find this under settings."
)
with gr.Row():
API_KEY = gr.Textbox(
type="password",
label="Label Studio API Key",
)
with gr.Row():
with gr.Row():
gr.Markdown(
"Space URL, this can be found by clicking on the three dots"
" button on your space and copying the URL shown after clicking"
" the Embed Space button"
)
with gr.Row():
SPACE_URL = gr.Textbox(
"e.g. https://davanstrien-label-studio.hf.space/",
label="Space URL",
placeholder="https://space.example.com",
)
with gr.Column():
gr.Markdown("## Hub Dataset info")
gr.Markdown(
"""Enter a Hub [API key](https://huggingface.co/settings/tokens) with write access and the name you would like to use for your dataset"""
)
HUB_API_KEY = gr.Textbox(
type="password",
label="Hub API Key",
)
with gr.Row():
gr.Markdown("Name of the dataset you would like to create")
with gr.Row():
HUB_DATASET_ID = gr.Textbox(
"e.g. davanstrien/dataset_name",
label="Dataset name",
placeholder="https://space.example.com",
)
button = gr.Button("Submit details")
button.click(update_user_data, [API_KEY, SPACE_URL, HUB_API_KEY, HUB_DATASET_ID])
with gr.Row():
project_id = gr.Number(1, label="Project ID")
input_column = gr.Textbox("text", type="text", label="Input column")
input_column_type = gr.Dropdown(
choices=["text", "image"], label="Input column type", value="text"
)
label_column = gr.Textbox("choice", type="text", label="Label column")
button = gr.Button("Push annotations to Hub")
with gr.Row():
gr.Markdown("## Preview of your dataset")
with gr.Row():
preview = gr.DataFrame()
button.click(
push_annotations_to_hub,
[
project_id,
input_column,
input_column_type,
label_column,
],
preview,
)
demo.launch(debug=True)
|