davanstrien HF staff commited on
Commit
a2bb2cd
1 Parent(s): 6af3d28
Files changed (3) hide show
  1. app.py +132 -0
  2. requirements.in +4 -0
  3. requirements.txt +226 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from httpx import Client
3
+ import pandas as pd
4
+ from datasets import Dataset
5
+
6
+ client = Client()
7
+ from io import StringIO
8
+ from datasets import ClassLabel
9
+ from datasets import Image
10
+
11
+ USER_DATA = {}
12
+
13
+
14
+ def update_user_data(api_key, space_url, hub_api_key, hub_dataset_id):
15
+ USER_DATA["api_key"] = api_key
16
+ USER_DATA["space_url"] = space_url
17
+ USER_DATA["hub_api_key"] = hub_api_key
18
+ USER_DATA["hub_dataset_id"] = hub_dataset_id
19
+
20
+
21
+ def check_user_data():
22
+ return bool(USER_DATA.get("api_key") and USER_DATA.get("space_url"))
23
+
24
+
25
+ # def list_projects():
26
+ # headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
27
+ # resp = client.get(
28
+ # "https://davanstrien-label-studio.hf.space/api/projects/", headers=headers
29
+ # )
30
+ # return resp.json()
31
+
32
+
33
+ # def get_column_names():
34
+ # headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
35
+ # print(headers)
36
+ # # resp = client.get(
37
+ # # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
38
+ # # headers=headers,
39
+ # # )
40
+ # resp = requests.get(
41
+ # "http://davanstrien-label-studio.hf.space/api/projects/1/export?exportType=CSV",
42
+ # headers=headers,
43
+ # )
44
+ # return pd.read_csv(StringIO(resp.text)).columns.tolist()
45
+
46
+
47
+ def push_annotations_to_hub(project_id, input_column, input_column_type, label_column):
48
+ headers = {"Authorization": f'Token {USER_DATA["api_key"]}'}
49
+ resp = client.get(
50
+ f"{USER_DATA['space_url']}/api/projects/{int(project_id)}/export?exportType=CSV",
51
+ headers=headers,
52
+ )
53
+ df = pd.read_csv(StringIO(resp.text))
54
+ print(df.head(1))
55
+ labels = df[label_column].unique().tolist()
56
+ ds = Dataset.from_pandas(df)
57
+ ds = ds.cast_column(label_column, ClassLabel(names=labels))
58
+ if input_column_type == "image":
59
+ ds = ds.cast_column(input_column, Image())
60
+ ds.push_to_hub(USER_DATA["hub_dataset_id"], token=USER_DATA["hub_api_key"])
61
+ return ds.to_pandas().head(5)
62
+
63
+
64
+ with gr.Blocks() as demo:
65
+ gr.Markdown("# Push label studio datasets to the hub")
66
+ gr.Markdown(
67
+ "This is a proof of concept app which provides a GUI for exporting data from Label Studio and pushing the loaded dataset to the Hugging Face Hub"
68
+ )
69
+ with gr.Row():
70
+ with gr.Column():
71
+ with gr.Row():
72
+ gr.Markdown("## Label Studio details")
73
+ with gr.Row():
74
+ API_KEY = gr.Textbox(
75
+ type="password",
76
+ label="Label Studio API Key",
77
+ )
78
+ with gr.Row():
79
+ with gr.Row():
80
+ gr.Markdown(
81
+ "Space URL, this can be found by clicking on the three dots button on your space and copying the URL shown after clicking the Embed Space button"
82
+ )
83
+ with gr.Row():
84
+ SPACE_URL = gr.Textbox(
85
+ "e.g. https://davanstrien-label-studio.hf.space/",
86
+ label="Space URL",
87
+ placeholder="https://space.example.com",
88
+ )
89
+ with gr.Column():
90
+ gr.Markdown("## Hub Dataset info")
91
+ gr.Markdown(
92
+ """Enter a Hub API key with write access and the name you would like to use for your dataset"""
93
+ )
94
+ HUB_API_KEY = gr.Textbox(
95
+ type="password",
96
+ label="Hub API Key",
97
+ )
98
+ with gr.Row():
99
+ gr.Markdown("Name of the dataset you would like to create")
100
+ with gr.Row():
101
+ HUB_DATASET_ID = gr.Textbox(
102
+ "e.g. davanstrien/dataset_name",
103
+ label="Dataset name",
104
+ placeholder="https://space.example.com",
105
+ )
106
+
107
+ button = gr.Button("Submit details")
108
+ button.click(update_user_data, [API_KEY, SPACE_URL, HUB_API_KEY, HUB_DATASET_ID])
109
+ with gr.Row():
110
+ project_id = gr.Number(1, label="Project ID")
111
+ input_column = gr.Textbox("text", type="text", label="Input column")
112
+ input_column_type = gr.Dropdown(
113
+ choices=["text", "image"], label="Input column type", value="text"
114
+ )
115
+ label_column = gr.Textbox("choice", type="text", label="Label column")
116
+ button = gr.Button("Push annotations to Hub")
117
+ with gr.Row():
118
+ gr.Markdown("## Preview of your dataset")
119
+ with gr.Row():
120
+ preview = gr.DataFrame()
121
+ button.click(
122
+ push_annotations_to_hub,
123
+ [
124
+ project_id,
125
+ input_column,
126
+ input_column_type,
127
+ label_column,
128
+ ],
129
+ preview,
130
+ )
131
+
132
+ demo.launch(debug=True)
requirements.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ httpx
2
+ pandas
3
+ datasets
4
+ gradio
requirements.txt ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.11
3
+ # by the following command:
4
+ #
5
+ # pip-compile --resolver=backtracking
6
+ #
7
+ aiofiles==23.1.0
8
+ # via gradio
9
+ aiohttp==3.8.4
10
+ # via
11
+ # datasets
12
+ # fsspec
13
+ # gradio
14
+ aiosignal==1.3.1
15
+ # via aiohttp
16
+ altair==5.0.1
17
+ # via gradio
18
+ anyio==3.7.0
19
+ # via
20
+ # httpcore
21
+ # starlette
22
+ async-timeout==4.0.2
23
+ # via aiohttp
24
+ attrs==23.1.0
25
+ # via
26
+ # aiohttp
27
+ # jsonschema
28
+ certifi==2023.5.7
29
+ # via
30
+ # httpcore
31
+ # httpx
32
+ # requests
33
+ charset-normalizer==3.1.0
34
+ # via
35
+ # aiohttp
36
+ # requests
37
+ click==8.1.3
38
+ # via uvicorn
39
+ contourpy==1.1.0
40
+ # via matplotlib
41
+ cycler==0.11.0
42
+ # via matplotlib
43
+ datasets==2.12.0
44
+ # via -r requirements.in
45
+ dill==0.3.6
46
+ # via
47
+ # datasets
48
+ # multiprocess
49
+ fastapi==0.99.1
50
+ # via gradio
51
+ ffmpy==0.3.0
52
+ # via gradio
53
+ filelock==3.12.0
54
+ # via huggingface-hub
55
+ fonttools==4.40.0
56
+ # via matplotlib
57
+ frozenlist==1.3.3
58
+ # via
59
+ # aiohttp
60
+ # aiosignal
61
+ fsspec[http]==2023.5.0
62
+ # via
63
+ # datasets
64
+ # gradio-client
65
+ # huggingface-hub
66
+ gradio==3.35.2
67
+ # via -r requirements.in
68
+ gradio-client==0.2.7
69
+ # via gradio
70
+ h11==0.14.0
71
+ # via
72
+ # httpcore
73
+ # uvicorn
74
+ httpcore==0.17.2
75
+ # via httpx
76
+ httpx==0.24.1
77
+ # via
78
+ # -r requirements.in
79
+ # gradio
80
+ # gradio-client
81
+ huggingface-hub==0.14.1
82
+ # via
83
+ # datasets
84
+ # gradio
85
+ # gradio-client
86
+ idna==3.4
87
+ # via
88
+ # anyio
89
+ # httpx
90
+ # requests
91
+ # yarl
92
+ jinja2==3.1.2
93
+ # via
94
+ # altair
95
+ # gradio
96
+ jsonschema==4.17.3
97
+ # via altair
98
+ kiwisolver==1.4.4
99
+ # via matplotlib
100
+ linkify-it-py==2.0.2
101
+ # via markdown-it-py
102
+ markdown-it-py[linkify]==2.2.0
103
+ # via
104
+ # gradio
105
+ # mdit-py-plugins
106
+ markupsafe==2.1.3
107
+ # via
108
+ # gradio
109
+ # jinja2
110
+ matplotlib==3.7.1
111
+ # via gradio
112
+ mdit-py-plugins==0.3.3
113
+ # via gradio
114
+ mdurl==0.1.2
115
+ # via markdown-it-py
116
+ multidict==6.0.4
117
+ # via
118
+ # aiohttp
119
+ # yarl
120
+ multiprocess==0.70.14
121
+ # via datasets
122
+ numpy==1.24.3
123
+ # via
124
+ # altair
125
+ # contourpy
126
+ # datasets
127
+ # gradio
128
+ # matplotlib
129
+ # pandas
130
+ # pyarrow
131
+ orjson==3.9.1
132
+ # via gradio
133
+ packaging==23.1
134
+ # via
135
+ # datasets
136
+ # gradio-client
137
+ # huggingface-hub
138
+ # matplotlib
139
+ pandas==2.0.2
140
+ # via
141
+ # -r requirements.in
142
+ # altair
143
+ # datasets
144
+ # gradio
145
+ pillow==10.0.0
146
+ # via
147
+ # gradio
148
+ # matplotlib
149
+ pyarrow==12.0.0
150
+ # via datasets
151
+ pydantic==1.10.11
152
+ # via
153
+ # fastapi
154
+ # gradio
155
+ pydub==0.25.1
156
+ # via gradio
157
+ pygments==2.15.1
158
+ # via gradio
159
+ pyparsing==3.1.0
160
+ # via matplotlib
161
+ pyrsistent==0.19.3
162
+ # via jsonschema
163
+ python-dateutil==2.8.2
164
+ # via
165
+ # matplotlib
166
+ # pandas
167
+ python-multipart==0.0.6
168
+ # via gradio
169
+ pytz==2023.3
170
+ # via pandas
171
+ pyyaml==6.0
172
+ # via
173
+ # datasets
174
+ # gradio
175
+ # huggingface-hub
176
+ requests==2.31.0
177
+ # via
178
+ # datasets
179
+ # fsspec
180
+ # gradio
181
+ # gradio-client
182
+ # huggingface-hub
183
+ # responses
184
+ responses==0.18.0
185
+ # via datasets
186
+ semantic-version==2.10.0
187
+ # via gradio
188
+ six==1.16.0
189
+ # via python-dateutil
190
+ sniffio==1.3.0
191
+ # via
192
+ # anyio
193
+ # httpcore
194
+ # httpx
195
+ starlette==0.27.0
196
+ # via fastapi
197
+ toolz==0.12.0
198
+ # via altair
199
+ tqdm==4.65.0
200
+ # via
201
+ # datasets
202
+ # huggingface-hub
203
+ typing-extensions==4.6.2
204
+ # via
205
+ # fastapi
206
+ # gradio-client
207
+ # huggingface-hub
208
+ # pydantic
209
+ tzdata==2023.3
210
+ # via pandas
211
+ uc-micro-py==1.0.2
212
+ # via linkify-it-py
213
+ urllib3==2.0.2
214
+ # via
215
+ # requests
216
+ # responses
217
+ uvicorn==0.22.0
218
+ # via gradio
219
+ websockets==11.0.3
220
+ # via
221
+ # gradio
222
+ # gradio-client
223
+ xxhash==3.2.0
224
+ # via datasets
225
+ yarl==1.9.2
226
+ # via aiohttp