Spaces:
Runtime error
Runtime error
Merge pull request #6 from huggingface/add-metadata
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ from datasets import get_dataset_config_names
|
|
8 |
from dotenv import load_dotenv
|
9 |
from huggingface_hub import list_datasets
|
10 |
|
11 |
-
from utils import get_compatible_models, get_metadata, http_get,
|
|
|
12 |
|
13 |
if Path(".env").is_file():
|
14 |
load_dotenv(".env")
|
@@ -29,6 +30,9 @@ TASK_TO_ID = {
|
|
29 |
"summarization": 8,
|
30 |
}
|
31 |
|
|
|
|
|
|
|
32 |
###########
|
33 |
### APP ###
|
34 |
###########
|
@@ -61,7 +65,11 @@ if metadata is None:
|
|
61 |
|
62 |
with st.expander("Advanced configuration"):
|
63 |
## Select task
|
64 |
-
selected_task = st.selectbox(
|
|
|
|
|
|
|
|
|
65 |
### Select config
|
66 |
configs = get_dataset_config_names(selected_dataset)
|
67 |
selected_config = st.selectbox("Select a config", configs)
|
@@ -75,29 +83,25 @@ with st.expander("Advanced configuration"):
|
|
75 |
if split["config"] == selected_config:
|
76 |
split_names.append(split["split"])
|
77 |
|
78 |
-
selected_split = st.selectbox(
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
##
|
81 |
rows_resp = http_get(
|
82 |
path="/rows",
|
83 |
domain="https://datasets-preview.huggingface.tech",
|
84 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
85 |
).json()
|
86 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
87 |
-
# splits = metadata[0]["splits"]
|
88 |
-
# split_names = list(splits.values())
|
89 |
-
# eval_split = splits.get("eval_split", split_names[0])
|
90 |
-
|
91 |
-
# selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
|
92 |
-
|
93 |
-
# TODO: add a function to handle the mapping task <--> column mapping
|
94 |
-
# col_mapping = metadata[0]["col_mapping"]
|
95 |
-
# col_names = list(col_mapping.keys())
|
96 |
|
97 |
st.markdown("**Map your data columns**")
|
98 |
col1, col2 = st.columns(2)
|
99 |
|
100 |
# TODO: find a better way to layout these items
|
|
|
101 |
col_mapping = {}
|
102 |
if selected_task in ["binary_classification", "multi_class_classification"]:
|
103 |
with col1:
|
@@ -108,9 +112,15 @@ with st.expander("Advanced configuration"):
|
|
108 |
st.text("")
|
109 |
st.markdown("`target` column")
|
110 |
with col2:
|
111 |
-
text_col = st.selectbox(
|
|
|
|
|
|
|
|
|
112 |
target_col = st.selectbox(
|
113 |
-
"This column should contain the labels you want to assign to the text",
|
|
|
|
|
114 |
)
|
115 |
col_mapping[text_col] = "text"
|
116 |
col_mapping[target_col] = "target"
|
@@ -127,9 +137,12 @@ with st.expander("Advanced configuration"):
|
|
127 |
tokens_col = st.selectbox(
|
128 |
"This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
|
129 |
col_names,
|
|
|
130 |
)
|
131 |
tags_col = st.selectbox(
|
132 |
-
"This column should contain the labels to associate to each part of the text",
|
|
|
|
|
133 |
)
|
134 |
col_mapping[tokens_col] = "tokens"
|
135 |
col_mapping[tags_col] = "tags"
|
@@ -143,9 +156,15 @@ with st.expander("Advanced configuration"):
|
|
143 |
st.text("")
|
144 |
st.markdown("`target` column")
|
145 |
with col2:
|
146 |
-
text_col = st.selectbox(
|
|
|
|
|
|
|
|
|
147 |
target_col = st.selectbox(
|
148 |
-
"This column should contain an example translation of the source text",
|
|
|
|
|
149 |
)
|
150 |
col_mapping[text_col] = "source"
|
151 |
col_mapping[target_col] = "target"
|
@@ -159,8 +178,16 @@ with st.expander("Advanced configuration"):
|
|
159 |
st.text("")
|
160 |
st.markdown("`target` column")
|
161 |
with col2:
|
162 |
-
text_col = st.selectbox(
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
col_mapping[text_col] = "text"
|
165 |
col_mapping[target_col] = "target"
|
166 |
|
@@ -183,16 +210,29 @@ with st.expander("Advanced configuration"):
|
|
183 |
st.text("")
|
184 |
st.markdown("`answers.answer_start` column")
|
185 |
with col2:
|
186 |
-
context_col = st.selectbox(
|
|
|
|
|
|
|
|
|
187 |
question_col = st.selectbox(
|
188 |
-
"This column should contain the question to be answered, given the context",
|
|
|
|
|
189 |
)
|
190 |
answers_text_col = st.selectbox(
|
191 |
-
"This column should contain example answers to the question, extracted from the context",
|
|
|
|
|
|
|
|
|
192 |
)
|
193 |
answers_start_col = st.selectbox(
|
194 |
"This column should contain the indices in the context of the first character of each answers.text",
|
195 |
col_names,
|
|
|
|
|
|
|
196 |
)
|
197 |
col_mapping[context_col] = "context"
|
198 |
col_mapping[question_col] = "question"
|
@@ -203,9 +243,8 @@ with st.form(key="form"):
|
|
203 |
|
204 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
205 |
|
206 |
-
selected_models = st.multiselect(
|
207 |
-
|
208 |
-
)
|
209 |
submit_button = st.form_submit_button("Make submission")
|
210 |
|
211 |
if submit_button:
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
from huggingface_hub import list_datasets
|
10 |
|
11 |
+
from utils import (get_compatible_models, get_key, get_metadata, http_get,
|
12 |
+
http_post)
|
13 |
|
14 |
if Path(".env").is_file():
|
15 |
load_dotenv(".env")
|
|
|
30 |
"summarization": 8,
|
31 |
}
|
32 |
|
33 |
+
supported_tasks = list(TASK_TO_ID.keys())
|
34 |
+
|
35 |
+
|
36 |
###########
|
37 |
### APP ###
|
38 |
###########
|
|
|
65 |
|
66 |
with st.expander("Advanced configuration"):
|
67 |
## Select task
|
68 |
+
selected_task = st.selectbox(
|
69 |
+
"Select a task",
|
70 |
+
supported_tasks,
|
71 |
+
index=supported_tasks.index(metadata[0]["task_id"]) if metadata is not None else 0,
|
72 |
+
)
|
73 |
### Select config
|
74 |
configs = get_dataset_config_names(selected_dataset)
|
75 |
selected_config = st.selectbox("Select a config", configs)
|
|
|
83 |
if split["config"] == selected_config:
|
84 |
split_names.append(split["split"])
|
85 |
|
86 |
+
selected_split = st.selectbox(
|
87 |
+
"Select a split",
|
88 |
+
split_names,
|
89 |
+
index=split_names.index(metadata[0]["splits"]["eval_split"]) if metadata is not None else 0,
|
90 |
+
)
|
91 |
|
92 |
+
## Select columns
|
93 |
rows_resp = http_get(
|
94 |
path="/rows",
|
95 |
domain="https://datasets-preview.huggingface.tech",
|
96 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
97 |
).json()
|
98 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
st.markdown("**Map your data columns**")
|
101 |
col1, col2 = st.columns(2)
|
102 |
|
103 |
# TODO: find a better way to layout these items
|
104 |
+
# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
|
105 |
col_mapping = {}
|
106 |
if selected_task in ["binary_classification", "multi_class_classification"]:
|
107 |
with col1:
|
|
|
112 |
st.text("")
|
113 |
st.markdown("`target` column")
|
114 |
with col2:
|
115 |
+
text_col = st.selectbox(
|
116 |
+
"This column should contain the text you want to classify",
|
117 |
+
col_names,
|
118 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
|
119 |
+
)
|
120 |
target_col = st.selectbox(
|
121 |
+
"This column should contain the labels you want to assign to the text",
|
122 |
+
col_names,
|
123 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
124 |
)
|
125 |
col_mapping[text_col] = "text"
|
126 |
col_mapping[target_col] = "target"
|
|
|
137 |
tokens_col = st.selectbox(
|
138 |
"This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
|
139 |
col_names,
|
140 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0,
|
141 |
)
|
142 |
tags_col = st.selectbox(
|
143 |
+
"This column should contain the labels to associate to each part of the text",
|
144 |
+
col_names,
|
145 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0,
|
146 |
)
|
147 |
col_mapping[tokens_col] = "tokens"
|
148 |
col_mapping[tags_col] = "tags"
|
|
|
156 |
st.text("")
|
157 |
st.markdown("`target` column")
|
158 |
with col2:
|
159 |
+
text_col = st.selectbox(
|
160 |
+
"This column should contain the text you want to translate",
|
161 |
+
col_names,
|
162 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0,
|
163 |
+
)
|
164 |
target_col = st.selectbox(
|
165 |
+
"This column should contain an example translation of the source text",
|
166 |
+
col_names,
|
167 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
168 |
)
|
169 |
col_mapping[text_col] = "source"
|
170 |
col_mapping[target_col] = "target"
|
|
|
178 |
st.text("")
|
179 |
st.markdown("`target` column")
|
180 |
with col2:
|
181 |
+
text_col = st.selectbox(
|
182 |
+
"This column should contain the text you want to summarize",
|
183 |
+
col_names,
|
184 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
|
185 |
+
)
|
186 |
+
target_col = st.selectbox(
|
187 |
+
"This column should contain an example summarization of the text",
|
188 |
+
col_names,
|
189 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
190 |
+
)
|
191 |
col_mapping[text_col] = "text"
|
192 |
col_mapping[target_col] = "target"
|
193 |
|
|
|
210 |
st.text("")
|
211 |
st.markdown("`answers.answer_start` column")
|
212 |
with col2:
|
213 |
+
context_col = st.selectbox(
|
214 |
+
"This column should contain the question's context",
|
215 |
+
col_names,
|
216 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "context")) if metadata is not None else 0,
|
217 |
+
)
|
218 |
question_col = st.selectbox(
|
219 |
+
"This column should contain the question to be answered, given the context",
|
220 |
+
col_names,
|
221 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "question")) if metadata is not None else 0,
|
222 |
)
|
223 |
answers_text_col = st.selectbox(
|
224 |
+
"This column should contain example answers to the question, extracted from the context",
|
225 |
+
col_names,
|
226 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.text"))
|
227 |
+
if metadata is not None
|
228 |
+
else 0,
|
229 |
)
|
230 |
answers_start_col = st.selectbox(
|
231 |
"This column should contain the indices in the context of the first character of each answers.text",
|
232 |
col_names,
|
233 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.answer_start"))
|
234 |
+
if metadata is not None
|
235 |
+
else 0,
|
236 |
)
|
237 |
col_mapping[context_col] = "context"
|
238 |
col_mapping[question_col] = "question"
|
|
|
243 |
|
244 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
245 |
|
246 |
+
selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
|
247 |
+
print("Selected models:", selected_models)
|
|
|
248 |
submit_button = st.form_submit_button("Make submission")
|
249 |
|
250 |
if submit_button:
|
utils.py
CHANGED
@@ -48,10 +48,9 @@ def http_get(path: str, domain: str, token: str = None, params: dict = None) ->
|
|
48 |
|
49 |
|
50 |
def get_metadata(dataset_name: str) -> Union[Dict, None]:
|
51 |
-
|
52 |
-
data
|
53 |
-
|
54 |
-
return data[0].cardData["train-eval-index"]
|
55 |
else:
|
56 |
return None
|
57 |
|
@@ -63,3 +62,11 @@ def get_compatible_models(task, dataset_name):
|
|
63 |
)
|
64 |
compatible_models = api.list_models(filter=filt)
|
65 |
return [model.modelId for model in compatible_models]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def get_metadata(dataset_name: str) -> Union[Dict, None]:
|
51 |
+
data = requests.get(f"https://huggingface.co/api/datasets/{dataset_name}").json()
|
52 |
+
if data["cardData"] is not None and "train-eval-index" in data["cardData"].keys():
|
53 |
+
return data["cardData"]["train-eval-index"]
|
|
|
54 |
else:
|
55 |
return None
|
56 |
|
|
|
62 |
)
|
63 |
compatible_models = api.list_models(filter=filt)
|
64 |
return [model.modelId for model in compatible_models]
|
65 |
+
|
66 |
+
|
67 |
+
def get_key(col_mapping, val):
|
68 |
+
for key, value in col_mapping.items():
|
69 |
+
if val == value:
|
70 |
+
return key
|
71 |
+
|
72 |
+
return "key doesn't exist"
|