Thomas Wolf commited on
Commit
1e4b28a
2 Parent(s): c264dfc febdd8b

Merge pull request #5 from lhoestq/load-local-only-by-default

Browse files
Files changed (1) hide show
  1. tagging_app.py +12 -7
tagging_app.py CHANGED
@@ -3,12 +3,18 @@ import datasets
3
  import json
4
  import os
5
  import streamlit as st
 
6
  import yaml
7
-
8
  from dataclasses import asdict
 
 
 
9
  from glob import glob
10
  from os.path import join as pjoin
11
 
 
 
 
12
  st.set_page_config(
13
  page_title="HF Dataset Tagging App",
14
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
@@ -132,7 +138,7 @@ def load_all_dataset_infos(dataset_list):
132
  def load_existing_tags():
133
  has_tags = {}
134
  for fname in glob("saved_tags/*/*/tags.json"):
135
- _, did, cid, _ = fname.split('/')
136
  has_tags[did] = has_tags.get(did, {})
137
  has_tags[did][cid] = fname
138
  return has_tags
@@ -160,9 +166,9 @@ to pre-load the tag sets from another dataset or configuration to avoid too much
160
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
161
  """
162
 
163
- all_dataset_ids = copy.deepcopy(get_dataset_list())
164
  existing_tag_sets = load_existing_tags()
165
- all_dataset_infos = load_all_dataset_infos(all_dataset_ids)
 
166
 
167
  st.sidebar.markdown(app_desc)
168
 
@@ -181,6 +187,7 @@ dataset_id = st.sidebar.selectbox(
181
  index=0,
182
  )
183
 
 
184
  if dataset_id == "local dataset":
185
  path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
186
  if path_to_info not in ["/path/to/dataset/", ""]:
@@ -249,8 +256,6 @@ c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
249
  ##########
250
  c2.markdown("#### Pre-loading an existing tag set")
251
 
252
- existing_tag_sets = load_existing_tags()
253
-
254
  pre_loaded = {
255
  "task_categories": [],
256
  "task_ids": [],
@@ -442,7 +447,7 @@ with c3.beta_expander("Show JSON output for the current config"):
442
 
443
  with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
444
  task_saved_configs = dict([
445
- (fname.split('/')[-2], json.load(open(fname)))
446
  for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
447
  ])
448
  aggregate_config = {}
 
3
  import json
4
  import os
5
  import streamlit as st
6
+ import sys
7
  import yaml
 
8
  from dataclasses import asdict
9
+ from pathlib import Path
10
+ from typing import Dict
11
+
12
  from glob import glob
13
  from os.path import join as pjoin
14
 
15
+
16
+ load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
17
+
18
  st.set_page_config(
19
  page_title="HF Dataset Tagging App",
20
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
 
138
  def load_existing_tags():
139
  has_tags = {}
140
  for fname in glob("saved_tags/*/*/tags.json"):
141
+ _, did, cid, _ = fname.split(os.sep)
142
  has_tags[did] = has_tags.get(did, {})
143
  has_tags[did][cid] = fname
144
  return has_tags
 
166
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
167
  """
168
 
 
169
  existing_tag_sets = load_existing_tags()
170
+ all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
171
+ all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
172
 
173
  st.sidebar.markdown(app_desc)
174
 
 
187
  index=0,
188
  )
189
 
190
+ all_info_dicts = {}
191
  if dataset_id == "local dataset":
192
  path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
193
  if path_to_info not in ["/path/to/dataset/", ""]:
 
256
  ##########
257
  c2.markdown("#### Pre-loading an existing tag set")
258
 
 
 
259
  pre_loaded = {
260
  "task_categories": [],
261
  "task_ids": [],
 
447
 
448
  with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
449
  task_saved_configs = dict([
450
+ (Path(fname).parent.name, json.load(open(fname)))
451
  for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
452
  ])
453
  aggregate_config = {}