Spaces:

huggingface
/

data-measurements-tool

Running

App Files Files Community

Yacine Jernite commited on Dec 13, 2021

Commit

f4b8e6e

•

1 Parent(s): c500e3c

can only select available splits

Browse files

Files changed (2) hide show

cache_dir/has_cache.json +3 -0
data_measurements/streamlit_utils.py +13 -6

cache_dir/has_cache.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7d89146f736ca9852dd82abaa7d29225499d53ca16f7714cfa576915e0a7d7
+size 3584

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import statistics
 import pandas as pd
 import seaborn as sns
 import streamlit as st
@@ -22,6 +23,8 @@ from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
 st.set_option('deprecation.showPyplotGlobalUse', False)
 def sidebar_header():
     st.sidebar.markdown(
         """
@@ -29,16 +32,17 @@ def sidebar_header():
     Right now this has a few pre-loaded datasets for which you can:
     - view some general statistics about the text vocabulary, lengths, labels
     - explore some distributional statistics to assess properties of the language
-    - view some comparison statistics and overview of the text distribution
-    The tool is in development, and will keep growing in utility and functionality 🤗🚧
     """,
         unsafe_allow_html=True,
     )
 def sidebar_selection(ds_name_to_dict, column_id):
-    ds_names = list(ds_name_to_dict.keys())
     with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
         # choose a dataset to analyze
         ds_name = st.selectbox(
@@ -52,6 +56,7 @@ def sidebar_selection(ds_name_to_dict, column_id):
             config_names = ['en','en.noblocklist','realnewslike']
         else:
             config_names = list(ds_configs.keys())
         config_name = st.selectbox(
             f"Choose configuration{column_id}:",
             config_names,
@@ -60,7 +65,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
         # choose a subset of num_examples
         # TODO: Handling for multiple text features
         ds_config = ds_configs[config_name]
-        text_features = ds_config[HF_FEATURE_FIELD]["string"]
         # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
         text_field = st.selectbox(
             f"Which text feature from the{column_id} dataset would you like to analyze?",
@@ -69,7 +75,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
             else [tp for tp in text_features if tp[0] != "id"],
         )
         # Choose a split and dataset size
-        avail_splits = list(ds_config["splits"].keys())
         # 12.Nov note: Removing "test" because those should not be examined
         # without discussion of pros and cons, which we haven't done yet.
         if "test" in avail_splits:

 import statistics
+import json
 import pandas as pd
 import seaborn as sns
 import streamlit as st
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
 st.set_option('deprecation.showPyplotGlobalUse', False)
+_HAS_CACHE = json.load(open("cache_dir/has_cache.json"))
 def sidebar_header():
     st.sidebar.markdown(
         """
     Right now this has a few pre-loaded datasets for which you can:
     - view some general statistics about the text vocabulary, lengths, labels
     - explore some distributional statistics to assess properties of the language
+    - view some comparison statistics and overview of the text distribution
+    The tool is in development, and will keep growing in utility and functionality 🤗🚧
     """,
         unsafe_allow_html=True,
     )
 def sidebar_selection(ds_name_to_dict, column_id):
+    # ds_names = list(ds_name_to_dict.keys())
+    ds_names = list(_HAS_CACHE.keys())
     with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
         # choose a dataset to analyze
         ds_name = st.selectbox(
             config_names = ['en','en.noblocklist','realnewslike']
         else:
             config_names = list(ds_configs.keys())
+        config_names = list(_HAS_CACHE[ds_name].keys())
         config_name = st.selectbox(
             f"Choose configuration{column_id}:",
             config_names,
         # choose a subset of num_examples
         # TODO: Handling for multiple text features
         ds_config = ds_configs[config_name]
+        # text_features = ds_config[HF_FEATURE_FIELD]["string"]
+        text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
         # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
         text_field = st.selectbox(
             f"Which text feature from the{column_id} dataset would you like to analyze?",
             else [tp for tp in text_features if tp[0] != "id"],
         )
         # Choose a split and dataset size
+        # avail_splits = list(ds_config["splits"].keys())
+        avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
         # 12.Nov note: Removing "test" because those should not be examined
         # without discussion of pros and cons, which we haven't done yet.
         if "test" in avail_splits: