Spaces:
Build error
Build error
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import datasets | |
from dataclasses import asdict | |
from PIL import Image | |
import yaml | |
import textwrap | |
import tornado | |
import json | |
import time | |
import sys | |
from git import Repo | |
import os | |
if not os.path.exists('datasets_clone'): | |
Repo.clone_from('https://github.com/huggingface/datasets.git', 'datasets_clone') | |
MAX_SIZE = 40000000000 | |
# if len(sys.argv) > 1: | |
# path_to_datasets = sys.argv[1] | |
# else: | |
# path_to_datasets = None | |
path_to_datasets = 'datasets_clone/datasets/' | |
## Hack to extend the width of the main pane. | |
def _max_width_(): | |
max_width_str = f"max-width: 1000px;" | |
st.markdown( | |
f""" | |
<style> | |
.reportview-container .main .block-container{{ | |
{max_width_str} | |
}} | |
th {{ | |
text-align: left; | |
font-size: 110%; | |
}} | |
tr:hover {{ | |
background-color: #ffff99; | |
}} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
_max_width_() | |
def render_features(features): | |
if isinstance(features, dict): | |
return {k: render_features(v) for k, v in features.items()} | |
if isinstance(features, datasets.features.ClassLabel): | |
return features.names | |
if isinstance(features, datasets.features.Value): | |
return features.dtype | |
if isinstance(features, datasets.features.Sequence): | |
return {"[]": render_features(features.feature)} | |
return features | |
app_state = st.experimental_get_query_params() | |
# print(app_state) | |
start = True | |
loaded = True | |
INITIAL_SELECTION = "" | |
# if app_state == "NOT_INITIALIZED": | |
# latest_iteration = st.empty() | |
# bar = st.progress(0) | |
# start = False | |
# for i in range(0, 101, 10): | |
# # Update the progress bar with each iteration. | |
# # latest_iteration.text(f'Iteration {i+1}') | |
# bar.progress(i) | |
# time.sleep(0.1) | |
# if i == 100: | |
# start = True | |
# bar.empty() | |
# loaded = True | |
# app_state = st.experimental_get_query_params() | |
# print("appstate is", app_state) | |
app_state.setdefault("dataset", "glue") | |
if len(app_state.get("dataset", [])) == 1: | |
app_state["dataset"] = app_state["dataset"][0] | |
INITIAL_SELECTION = app_state["dataset"] | |
if len(app_state.get("config", [])) == 1: | |
app_state["config"] = app_state["config"][0] | |
print(INITIAL_SELECTION) | |
if start: | |
## Logo and sidebar decoration. | |
st.sidebar.markdown( | |
"""<center> | |
<a href="https://github.com/huggingface/datasets"> | |
</a> | |
</center>""", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.image("datasets_logo_name.png", width=300) | |
st.sidebar.markdown( | |
"<center><h2><a href='https://github.com/huggingface/datasets'>github/huggingface/datasets</h2></a></center>", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.markdown( | |
""" | |
<center> | |
<a target="_blank" href="https://huggingface.co/docs/datasets/">Docs</a> | | |
<a target="_blank" href="https://huggingface.co/datasets">Browse</a> | |
| <a href="https://huggingface.co/new-dataset" target="_blank">Add Dataset</a> | |
</center>""", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.subheader("") | |
## Interaction with the datasets libary. | |
# @st.cache | |
def get_confs(opt): | |
"Get the list of confs for a dataset." | |
if path_to_datasets is not None and opt is not None: | |
path = path_to_datasets + opt | |
else: | |
path = opt | |
module_path = datasets.load.prepare_module(path, dataset=True | |
) | |
# Get dataset builder class from the processing script | |
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
# Instantiate the dataset builder | |
confs = builder_cls.BUILDER_CONFIGS | |
if confs and len(confs) > 1: | |
return confs | |
else: | |
return [] | |
# @st.cache(allow_output_mutation=True) | |
def get(opt, conf=None): | |
"Get a dataset from name and conf" | |
if path_to_datasets is not None: | |
path = path_to_datasets + opt | |
else: | |
path = opt | |
module_path = datasets.load.prepare_module(path, dataset=True) | |
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
if conf: | |
builder_instance = builder_cls(name=conf, cache_dir=path if path_to_datasets is not None else None) | |
else: | |
builder_instance = builder_cls(cache_dir=path if path_to_datasets is not None else None) | |
fail = False | |
if path_to_datasets is not None: | |
dts = datasets.load_dataset(path, | |
name=builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None, | |
) | |
dataset = dts | |
elif ( | |
builder_instance.manual_download_instructions is None | |
and builder_instance.info.size_in_bytes is not None | |
and builder_instance.info.size_in_bytes < MAX_SIZE): | |
builder_instance.download_and_prepare() | |
dts = builder_instance.as_dataset() | |
dataset = dts | |
else: | |
dataset = builder_instance | |
fail = True | |
return dataset, fail | |
# Dataset select box. | |
dataset_names = [] | |
selection = None | |
import glob | |
if path_to_datasets is None: | |
list_of_datasets = datasets.list_datasets(with_community_datasets=False) | |
else: | |
list_of_datasets = sorted(glob.glob(path_to_datasets + "*")) | |
print(list_of_datasets) | |
for i, dataset in enumerate(list_of_datasets): | |
dataset = dataset.split("/")[-1] | |
if INITIAL_SELECTION and dataset == INITIAL_SELECTION: | |
selection = i | |
dataset_names.append(dataset ) | |
if selection is not None: | |
option = st.sidebar.selectbox( | |
"Dataset", dataset_names, index=selection, format_func=lambda a: a | |
) | |
else: | |
option = st.sidebar.selectbox("Dataset", dataset_names, format_func=lambda a: a) | |
print(option) | |
app_state["dataset"] = option | |
st.experimental_set_query_params(**app_state) | |
# Side bar Configurations. | |
configs = get_confs(option) | |
conf_avail = len(configs) > 0 | |
conf_option = None | |
if conf_avail: | |
start = 0 | |
for i, conf in enumerate(configs): | |
if conf.name == app_state.get("config", None): | |
start = i | |
conf_option = st.sidebar.selectbox( | |
"Subset", configs, index=start, format_func=lambda a: a.name | |
) | |
app_state["config"] = conf_option.name | |
else: | |
if "config" in app_state: | |
del app_state["config"] | |
st.experimental_set_query_params(**app_state) | |
dts, fail = get(str(option), str(conf_option.name) if conf_option else None) | |
# Main panel setup. | |
if fail: | |
st.markdown( | |
"Dataset is too large to browse or requires manual download. Check it out in the datasets library! \n\n Size: " | |
+ str(dts.info.size_in_bytes) | |
+ "\n\n Instructions: " | |
+ str(dts.manual_download_instructions) | |
) | |
else: | |
k = list(dts.keys()) | |
index = 0 | |
if "train" in dts.keys(): | |
index = k.index("train") | |
split = st.sidebar.selectbox("Split", k, index=index) | |
d = dts[split] | |
keys = list(d[0].keys()) | |
st.header( | |
"Dataset: " | |
+ option | |
+ " " | |
+ (("/ " + conf_option.name) if conf_option else "") | |
) | |
st.markdown( | |
"*Homepage*: " | |
+ d.info.homepage | |
+ "\n\n*Dataset*: https://huggingface.co/datasets/%s" | |
% (option) | |
) | |
md = """ | |
%s | |
""" % ( | |
d.info.description.replace("\\", "") if option else "" | |
) | |
st.markdown(md) | |
step = 50 | |
offset = st.sidebar.number_input( | |
"Offset (Size: %d)" % len(d), | |
min_value=0, | |
max_value=int(len(d)) - step, | |
value=0, | |
step=step, | |
) | |
image_classification, gallary = False, False | |
if d.info.task_templates: | |
for task_template in d.info.task_templates: | |
if task_template.task == 'image-classification': | |
image_classification = True | |
st.sidebar.markdown('\n---\n') | |
gallary = st.sidebar.checkbox("Show Image Gallary ๐ผ๏ธ", False) if image_classification else None | |
break | |
citation = st.sidebar.checkbox("Show Citations ๐", False) | |
table = image_classification or not st.sidebar.checkbox("Show List View ๐", False) | |
show_features = st.sidebar.checkbox("Show Features ๐ง", True) | |
md = """ | |
``` | |
%s | |
``` | |
""" % ( | |
d.info.citation.replace("\\", "").replace("}", " }").replace("{", "{ "), | |
) | |
if citation: | |
st.markdown(md) | |
# st.text("Features:") | |
if show_features: | |
if not gallary: | |
on_keys = st.multiselect("Features", keys, keys) | |
st.write(render_features(d.features)) | |
else: | |
on_keys = keys | |
if not table and not (image_classification and gallary): | |
# Full view. | |
for item in range(offset, offset + step): | |
st.text(" ") | |
st.text(" ---- #" + str(item)) | |
st.text(" ") | |
# Use st to write out. | |
for k in on_keys: | |
v = d[item][k] | |
st.subheader(k) | |
if isinstance(v, str): | |
out = v | |
st.text(textwrap.fill(out, width=120)) | |
elif ( | |
isinstance(v, bool) | |
or isinstance(v, int) | |
or isinstance(v, float) | |
): | |
st.text(v) | |
else: | |
st.write(v) | |
elif image_classification and gallary: | |
# Image Gallary View. | |
d = d.prepare_for_task('image-classification') | |
n_cols, n_rows = 5, 10 | |
images = [] | |
labels = [] | |
for item in range(offset, offset+step): | |
image = Image.open(d[item]['image_file_path']).convert("RGB") | |
images.append(image) | |
label_id = d[item]['labels'] | |
label_str = d.features['labels'].int2str(label_id) | |
labels.append(f"#{item} | {label_str}") | |
n_rows = 1 + len(images) // int(n_cols) | |
cols_per_row = [st.beta_columns(n_cols) for _ in range(n_rows)] | |
cols = [column for row in cols_per_row for column in row] | |
for idx, (image, label) in enumerate(zip(images, labels)): | |
cols[idx].image(image, caption=label) | |
else: | |
# Table view. Use Pandas. | |
df = [] | |
for item in range(offset, offset + step): | |
df_item = {} | |
df_item["_number"] = item | |
for k in on_keys: | |
v = d[item][k] | |
if isinstance(v, str): | |
out = v | |
df_item[k] = textwrap.fill(out, width=50) | |
elif ( | |
isinstance(v, bool) | |
or isinstance(v, int) | |
or isinstance(v, float) | |
): | |
df_item[k] = v | |
else: | |
out = json.dumps(v, indent=2, sort_keys=True) | |
df_item[k] = out | |
df.append(df_item) | |
df2 = df | |
df = pd.DataFrame(df).set_index("_number") | |
def hover(hover_color="#ffff99"): | |
return dict( | |
selector="tr:hover", | |
props=[("background-color", "%s" % hover_color)], | |
) | |
styles = [ | |
hover(), | |
dict( | |
selector="th", | |
props=[("font-size", "150%"), ("text-align", "center")], | |
), | |
dict(selector="caption", props=[("caption-side", "bottom")]), | |
] | |
# Table view. Use pands styling. | |
style = df.style.set_properties( | |
**{"text-align": "left", "white-space": "pre"} | |
).set_table_styles([dict(selector="th", props=[("text-align", "left")])]) | |
style = style.set_table_styles(styles) | |
st.table(style) | |
# Additional dataset installation and sidebar properties. | |
md = """ | |
### Code | |
```python | |
!pip install datasets | |
from datasets import load_dataset | |
dataset = load_dataset( | |
'%s'%s) | |
``` | |
""" % ( | |
option, | |
(", '" + conf_option.name + "'") if conf_option else "", | |
) | |
st.sidebar.markdown(md) | |