lhoestq HF staff commited on
Commit
2c1a98c
Β·
1 Parent(s): 976e9da

initial app

Browse files
Files changed (2) hide show
  1. README.md +4 -1
  2. app.py +123 -0
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
  title: Dataset Spreadsheets
3
- emoji: 🌍
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Dataset Spreadsheets
3
+ emoji: πŸ€—πŸ“
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - read-repos
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ import duckdb
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import requests
7
+ from duckdb import DuckDBPyRelation
8
+ from duckdb.typing import DuckDBPyType
9
+ from huggingface_hub import HfApi
10
+
11
+ Table = DuckDBPyRelation
12
+ Dtype = DuckDBPyType
13
+ READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
14
+ EMPTY_TABLE = duckdb.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
15
+ PAGE_SIZE = 100
16
+ NUM_TRENDING_DATASETS = 10
17
+ NUM_USER_DATASETS = 10
18
+ css = """
19
+ .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
20
+ background: var(--body-background-fill);
21
+ }
22
+ .gradio-container {
23
+ padding: var(--size-4) 0 !important;
24
+ max-width: 98% !important;
25
+ }
26
+ """
27
+
28
+ @lru_cache(maxsize=3)
29
+ def cached_duckdb_sql(query: str) -> Table:
30
+ return duckdb.sql(query)
31
+
32
+ def to_json_df(tbl: Table) -> pd.DataFrame:
33
+ query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') as " + col for col in tbl.columns)
34
+ return duckdb.sql(f"SELECT {query} FROM tbl").df()
35
+
36
+ def from_json_df(df: pd.DataFrame, dtypes: list[Dtype]) -> Table:
37
+ query = ", ".join("(ifnull(" + col + ", 'null')::JSON)::" + dtype + " as " + col for col, dtype in zip(df.columns, dtypes))
38
+ return duckdb.sql(f"SELECT {query} FROM df")
39
+
40
+ with gr.Blocks(css=css) as demo:
41
+ loading_codes_json = gr.JSON(visible=False)
42
+ with gr.Row():
43
+ with gr.Column():
44
+ gr.Markdown("# <p style='text-align:center;'>πŸ€— (WIP) Hugging Face Dataset Spreadsheets πŸ“</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
45
+ with gr.Group():
46
+ with gr.Row():
47
+ dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
48
+ subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
49
+ split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
50
+ gr.LoginButton()
51
+ dataframe = gr.DataFrame(to_json_df(EMPTY_TABLE), interactive=True, wrap=True)
52
+
53
+ def show_subset_dropdown(dataset: str):
54
+ if dataset and "/" not in dataset.strip().strip("/"):
55
+ return []
56
+ resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
57
+ loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
58
+ subsets = [loading_code["config_name"] for loading_code in loading_codes]
59
+ subset = (subsets or [""])[0]
60
+ return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
61
+
62
+ def show_split_dropdown(subset: str, loading_codes: list[dict]):
63
+ splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
64
+ split = (splits or [""])[0]
65
+ return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
66
+
67
+ def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
68
+ pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
69
+ if dataset and subset and split and pattern:
70
+ tbl = cached_duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {PAGE_SIZE}")
71
+ else:
72
+ tbl = EMPTY_TABLE
73
+ return dict(value=to_json_df(tbl))
74
+
75
+ @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
76
+ def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
77
+ api = HfApi(token=oauth_token.token if oauth_token else None)
78
+ datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
79
+ if oauth_token and (user := api.whoami().get("name")):
80
+ datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
81
+ dataset = request.query_params.get("dataset") or datasets[0].id
82
+ subsets, loading_codes = show_subset_dropdown(dataset)
83
+ splits = show_split_dropdown(subsets["value"], loading_codes)
84
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
85
+ return {
86
+ dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
87
+ loading_codes_json: loading_codes,
88
+ subset_dropdown: gr.Dropdown(**subsets),
89
+ split_dropdown: gr.Dropdown(**splits),
90
+ dataframe: gr.DataFrame(**input_dataframe),
91
+ }
92
+
93
+ @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, dataframe])
94
+ def _show_subset_dropdown(dataset: str):
95
+ subsets, loading_codes = show_subset_dropdown(dataset)
96
+ splits = show_split_dropdown(subsets["value"], loading_codes)
97
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
98
+ return {
99
+ loading_codes_json: loading_codes,
100
+ subset_dropdown: gr.Dropdown(**subsets),
101
+ split_dropdown: gr.Dropdown(**splits),
102
+ dataframe: gr.DataFrame(**input_dataframe),
103
+ }
104
+
105
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, dataframe])
106
+ def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
107
+ splits = show_split_dropdown(subset, loading_codes)
108
+ input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
109
+ return {
110
+ split_dropdown: gr.Dropdown(**splits),
111
+ dataframe: gr.DataFrame(**input_dataframe),
112
+ }
113
+
114
+ @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[dataframe])
115
+ def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
116
+ input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes)
117
+ return {
118
+ dataframe: gr.DataFrame(**input_dataframe),
119
+ }
120
+
121
+
122
+ if __name__ == "__main__":
123
+ demo.launch()