lhoestq HF staff commited on
Commit
36212af
Β·
1 Parent(s): bd73664
Files changed (1) hide show
  1. app.py +61 -16
app.py CHANGED
@@ -35,6 +35,9 @@ css = """
35
  .cell-menu-button {
36
  z-index: -1;
37
  }
 
 
 
38
  """
39
 
40
  def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
@@ -70,7 +73,7 @@ with gr.Blocks(css=css) as demo:
70
  loading_codes_json = gr.JSON([], visible=False)
71
  with gr.Row():
72
  with gr.Column():
73
- gr.Markdown("# <p style='text-align:center;'>πŸ€— (WIP) Hugging Face Dataset Spreadsheets πŸ“</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
74
  with gr.Group():
75
  with gr.Tab("Select Dataset"):
76
  with gr.Row():
@@ -82,6 +85,11 @@ with gr.Blocks(css=css) as demo:
82
  with gr.Tab("Use Locally"):
83
  use_locally_markdown = gr.Markdown()
84
  dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
 
 
 
 
 
85
 
86
  def show_subset_dropdown(dataset: str):
87
  if dataset and "/" not in dataset.strip().strip("/"):
@@ -97,7 +105,7 @@ with gr.Blocks(css=css) as demo:
97
  split = (splits or [""])[0]
98
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
99
 
100
- def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str):
101
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
102
  if session and dataset and subset and split and pattern:
103
  duckdb_file = session + ".duckdb"
@@ -105,13 +113,13 @@ with gr.Blocks(css=css) as demo:
105
  setup_edits(con, dataset, pattern)
106
  # Uncomment to have one edit for testing
107
  # con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
108
- tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE}")
109
  return dict(value=to_json_df(con, tbl))
110
  else:
111
  return dict(value=to_json_df(memory_con, empty_tbl))
112
 
113
 
114
- @demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown])
115
  def _fetch_datasets(session: str | None, request: gr.Request):
116
  datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
117
  session = request.query_params.get(SESSIONS_DIR) or session
@@ -128,7 +136,8 @@ with gr.Blocks(css=css) as demo:
128
  splits = show_split_dropdown(subsets["value"], loading_codes)
129
  splits["value"] = split if session else splits["value"]
130
  session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
131
- input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
 
132
  return {
133
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
134
  loading_codes_json: loading_codes,
@@ -140,45 +149,80 @@ with gr.Blocks(css=css) as demo:
140
  use_locally_markdown: (
141
  f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
142
  f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
143
- )
 
 
 
144
  }
145
 
146
- @dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
147
  def _show_subset_dropdown(session: str | None, dataset: str):
148
  subsets, loading_codes = show_subset_dropdown(dataset)
149
  splits = show_split_dropdown(subsets["value"], loading_codes)
150
  session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
151
- input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
 
152
  return {
153
  loading_codes_json: loading_codes,
154
  subset_dropdown: gr.Dropdown(**subsets),
155
  split_dropdown: gr.Dropdown(**splits),
156
  session_state: session,
157
  dataframe: gr.DataFrame(**input_dataframe),
 
 
 
158
  }
159
 
160
- @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe])
161
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
162
  splits = show_split_dropdown(subset, loading_codes)
163
  session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
164
- input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session)
 
165
  return {
166
  split_dropdown: gr.Dropdown(**splits),
167
  session_state: session,
168
  dataframe: gr.DataFrame(**input_dataframe),
 
 
 
169
  }
170
 
171
- @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe])
172
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
173
  session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
174
- input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session)
 
175
  return {
176
  session_state: session,
177
  dataframe: gr.DataFrame(**input_dataframe),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  }
179
 
180
- @dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json])
181
- def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict]):
182
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
183
  if session and dataset and subset and split and pattern:
184
  duckdb_file = session + ".duckdb"
@@ -188,11 +232,12 @@ with gr.Blocks(css=css) as demo:
188
  columns = empty_dataset_tbl.columns
189
  dtypes = empty_dataset_tbl.dtypes
190
  tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
 
191
  # TODO add edits for page > 1
192
  # Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
193
- con.sql(f"DELETE FROM edits WHERE rowid IN range({len(df)})")
194
  try:
195
- con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({len(df)})) AS rowid) POSITIONAL JOIN tbl")
196
  except duckdb.ConversionException as e:
197
  raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
198
  print(f"Saved {dataset} edits")
 
35
  .cell-menu-button {
36
  z-index: -1;
37
  }
38
+ .centered {
39
+ text-align: center;
40
+ }
41
  """
42
 
43
  def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
 
73
  loading_codes_json = gr.JSON([], visible=False)
74
  with gr.Row():
75
  with gr.Column():
76
+ gr.Markdown("# πŸ€— (WIP) Hugging Face Dataset Spreadsheets πŸ“\n\nEdit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)", elem_classes="centered")
77
  with gr.Group():
78
  with gr.Tab("Select Dataset"):
79
  with gr.Row():
 
85
  with gr.Tab("Use Locally"):
86
  use_locally_markdown = gr.Markdown()
87
  dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
88
+ with gr.Row():
89
+ prev_button = gr.Button("< Previous", min_width=140, interactive=False)
90
+ with gr.Column(scale=9, min_width=0):
91
+ page_html = gr.HTML("Page 1", elem_classes="centered")
92
+ next_button = gr.Button("Next >", min_width=140)
93
 
94
  def show_subset_dropdown(dataset: str):
95
  if dataset and "/" not in dataset.strip().strip("/"):
 
105
  split = (splits or [""])[0]
106
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
107
 
108
+ def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page: int):
109
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
110
  if session and dataset and subset and split and pattern:
111
  duckdb_file = session + ".duckdb"
 
113
  setup_edits(con, dataset, pattern)
114
  # Uncomment to have one edit for testing
115
  # con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
116
+ tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE} OFFSET {(page - 1) * PAGE_SIZE}")
117
  return dict(value=to_json_df(con, tbl))
118
  else:
119
  return dict(value=to_json_df(memory_con, empty_tbl))
120
 
121
 
122
+ @demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
123
  def _fetch_datasets(session: str | None, request: gr.Request):
124
  datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
125
  session = request.query_params.get(SESSIONS_DIR) or session
 
136
  splits = show_split_dropdown(subsets["value"], loading_codes)
137
  splits["value"] = split if session else splits["value"]
138
  session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
139
+ page = 1
140
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
141
  return {
142
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
143
  loading_codes_json: loading_codes,
 
149
  use_locally_markdown: (
150
  f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
151
  f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
152
+ ),
153
+ prev_button: gr.Button(interactive=False),
154
+ next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
155
+ page_html: f"Page {page}",
156
  }
157
 
158
+ @dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe, prev_button, next_button, page_html])
159
  def _show_subset_dropdown(session: str | None, dataset: str):
160
  subsets, loading_codes = show_subset_dropdown(dataset)
161
  splits = show_split_dropdown(subsets["value"], loading_codes)
162
  session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
163
+ page = 1
164
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
165
  return {
166
  loading_codes_json: loading_codes,
167
  subset_dropdown: gr.Dropdown(**subsets),
168
  split_dropdown: gr.Dropdown(**splits),
169
  session_state: session,
170
  dataframe: gr.DataFrame(**input_dataframe),
171
+ prev_button: gr.Button(interactive=False),
172
+ next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
173
+ page_html: f"Page {page}",
174
  }
175
 
176
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe, prev_button, next_button, page_html])
177
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
178
  splits = show_split_dropdown(subset, loading_codes)
179
  session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
180
+ page = 1
181
+ input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session, page)
182
  return {
183
  split_dropdown: gr.Dropdown(**splits),
184
  session_state: session,
185
  dataframe: gr.DataFrame(**input_dataframe),
186
+ prev_button: gr.Button(interactive=False),
187
+ next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
188
+ page_html: f"Page {page}",
189
  }
190
 
191
+ @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe, prev_button, next_button, page_html])
192
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
193
  session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
194
+ page = 1
195
+ input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
196
  return {
197
  session_state: session,
198
  dataframe: gr.DataFrame(**input_dataframe),
199
+ prev_button: gr.Button(interactive=False),
200
+ next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
201
+ page_html: f"Page {page}",
202
+ }
203
+
204
+ @next_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
205
+ def _show_next_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
206
+ page = int(page_str.split(" ")[-1]) + 1
207
+ input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
208
+ return {
209
+ dataframe: gr.DataFrame(**input_dataframe),
210
+ prev_button: gr.Button(elem_classes="", interactive=True),
211
+ page_html: f"Page {page}",
212
+ }
213
+
214
+ @prev_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
215
+ def _show_prev_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
216
+ page = int(page_str.split(" ")[-1]) - 1
217
+ input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
218
+ return {
219
+ dataframe: gr.DataFrame(**input_dataframe),
220
+ prev_button: gr.Button(interactive=False) if page == 1 else gr.Button(elem_classes="", interactive=True),
221
+ page_html: f"Page {page}",
222
  }
223
 
224
+ @dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, page_html])
225
+ def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict], page_str: str):
226
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
227
  if session and dataset and subset and split and pattern:
228
  duckdb_file = session + ".duckdb"
 
232
  columns = empty_dataset_tbl.columns
233
  dtypes = empty_dataset_tbl.dtypes
234
  tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
235
+ page = int(page_str.split(" ")[-1])
236
  # TODO add edits for page > 1
237
  # Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
238
+ con.sql(f"DELETE FROM edits WHERE rowid IN range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})")
239
  try:
240
+ con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})) AS rowid) POSITIONAL JOIN tbl")
241
  except duckdb.ConversionException as e:
242
  raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
243
  print(f"Saved {dataset} edits")