hynky HF staff commited on
Commit
638184c
·
1 Parent(s): e808f6a
src/logic/data_fetching.py CHANGED
@@ -1,4 +1,4 @@
1
- from functools import partial
2
  import os
3
  import json
4
  import re
@@ -17,19 +17,17 @@ def find_folders(base_folder: str, path: str) -> List[str]:
17
  base_folder_df = get_datafolder(base_folder)
18
  if not base_folder_df.exists(path):
19
  return []
20
- return sorted(
21
- [
22
  folder
23
  for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
24
  if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
25
  ]
26
- )
27
 
28
- def fetch_datasets(base_folder: str):
29
- datasets = sorted(find_folders(base_folder, ""))
30
  if len(datasets) == 0:
31
  raise ValueError("No datasets found")
32
- return datasets
33
 
34
  def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
35
  if not datasets:
 
1
+ from functools import lru_cache, partial
2
  import os
3
  import json
4
  import re
 
17
  base_folder_df = get_datafolder(base_folder)
18
  if not base_folder_df.exists(path):
19
  return []
20
+ return [
 
21
  folder
22
  for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
23
  if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
24
  ]
 
25
 
26
+ def fetch_datasets(base_folder: str, progress=gr.Progress()):
27
+ datasets = sorted(progress.tqdm(find_folders(base_folder, "")))
28
  if len(datasets) == 0:
29
  raise ValueError("No datasets found")
30
+ return datasets, None
31
 
32
  def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
33
  if not datasets:
src/view/help_tab.py CHANGED
@@ -7,10 +7,10 @@ def create_help_tab():
7
 
8
  # Dataset Metrics Explorer
9
  ## Features:
10
- - View metrics for various datasets you computed using datatrove
11
- - Search for metrics across datasets
12
 
13
- ## View metrics Usage:
14
  1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
15
  2) Select datasets you are interested in using the dropdown or regex filter
16
  3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
@@ -27,7 +27,7 @@ def create_help_tab():
27
  - **summary**: Shows the average value of given metric for every dataset
28
  * show_stds: Show the standard deviation from mean for every datasets
29
 
30
- ## Reverse search Usage:
31
  To search for datasets containing a grouping and certain metric, use the Reverse search section.
32
  Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
33
 
 
7
 
8
  # Dataset Metrics Explorer
9
  ## Features:
10
+ - Inspect datasets throught various metrics computed using datatrove
11
+ - Search for datasets containing certain metrics
12
 
13
+ ## Metrics View Usage:
14
  1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
15
  2) Select datasets you are interested in using the dropdown or regex filter
16
  3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
 
27
  - **summary**: Shows the average value of given metric for every dataset
28
  * show_stds: Show the standard deviation from mean for every datasets
29
 
30
+ ## Reverse Metrics Search Usage:
31
  To search for datasets containing a grouping and certain metric, use the Reverse search section.
32
  Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
33
 
src/view/metric_view_tab.py CHANGED
@@ -6,12 +6,12 @@ from functools import partial
6
  import re
7
  import json
8
 
9
- from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex
10
  from src.logic.data_processing import export_data
11
  from src.logic.graph_settings import update_graph_options
12
  from src.logic.plotting import plot_data
13
 
14
- def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State, selected_datasets: gr.State):
15
  metric_data = gr.State([])
16
 
17
  with gr.Row():
@@ -120,53 +120,40 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
120
 
121
 
122
 
123
- def update_selected_datasets_dropdown(available_datasets, selected_datasets):
124
- return gr.Dropdown(choices=available_datasets, value=sorted(selected_datasets))
 
 
125
 
126
 
127
  datasets_fetch.click(
128
  fn=fetch_datasets,
129
  inputs=[base_folder],
130
- outputs=[available_datasets],
131
  )
132
 
133
  available_datasets.change(
134
  fn=update_selected_datasets_dropdown,
135
- inputs=[available_datasets, selected_datasets],
136
  outputs=selected_datasets_dropdown,
137
  )
138
 
139
  regex_button.click(
140
  fn=update_datasets_with_regex,
141
- inputs=[regex_select, selected_datasets, available_datasets],
142
- outputs=selected_datasets,
143
  )
144
 
145
- def update_selected_datasets(selected_datasets_dropdown):
146
- return selected_datasets_dropdown
147
 
148
  selected_datasets_dropdown.change(
149
- fn=update_selected_datasets,
150
- inputs=[selected_datasets_dropdown],
151
- outputs=selected_datasets,
152
- )
153
-
154
- selected_datasets.change(
155
- fn=update_selected_datasets_dropdown,
156
- inputs=[available_datasets, selected_datasets],
157
- outputs=selected_datasets_dropdown,
158
- )
159
-
160
-
161
- selected_datasets.change(
162
  fn=fetch_groups,
163
- inputs=[base_folder, selected_datasets, grouping_dropdown],
164
  outputs=grouping_dropdown,
165
  )
166
 
167
  grouping_dropdown.change(
168
  fn=fetch_metrics,
169
- inputs=[base_folder, selected_datasets, grouping_dropdown, metric_name_dropdown],
170
  outputs=metric_name_dropdown,
171
  )
172
 
@@ -174,7 +161,7 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
174
  fn=fetch_graph_data,
175
  inputs=[
176
  base_folder,
177
- selected_datasets,
178
  metric_name_dropdown,
179
  grouping_dropdown,
180
  ],
@@ -219,4 +206,4 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
219
  outputs=[export_data_json],
220
  )
221
 
222
- return base_folder
 
6
  import re
7
  import json
8
 
9
+ from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex, update_datasets_with_regex
10
  from src.logic.data_processing import export_data
11
  from src.logic.graph_settings import update_graph_options
12
  from src.logic.plotting import plot_data
13
 
14
+ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State):
15
  metric_data = gr.State([])
16
 
17
  with gr.Row():
 
120
 
121
 
122
 
123
+ def update_selected_datasets_dropdown(available_datasets, selected_datasets_dropdown):
124
+ selected_datasets = selected_datasets_dropdown or []
125
+ selected_datasets = set(selected_datasets) & set(available_datasets)
126
+ return gr.Dropdown(choices=available_datasets, value=sorted(list(selected_datasets)))
127
 
128
 
129
  datasets_fetch.click(
130
  fn=fetch_datasets,
131
  inputs=[base_folder],
132
+ outputs=[available_datasets, selected_datasets_dropdown],
133
  )
134
 
135
  available_datasets.change(
136
  fn=update_selected_datasets_dropdown,
137
+ inputs=[available_datasets, selected_datasets_dropdown],
138
  outputs=selected_datasets_dropdown,
139
  )
140
 
141
  regex_button.click(
142
  fn=update_datasets_with_regex,
143
+ inputs=[regex_select, selected_datasets_dropdown, available_datasets],
144
+ outputs=selected_datasets_dropdown,
145
  )
146
 
 
 
147
 
148
  selected_datasets_dropdown.change(
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  fn=fetch_groups,
150
+ inputs=[base_folder, selected_datasets_dropdown, grouping_dropdown],
151
  outputs=grouping_dropdown,
152
  )
153
 
154
  grouping_dropdown.change(
155
  fn=fetch_metrics,
156
+ inputs=[base_folder, selected_datasets_dropdown, grouping_dropdown, metric_name_dropdown],
157
  outputs=metric_name_dropdown,
158
  )
159
 
 
161
  fn=fetch_graph_data,
162
  inputs=[
163
  base_folder,
164
+ selected_datasets_dropdown,
165
  metric_name_dropdown,
166
  grouping_dropdown,
167
  ],
 
206
  outputs=[export_data_json],
207
  )
208
 
209
+ return base_folder, selected_datasets_dropdown
src/view/reverse_search_tab.py CHANGED
@@ -3,7 +3,7 @@ import gradio as gr
3
 
4
  from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
5
 
6
- def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.State):
7
  reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
8
 
9
  with gr.Row():
 
3
 
4
  from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
5
 
6
+ def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.Dropdown):
7
  reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
8
 
9
  with gr.Row():
src/view/view.py CHANGED
@@ -11,16 +11,15 @@ METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/
11
 
12
  def create_interface():
13
  with gr.Blocks() as demo:
14
- metrics_headline = gr.Markdown(value="# Metrics Exploration")
15
  available_datasets = gr.State([])
16
- selected_datasets = gr.State([])
17
 
18
  with gr.Tabs():
19
  with gr.Tab("Help"):
20
  create_help_tab()
21
 
22
  with gr.TabItem("Metric View"):
23
- base_folder = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets, selected_datasets)
24
 
25
  with gr.TabItem("Reverse Metrics Search"):
26
  create_reverse_search_tab(base_folder, available_datasets, selected_datasets)
 
11
 
12
  def create_interface():
13
  with gr.Blocks() as demo:
14
+ metrics_headline = gr.Markdown(value="# Datasets Metrics Explorer")
15
  available_datasets = gr.State([])
 
16
 
17
  with gr.Tabs():
18
  with gr.Tab("Help"):
19
  create_help_tab()
20
 
21
  with gr.TabItem("Metric View"):
22
+ base_folder, selected_datasets = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets)
23
 
24
  with gr.TabItem("Reverse Metrics Search"):
25
  create_reverse_search_tab(base_folder, available_datasets, selected_datasets)