Santosh commited on
Commit
2ccb279
·
1 Parent(s): 50f4e89

updated ryan science tags

Browse files
all_rich_dataset_cards.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94ac600eb5100aa7acaeeec3d05becbee7ac11eba9595a0f9e38286879285349
3
- size 5475858
 
 
 
 
app.py CHANGED
@@ -1,17 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import polars as pl
3
 
4
- # Paths or HF Hub URLs for Parquet files
5
- RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
6
- MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
7
 
8
  ROWS_PER_PAGE = 50
9
 
10
- # Lazy load datasets
11
- lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
12
- lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
13
-
14
- current_lazy_df = lazy_missing # Default dataset
15
 
16
  # Helper function to fetch a page
17
  def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
@@ -29,18 +134,13 @@ def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str =
29
  return page_df, total_pages
30
 
31
  # Initialize first page
32
- initial_df, total_pages = get_page(current_lazy_df, 0)
33
  columns = list(initial_df.columns)
34
 
35
  with gr.Blocks() as demo:
36
  gr.Markdown("## Dataset Insight Portal")
37
-
38
- # Dataset selection
39
- dataset_select = gr.Dropdown(
40
- choices=["DatasetCards rich in information", "DatasetCards missing information"],
41
- value="DatasetCards missing information",
42
- label="Select Dataset"
43
- )
44
 
45
  # Pagination controls
46
  with gr.Row():
@@ -63,17 +163,7 @@ with gr.Blocks() as demo:
63
  reset_btn = gr.Button("Reset", elem_id="small-btn")
64
 
65
  # --- Functions ---
66
- def load_dataset(dataset_choice):
67
- global current_lazy_df
68
- current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
69
- initial_df, total_pages = get_page(current_lazy_df, 0)
70
- columns = list(initial_df.columns)
71
- return (
72
- gr.update(value=initial_df, headers=columns),
73
- f"Total Pages: {total_pages}",
74
- 0,
75
- gr.update(choices=columns, value=columns[0])
76
- )
77
 
78
  def next_page_func(page, column, query):
79
  page += 1
@@ -98,7 +188,6 @@ with gr.Blocks() as demo:
98
  return page_df, f"Total Pages: {total_pages}", 0
99
 
100
  # --- Event Listeners ---
101
- dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
102
  next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
103
  prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
104
  search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
 
1
+ # import gradio as gr
2
+ # import polars as pl
3
+
4
+ # # Paths or HF Hub URLs for Parquet files
5
+ # RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
6
+ # MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
7
+
8
+ # ROWS_PER_PAGE = 50
9
+
10
+ # # Lazy load datasets
11
+ # lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
12
+ # lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
13
+
14
+ # current_lazy_df = lazy_missing # Default dataset
15
+
16
+ # # Helper function to fetch a page
17
+ # def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
18
+ # filtered_df = lazy_df
19
+ # if column and query:
20
+ # query_lower = query.lower().strip()
21
+ # # Case-insensitive search
22
+ # filtered_df = filtered_df.with_columns([
23
+ # pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
24
+ # ]).filter(pl.col(column).str.contains(query_lower, literal=False))
25
+ # start = page * ROWS_PER_PAGE
26
+ # page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
27
+ # total_rows = filtered_df.collect().height
28
+ # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
29
+ # return page_df, total_pages
30
+
31
+ # # Initialize first page
32
+ # initial_df, total_pages = get_page(current_lazy_df, 0)
33
+ # columns = list(initial_df.columns)
34
+
35
+ # with gr.Blocks() as demo:
36
+ # gr.Markdown("## Dataset Insight Portal")
37
+
38
+ # # Dataset selection
39
+ # dataset_select = gr.Dropdown(
40
+ # choices=["DatasetCards rich in information", "DatasetCards missing information"],
41
+ # value="DatasetCards missing information",
42
+ # label="Select Dataset"
43
+ # )
44
+
45
+ # # Pagination controls
46
+ # with gr.Row():
47
+ # prev_btn = gr.Button("Previous", elem_id="small-btn")
48
+ # next_btn = gr.Button("Next", elem_id="small-btn")
49
+ # page_number = gr.Number(value=0, label="Page", precision=0)
50
+ # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
51
+
52
+ # # Data table
53
+ # data_table = gr.Dataframe(
54
+ # value=initial_df, headers=columns, datatype="str",
55
+ # interactive=False, row_count=ROWS_PER_PAGE
56
+ # )
57
+
58
+ # # Column search
59
+ # with gr.Row():
60
+ # col_dropdown = gr.Dropdown(choices=columns, label="Column")
61
+ # search_text = gr.Textbox(label="Search")
62
+ # search_btn = gr.Button("Search", elem_id="small-btn")
63
+ # reset_btn = gr.Button("Reset", elem_id="small-btn")
64
+
65
+ # # --- Functions ---
66
+ # def load_dataset(dataset_choice):
67
+ # global current_lazy_df
68
+ # current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
69
+ # initial_df, total_pages = get_page(current_lazy_df, 0)
70
+ # columns = list(initial_df.columns)
71
+ # return (
72
+ # gr.update(value=initial_df, headers=columns),
73
+ # f"Total Pages: {total_pages}",
74
+ # 0,
75
+ # gr.update(choices=columns, value=columns[0])
76
+ # )
77
+
78
+ # def next_page_func(page, column, query):
79
+ # page += 1
80
+ # page_df, total_pages = get_page(current_lazy_df, page, column, query)
81
+ # if page >= total_pages:
82
+ # page = total_pages - 1
83
+ # page_df, total_pages = get_page(current_lazy_df, page, column, query)
84
+ # return page_df, f"Total Pages: {total_pages}", page
85
+
86
+ # def prev_page_func(page, column, query):
87
+ # page -= 1
88
+ # page = max(0, page)
89
+ # page_df, total_pages = get_page(current_lazy_df, page, column, query)
90
+ # return page_df, f"Total Pages: {total_pages}", page
91
+
92
+ # def search_func(column, query):
93
+ # page_df, total_pages = get_page(current_lazy_df, 0, column, query)
94
+ # return page_df, f"Total Pages: {total_pages}", 0
95
+
96
+ # def reset_func():
97
+ # page_df, total_pages = get_page(current_lazy_df, 0)
98
+ # return page_df, f"Total Pages: {total_pages}", 0
99
+
100
+ # # --- Event Listeners ---
101
+ # dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
102
+ # next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
103
+ # prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
104
+ # search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
105
+ # reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
106
+
107
+ # demo.launch()
108
+
109
+
110
  import gradio as gr
111
  import polars as pl
112
 
113
+ # Path for the combined Parquet file
114
+ COMBINED_PARQUET_PATH = "datasetcards.parquet"
 
115
 
116
  ROWS_PER_PAGE = 50
117
 
118
+ # Lazy load dataset
119
+ lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH)
 
 
 
120
 
121
  # Helper function to fetch a page
122
  def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
 
134
  return page_df, total_pages
135
 
136
  # Initialize first page
137
+ initial_df, total_pages = get_page(lazy_df, 0)
138
  columns = list(initial_df.columns)
139
 
140
  with gr.Blocks() as demo:
141
  gr.Markdown("## Dataset Insight Portal")
142
+ gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. "
143
+ "You can navigate pages, search within columns, and inspect the dataset easily.")
 
 
 
 
 
144
 
145
  # Pagination controls
146
  with gr.Row():
 
163
  reset_btn = gr.Button("Reset", elem_id="small-btn")
164
 
165
  # --- Functions ---
166
+ current_lazy_df = lazy_df # single dataset
 
 
 
 
 
 
 
 
 
 
167
 
168
  def next_page_func(page, column, query):
169
  page += 1
 
188
  return page_df, f"Total Pages: {total_pages}", 0
189
 
190
  # --- Event Listeners ---
 
191
  next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
192
  prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
193
  search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
all_minimal_dataset_cards.parquet → datasetcards.parquet RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5adb59f94fb6f08f5c0859e21e55ed56ec40f40d9cde349427bf24065e775d60
3
- size 17318878
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c248074b63bc77b236e8096e3423779f3a5bf4cbe24a2683ea63da31a1c4c154
3
+ size 35038132
ds_missing_sci_data_4k.csv DELETED
The diff for this file is too large to render. See raw diff
 
heuristic_approach.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from huggingface_hub import list_datasets, DatasetCard
2
+ # import re
3
+ # import pandas as pd
4
+ # import os
5
+ # import time
6
+ # import random
7
+ # from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ # from requests.exceptions import HTTPError
9
+
10
+
11
+ # # ---------- Retry helper ----------
12
+ # def retry_load_card(dataset_id, retries=5, base_wait=60):
13
+ # """
14
+ # Try to load a dataset card with retries if 429 (rate limit) occurs.
15
+ # Uses Retry-After header if available, otherwise exponential backoff.
16
+ # """
17
+ # for attempt in range(retries):
18
+ # try:
19
+ # return DatasetCard.load(dataset_id)
20
+ # except HTTPError as e:
21
+ # if e.response is not None and e.response.status_code == 429:
22
+ # wait_time = e.response.headers.get("Retry-After")
23
+ # if wait_time is not None:
24
+ # wait_time = int(wait_time)
25
+ # else:
26
+ # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
27
+ # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
28
+ # time.sleep(wait_time)
29
+ # continue
30
+ # else:
31
+ # raise # don't retry for other HTTP errors
32
+ # except Exception as e:
33
+ # print(f"[ERROR] {dataset_id}: {e}")
34
+ # raise
35
+ # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")
36
+
37
+
38
+ # # ---------- Heuristic functions with reasons ----------
39
+ # def check_card_quality(card_text, metadata, dataset_url):
40
+ # reasons = []
41
+ # length = len(card_text)
42
+ # word_count = len(card_text.split())
43
+
44
+ # if metadata is None or len(metadata) == 0:
45
+ # print(length, word_count, dataset_url)
46
+ # if length < 200:
47
+ # reasons.append("No metadata and no description")
48
+ # return "minimal", reasons, word_count
49
+ # else:
50
+ # reasons.append("No metadata but has description")
51
+ # return "minimal", reasons, word_count
52
+ # else:
53
+ # if length < 200:
54
+ # reasons.append(f"Short description (char count={length}, words={word_count})")
55
+ # return "minimal", reasons, word_count
56
+ # else:
57
+ # return "rich", reasons, word_count
58
+
59
+ # # ---------- Worker function for one dataset ----------
60
+ # def process_dataset(ds, save_dir):
61
+ # try:
62
+ # card = retry_load_card(ds.id)
63
+ # card_text = card.text or ""
64
+ # metadata = card.data.to_dict() if card.data else {}
65
+ # dataset_url = f"https://huggingface.co/datasets/{ds.id}"
66
+
67
+ # # Save README locally
68
+ # readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
69
+ # with open(readme_path, "w", encoding="utf-8") as f:
70
+ # f.write(card_text)
71
+
72
+ # category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
73
+ # row = {
74
+ # "dataset_id": ds.id,
75
+ # "dataset_url": dataset_url,
76
+ # "downloads": getattr(ds, "downloads", None),
77
+ # "reason": "; ".join(reasons),
78
+ # "readme_path": readme_path,
79
+ # "word_count": word_count,
80
+ # "category": category,
81
+ # }
82
+ # return row
83
+ # except Exception as e:
84
+ # return {
85
+ # "dataset_id": ds.id,
86
+ # "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
87
+ # "downloads": getattr(ds, "downloads", None),
88
+ # "reason": f"Failed to load card",
89
+ # "readme_path": None,
90
+ # "word_count": 0,
91
+ # "category": "minimal",
92
+ # }
93
+
94
+
95
+ # # ---------- Main ----------
96
+ # def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16):
97
+ # minimal_results = []
98
+ # rich_results = []
99
+
100
+ # os.makedirs(save_dir, exist_ok=True)
101
+
102
+ # print(f"Fetching up to {limit} datasets (sorted by downloads)...")
103
+ # datasets = list_datasets()
104
+
105
+ # with ThreadPoolExecutor(max_workers=max_workers) as executor:
106
+ # futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
107
+ # for i, f in enumerate(as_completed(futures), 1):
108
+ # row = f.result()
109
+ # if row["category"] == "minimal":
110
+ # minimal_results.append(row)
111
+ # else:
112
+ # rich_results.append(row)
113
+ # return minimal_results, rich_results
114
+
115
+
116
+ # if __name__ == "__main__":
117
+ # minimal, rich = collect_dataset_ids(limit=1000, max_workers=16)
118
+
119
+ # # Save separate CSV files
120
+ # if minimal:
121
+ # pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
122
+ # if rich:
123
+ # pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)
124
+
125
+ # print("\nSaved results to:")
126
+ # if minimal:
127
+ # print(" - minimal_dataset_cards.csv")
128
+ # if rich:
129
+ # print(" - rich_dataset_cards.csv")
130
+ # print(" - README files in ./dataset_readmes/")
131
+
132
+ # print("\nSummary:")
133
+ # print(f"Minimal: {len(minimal)}")
134
+ # print(f"Rich: {len(rich)}")
135
+
136
+
137
+ from huggingface_hub import list_datasets, DatasetCard
138
+ import re
139
+ import pandas as pd
140
+ import os
141
+ import time
142
+ import random
143
+ from concurrent.futures import ThreadPoolExecutor, as_completed
144
+ from requests.exceptions import HTTPError
145
+
146
+
147
+ # # ---------- Retry helper ----------
148
+ # def retry_load_card(dataset_id, retries=5, base_wait=60):
149
+ # for attempt in range(retries):
150
+ # try:
151
+ # return DatasetCard.load(dataset_id)
152
+ # except HTTPError as e:
153
+ # if e.response is not None and e.response.status_code == 429:
154
+ # wait_time = e.response.headers.get("Retry-After")
155
+ # if wait_time is not None:
156
+ # wait_time = int(wait_time)
157
+ # else:
158
+ # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
159
+ # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
160
+ # time.sleep(wait_time)
161
+ # continue
162
+ # else:
163
+ # raise
164
+ # except Exception as e:
165
+ # print(f"[ERROR] {dataset_id}: {e}")
166
+ # raise
167
+ # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")
168
+
169
+
170
+ # ---------- Heuristic functions with reasons ----------
171
+ def check_card_quality(card_text, metadata, dataset_url):
172
+ reasons = []
173
+ length = len(card_text)
174
+ word_count = len(card_text.split())
175
+
176
+ if metadata is None or len(metadata) == 0:
177
+ print(length, word_count, dataset_url)
178
+ if length < 200:
179
+ reasons.append("No metadata and no description")
180
+ return "minimal", reasons, word_count
181
+ else:
182
+ reasons.append("No metadata but has description")
183
+ return "minimal", reasons, word_count
184
+ else:
185
+ if length < 200:
186
+ reasons.append(f"Short description (char count={length}, words={word_count})")
187
+ return "minimal", reasons, word_count
188
+ else:
189
+ return "rich", reasons, word_count
190
+
191
+
192
+ # ---------- Worker function for one dataset ----------
193
+ def process_dataset(ds, save_dir):
194
+ try:
195
+ card = DatasetCard.load(ds.id)
196
+ card_text = card.text or ""
197
+ metadata = card.data.to_dict() if card.data else {}
198
+ dataset_url = f"https://huggingface.co/datasets/{ds.id}"
199
+
200
+ # Save README locally
201
+ readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
202
+ with open(readme_path, "w", encoding="utf-8") as f:
203
+ f.write(card_text)
204
+
205
+ category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
206
+
207
+ row = {
208
+ "dataset_id": ds.id,
209
+ "dataset_url": dataset_url,
210
+ "downloads": getattr(ds, "downloads", None),
211
+ "author": metadata.get("author", None),
212
+ "license": metadata.get("license", None),
213
+ "tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None,
214
+ "task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None,
215
+ "last_modified": getattr(ds, "lastModified", None),
216
+ "reason": "; ".join(reasons),
217
+ "readme_path": readme_path,
218
+ "word_count": word_count,
219
+ "category": category,
220
+ }
221
+ return row
222
+ except Exception as e:
223
+ return {
224
+ "dataset_id": ds.id,
225
+ "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
226
+ "downloads": getattr(ds, "downloads", None),
227
+ "author": None,
228
+ "license": None,
229
+ "tags": None,
230
+ "task_categories": None,
231
+ "last_modified": None,
232
+ "reason": "Failed to load card",
233
+ "readme_path": None,
234
+ "word_count": 0,
235
+ "category": "minimal",
236
+ }
237
+
238
+
239
+ # ---------- Main ----------
240
+ def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16):
241
+ minimal_results = []
242
+ rich_results = []
243
+
244
+ os.makedirs(save_dir, exist_ok=True)
245
+
246
+ datasets = list_datasets()
247
+
248
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
249
+ futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
250
+ for i, f in enumerate(as_completed(futures), 1):
251
+ row = f.result()
252
+ if row["category"] == "minimal":
253
+ minimal_results.append(row)
254
+ else:
255
+ rich_results.append(row)
256
+ return minimal_results, rich_results
257
+
258
+
259
+ if __name__ == "__main__":
260
+ minimal, rich = collect_dataset_ids(limit=100, max_workers=16)
261
+
262
+ # Save separate CSV files
263
+ if minimal:
264
+ pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
265
+ if rich:
266
+ pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)
267
+
268
+ print("\nSaved results to:")
269
+ if minimal:
270
+ print(" - minimal_dataset_cards.csv")
271
+ if rich:
272
+ print(" - rich_dataset_cards.csv")
273
+ print(" - README files in ./dataset_readmes/")
274
+
275
+ print("\nSummary:")
276
+ print(f"Minimal: {len(minimal)}")
277
+ print(f"Rich: {len(rich)}")
preprocessing.ipynb ADDED
@@ -0,0 +1,2098 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "4e64d318",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ " dataset_id \\\n",
14
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
15
+ "1 aemska/stuhl \n",
16
+ "2 Pogpotatofarmer/memes \n",
17
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
18
+ "4 chamisfum/brain_tumor_3_classes \n",
19
+ "\n",
20
+ " dataset_url downloads author \\\n",
21
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
22
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
23
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
24
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
25
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
26
+ "\n",
27
+ " license tags task_categories last_modified \\\n",
28
+ "0 None None None 2024-01-30 07:40:02+00:00 \n",
29
+ "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
30
+ "2 cc None None 2022-07-15 21:11:34+00:00 \n",
31
+ "3 None None None None \n",
32
+ "4 None None None None \n",
33
+ "\n",
34
+ " reason \\\n",
35
+ "0 No metadata and no description \n",
36
+ "1 Short description (char count=0, words=0) \n",
37
+ "2 Short description (char count=0, words=0) \n",
38
+ "3 Failed to load card \n",
39
+ "4 Failed to load card \n",
40
+ "\n",
41
+ " readme_path word_count category \n",
42
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
43
+ "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
44
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
45
+ "3 None 0 minimal \n",
46
+ "4 None 0 minimal \n",
47
+ " dataset_id \\\n",
48
+ "0 autoevaluate/autoeval-staging-eval-launch__gov... \n",
49
+ "1 autoevaluate/autoeval-eval-emotion-default-fe1... \n",
50
+ "2 LTCB/enwik8 \n",
51
+ "3 boltuix/emotions-dataset \n",
52
+ "4 yixuantt/MultiHopRAG \n",
53
+ "\n",
54
+ " dataset_url downloads author \\\n",
55
+ "0 https://huggingface.co/datasets/autoevaluate/a... 8 None \n",
56
+ "1 https://huggingface.co/datasets/autoevaluate/a... 8 None \n",
57
+ "2 https://huggingface.co/datasets/LTCB/enwik8 154 None \n",
58
+ "3 https://huggingface.co/datasets/boltuix/emotio... 754 None \n",
59
+ "4 https://huggingface.co/datasets/yixuantt/Multi... 7050 None \n",
60
+ "\n",
61
+ " license tags \\\n",
62
+ "0 None autotrain, evaluation \n",
63
+ "1 None autotrain, evaluation \n",
64
+ "2 ['mit'] None \n",
65
+ "3 mit emotions, nlp, sentiment-analysis, emotion-cla... \n",
66
+ "4 odc-by None \n",
67
+ "\n",
68
+ " task_categories last_modified reason \\\n",
69
+ "0 None 2022-09-09 07:44:04+00:00 None \n",
70
+ "1 None 2022-09-16 20:22:59+00:00 None \n",
71
+ "2 fill-mask, text-generation 2024-01-18 11:19:13+00:00 None \n",
72
+ "3 None 2025-05-25 15:41:59+00:00 None \n",
73
+ "4 question-answering, feature-extraction 2024-01-30 02:49:29+00:00 None \n",
74
+ "\n",
75
+ " readme_path word_count category \n",
76
+ "0 dataset_readmes/autoevaluate__autoeval-staging... 55 rich \n",
77
+ "1 dataset_readmes/autoevaluate__autoeval-eval-em... 57 rich \n",
78
+ "2 dataset_readmes/LTCB__enwik8_README.md 427 rich \n",
79
+ "3 dataset_readmes/boltuix__emotions-dataset_READ... 1643 rich \n",
80
+ "4 dataset_readmes/yixuantt__MultiHopRAG_README.md 111 rich \n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "import pandas as pd\n",
86
+ "\n",
87
+ "# Read parquet files\n",
88
+ "df1 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_minimal_dataset_cards.parquet\")\n",
89
+ "df2 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_rich_dataset_cards.parquet\")\n",
90
+ "\n",
91
+ "# Display first few rows\n",
92
+ "print(df1.head())\n",
93
+ "print(df2.head())"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 2,
99
+ "id": "e9a20931",
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "data": {
104
+ "text/html": [
105
+ "<div>\n",
106
+ "<style scoped>\n",
107
+ " .dataframe tbody tr th:only-of-type {\n",
108
+ " vertical-align: middle;\n",
109
+ " }\n",
110
+ "\n",
111
+ " .dataframe tbody tr th {\n",
112
+ " vertical-align: top;\n",
113
+ " }\n",
114
+ "\n",
115
+ " .dataframe thead th {\n",
116
+ " text-align: right;\n",
117
+ " }\n",
118
+ "</style>\n",
119
+ "<table border=\"1\" class=\"dataframe\">\n",
120
+ " <thead>\n",
121
+ " <tr style=\"text-align: right;\">\n",
122
+ " <th></th>\n",
123
+ " <th>dataset_id</th>\n",
124
+ " <th>dataset_url</th>\n",
125
+ " <th>downloads</th>\n",
126
+ " <th>author</th>\n",
127
+ " <th>license</th>\n",
128
+ " <th>tags</th>\n",
129
+ " <th>task_categories</th>\n",
130
+ " <th>last_modified</th>\n",
131
+ " <th>reason</th>\n",
132
+ " <th>readme_path</th>\n",
133
+ " <th>word_count</th>\n",
134
+ " <th>category</th>\n",
135
+ " </tr>\n",
136
+ " </thead>\n",
137
+ " <tbody>\n",
138
+ " <tr>\n",
139
+ " <th>0</th>\n",
140
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
141
+ " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
142
+ " <td>22</td>\n",
143
+ " <td>None</td>\n",
144
+ " <td>None</td>\n",
145
+ " <td>None</td>\n",
146
+ " <td>None</td>\n",
147
+ " <td>2024-01-30 07:40:02+00:00</td>\n",
148
+ " <td>No metadata and no description</td>\n",
149
+ " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
150
+ " <td>0</td>\n",
151
+ " <td>minimal</td>\n",
152
+ " </tr>\n",
153
+ " <tr>\n",
154
+ " <th>1</th>\n",
155
+ " <td>aemska/stuhl</td>\n",
156
+ " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
157
+ " <td>11</td>\n",
158
+ " <td>None</td>\n",
159
+ " <td>openrail</td>\n",
160
+ " <td>None</td>\n",
161
+ " <td>None</td>\n",
162
+ " <td>2022-11-11 14:12:36+00:00</td>\n",
163
+ " <td>Short description (char count=0, words=0)</td>\n",
164
+ " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
165
+ " <td>0</td>\n",
166
+ " <td>minimal</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>2</th>\n",
170
+ " <td>Pogpotatofarmer/memes</td>\n",
171
+ " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
172
+ " <td>15</td>\n",
173
+ " <td>None</td>\n",
174
+ " <td>cc</td>\n",
175
+ " <td>None</td>\n",
176
+ " <td>None</td>\n",
177
+ " <td>2022-07-15 21:11:34+00:00</td>\n",
178
+ " <td>Short description (char count=0, words=0)</td>\n",
179
+ " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
180
+ " <td>0</td>\n",
181
+ " <td>minimal</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>3</th>\n",
185
+ " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
186
+ " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
187
+ " <td>11</td>\n",
188
+ " <td>None</td>\n",
189
+ " <td>None</td>\n",
190
+ " <td>None</td>\n",
191
+ " <td>None</td>\n",
192
+ " <td>None</td>\n",
193
+ " <td>Failed to load card</td>\n",
194
+ " <td>None</td>\n",
195
+ " <td>0</td>\n",
196
+ " <td>minimal</td>\n",
197
+ " </tr>\n",
198
+ " <tr>\n",
199
+ " <th>4</th>\n",
200
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
201
+ " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
202
+ " <td>8</td>\n",
203
+ " <td>None</td>\n",
204
+ " <td>None</td>\n",
205
+ " <td>None</td>\n",
206
+ " <td>None</td>\n",
207
+ " <td>None</td>\n",
208
+ " <td>Failed to load card</td>\n",
209
+ " <td>None</td>\n",
210
+ " <td>0</td>\n",
211
+ " <td>minimal</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>...</th>\n",
215
+ " <td>...</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>...</td>\n",
218
+ " <td>...</td>\n",
219
+ " <td>...</td>\n",
220
+ " <td>...</td>\n",
221
+ " <td>...</td>\n",
222
+ " <td>...</td>\n",
223
+ " <td>...</td>\n",
224
+ " <td>...</td>\n",
225
+ " <td>...</td>\n",
226
+ " <td>...</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>400292</th>\n",
230
+ " <td>TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...</td>\n",
231
+ " <td>https://huggingface.co/datasets/TAUR-dev/D-EVA...</td>\n",
232
+ " <td>0</td>\n",
233
+ " <td>None</td>\n",
234
+ " <td>None</td>\n",
235
+ " <td>None</td>\n",
236
+ " <td>None</td>\n",
237
+ " <td>2025-09-19 06:27:52+00:00</td>\n",
238
+ " <td>Short description (char count=0, words=0)</td>\n",
239
+ " <td>dataset_readmes/TAUR-dev__D-EVAL__standard_eva...</td>\n",
240
+ " <td>0</td>\n",
241
+ " <td>minimal</td>\n",
242
+ " </tr>\n",
243
+ " <tr>\n",
244
+ " <th>400293</th>\n",
245
+ " <td>TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...</td>\n",
246
+ " <td>https://huggingface.co/datasets/TAUR-dev/D-EVA...</td>\n",
247
+ " <td>0</td>\n",
248
+ " <td>None</td>\n",
249
+ " <td>None</td>\n",
250
+ " <td>None</td>\n",
251
+ " <td>None</td>\n",
252
+ " <td>2025-09-19 06:28:16+00:00</td>\n",
253
+ " <td>Short description (char count=0, words=0)</td>\n",
254
+ " <td>dataset_readmes/TAUR-dev__D-EVAL__standard_eva...</td>\n",
255
+ " <td>0</td>\n",
256
+ " <td>minimal</td>\n",
257
+ " </tr>\n",
258
+ " <tr>\n",
259
+ " <th>400294</th>\n",
260
+ " <td>haru101/Minecraft-Knowledge-Dataset</td>\n",
261
+ " <td>https://huggingface.co/datasets/haru101/Minecr...</td>\n",
262
+ " <td>0</td>\n",
263
+ " <td>None</td>\n",
264
+ " <td>apache-2.0</td>\n",
265
+ " <td>None</td>\n",
266
+ " <td>question-answering</td>\n",
267
+ " <td>2025-09-19 06:33:33+00:00</td>\n",
268
+ " <td>Short description (char count=0, words=0)</td>\n",
269
+ " <td>dataset_readmes/haru101__Minecraft-Knowledge-D...</td>\n",
270
+ " <td>0</td>\n",
271
+ " <td>minimal</td>\n",
272
+ " </tr>\n",
273
+ " <tr>\n",
274
+ " <th>400295</th>\n",
275
+ " <td>sxj1215/mmimdb_sorted_with_label_2</td>\n",
276
+ " <td>https://huggingface.co/datasets/sxj1215/mmimdb...</td>\n",
277
+ " <td>0</td>\n",
278
+ " <td>None</td>\n",
279
+ " <td>None</td>\n",
280
+ " <td>None</td>\n",
281
+ " <td>None</td>\n",
282
+ " <td>2025-09-19 06:35:25+00:00</td>\n",
283
+ " <td>Short description (char count=0, words=0)</td>\n",
284
+ " <td>dataset_readmes/sxj1215__mmimdb_sorted_with_la...</td>\n",
285
+ " <td>0</td>\n",
286
+ " <td>minimal</td>\n",
287
+ " </tr>\n",
288
+ " <tr>\n",
289
+ " <th>400296</th>\n",
290
+ " <td>Vikir2411CS19/Multimodal_Complaint</td>\n",
291
+ " <td>https://huggingface.co/datasets/Vikir2411CS19/...</td>\n",
292
+ " <td>0</td>\n",
293
+ " <td>None</td>\n",
294
+ " <td>None</td>\n",
295
+ " <td>None</td>\n",
296
+ " <td>None</td>\n",
297
+ " <td>2025-09-19 06:35:01+00:00</td>\n",
298
+ " <td>Short description (char count=0, words=0)</td>\n",
299
+ " <td>dataset_readmes/Vikir2411CS19__Multimodal_Comp...</td>\n",
300
+ " <td>0</td>\n",
301
+ " <td>minimal</td>\n",
302
+ " </tr>\n",
303
+ " </tbody>\n",
304
+ "</table>\n",
305
+ "<p>400297 rows × 12 columns</p>\n",
306
+ "</div>"
307
+ ],
308
+ "text/plain": [
309
+ " dataset_id \\\n",
310
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
311
+ "1 aemska/stuhl \n",
312
+ "2 Pogpotatofarmer/memes \n",
313
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
314
+ "4 chamisfum/brain_tumor_3_classes \n",
315
+ "... ... \n",
316
+ "400292 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n",
317
+ "400293 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n",
318
+ "400294 haru101/Minecraft-Knowledge-Dataset \n",
319
+ "400295 sxj1215/mmimdb_sorted_with_label_2 \n",
320
+ "400296 Vikir2411CS19/Multimodal_Complaint \n",
321
+ "\n",
322
+ " dataset_url downloads author \\\n",
323
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
324
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
325
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
326
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
327
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
328
+ "... ... ... ... \n",
329
+ "400292 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n",
330
+ "400293 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n",
331
+ "400294 https://huggingface.co/datasets/haru101/Minecr... 0 None \n",
332
+ "400295 https://huggingface.co/datasets/sxj1215/mmimdb... 0 None \n",
333
+ "400296 https://huggingface.co/datasets/Vikir2411CS19/... 0 None \n",
334
+ "\n",
335
+ " license tags task_categories last_modified \\\n",
336
+ "0 None None None 2024-01-30 07:40:02+00:00 \n",
337
+ "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
338
+ "2 cc None None 2022-07-15 21:11:34+00:00 \n",
339
+ "3 None None None None \n",
340
+ "4 None None None None \n",
341
+ "... ... ... ... ... \n",
342
+ "400292 None None None 2025-09-19 06:27:52+00:00 \n",
343
+ "400293 None None None 2025-09-19 06:28:16+00:00 \n",
344
+ "400294 apache-2.0 None question-answering 2025-09-19 06:33:33+00:00 \n",
345
+ "400295 None None None 2025-09-19 06:35:25+00:00 \n",
346
+ "400296 None None None 2025-09-19 06:35:01+00:00 \n",
347
+ "\n",
348
+ " reason \\\n",
349
+ "0 No metadata and no description \n",
350
+ "1 Short description (char count=0, words=0) \n",
351
+ "2 Short description (char count=0, words=0) \n",
352
+ "3 Failed to load card \n",
353
+ "4 Failed to load card \n",
354
+ "... ... \n",
355
+ "400292 Short description (char count=0, words=0) \n",
356
+ "400293 Short description (char count=0, words=0) \n",
357
+ "400294 Short description (char count=0, words=0) \n",
358
+ "400295 Short description (char count=0, words=0) \n",
359
+ "400296 Short description (char count=0, words=0) \n",
360
+ "\n",
361
+ " readme_path word_count category \n",
362
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
363
+ "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
364
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
365
+ "3 None 0 minimal \n",
366
+ "4 None 0 minimal \n",
367
+ "... ... ... ... \n",
368
+ "400292 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n",
369
+ "400293 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n",
370
+ "400294 dataset_readmes/haru101__Minecraft-Knowledge-D... 0 minimal \n",
371
+ "400295 dataset_readmes/sxj1215__mmimdb_sorted_with_la... 0 minimal \n",
372
+ "400296 dataset_readmes/Vikir2411CS19__Multimodal_Comp... 0 minimal \n",
373
+ "\n",
374
+ "[400297 rows x 12 columns]"
375
+ ]
376
+ },
377
+ "execution_count": 2,
378
+ "metadata": {},
379
+ "output_type": "execute_result"
380
+ }
381
+ ],
382
+ "source": [
383
+ "df1"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 4,
389
+ "id": "b5582c36",
390
+ "metadata": {},
391
+ "outputs": [
392
+ {
393
+ "data": {
394
+ "text/html": [
395
+ "<div>\n",
396
+ "<style scoped>\n",
397
+ " .dataframe tbody tr th:only-of-type {\n",
398
+ " vertical-align: middle;\n",
399
+ " }\n",
400
+ "\n",
401
+ " .dataframe tbody tr th {\n",
402
+ " vertical-align: top;\n",
403
+ " }\n",
404
+ "\n",
405
+ " .dataframe thead th {\n",
406
+ " text-align: right;\n",
407
+ " }\n",
408
+ "</style>\n",
409
+ "<table border=\"1\" class=\"dataframe\">\n",
410
+ " <thead>\n",
411
+ " <tr style=\"text-align: right;\">\n",
412
+ " <th></th>\n",
413
+ " <th>id</th>\n",
414
+ " <th>url</th>\n",
415
+ " <th>field</th>\n",
416
+ " <th>keyword</th>\n",
417
+ " <th>missing_readme</th>\n",
418
+ " <th>missing_card</th>\n",
419
+ " </tr>\n",
420
+ " </thead>\n",
421
+ " <tbody>\n",
422
+ " <tr>\n",
423
+ " <th>0</th>\n",
424
+ " <td>solomonk/reddit_mental_health_posts</td>\n",
425
+ " <td>https://huggingface.co/datasets/solomonk/reddi...</td>\n",
426
+ " <td>life_sciences</td>\n",
427
+ " <td>health</td>\n",
428
+ " <td>False</td>\n",
429
+ " <td>True</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>1</th>\n",
433
+ " <td>Kira-Asimov/gender_clinical_trial</td>\n",
434
+ " <td>https://huggingface.co/datasets/Kira-Asimov/ge...</td>\n",
435
+ " <td>life_sciences</td>\n",
436
+ " <td>clinical</td>\n",
437
+ " <td>False</td>\n",
438
+ " <td>True</td>\n",
439
+ " </tr>\n",
440
+ " <tr>\n",
441
+ " <th>2</th>\n",
442
+ " <td>samhog/psychology-6k</td>\n",
443
+ " <td>https://huggingface.co/datasets/samhog/psychol...</td>\n",
444
+ " <td>life_sciences</td>\n",
445
+ " <td>psychology</td>\n",
446
+ " <td>True</td>\n",
447
+ " <td>True</td>\n",
448
+ " </tr>\n",
449
+ " <tr>\n",
450
+ " <th>3</th>\n",
451
+ " <td>TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do...</td>\n",
452
+ " <td>https://huggingface.co/datasets/TCMLM/real_cli...</td>\n",
453
+ " <td>life_sciences</td>\n",
454
+ " <td>clinical</td>\n",
455
+ " <td>False</td>\n",
456
+ " <td>True</td>\n",
457
+ " </tr>\n",
458
+ " <tr>\n",
459
+ " <th>4</th>\n",
460
+ " <td>jibrand/plant-dataset-JSONL</td>\n",
461
+ " <td>https://huggingface.co/datasets/jibrand/plant-...</td>\n",
462
+ " <td>agriculture_and_biology</td>\n",
463
+ " <td>plant</td>\n",
464
+ " <td>True</td>\n",
465
+ " <td>True</td>\n",
466
+ " </tr>\n",
467
+ " <tr>\n",
468
+ " <th>...</th>\n",
469
+ " <td>...</td>\n",
470
+ " <td>...</td>\n",
471
+ " <td>...</td>\n",
472
+ " <td>...</td>\n",
473
+ " <td>...</td>\n",
474
+ " <td>...</td>\n",
475
+ " </tr>\n",
476
+ " <tr>\n",
477
+ " <th>4035</th>\n",
478
+ " <td>AshwinManohar/medicine_normalizer_alpaca</td>\n",
479
+ " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
480
+ " <td>life_sciences</td>\n",
481
+ " <td>medicine</td>\n",
482
+ " <td>True</td>\n",
483
+ " <td>True</td>\n",
484
+ " </tr>\n",
485
+ " <tr>\n",
486
+ " <th>4036</th>\n",
487
+ " <td>AshwinManohar/medicine_parser_alpaca</td>\n",
488
+ " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
489
+ " <td>life_sciences</td>\n",
490
+ " <td>medicine</td>\n",
491
+ " <td>True</td>\n",
492
+ " <td>True</td>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>4037</th>\n",
496
+ " <td>AshwinManohar/medicine_normalizer_alpaca_20k</td>\n",
497
+ " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
498
+ " <td>life_sciences</td>\n",
499
+ " <td>medicine</td>\n",
500
+ " <td>True</td>\n",
501
+ " <td>True</td>\n",
502
+ " </tr>\n",
503
+ " <tr>\n",
504
+ " <th>4038</th>\n",
505
+ " <td>Adithyaaaa/plant_leaf_classification</td>\n",
506
+ " <td>https://huggingface.co/datasets/Adithyaaaa/pla...</td>\n",
507
+ " <td>agriculture_and_biology</td>\n",
508
+ " <td>plant</td>\n",
509
+ " <td>True</td>\n",
510
+ " <td>True</td>\n",
511
+ " </tr>\n",
512
+ " <tr>\n",
513
+ " <th>4039</th>\n",
514
+ " <td>benali-ai-24/drug-data-public</td>\n",
515
+ " <td>https://huggingface.co/datasets/benali-ai-24/d...</td>\n",
516
+ " <td>life_sciences</td>\n",
517
+ " <td>drug</td>\n",
518
+ " <td>True</td>\n",
519
+ " <td>True</td>\n",
520
+ " </tr>\n",
521
+ " </tbody>\n",
522
+ "</table>\n",
523
+ "<p>4040 rows × 6 columns</p>\n",
524
+ "</div>"
525
+ ],
526
+ "text/plain": [
527
+ " id \\\n",
528
+ "0 solomonk/reddit_mental_health_posts \n",
529
+ "1 Kira-Asimov/gender_clinical_trial \n",
530
+ "2 samhog/psychology-6k \n",
531
+ "3 TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do... \n",
532
+ "4 jibrand/plant-dataset-JSONL \n",
533
+ "... ... \n",
534
+ "4035 AshwinManohar/medicine_normalizer_alpaca \n",
535
+ "4036 AshwinManohar/medicine_parser_alpaca \n",
536
+ "4037 AshwinManohar/medicine_normalizer_alpaca_20k \n",
537
+ "4038 Adithyaaaa/plant_leaf_classification \n",
538
+ "4039 benali-ai-24/drug-data-public \n",
539
+ "\n",
540
+ " url \\\n",
541
+ "0 https://huggingface.co/datasets/solomonk/reddi... \n",
542
+ "1 https://huggingface.co/datasets/Kira-Asimov/ge... \n",
543
+ "2 https://huggingface.co/datasets/samhog/psychol... \n",
544
+ "3 https://huggingface.co/datasets/TCMLM/real_cli... \n",
545
+ "4 https://huggingface.co/datasets/jibrand/plant-... \n",
546
+ "... ... \n",
547
+ "4035 https://huggingface.co/datasets/AshwinManohar/... \n",
548
+ "4036 https://huggingface.co/datasets/AshwinManohar/... \n",
549
+ "4037 https://huggingface.co/datasets/AshwinManohar/... \n",
550
+ "4038 https://huggingface.co/datasets/Adithyaaaa/pla... \n",
551
+ "4039 https://huggingface.co/datasets/benali-ai-24/d... \n",
552
+ "\n",
553
+ " field keyword missing_readme missing_card \n",
554
+ "0 life_sciences health False True \n",
555
+ "1 life_sciences clinical False True \n",
556
+ "2 life_sciences psychology True True \n",
557
+ "3 life_sciences clinical False True \n",
558
+ "4 agriculture_and_biology plant True True \n",
559
+ "... ... ... ... ... \n",
560
+ "4035 life_sciences medicine True True \n",
561
+ "4036 life_sciences medicine True True \n",
562
+ "4037 life_sciences medicine True True \n",
563
+ "4038 agriculture_and_biology plant True True \n",
564
+ "4039 life_sciences drug True True \n",
565
+ "\n",
566
+ "[4040 rows x 6 columns]"
567
+ ]
568
+ },
569
+ "execution_count": 4,
570
+ "metadata": {},
571
+ "output_type": "execute_result"
572
+ }
573
+ ],
574
+ "source": [
575
+ "csv_df = pd.read_csv(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/ds_missing_sci_data_4k.csv\")\n",
576
+ "csv_df"
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "execution_count": 6,
582
+ "id": "a061659a",
583
+ "metadata": {},
584
+ "outputs": [
585
+ {
586
+ "data": {
587
+ "text/html": [
588
+ "<div>\n",
589
+ "<style scoped>\n",
590
+ " .dataframe tbody tr th:only-of-type {\n",
591
+ " vertical-align: middle;\n",
592
+ " }\n",
593
+ "\n",
594
+ " .dataframe tbody tr th {\n",
595
+ " vertical-align: top;\n",
596
+ " }\n",
597
+ "\n",
598
+ " .dataframe thead th {\n",
599
+ " text-align: right;\n",
600
+ " }\n",
601
+ "</style>\n",
602
+ "<table border=\"1\" class=\"dataframe\">\n",
603
+ " <thead>\n",
604
+ " <tr style=\"text-align: right;\">\n",
605
+ " <th></th>\n",
606
+ " <th>dataset_id</th>\n",
607
+ " <th>dataset_url</th>\n",
608
+ " <th>downloads</th>\n",
609
+ " <th>author</th>\n",
610
+ " <th>license</th>\n",
611
+ " <th>tags</th>\n",
612
+ " <th>task_categories</th>\n",
613
+ " <th>last_modified</th>\n",
614
+ " <th>reason</th>\n",
615
+ " <th>readme_path</th>\n",
616
+ " <th>word_count</th>\n",
617
+ " <th>category</th>\n",
618
+ " </tr>\n",
619
+ " </thead>\n",
620
+ " <tbody>\n",
621
+ " <tr>\n",
622
+ " <th>0</th>\n",
623
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
624
+ " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
625
+ " <td>22</td>\n",
626
+ " <td>None</td>\n",
627
+ " <td>None</td>\n",
628
+ " <td>None</td>\n",
629
+ " <td>None</td>\n",
630
+ " <td>2024-01-30 07:40:02+00:00</td>\n",
631
+ " <td>No metadata and no description</td>\n",
632
+ " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
633
+ " <td>0</td>\n",
634
+ " <td>minimal</td>\n",
635
+ " </tr>\n",
636
+ " <tr>\n",
637
+ " <th>1</th>\n",
638
+ " <td>aemska/stuhl</td>\n",
639
+ " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
640
+ " <td>11</td>\n",
641
+ " <td>None</td>\n",
642
+ " <td>openrail</td>\n",
643
+ " <td>None</td>\n",
644
+ " <td>None</td>\n",
645
+ " <td>2022-11-11 14:12:36+00:00</td>\n",
646
+ " <td>Short description (char count=0, words=0)</td>\n",
647
+ " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
648
+ " <td>0</td>\n",
649
+ " <td>minimal</td>\n",
650
+ " </tr>\n",
651
+ " <tr>\n",
652
+ " <th>2</th>\n",
653
+ " <td>Pogpotatofarmer/memes</td>\n",
654
+ " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
655
+ " <td>15</td>\n",
656
+ " <td>None</td>\n",
657
+ " <td>cc</td>\n",
658
+ " <td>None</td>\n",
659
+ " <td>None</td>\n",
660
+ " <td>2022-07-15 21:11:34+00:00</td>\n",
661
+ " <td>Short description (char count=0, words=0)</td>\n",
662
+ " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
663
+ " <td>0</td>\n",
664
+ " <td>minimal</td>\n",
665
+ " </tr>\n",
666
+ " <tr>\n",
667
+ " <th>3</th>\n",
668
+ " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
669
+ " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
670
+ " <td>11</td>\n",
671
+ " <td>None</td>\n",
672
+ " <td>None</td>\n",
673
+ " <td>None</td>\n",
674
+ " <td>None</td>\n",
675
+ " <td>None</td>\n",
676
+ " <td>Failed to load card</td>\n",
677
+ " <td>None</td>\n",
678
+ " <td>0</td>\n",
679
+ " <td>minimal</td>\n",
680
+ " </tr>\n",
681
+ " <tr>\n",
682
+ " <th>4</th>\n",
683
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
684
+ " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
685
+ " <td>8</td>\n",
686
+ " <td>None</td>\n",
687
+ " <td>None</td>\n",
688
+ " <td>None</td>\n",
689
+ " <td>None</td>\n",
690
+ " <td>None</td>\n",
691
+ " <td>Failed to load card</td>\n",
692
+ " <td>None</td>\n",
693
+ " <td>0</td>\n",
694
+ " <td>minimal</td>\n",
695
+ " </tr>\n",
696
+ " <tr>\n",
697
+ " <th>...</th>\n",
698
+ " <td>...</td>\n",
699
+ " <td>...</td>\n",
700
+ " <td>...</td>\n",
701
+ " <td>...</td>\n",
702
+ " <td>...</td>\n",
703
+ " <td>...</td>\n",
704
+ " <td>...</td>\n",
705
+ " <td>...</td>\n",
706
+ " <td>...</td>\n",
707
+ " <td>...</td>\n",
708
+ " <td>...</td>\n",
709
+ " <td>...</td>\n",
710
+ " </tr>\n",
711
+ " <tr>\n",
712
+ " <th>503185</th>\n",
713
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
714
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
715
+ " <td>0</td>\n",
716
+ " <td>None</td>\n",
717
+ " <td>apache-2.0</td>\n",
718
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
719
+ " <td>robotics</td>\n",
720
+ " <td>2025-09-19 06:28:15+00:00</td>\n",
721
+ " <td>None</td>\n",
722
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
723
+ " <td>299</td>\n",
724
+ " <td>rich</td>\n",
725
+ " </tr>\n",
726
+ " <tr>\n",
727
+ " <th>503186</th>\n",
728
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
729
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
730
+ " <td>0</td>\n",
731
+ " <td>None</td>\n",
732
+ " <td>apache-2.0</td>\n",
733
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
734
+ " <td>robotics</td>\n",
735
+ " <td>2025-09-19 06:29:40+00:00</td>\n",
736
+ " <td>None</td>\n",
737
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
738
+ " <td>299</td>\n",
739
+ " <td>rich</td>\n",
740
+ " </tr>\n",
741
+ " <tr>\n",
742
+ " <th>503187</th>\n",
743
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
744
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
745
+ " <td>0</td>\n",
746
+ " <td>None</td>\n",
747
+ " <td>apache-2.0</td>\n",
748
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
749
+ " <td>robotics</td>\n",
750
+ " <td>2025-09-19 06:30:53+00:00</td>\n",
751
+ " <td>None</td>\n",
752
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
753
+ " <td>299</td>\n",
754
+ " <td>rich</td>\n",
755
+ " </tr>\n",
756
+ " <tr>\n",
757
+ " <th>503188</th>\n",
758
+ " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
759
+ " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
760
+ " <td>0</td>\n",
761
+ " <td>None</td>\n",
762
+ " <td>apache-2.0</td>\n",
763
+ " <td>LeRobot, tutorial</td>\n",
764
+ " <td>robotics</td>\n",
765
+ " <td>2025-09-19 06:34:11+00:00</td>\n",
766
+ " <td>None</td>\n",
767
+ " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
768
+ " <td>231</td>\n",
769
+ " <td>rich</td>\n",
770
+ " </tr>\n",
771
+ " <tr>\n",
772
+ " <th>503189</th>\n",
773
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
774
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
775
+ " <td>0</td>\n",
776
+ " <td>None</td>\n",
777
+ " <td>apache-2.0</td>\n",
778
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
779
+ " <td>robotics</td>\n",
780
+ " <td>2025-09-19 06:34:09+00:00</td>\n",
781
+ " <td>None</td>\n",
782
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
783
+ " <td>299</td>\n",
784
+ " <td>rich</td>\n",
785
+ " </tr>\n",
786
+ " </tbody>\n",
787
+ "</table>\n",
788
+ "<p>503190 rows × 12 columns</p>\n",
789
+ "</div>"
790
+ ],
791
+ "text/plain": [
792
+ " dataset_id \\\n",
793
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
794
+ "1 aemska/stuhl \n",
795
+ "2 Pogpotatofarmer/memes \n",
796
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
797
+ "4 chamisfum/brain_tumor_3_classes \n",
798
+ "... ... \n",
799
+ "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
800
+ "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
801
+ "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
802
+ "503188 chenxing1234567890/eval_testZ1.2.1 \n",
803
+ "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
804
+ "\n",
805
+ " dataset_url downloads author \\\n",
806
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
807
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
808
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
809
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
810
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
811
+ "... ... ... ... \n",
812
+ "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
813
+ "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
814
+ "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
815
+ "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
816
+ "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
817
+ "\n",
818
+ " license tags task_categories \\\n",
819
+ "0 None None None \n",
820
+ "1 openrail None None \n",
821
+ "2 cc None None \n",
822
+ "3 None None None \n",
823
+ "4 None None None \n",
824
+ "... ... ... ... \n",
825
+ "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
826
+ "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
827
+ "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
828
+ "503188 apache-2.0 LeRobot, tutorial robotics \n",
829
+ "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
830
+ "\n",
831
+ " last_modified reason \\\n",
832
+ "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
833
+ "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
834
+ "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
835
+ "3 None Failed to load card \n",
836
+ "4 None Failed to load card \n",
837
+ "... ... ... \n",
838
+ "503185 2025-09-19 06:28:15+00:00 None \n",
839
+ "503186 2025-09-19 06:29:40+00:00 None \n",
840
+ "503187 2025-09-19 06:30:53+00:00 None \n",
841
+ "503188 2025-09-19 06:34:11+00:00 None \n",
842
+ "503189 2025-09-19 06:34:09+00:00 None \n",
843
+ "\n",
844
+ " readme_path word_count category \n",
845
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
846
+ "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
847
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
848
+ "3 None 0 minimal \n",
849
+ "4 None 0 minimal \n",
850
+ "... ... ... ... \n",
851
+ "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n",
852
+ "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n",
853
+ "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n",
854
+ "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 rich \n",
855
+ "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n",
856
+ "\n",
857
+ "[503190 rows x 12 columns]"
858
+ ]
859
+ },
860
+ "execution_count": 6,
861
+ "metadata": {},
862
+ "output_type": "execute_result"
863
+ }
864
+ ],
865
+ "source": [
866
+ "merged_df = pd.concat([df1, df2], ignore_index=True)\n",
867
+ "merged_df"
868
+ ]
869
+ },
870
+ {
871
+ "cell_type": "code",
872
+ "execution_count": 21,
873
+ "id": "e0623157",
874
+ "metadata": {},
875
+ "outputs": [
876
+ {
877
+ "name": "stdout",
878
+ "output_type": "stream",
879
+ "text": [
880
+ "(11, 7)\n",
881
+ " id \\\n",
882
+ "623 introspector/unimath \n",
883
+ "766 ekim15/bone_marrow_cell_dataset \n",
884
+ "1645 fabriciojm/ecg-examples \n",
885
+ "3280 ahork/record-test-6 \n",
886
+ "3281 RickRain/SecondTrySimData3 \n",
887
+ "\n",
888
+ " url \\\n",
889
+ "623 https://huggingface.co/datasets/introspector/u... \n",
890
+ "766 https://huggingface.co/datasets/ekim15/bone_ma... \n",
891
+ "1645 https://huggingface.co/datasets/fabriciojm/ecg... \n",
892
+ "3280 https://huggingface.co/datasets/ahork/record-t... \n",
893
+ "3281 https://huggingface.co/datasets/RickRain/Secon... \n",
894
+ "\n",
895
+ " field keyword missing_readme missing_card \\\n",
896
+ "623 mathematics_and_statistics math False True \n",
897
+ "766 life_sciences biology True False \n",
898
+ "1645 life_sciences medical True False \n",
899
+ "3280 engineering_and_technology robotics True False \n",
900
+ "3281 engineering_and_technology robotics True False \n",
901
+ "\n",
902
+ " _id_lower \n",
903
+ "623 introspector/unimath \n",
904
+ "766 ekim15/bone_marrow_cell_dataset \n",
905
+ "1645 fabriciojm/ecg-examples \n",
906
+ "3280 ahork/record-test-6 \n",
907
+ "3281 rickrain/secondtrysimdata3 \n"
908
+ ]
909
+ }
910
+ ],
911
+ "source": [
912
+ "# Create lowercase helper columns\n",
913
+ "df1[\"_dataset_id_lower\"] = df1[\"dataset_id\"].str.lower()\n",
914
+ "csv_df[\"_id_lower\"] = csv_df[\"id\"].str.lower()\n",
915
+ "\n",
916
+ "# Get the rows from df3 where id is NOT in df1\n",
917
+ "df3_missed = csv_df[~csv_df[\"_id_lower\"].isin(df1[\"_dataset_id_lower\"])]\n",
918
+ "\n",
919
+ "print(df3_missed.shape)\n",
920
+ "print(df3_missed.head())\n"
921
+ ]
922
+ },
923
+ {
924
+ "cell_type": "code",
925
+ "execution_count": 25,
926
+ "id": "b6dbce79",
927
+ "metadata": {},
928
+ "outputs": [
929
+ {
930
+ "data": {
931
+ "text/plain": [
932
+ "array([['introspector/unimath',\n",
933
+ " 'https://huggingface.co/datasets/introspector/unimath',\n",
934
+ " 'mathematics_and_statistics', 'math', False, True,\n",
935
+ " 'introspector/unimath'],\n",
936
+ " ['ekim15/bone_marrow_cell_dataset',\n",
937
+ " 'https://huggingface.co/datasets/ekim15/bone_marrow_cell_dataset',\n",
938
+ " 'life_sciences', 'biology', True, False,\n",
939
+ " 'ekim15/bone_marrow_cell_dataset'],\n",
940
+ " ['fabriciojm/ecg-examples',\n",
941
+ " 'https://huggingface.co/datasets/fabriciojm/ecg-examples',\n",
942
+ " 'life_sciences', 'medical', True, False,\n",
943
+ " 'fabriciojm/ecg-examples'],\n",
944
+ " ['ahork/record-test-6',\n",
945
+ " 'https://huggingface.co/datasets/ahork/record-test-6',\n",
946
+ " 'engineering_and_technology', 'robotics', True, False,\n",
947
+ " 'ahork/record-test-6'],\n",
948
+ " ['RickRain/SecondTrySimData3',\n",
949
+ " 'https://huggingface.co/datasets/RickRain/SecondTrySimData3',\n",
950
+ " 'engineering_and_technology', 'robotics', True, False,\n",
951
+ " 'rickrain/secondtrysimdata3'],\n",
952
+ " ['MulixBF/record-cube-pick-2cam-black-2',\n",
953
+ " 'https://huggingface.co/datasets/MulixBF/record-cube-pick-2cam-black-2',\n",
954
+ " 'engineering_and_technology', 'robotics', True, False,\n",
955
+ " 'mulixbf/record-cube-pick-2cam-black-2'],\n",
956
+ " ['ricdigi/1two-camera3-test2345',\n",
957
+ " 'https://huggingface.co/datasets/ricdigi/1two-camera3-test2345',\n",
958
+ " 'engineering_and_technology', 'robotics', True, False,\n",
959
+ " 'ricdigi/1two-camera3-test2345'],\n",
960
+ " ['Ninkofu/sushi_put',\n",
961
+ " 'https://huggingface.co/datasets/Ninkofu/sushi_put',\n",
962
+ " 'engineering_and_technology', 'robotics', True, False,\n",
963
+ " 'ninkofu/sushi_put'],\n",
964
+ " ['jokla89/record-test-temp1',\n",
965
+ " 'https://huggingface.co/datasets/jokla89/record-test-temp1',\n",
966
+ " 'engineering_and_technology', 'robotics', True, False,\n",
967
+ " 'jokla89/record-test-temp1'],\n",
968
+ " ['LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n",
969
+ " 'https://huggingface.co/datasets/LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n",
970
+ " 'engineering_and_technology', 'robotics', True, False,\n",
971
+ " 'lerobot-worldwide-hackathon/325-casino-dealer-dice-set'],\n",
972
+ " ['jackvial/koch_screwdriver_attach_orange_panel_e125',\n",
973
+ " 'https://huggingface.co/datasets/jackvial/koch_screwdriver_attach_orange_panel_e125',\n",
974
+ " 'engineering_and_technology', 'robotics', True, False,\n",
975
+ " 'jackvial/koch_screwdriver_attach_orange_panel_e125']],\n",
976
+ " dtype=object)"
977
+ ]
978
+ },
979
+ "execution_count": 25,
980
+ "metadata": {},
981
+ "output_type": "execute_result"
982
+ }
983
+ ],
984
+ "source": [
985
+ "df3_missed.values"
986
+ ]
987
+ },
988
+ {
989
+ "cell_type": "code",
990
+ "execution_count": 26,
991
+ "id": "0cec2023",
992
+ "metadata": {},
993
+ "outputs": [
994
+ {
995
+ "data": {
996
+ "text/html": [
997
+ "<div>\n",
998
+ "<style scoped>\n",
999
+ " .dataframe tbody tr th:only-of-type {\n",
1000
+ " vertical-align: middle;\n",
1001
+ " }\n",
1002
+ "\n",
1003
+ " .dataframe tbody tr th {\n",
1004
+ " vertical-align: top;\n",
1005
+ " }\n",
1006
+ "\n",
1007
+ " .dataframe thead th {\n",
1008
+ " text-align: right;\n",
1009
+ " }\n",
1010
+ "</style>\n",
1011
+ "<table border=\"1\" class=\"dataframe\">\n",
1012
+ " <thead>\n",
1013
+ " <tr style=\"text-align: right;\">\n",
1014
+ " <th></th>\n",
1015
+ " <th>dataset_id</th>\n",
1016
+ " <th>dataset_url</th>\n",
1017
+ " <th>downloads</th>\n",
1018
+ " <th>author</th>\n",
1019
+ " <th>license</th>\n",
1020
+ " <th>tags</th>\n",
1021
+ " <th>task_categories</th>\n",
1022
+ " <th>last_modified</th>\n",
1023
+ " <th>reason</th>\n",
1024
+ " <th>readme_path</th>\n",
1025
+ " <th>word_count</th>\n",
1026
+ " <th>category</th>\n",
1027
+ " <th>_dataset_id_lower</th>\n",
1028
+ " </tr>\n",
1029
+ " </thead>\n",
1030
+ " <tbody>\n",
1031
+ " <tr>\n",
1032
+ " <th>0</th>\n",
1033
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1034
+ " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1035
+ " <td>22</td>\n",
1036
+ " <td>None</td>\n",
1037
+ " <td>None</td>\n",
1038
+ " <td>None</td>\n",
1039
+ " <td>None</td>\n",
1040
+ " <td>2024-01-30 07:40:02+00:00</td>\n",
1041
+ " <td>No metadata and no description</td>\n",
1042
+ " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1043
+ " <td>0</td>\n",
1044
+ " <td>minimal</td>\n",
1045
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1046
+ " </tr>\n",
1047
+ " <tr>\n",
1048
+ " <th>1</th>\n",
1049
+ " <td>aemska/stuhl</td>\n",
1050
+ " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1051
+ " <td>11</td>\n",
1052
+ " <td>None</td>\n",
1053
+ " <td>openrail</td>\n",
1054
+ " <td>None</td>\n",
1055
+ " <td>None</td>\n",
1056
+ " <td>2022-11-11 14:12:36+00:00</td>\n",
1057
+ " <td>Short description (char count=0, words=0)</td>\n",
1058
+ " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1059
+ " <td>0</td>\n",
1060
+ " <td>minimal</td>\n",
1061
+ " <td>aemska/stuhl</td>\n",
1062
+ " </tr>\n",
1063
+ " <tr>\n",
1064
+ " <th>2</th>\n",
1065
+ " <td>Pogpotatofarmer/memes</td>\n",
1066
+ " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1067
+ " <td>15</td>\n",
1068
+ " <td>None</td>\n",
1069
+ " <td>cc</td>\n",
1070
+ " <td>None</td>\n",
1071
+ " <td>None</td>\n",
1072
+ " <td>2022-07-15 21:11:34+00:00</td>\n",
1073
+ " <td>Short description (char count=0, words=0)</td>\n",
1074
+ " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1075
+ " <td>0</td>\n",
1076
+ " <td>minimal</td>\n",
1077
+ " <td>pogpotatofarmer/memes</td>\n",
1078
+ " </tr>\n",
1079
+ " <tr>\n",
1080
+ " <th>3</th>\n",
1081
+ " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1082
+ " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1083
+ " <td>11</td>\n",
1084
+ " <td>None</td>\n",
1085
+ " <td>None</td>\n",
1086
+ " <td>None</td>\n",
1087
+ " <td>None</td>\n",
1088
+ " <td>None</td>\n",
1089
+ " <td>Failed to load card</td>\n",
1090
+ " <td>None</td>\n",
1091
+ " <td>0</td>\n",
1092
+ " <td>minimal</td>\n",
1093
+ " <td>splend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h</td>\n",
1094
+ " </tr>\n",
1095
+ " <tr>\n",
1096
+ " <th>4</th>\n",
1097
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
1098
+ " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1099
+ " <td>8</td>\n",
1100
+ " <td>None</td>\n",
1101
+ " <td>None</td>\n",
1102
+ " <td>None</td>\n",
1103
+ " <td>None</td>\n",
1104
+ " <td>None</td>\n",
1105
+ " <td>Failed to load card</td>\n",
1106
+ " <td>None</td>\n",
1107
+ " <td>0</td>\n",
1108
+ " <td>minimal</td>\n",
1109
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
1110
+ " </tr>\n",
1111
+ " <tr>\n",
1112
+ " <th>...</th>\n",
1113
+ " <td>...</td>\n",
1114
+ " <td>...</td>\n",
1115
+ " <td>...</td>\n",
1116
+ " <td>...</td>\n",
1117
+ " <td>...</td>\n",
1118
+ " <td>...</td>\n",
1119
+ " <td>...</td>\n",
1120
+ " <td>...</td>\n",
1121
+ " <td>...</td>\n",
1122
+ " <td>...</td>\n",
1123
+ " <td>...</td>\n",
1124
+ " <td>...</td>\n",
1125
+ " <td>...</td>\n",
1126
+ " </tr>\n",
1127
+ " <tr>\n",
1128
+ " <th>503185</th>\n",
1129
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1130
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1131
+ " <td>0</td>\n",
1132
+ " <td>None</td>\n",
1133
+ " <td>apache-2.0</td>\n",
1134
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1135
+ " <td>robotics</td>\n",
1136
+ " <td>2025-09-19 06:28:15+00:00</td>\n",
1137
+ " <td>None</td>\n",
1138
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1139
+ " <td>299</td>\n",
1140
+ " <td>rich</td>\n",
1141
+ " <td>robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1142
+ " </tr>\n",
1143
+ " <tr>\n",
1144
+ " <th>503186</th>\n",
1145
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1146
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1147
+ " <td>0</td>\n",
1148
+ " <td>None</td>\n",
1149
+ " <td>apache-2.0</td>\n",
1150
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1151
+ " <td>robotics</td>\n",
1152
+ " <td>2025-09-19 06:29:40+00:00</td>\n",
1153
+ " <td>None</td>\n",
1154
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1155
+ " <td>299</td>\n",
1156
+ " <td>rich</td>\n",
1157
+ " <td>robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1158
+ " </tr>\n",
1159
+ " <tr>\n",
1160
+ " <th>503187</th>\n",
1161
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1162
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1163
+ " <td>0</td>\n",
1164
+ " <td>None</td>\n",
1165
+ " <td>apache-2.0</td>\n",
1166
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1167
+ " <td>robotics</td>\n",
1168
+ " <td>2025-09-19 06:30:53+00:00</td>\n",
1169
+ " <td>None</td>\n",
1170
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1171
+ " <td>299</td>\n",
1172
+ " <td>rich</td>\n",
1173
+ " <td>dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1174
+ " </tr>\n",
1175
+ " <tr>\n",
1176
+ " <th>503188</th>\n",
1177
+ " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1178
+ " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1179
+ " <td>0</td>\n",
1180
+ " <td>None</td>\n",
1181
+ " <td>apache-2.0</td>\n",
1182
+ " <td>LeRobot, tutorial</td>\n",
1183
+ " <td>robotics</td>\n",
1184
+ " <td>2025-09-19 06:34:11+00:00</td>\n",
1185
+ " <td>None</td>\n",
1186
+ " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1187
+ " <td>231</td>\n",
1188
+ " <td>rich</td>\n",
1189
+ " <td>chenxing1234567890/eval_testz1.2.1</td>\n",
1190
+ " </tr>\n",
1191
+ " <tr>\n",
1192
+ " <th>503189</th>\n",
1193
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1194
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1195
+ " <td>0</td>\n",
1196
+ " <td>None</td>\n",
1197
+ " <td>apache-2.0</td>\n",
1198
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1199
+ " <td>robotics</td>\n",
1200
+ " <td>2025-09-19 06:34:09+00:00</td>\n",
1201
+ " <td>None</td>\n",
1202
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1203
+ " <td>299</td>\n",
1204
+ " <td>rich</td>\n",
1205
+ " <td>dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1206
+ " </tr>\n",
1207
+ " </tbody>\n",
1208
+ "</table>\n",
1209
+ "<p>503190 rows × 13 columns</p>\n",
1210
+ "</div>"
1211
+ ],
1212
+ "text/plain": [
1213
+ " dataset_id \\\n",
1214
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1215
+ "1 aemska/stuhl \n",
1216
+ "2 Pogpotatofarmer/memes \n",
1217
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1218
+ "4 chamisfum/brain_tumor_3_classes \n",
1219
+ "... ... \n",
1220
+ "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1221
+ "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1222
+ "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1223
+ "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1224
+ "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1225
+ "\n",
1226
+ " dataset_url downloads author \\\n",
1227
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1228
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1229
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1230
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1231
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1232
+ "... ... ... ... \n",
1233
+ "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1234
+ "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1235
+ "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1236
+ "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
1237
+ "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1238
+ "\n",
1239
+ " license tags task_categories \\\n",
1240
+ "0 None None None \n",
1241
+ "1 openrail None None \n",
1242
+ "2 cc None None \n",
1243
+ "3 None None None \n",
1244
+ "4 None None None \n",
1245
+ "... ... ... ... \n",
1246
+ "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1247
+ "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1248
+ "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1249
+ "503188 apache-2.0 LeRobot, tutorial robotics \n",
1250
+ "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1251
+ "\n",
1252
+ " last_modified reason \\\n",
1253
+ "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
1254
+ "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
1255
+ "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
1256
+ "3 None Failed to load card \n",
1257
+ "4 None Failed to load card \n",
1258
+ "... ... ... \n",
1259
+ "503185 2025-09-19 06:28:15+00:00 None \n",
1260
+ "503186 2025-09-19 06:29:40+00:00 None \n",
1261
+ "503187 2025-09-19 06:30:53+00:00 None \n",
1262
+ "503188 2025-09-19 06:34:11+00:00 None \n",
1263
+ "503189 2025-09-19 06:34:09+00:00 None \n",
1264
+ "\n",
1265
+ " readme_path word_count \\\n",
1266
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
1267
+ "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
1268
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
1269
+ "3 None 0 \n",
1270
+ "4 None 0 \n",
1271
+ "... ... ... \n",
1272
+ "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1273
+ "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1274
+ "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1275
+ "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
1276
+ "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1277
+ "\n",
1278
+ " category _dataset_id_lower \n",
1279
+ "0 minimal akjadhav/leandojo-lean4-formal-informal-strings \n",
1280
+ "1 minimal aemska/stuhl \n",
1281
+ "2 minimal pogpotatofarmer/memes \n",
1282
+ "3 minimal splend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h \n",
1283
+ "4 minimal chamisfum/brain_tumor_3_classes \n",
1284
+ "... ... ... \n",
1285
+ "503185 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1286
+ "503186 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1287
+ "503187 rich dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1288
+ "503188 rich chenxing1234567890/eval_testz1.2.1 \n",
1289
+ "503189 rich dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1290
+ "\n",
1291
+ "[503190 rows x 13 columns]"
1292
+ ]
1293
+ },
1294
+ "execution_count": 26,
1295
+ "metadata": {},
1296
+ "output_type": "execute_result"
1297
+ }
1298
+ ],
1299
+ "source": [
1300
+ "merged_df"
1301
+ ]
1302
+ },
1303
+ {
1304
+ "cell_type": "code",
1305
+ "execution_count": 27,
1306
+ "id": "2bc30fa7",
1307
+ "metadata": {},
1308
+ "outputs": [
1309
+ {
1310
+ "name": "stdout",
1311
+ "output_type": "stream",
1312
+ "text": [
1313
+ "(503190, 14)\n",
1314
+ " dataset_id \\\n",
1315
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1316
+ "1 aemska/stuhl \n",
1317
+ "2 Pogpotatofarmer/memes \n",
1318
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1319
+ "4 chamisfum/brain_tumor_3_classes \n",
1320
+ "\n",
1321
+ " dataset_url downloads author \\\n",
1322
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1323
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1324
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1325
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1326
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1327
+ "\n",
1328
+ " license tags task_categories last_modified \\\n",
1329
+ "0 None None None 2024-01-30 07:40:02+00:00 \n",
1330
+ "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
1331
+ "2 cc None None 2022-07-15 21:11:34+00:00 \n",
1332
+ "3 None None None None \n",
1333
+ "4 None None None None \n",
1334
+ "\n",
1335
+ " reason \\\n",
1336
+ "0 No metadata and no description \n",
1337
+ "1 Short description (char count=0, words=0) \n",
1338
+ "2 Short description (char count=0, words=0) \n",
1339
+ "3 Failed to load card \n",
1340
+ "4 Failed to load card \n",
1341
+ "\n",
1342
+ " readme_path word_count category \\\n",
1343
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
1344
+ "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
1345
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
1346
+ "3 None 0 minimal \n",
1347
+ "4 None 0 minimal \n",
1348
+ "\n",
1349
+ " field keyword \n",
1350
+ "0 NaN NaN \n",
1351
+ "1 NaN NaN \n",
1352
+ "2 NaN NaN \n",
1353
+ "3 NaN NaN \n",
1354
+ "4 life_sciences brain \n"
1355
+ ]
1356
+ }
1357
+ ],
1358
+ "source": [
1359
+ "# Merge on lowercase columns to bring 'field' and 'keyword' from csv_df\n",
1360
+ "merged_df = merged_df.merge(\n",
1361
+ " csv_df[[\"_id_lower\", \"field\", \"keyword\"]],\n",
1362
+ " left_on=\"_dataset_id_lower\",\n",
1363
+ " right_on=\"_id_lower\",\n",
1364
+ " how=\"left\"\n",
1365
+ ")\n",
1366
+ "\n",
1367
+ "# Drop the helper columns\n",
1368
+ "merged_df = merged_df.drop(columns=[\"_dataset_id_lower\", \"_id_lower\"])\n",
1369
+ "\n",
1370
+ "# Quick check\n",
1371
+ "print(merged_df.shape)\n",
1372
+ "print(merged_df.head())\n"
1373
+ ]
1374
+ },
1375
+ {
1376
+ "cell_type": "code",
1377
+ "execution_count": 28,
1378
+ "id": "4b104aef",
1379
+ "metadata": {},
1380
+ "outputs": [
1381
+ {
1382
+ "data": {
1383
+ "text/html": [
1384
+ "<div>\n",
1385
+ "<style scoped>\n",
1386
+ " .dataframe tbody tr th:only-of-type {\n",
1387
+ " vertical-align: middle;\n",
1388
+ " }\n",
1389
+ "\n",
1390
+ " .dataframe tbody tr th {\n",
1391
+ " vertical-align: top;\n",
1392
+ " }\n",
1393
+ "\n",
1394
+ " .dataframe thead th {\n",
1395
+ " text-align: right;\n",
1396
+ " }\n",
1397
+ "</style>\n",
1398
+ "<table border=\"1\" class=\"dataframe\">\n",
1399
+ " <thead>\n",
1400
+ " <tr style=\"text-align: right;\">\n",
1401
+ " <th></th>\n",
1402
+ " <th>dataset_id</th>\n",
1403
+ " <th>dataset_url</th>\n",
1404
+ " <th>downloads</th>\n",
1405
+ " <th>author</th>\n",
1406
+ " <th>license</th>\n",
1407
+ " <th>tags</th>\n",
1408
+ " <th>task_categories</th>\n",
1409
+ " <th>last_modified</th>\n",
1410
+ " <th>reason</th>\n",
1411
+ " <th>readme_path</th>\n",
1412
+ " <th>word_count</th>\n",
1413
+ " <th>category</th>\n",
1414
+ " <th>field</th>\n",
1415
+ " <th>keyword</th>\n",
1416
+ " </tr>\n",
1417
+ " </thead>\n",
1418
+ " <tbody>\n",
1419
+ " <tr>\n",
1420
+ " <th>0</th>\n",
1421
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1422
+ " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1423
+ " <td>22</td>\n",
1424
+ " <td>None</td>\n",
1425
+ " <td>None</td>\n",
1426
+ " <td>None</td>\n",
1427
+ " <td>None</td>\n",
1428
+ " <td>2024-01-30 07:40:02+00:00</td>\n",
1429
+ " <td>No metadata and no description</td>\n",
1430
+ " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1431
+ " <td>0</td>\n",
1432
+ " <td>minimal</td>\n",
1433
+ " <td>NaN</td>\n",
1434
+ " <td>NaN</td>\n",
1435
+ " </tr>\n",
1436
+ " <tr>\n",
1437
+ " <th>1</th>\n",
1438
+ " <td>aemska/stuhl</td>\n",
1439
+ " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1440
+ " <td>11</td>\n",
1441
+ " <td>None</td>\n",
1442
+ " <td>openrail</td>\n",
1443
+ " <td>None</td>\n",
1444
+ " <td>None</td>\n",
1445
+ " <td>2022-11-11 14:12:36+00:00</td>\n",
1446
+ " <td>Short description (char count=0, words=0)</td>\n",
1447
+ " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1448
+ " <td>0</td>\n",
1449
+ " <td>minimal</td>\n",
1450
+ " <td>NaN</td>\n",
1451
+ " <td>NaN</td>\n",
1452
+ " </tr>\n",
1453
+ " <tr>\n",
1454
+ " <th>2</th>\n",
1455
+ " <td>Pogpotatofarmer/memes</td>\n",
1456
+ " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1457
+ " <td>15</td>\n",
1458
+ " <td>None</td>\n",
1459
+ " <td>cc</td>\n",
1460
+ " <td>None</td>\n",
1461
+ " <td>None</td>\n",
1462
+ " <td>2022-07-15 21:11:34+00:00</td>\n",
1463
+ " <td>Short description (char count=0, words=0)</td>\n",
1464
+ " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1465
+ " <td>0</td>\n",
1466
+ " <td>minimal</td>\n",
1467
+ " <td>NaN</td>\n",
1468
+ " <td>NaN</td>\n",
1469
+ " </tr>\n",
1470
+ " <tr>\n",
1471
+ " <th>3</th>\n",
1472
+ " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1473
+ " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1474
+ " <td>11</td>\n",
1475
+ " <td>None</td>\n",
1476
+ " <td>None</td>\n",
1477
+ " <td>None</td>\n",
1478
+ " <td>None</td>\n",
1479
+ " <td>None</td>\n",
1480
+ " <td>Failed to load card</td>\n",
1481
+ " <td>None</td>\n",
1482
+ " <td>0</td>\n",
1483
+ " <td>minimal</td>\n",
1484
+ " <td>NaN</td>\n",
1485
+ " <td>NaN</td>\n",
1486
+ " </tr>\n",
1487
+ " <tr>\n",
1488
+ " <th>4</th>\n",
1489
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
1490
+ " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1491
+ " <td>8</td>\n",
1492
+ " <td>None</td>\n",
1493
+ " <td>None</td>\n",
1494
+ " <td>None</td>\n",
1495
+ " <td>None</td>\n",
1496
+ " <td>None</td>\n",
1497
+ " <td>Failed to load card</td>\n",
1498
+ " <td>None</td>\n",
1499
+ " <td>0</td>\n",
1500
+ " <td>minimal</td>\n",
1501
+ " <td>life_sciences</td>\n",
1502
+ " <td>brain</td>\n",
1503
+ " </tr>\n",
1504
+ " <tr>\n",
1505
+ " <th>...</th>\n",
1506
+ " <td>...</td>\n",
1507
+ " <td>...</td>\n",
1508
+ " <td>...</td>\n",
1509
+ " <td>...</td>\n",
1510
+ " <td>...</td>\n",
1511
+ " <td>...</td>\n",
1512
+ " <td>...</td>\n",
1513
+ " <td>...</td>\n",
1514
+ " <td>...</td>\n",
1515
+ " <td>...</td>\n",
1516
+ " <td>...</td>\n",
1517
+ " <td>...</td>\n",
1518
+ " <td>...</td>\n",
1519
+ " <td>...</td>\n",
1520
+ " </tr>\n",
1521
+ " <tr>\n",
1522
+ " <th>503185</th>\n",
1523
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1524
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1525
+ " <td>0</td>\n",
1526
+ " <td>None</td>\n",
1527
+ " <td>apache-2.0</td>\n",
1528
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1529
+ " <td>robotics</td>\n",
1530
+ " <td>2025-09-19 06:28:15+00:00</td>\n",
1531
+ " <td>None</td>\n",
1532
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1533
+ " <td>299</td>\n",
1534
+ " <td>rich</td>\n",
1535
+ " <td>NaN</td>\n",
1536
+ " <td>NaN</td>\n",
1537
+ " </tr>\n",
1538
+ " <tr>\n",
1539
+ " <th>503186</th>\n",
1540
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1541
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1542
+ " <td>0</td>\n",
1543
+ " <td>None</td>\n",
1544
+ " <td>apache-2.0</td>\n",
1545
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1546
+ " <td>robotics</td>\n",
1547
+ " <td>2025-09-19 06:29:40+00:00</td>\n",
1548
+ " <td>None</td>\n",
1549
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1550
+ " <td>299</td>\n",
1551
+ " <td>rich</td>\n",
1552
+ " <td>NaN</td>\n",
1553
+ " <td>NaN</td>\n",
1554
+ " </tr>\n",
1555
+ " <tr>\n",
1556
+ " <th>503187</th>\n",
1557
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1558
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1559
+ " <td>0</td>\n",
1560
+ " <td>None</td>\n",
1561
+ " <td>apache-2.0</td>\n",
1562
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1563
+ " <td>robotics</td>\n",
1564
+ " <td>2025-09-19 06:30:53+00:00</td>\n",
1565
+ " <td>None</td>\n",
1566
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1567
+ " <td>299</td>\n",
1568
+ " <td>rich</td>\n",
1569
+ " <td>NaN</td>\n",
1570
+ " <td>NaN</td>\n",
1571
+ " </tr>\n",
1572
+ " <tr>\n",
1573
+ " <th>503188</th>\n",
1574
+ " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1575
+ " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1576
+ " <td>0</td>\n",
1577
+ " <td>None</td>\n",
1578
+ " <td>apache-2.0</td>\n",
1579
+ " <td>LeRobot, tutorial</td>\n",
1580
+ " <td>robotics</td>\n",
1581
+ " <td>2025-09-19 06:34:11+00:00</td>\n",
1582
+ " <td>None</td>\n",
1583
+ " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1584
+ " <td>231</td>\n",
1585
+ " <td>rich</td>\n",
1586
+ " <td>NaN</td>\n",
1587
+ " <td>NaN</td>\n",
1588
+ " </tr>\n",
1589
+ " <tr>\n",
1590
+ " <th>503189</th>\n",
1591
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1592
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1593
+ " <td>0</td>\n",
1594
+ " <td>None</td>\n",
1595
+ " <td>apache-2.0</td>\n",
1596
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1597
+ " <td>robotics</td>\n",
1598
+ " <td>2025-09-19 06:34:09+00:00</td>\n",
1599
+ " <td>None</td>\n",
1600
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1601
+ " <td>299</td>\n",
1602
+ " <td>rich</td>\n",
1603
+ " <td>NaN</td>\n",
1604
+ " <td>NaN</td>\n",
1605
+ " </tr>\n",
1606
+ " </tbody>\n",
1607
+ "</table>\n",
1608
+ "<p>503190 rows × 14 columns</p>\n",
1609
+ "</div>"
1610
+ ],
1611
+ "text/plain": [
1612
+ " dataset_id \\\n",
1613
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1614
+ "1 aemska/stuhl \n",
1615
+ "2 Pogpotatofarmer/memes \n",
1616
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1617
+ "4 chamisfum/brain_tumor_3_classes \n",
1618
+ "... ... \n",
1619
+ "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1620
+ "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1621
+ "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1622
+ "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1623
+ "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1624
+ "\n",
1625
+ " dataset_url downloads author \\\n",
1626
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1627
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1628
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1629
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1630
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1631
+ "... ... ... ... \n",
1632
+ "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1633
+ "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1634
+ "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1635
+ "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
1636
+ "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1637
+ "\n",
1638
+ " license tags task_categories \\\n",
1639
+ "0 None None None \n",
1640
+ "1 openrail None None \n",
1641
+ "2 cc None None \n",
1642
+ "3 None None None \n",
1643
+ "4 None None None \n",
1644
+ "... ... ... ... \n",
1645
+ "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1646
+ "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1647
+ "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1648
+ "503188 apache-2.0 LeRobot, tutorial robotics \n",
1649
+ "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1650
+ "\n",
1651
+ " last_modified reason \\\n",
1652
+ "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
1653
+ "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
1654
+ "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
1655
+ "3 None Failed to load card \n",
1656
+ "4 None Failed to load card \n",
1657
+ "... ... ... \n",
1658
+ "503185 2025-09-19 06:28:15+00:00 None \n",
1659
+ "503186 2025-09-19 06:29:40+00:00 None \n",
1660
+ "503187 2025-09-19 06:30:53+00:00 None \n",
1661
+ "503188 2025-09-19 06:34:11+00:00 None \n",
1662
+ "503189 2025-09-19 06:34:09+00:00 None \n",
1663
+ "\n",
1664
+ " readme_path word_count \\\n",
1665
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
1666
+ "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
1667
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
1668
+ "3 None 0 \n",
1669
+ "4 None 0 \n",
1670
+ "... ... ... \n",
1671
+ "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1672
+ "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1673
+ "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1674
+ "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
1675
+ "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1676
+ "\n",
1677
+ " category field keyword \n",
1678
+ "0 minimal NaN NaN \n",
1679
+ "1 minimal NaN NaN \n",
1680
+ "2 minimal NaN NaN \n",
1681
+ "3 minimal NaN NaN \n",
1682
+ "4 minimal life_sciences brain \n",
1683
+ "... ... ... ... \n",
1684
+ "503185 rich NaN NaN \n",
1685
+ "503186 rich NaN NaN \n",
1686
+ "503187 rich NaN NaN \n",
1687
+ "503188 rich NaN NaN \n",
1688
+ "503189 rich NaN NaN \n",
1689
+ "\n",
1690
+ "[503190 rows x 14 columns]"
1691
+ ]
1692
+ },
1693
+ "execution_count": 28,
1694
+ "metadata": {},
1695
+ "output_type": "execute_result"
1696
+ }
1697
+ ],
1698
+ "source": [
1699
+ "merged_df"
1700
+ ]
1701
+ },
1702
+ {
1703
+ "cell_type": "code",
1704
+ "execution_count": 30,
1705
+ "id": "69ec9289",
1706
+ "metadata": {},
1707
+ "outputs": [
1708
+ {
1709
+ "name": "stdout",
1710
+ "output_type": "stream",
1711
+ "text": [
1712
+ "Number of rows with a value in 'science' column: 4040\n"
1713
+ ]
1714
+ }
1715
+ ],
1716
+ "source": [
1717
+ "import numpy as np\n",
1718
+ "\n",
1719
+ "# Replace all None with np.nan\n",
1720
+ "merged_df = merged_df.replace({None: np.nan})\n",
1721
+ "\n",
1722
+ "# Count rows where 'science' column has a value (not NaN)\n",
1723
+ "science_count = merged_df[\"field\"].notna().sum()\n",
1724
+ "\n",
1725
+ "print(f\"Number of rows with a value in 'science' column: {science_count}\")\n"
1726
+ ]
1727
+ },
1728
+ {
1729
+ "cell_type": "code",
1730
+ "execution_count": 31,
1731
+ "id": "b0d58ceb",
1732
+ "metadata": {},
1733
+ "outputs": [
1734
+ {
1735
+ "data": {
1736
+ "text/html": [
1737
+ "<div>\n",
1738
+ "<style scoped>\n",
1739
+ " .dataframe tbody tr th:only-of-type {\n",
1740
+ " vertical-align: middle;\n",
1741
+ " }\n",
1742
+ "\n",
1743
+ " .dataframe tbody tr th {\n",
1744
+ " vertical-align: top;\n",
1745
+ " }\n",
1746
+ "\n",
1747
+ " .dataframe thead th {\n",
1748
+ " text-align: right;\n",
1749
+ " }\n",
1750
+ "</style>\n",
1751
+ "<table border=\"1\" class=\"dataframe\">\n",
1752
+ " <thead>\n",
1753
+ " <tr style=\"text-align: right;\">\n",
1754
+ " <th></th>\n",
1755
+ " <th>dataset_id</th>\n",
1756
+ " <th>dataset_url</th>\n",
1757
+ " <th>downloads</th>\n",
1758
+ " <th>author</th>\n",
1759
+ " <th>license</th>\n",
1760
+ " <th>tags</th>\n",
1761
+ " <th>task_categories</th>\n",
1762
+ " <th>last_modified</th>\n",
1763
+ " <th>reason</th>\n",
1764
+ " <th>readme_path</th>\n",
1765
+ " <th>word_count</th>\n",
1766
+ " <th>category</th>\n",
1767
+ " <th>field</th>\n",
1768
+ " <th>keyword</th>\n",
1769
+ " </tr>\n",
1770
+ " </thead>\n",
1771
+ " <tbody>\n",
1772
+ " <tr>\n",
1773
+ " <th>0</th>\n",
1774
+ " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1775
+ " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1776
+ " <td>22</td>\n",
1777
+ " <td>NaN</td>\n",
1778
+ " <td>NaN</td>\n",
1779
+ " <td>NaN</td>\n",
1780
+ " <td>NaN</td>\n",
1781
+ " <td>2024-01-30 07:40:02+00:00</td>\n",
1782
+ " <td>No metadata and no description</td>\n",
1783
+ " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1784
+ " <td>0</td>\n",
1785
+ " <td>minimal</td>\n",
1786
+ " <td>NaN</td>\n",
1787
+ " <td>NaN</td>\n",
1788
+ " </tr>\n",
1789
+ " <tr>\n",
1790
+ " <th>1</th>\n",
1791
+ " <td>aemska/stuhl</td>\n",
1792
+ " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1793
+ " <td>11</td>\n",
1794
+ " <td>NaN</td>\n",
1795
+ " <td>openrail</td>\n",
1796
+ " <td>NaN</td>\n",
1797
+ " <td>NaN</td>\n",
1798
+ " <td>2022-11-11 14:12:36+00:00</td>\n",
1799
+ " <td>Short description (char count=0, words=0)</td>\n",
1800
+ " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1801
+ " <td>0</td>\n",
1802
+ " <td>minimal</td>\n",
1803
+ " <td>NaN</td>\n",
1804
+ " <td>NaN</td>\n",
1805
+ " </tr>\n",
1806
+ " <tr>\n",
1807
+ " <th>2</th>\n",
1808
+ " <td>Pogpotatofarmer/memes</td>\n",
1809
+ " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1810
+ " <td>15</td>\n",
1811
+ " <td>NaN</td>\n",
1812
+ " <td>cc</td>\n",
1813
+ " <td>NaN</td>\n",
1814
+ " <td>NaN</td>\n",
1815
+ " <td>2022-07-15 21:11:34+00:00</td>\n",
1816
+ " <td>Short description (char count=0, words=0)</td>\n",
1817
+ " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1818
+ " <td>0</td>\n",
1819
+ " <td>minimal</td>\n",
1820
+ " <td>NaN</td>\n",
1821
+ " <td>NaN</td>\n",
1822
+ " </tr>\n",
1823
+ " <tr>\n",
1824
+ " <th>3</th>\n",
1825
+ " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1826
+ " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1827
+ " <td>11</td>\n",
1828
+ " <td>NaN</td>\n",
1829
+ " <td>NaN</td>\n",
1830
+ " <td>NaN</td>\n",
1831
+ " <td>NaN</td>\n",
1832
+ " <td>NaN</td>\n",
1833
+ " <td>Failed to load card</td>\n",
1834
+ " <td>NaN</td>\n",
1835
+ " <td>0</td>\n",
1836
+ " <td>minimal</td>\n",
1837
+ " <td>NaN</td>\n",
1838
+ " <td>NaN</td>\n",
1839
+ " </tr>\n",
1840
+ " <tr>\n",
1841
+ " <th>4</th>\n",
1842
+ " <td>chamisfum/brain_tumor_3_classes</td>\n",
1843
+ " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1844
+ " <td>8</td>\n",
1845
+ " <td>NaN</td>\n",
1846
+ " <td>NaN</td>\n",
1847
+ " <td>NaN</td>\n",
1848
+ " <td>NaN</td>\n",
1849
+ " <td>NaN</td>\n",
1850
+ " <td>Failed to load card</td>\n",
1851
+ " <td>NaN</td>\n",
1852
+ " <td>0</td>\n",
1853
+ " <td>minimal</td>\n",
1854
+ " <td>life_sciences</td>\n",
1855
+ " <td>brain</td>\n",
1856
+ " </tr>\n",
1857
+ " <tr>\n",
1858
+ " <th>...</th>\n",
1859
+ " <td>...</td>\n",
1860
+ " <td>...</td>\n",
1861
+ " <td>...</td>\n",
1862
+ " <td>...</td>\n",
1863
+ " <td>...</td>\n",
1864
+ " <td>...</td>\n",
1865
+ " <td>...</td>\n",
1866
+ " <td>...</td>\n",
1867
+ " <td>...</td>\n",
1868
+ " <td>...</td>\n",
1869
+ " <td>...</td>\n",
1870
+ " <td>...</td>\n",
1871
+ " <td>...</td>\n",
1872
+ " <td>...</td>\n",
1873
+ " </tr>\n",
1874
+ " <tr>\n",
1875
+ " <th>503185</th>\n",
1876
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1877
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1878
+ " <td>0</td>\n",
1879
+ " <td>NaN</td>\n",
1880
+ " <td>apache-2.0</td>\n",
1881
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1882
+ " <td>robotics</td>\n",
1883
+ " <td>2025-09-19 06:28:15+00:00</td>\n",
1884
+ " <td>NaN</td>\n",
1885
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1886
+ " <td>299</td>\n",
1887
+ " <td>rich</td>\n",
1888
+ " <td>NaN</td>\n",
1889
+ " <td>NaN</td>\n",
1890
+ " </tr>\n",
1891
+ " <tr>\n",
1892
+ " <th>503186</th>\n",
1893
+ " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1894
+ " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1895
+ " <td>0</td>\n",
1896
+ " <td>NaN</td>\n",
1897
+ " <td>apache-2.0</td>\n",
1898
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1899
+ " <td>robotics</td>\n",
1900
+ " <td>2025-09-19 06:29:40+00:00</td>\n",
1901
+ " <td>NaN</td>\n",
1902
+ " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1903
+ " <td>299</td>\n",
1904
+ " <td>rich</td>\n",
1905
+ " <td>NaN</td>\n",
1906
+ " <td>NaN</td>\n",
1907
+ " </tr>\n",
1908
+ " <tr>\n",
1909
+ " <th>503187</th>\n",
1910
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1911
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1912
+ " <td>0</td>\n",
1913
+ " <td>NaN</td>\n",
1914
+ " <td>apache-2.0</td>\n",
1915
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1916
+ " <td>robotics</td>\n",
1917
+ " <td>2025-09-19 06:30:53+00:00</td>\n",
1918
+ " <td>NaN</td>\n",
1919
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1920
+ " <td>299</td>\n",
1921
+ " <td>rich</td>\n",
1922
+ " <td>NaN</td>\n",
1923
+ " <td>NaN</td>\n",
1924
+ " </tr>\n",
1925
+ " <tr>\n",
1926
+ " <th>503188</th>\n",
1927
+ " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1928
+ " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1929
+ " <td>0</td>\n",
1930
+ " <td>NaN</td>\n",
1931
+ " <td>apache-2.0</td>\n",
1932
+ " <td>LeRobot, tutorial</td>\n",
1933
+ " <td>robotics</td>\n",
1934
+ " <td>2025-09-19 06:34:11+00:00</td>\n",
1935
+ " <td>NaN</td>\n",
1936
+ " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1937
+ " <td>231</td>\n",
1938
+ " <td>rich</td>\n",
1939
+ " <td>NaN</td>\n",
1940
+ " <td>NaN</td>\n",
1941
+ " </tr>\n",
1942
+ " <tr>\n",
1943
+ " <th>503189</th>\n",
1944
+ " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1945
+ " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1946
+ " <td>0</td>\n",
1947
+ " <td>NaN</td>\n",
1948
+ " <td>apache-2.0</td>\n",
1949
+ " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1950
+ " <td>robotics</td>\n",
1951
+ " <td>2025-09-19 06:34:09+00:00</td>\n",
1952
+ " <td>NaN</td>\n",
1953
+ " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1954
+ " <td>299</td>\n",
1955
+ " <td>rich</td>\n",
1956
+ " <td>NaN</td>\n",
1957
+ " <td>NaN</td>\n",
1958
+ " </tr>\n",
1959
+ " </tbody>\n",
1960
+ "</table>\n",
1961
+ "<p>503190 rows × 14 columns</p>\n",
1962
+ "</div>"
1963
+ ],
1964
+ "text/plain": [
1965
+ " dataset_id \\\n",
1966
+ "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1967
+ "1 aemska/stuhl \n",
1968
+ "2 Pogpotatofarmer/memes \n",
1969
+ "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1970
+ "4 chamisfum/brain_tumor_3_classes \n",
1971
+ "... ... \n",
1972
+ "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1973
+ "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1974
+ "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1975
+ "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1976
+ "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1977
+ "\n",
1978
+ " dataset_url downloads author \\\n",
1979
+ "0 https://huggingface.co/datasets/akjadhav/leand... 22 NaN \n",
1980
+ "1 https://huggingface.co/datasets/aemska/stuhl 11 NaN \n",
1981
+ "2 https://huggingface.co/datasets/Pogpotatofarme... 15 NaN \n",
1982
+ "3 https://huggingface.co/datasets/Splend1dchan/N... 11 NaN \n",
1983
+ "4 https://huggingface.co/datasets/chamisfum/brai... 8 NaN \n",
1984
+ "... ... ... ... \n",
1985
+ "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n",
1986
+ "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n",
1987
+ "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n",
1988
+ "503188 https://huggingface.co/datasets/chenxing123456... 0 NaN \n",
1989
+ "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n",
1990
+ "\n",
1991
+ " license tags task_categories \\\n",
1992
+ "0 NaN NaN NaN \n",
1993
+ "1 openrail NaN NaN \n",
1994
+ "2 cc NaN NaN \n",
1995
+ "3 NaN NaN NaN \n",
1996
+ "4 NaN NaN NaN \n",
1997
+ "... ... ... ... \n",
1998
+ "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1999
+ "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2000
+ "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2001
+ "503188 apache-2.0 LeRobot, tutorial robotics \n",
2002
+ "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2003
+ "\n",
2004
+ " last_modified reason \\\n",
2005
+ "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
2006
+ "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
2007
+ "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
2008
+ "3 NaN Failed to load card \n",
2009
+ "4 NaN Failed to load card \n",
2010
+ "... ... ... \n",
2011
+ "503185 2025-09-19 06:28:15+00:00 NaN \n",
2012
+ "503186 2025-09-19 06:29:40+00:00 NaN \n",
2013
+ "503187 2025-09-19 06:30:53+00:00 NaN \n",
2014
+ "503188 2025-09-19 06:34:11+00:00 NaN \n",
2015
+ "503189 2025-09-19 06:34:09+00:00 NaN \n",
2016
+ "\n",
2017
+ " readme_path word_count \\\n",
2018
+ "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
2019
+ "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
2020
+ "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
2021
+ "3 NaN 0 \n",
2022
+ "4 NaN 0 \n",
2023
+ "... ... ... \n",
2024
+ "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
2025
+ "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
2026
+ "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
2027
+ "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
2028
+ "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
2029
+ "\n",
2030
+ " category field keyword \n",
2031
+ "0 minimal NaN NaN \n",
2032
+ "1 minimal NaN NaN \n",
2033
+ "2 minimal NaN NaN \n",
2034
+ "3 minimal NaN NaN \n",
2035
+ "4 minimal life_sciences brain \n",
2036
+ "... ... ... ... \n",
2037
+ "503185 rich NaN NaN \n",
2038
+ "503186 rich NaN NaN \n",
2039
+ "503187 rich NaN NaN \n",
2040
+ "503188 rich NaN NaN \n",
2041
+ "503189 rich NaN NaN \n",
2042
+ "\n",
2043
+ "[503190 rows x 14 columns]"
2044
+ ]
2045
+ },
2046
+ "execution_count": 31,
2047
+ "metadata": {},
2048
+ "output_type": "execute_result"
2049
+ }
2050
+ ],
2051
+ "source": [
2052
+ "merged_df"
2053
+ ]
2054
+ },
2055
+ {
2056
+ "cell_type": "code",
2057
+ "execution_count": 32,
2058
+ "id": "d8d61dc6",
2059
+ "metadata": {},
2060
+ "outputs": [
2061
+ {
2062
+ "name": "stdout",
2063
+ "output_type": "stream",
2064
+ "text": [
2065
+ "merged_df saved to 'datasetcards.parquet'\n"
2066
+ ]
2067
+ }
2068
+ ],
2069
+ "source": [
2070
+ "# Save to parquet\n",
2071
+ "merged_df.to_parquet(\"datasetcards.parquet\", engine=\"pyarrow\", index=False)\n",
2072
+ "\n",
2073
+ "print(\"merged_df saved to 'datasetcards.parquet'\")\n"
2074
+ ]
2075
+ }
2076
+ ],
2077
+ "metadata": {
2078
+ "kernelspec": {
2079
+ "display_name": "hftest",
2080
+ "language": "python",
2081
+ "name": "python3"
2082
+ },
2083
+ "language_info": {
2084
+ "codemirror_mode": {
2085
+ "name": "ipython",
2086
+ "version": 3
2087
+ },
2088
+ "file_extension": ".py",
2089
+ "mimetype": "text/x-python",
2090
+ "name": "python",
2091
+ "nbconvert_exporter": "python",
2092
+ "pygments_lexer": "ipython3",
2093
+ "version": "3.10.18"
2094
+ }
2095
+ },
2096
+ "nbformat": 4,
2097
+ "nbformat_minor": 5
2098
+ }