Santosh commited on
Commit
c7a5270
·
1 Parent(s): a405ce7

made changes

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. datasetcards_new.parquet +2 -2
  3. preprocessing.ipynb +0 -2098
app.py CHANGED
@@ -308,7 +308,7 @@ with gr.Blocks() as demo:
308
  - Track progress using `status`.
309
 
310
  ## Why the table?
311
- The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset.
312
 
313
  ## What does the table contain?
314
  Each row represents a dataset. Columns include:
 
308
  - Track progress using `status`.
309
 
310
  ## Why the table?
311
+ The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025.
312
 
313
  ## What does the table contain?
314
  Each row represents a dataset. Columns include:
datasetcards_new.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0d3770a3024eaf459d5c12d2c4a9d0d5a5043660d0a15c062a387595602eacf
3
- size 38347730
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c248074b63bc77b236e8096e3423779f3a5bf4cbe24a2683ea63da31a1c4c154
3
+ size 35038132
preprocessing.ipynb DELETED
@@ -1,2098 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "4e64d318",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- " dataset_id \\\n",
14
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
15
- "1 aemska/stuhl \n",
16
- "2 Pogpotatofarmer/memes \n",
17
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
18
- "4 chamisfum/brain_tumor_3_classes \n",
19
- "\n",
20
- " dataset_url downloads author \\\n",
21
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
22
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
23
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
24
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
25
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
26
- "\n",
27
- " license tags task_categories last_modified \\\n",
28
- "0 None None None 2024-01-30 07:40:02+00:00 \n",
29
- "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
30
- "2 cc None None 2022-07-15 21:11:34+00:00 \n",
31
- "3 None None None None \n",
32
- "4 None None None None \n",
33
- "\n",
34
- " reason \\\n",
35
- "0 No metadata and no description \n",
36
- "1 Short description (char count=0, words=0) \n",
37
- "2 Short description (char count=0, words=0) \n",
38
- "3 Failed to load card \n",
39
- "4 Failed to load card \n",
40
- "\n",
41
- " readme_path word_count category \n",
42
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
43
- "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
44
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
45
- "3 None 0 minimal \n",
46
- "4 None 0 minimal \n",
47
- " dataset_id \\\n",
48
- "0 autoevaluate/autoeval-staging-eval-launch__gov... \n",
49
- "1 autoevaluate/autoeval-eval-emotion-default-fe1... \n",
50
- "2 LTCB/enwik8 \n",
51
- "3 boltuix/emotions-dataset \n",
52
- "4 yixuantt/MultiHopRAG \n",
53
- "\n",
54
- " dataset_url downloads author \\\n",
55
- "0 https://huggingface.co/datasets/autoevaluate/a... 8 None \n",
56
- "1 https://huggingface.co/datasets/autoevaluate/a... 8 None \n",
57
- "2 https://huggingface.co/datasets/LTCB/enwik8 154 None \n",
58
- "3 https://huggingface.co/datasets/boltuix/emotio... 754 None \n",
59
- "4 https://huggingface.co/datasets/yixuantt/Multi... 7050 None \n",
60
- "\n",
61
- " license tags \\\n",
62
- "0 None autotrain, evaluation \n",
63
- "1 None autotrain, evaluation \n",
64
- "2 ['mit'] None \n",
65
- "3 mit emotions, nlp, sentiment-analysis, emotion-cla... \n",
66
- "4 odc-by None \n",
67
- "\n",
68
- " task_categories last_modified reason \\\n",
69
- "0 None 2022-09-09 07:44:04+00:00 None \n",
70
- "1 None 2022-09-16 20:22:59+00:00 None \n",
71
- "2 fill-mask, text-generation 2024-01-18 11:19:13+00:00 None \n",
72
- "3 None 2025-05-25 15:41:59+00:00 None \n",
73
- "4 question-answering, feature-extraction 2024-01-30 02:49:29+00:00 None \n",
74
- "\n",
75
- " readme_path word_count category \n",
76
- "0 dataset_readmes/autoevaluate__autoeval-staging... 55 rich \n",
77
- "1 dataset_readmes/autoevaluate__autoeval-eval-em... 57 rich \n",
78
- "2 dataset_readmes/LTCB__enwik8_README.md 427 rich \n",
79
- "3 dataset_readmes/boltuix__emotions-dataset_READ... 1643 rich \n",
80
- "4 dataset_readmes/yixuantt__MultiHopRAG_README.md 111 rich \n"
81
- ]
82
- }
83
- ],
84
- "source": [
85
- "import pandas as pd\n",
86
- "\n",
87
- "# Read parquet files\n",
88
- "df1 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_minimal_dataset_cards.parquet\")\n",
89
- "df2 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_rich_dataset_cards.parquet\")\n",
90
- "\n",
91
- "# Display first few rows\n",
92
- "print(df1.head())\n",
93
- "print(df2.head())"
94
- ]
95
- },
96
- {
97
- "cell_type": "code",
98
- "execution_count": 2,
99
- "id": "e9a20931",
100
- "metadata": {},
101
- "outputs": [
102
- {
103
- "data": {
104
- "text/html": [
105
- "<div>\n",
106
- "<style scoped>\n",
107
- " .dataframe tbody tr th:only-of-type {\n",
108
- " vertical-align: middle;\n",
109
- " }\n",
110
- "\n",
111
- " .dataframe tbody tr th {\n",
112
- " vertical-align: top;\n",
113
- " }\n",
114
- "\n",
115
- " .dataframe thead th {\n",
116
- " text-align: right;\n",
117
- " }\n",
118
- "</style>\n",
119
- "<table border=\"1\" class=\"dataframe\">\n",
120
- " <thead>\n",
121
- " <tr style=\"text-align: right;\">\n",
122
- " <th></th>\n",
123
- " <th>dataset_id</th>\n",
124
- " <th>dataset_url</th>\n",
125
- " <th>downloads</th>\n",
126
- " <th>author</th>\n",
127
- " <th>license</th>\n",
128
- " <th>tags</th>\n",
129
- " <th>task_categories</th>\n",
130
- " <th>last_modified</th>\n",
131
- " <th>reason</th>\n",
132
- " <th>readme_path</th>\n",
133
- " <th>word_count</th>\n",
134
- " <th>category</th>\n",
135
- " </tr>\n",
136
- " </thead>\n",
137
- " <tbody>\n",
138
- " <tr>\n",
139
- " <th>0</th>\n",
140
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
141
- " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
142
- " <td>22</td>\n",
143
- " <td>None</td>\n",
144
- " <td>None</td>\n",
145
- " <td>None</td>\n",
146
- " <td>None</td>\n",
147
- " <td>2024-01-30 07:40:02+00:00</td>\n",
148
- " <td>No metadata and no description</td>\n",
149
- " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
150
- " <td>0</td>\n",
151
- " <td>minimal</td>\n",
152
- " </tr>\n",
153
- " <tr>\n",
154
- " <th>1</th>\n",
155
- " <td>aemska/stuhl</td>\n",
156
- " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
157
- " <td>11</td>\n",
158
- " <td>None</td>\n",
159
- " <td>openrail</td>\n",
160
- " <td>None</td>\n",
161
- " <td>None</td>\n",
162
- " <td>2022-11-11 14:12:36+00:00</td>\n",
163
- " <td>Short description (char count=0, words=0)</td>\n",
164
- " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
165
- " <td>0</td>\n",
166
- " <td>minimal</td>\n",
167
- " </tr>\n",
168
- " <tr>\n",
169
- " <th>2</th>\n",
170
- " <td>Pogpotatofarmer/memes</td>\n",
171
- " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
172
- " <td>15</td>\n",
173
- " <td>None</td>\n",
174
- " <td>cc</td>\n",
175
- " <td>None</td>\n",
176
- " <td>None</td>\n",
177
- " <td>2022-07-15 21:11:34+00:00</td>\n",
178
- " <td>Short description (char count=0, words=0)</td>\n",
179
- " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
180
- " <td>0</td>\n",
181
- " <td>minimal</td>\n",
182
- " </tr>\n",
183
- " <tr>\n",
184
- " <th>3</th>\n",
185
- " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
186
- " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
187
- " <td>11</td>\n",
188
- " <td>None</td>\n",
189
- " <td>None</td>\n",
190
- " <td>None</td>\n",
191
- " <td>None</td>\n",
192
- " <td>None</td>\n",
193
- " <td>Failed to load card</td>\n",
194
- " <td>None</td>\n",
195
- " <td>0</td>\n",
196
- " <td>minimal</td>\n",
197
- " </tr>\n",
198
- " <tr>\n",
199
- " <th>4</th>\n",
200
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
201
- " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
202
- " <td>8</td>\n",
203
- " <td>None</td>\n",
204
- " <td>None</td>\n",
205
- " <td>None</td>\n",
206
- " <td>None</td>\n",
207
- " <td>None</td>\n",
208
- " <td>Failed to load card</td>\n",
209
- " <td>None</td>\n",
210
- " <td>0</td>\n",
211
- " <td>minimal</td>\n",
212
- " </tr>\n",
213
- " <tr>\n",
214
- " <th>...</th>\n",
215
- " <td>...</td>\n",
216
- " <td>...</td>\n",
217
- " <td>...</td>\n",
218
- " <td>...</td>\n",
219
- " <td>...</td>\n",
220
- " <td>...</td>\n",
221
- " <td>...</td>\n",
222
- " <td>...</td>\n",
223
- " <td>...</td>\n",
224
- " <td>...</td>\n",
225
- " <td>...</td>\n",
226
- " <td>...</td>\n",
227
- " </tr>\n",
228
- " <tr>\n",
229
- " <th>400292</th>\n",
230
- " <td>TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...</td>\n",
231
- " <td>https://huggingface.co/datasets/TAUR-dev/D-EVA...</td>\n",
232
- " <td>0</td>\n",
233
- " <td>None</td>\n",
234
- " <td>None</td>\n",
235
- " <td>None</td>\n",
236
- " <td>None</td>\n",
237
- " <td>2025-09-19 06:27:52+00:00</td>\n",
238
- " <td>Short description (char count=0, words=0)</td>\n",
239
- " <td>dataset_readmes/TAUR-dev__D-EVAL__standard_eva...</td>\n",
240
- " <td>0</td>\n",
241
- " <td>minimal</td>\n",
242
- " </tr>\n",
243
- " <tr>\n",
244
- " <th>400293</th>\n",
245
- " <td>TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...</td>\n",
246
- " <td>https://huggingface.co/datasets/TAUR-dev/D-EVA...</td>\n",
247
- " <td>0</td>\n",
248
- " <td>None</td>\n",
249
- " <td>None</td>\n",
250
- " <td>None</td>\n",
251
- " <td>None</td>\n",
252
- " <td>2025-09-19 06:28:16+00:00</td>\n",
253
- " <td>Short description (char count=0, words=0)</td>\n",
254
- " <td>dataset_readmes/TAUR-dev__D-EVAL__standard_eva...</td>\n",
255
- " <td>0</td>\n",
256
- " <td>minimal</td>\n",
257
- " </tr>\n",
258
- " <tr>\n",
259
- " <th>400294</th>\n",
260
- " <td>haru101/Minecraft-Knowledge-Dataset</td>\n",
261
- " <td>https://huggingface.co/datasets/haru101/Minecr...</td>\n",
262
- " <td>0</td>\n",
263
- " <td>None</td>\n",
264
- " <td>apache-2.0</td>\n",
265
- " <td>None</td>\n",
266
- " <td>question-answering</td>\n",
267
- " <td>2025-09-19 06:33:33+00:00</td>\n",
268
- " <td>Short description (char count=0, words=0)</td>\n",
269
- " <td>dataset_readmes/haru101__Minecraft-Knowledge-D...</td>\n",
270
- " <td>0</td>\n",
271
- " <td>minimal</td>\n",
272
- " </tr>\n",
273
- " <tr>\n",
274
- " <th>400295</th>\n",
275
- " <td>sxj1215/mmimdb_sorted_with_label_2</td>\n",
276
- " <td>https://huggingface.co/datasets/sxj1215/mmimdb...</td>\n",
277
- " <td>0</td>\n",
278
- " <td>None</td>\n",
279
- " <td>None</td>\n",
280
- " <td>None</td>\n",
281
- " <td>None</td>\n",
282
- " <td>2025-09-19 06:35:25+00:00</td>\n",
283
- " <td>Short description (char count=0, words=0)</td>\n",
284
- " <td>dataset_readmes/sxj1215__mmimdb_sorted_with_la...</td>\n",
285
- " <td>0</td>\n",
286
- " <td>minimal</td>\n",
287
- " </tr>\n",
288
- " <tr>\n",
289
- " <th>400296</th>\n",
290
- " <td>Vikir2411CS19/Multimodal_Complaint</td>\n",
291
- " <td>https://huggingface.co/datasets/Vikir2411CS19/...</td>\n",
292
- " <td>0</td>\n",
293
- " <td>None</td>\n",
294
- " <td>None</td>\n",
295
- " <td>None</td>\n",
296
- " <td>None</td>\n",
297
- " <td>2025-09-19 06:35:01+00:00</td>\n",
298
- " <td>Short description (char count=0, words=0)</td>\n",
299
- " <td>dataset_readmes/Vikir2411CS19__Multimodal_Comp...</td>\n",
300
- " <td>0</td>\n",
301
- " <td>minimal</td>\n",
302
- " </tr>\n",
303
- " </tbody>\n",
304
- "</table>\n",
305
- "<p>400297 rows × 12 columns</p>\n",
306
- "</div>"
307
- ],
308
- "text/plain": [
309
- " dataset_id \\\n",
310
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
311
- "1 aemska/stuhl \n",
312
- "2 Pogpotatofarmer/memes \n",
313
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
314
- "4 chamisfum/brain_tumor_3_classes \n",
315
- "... ... \n",
316
- "400292 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n",
317
- "400293 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n",
318
- "400294 haru101/Minecraft-Knowledge-Dataset \n",
319
- "400295 sxj1215/mmimdb_sorted_with_label_2 \n",
320
- "400296 Vikir2411CS19/Multimodal_Complaint \n",
321
- "\n",
322
- " dataset_url downloads author \\\n",
323
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
324
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
325
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
326
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
327
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
328
- "... ... ... ... \n",
329
- "400292 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n",
330
- "400293 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n",
331
- "400294 https://huggingface.co/datasets/haru101/Minecr... 0 None \n",
332
- "400295 https://huggingface.co/datasets/sxj1215/mmimdb... 0 None \n",
333
- "400296 https://huggingface.co/datasets/Vikir2411CS19/... 0 None \n",
334
- "\n",
335
- " license tags task_categories last_modified \\\n",
336
- "0 None None None 2024-01-30 07:40:02+00:00 \n",
337
- "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
338
- "2 cc None None 2022-07-15 21:11:34+00:00 \n",
339
- "3 None None None None \n",
340
- "4 None None None None \n",
341
- "... ... ... ... ... \n",
342
- "400292 None None None 2025-09-19 06:27:52+00:00 \n",
343
- "400293 None None None 2025-09-19 06:28:16+00:00 \n",
344
- "400294 apache-2.0 None question-answering 2025-09-19 06:33:33+00:00 \n",
345
- "400295 None None None 2025-09-19 06:35:25+00:00 \n",
346
- "400296 None None None 2025-09-19 06:35:01+00:00 \n",
347
- "\n",
348
- " reason \\\n",
349
- "0 No metadata and no description \n",
350
- "1 Short description (char count=0, words=0) \n",
351
- "2 Short description (char count=0, words=0) \n",
352
- "3 Failed to load card \n",
353
- "4 Failed to load card \n",
354
- "... ... \n",
355
- "400292 Short description (char count=0, words=0) \n",
356
- "400293 Short description (char count=0, words=0) \n",
357
- "400294 Short description (char count=0, words=0) \n",
358
- "400295 Short description (char count=0, words=0) \n",
359
- "400296 Short description (char count=0, words=0) \n",
360
- "\n",
361
- " readme_path word_count category \n",
362
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
363
- "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
364
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
365
- "3 None 0 minimal \n",
366
- "4 None 0 minimal \n",
367
- "... ... ... ... \n",
368
- "400292 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n",
369
- "400293 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n",
370
- "400294 dataset_readmes/haru101__Minecraft-Knowledge-D... 0 minimal \n",
371
- "400295 dataset_readmes/sxj1215__mmimdb_sorted_with_la... 0 minimal \n",
372
- "400296 dataset_readmes/Vikir2411CS19__Multimodal_Comp... 0 minimal \n",
373
- "\n",
374
- "[400297 rows x 12 columns]"
375
- ]
376
- },
377
- "execution_count": 2,
378
- "metadata": {},
379
- "output_type": "execute_result"
380
- }
381
- ],
382
- "source": [
383
- "df1"
384
- ]
385
- },
386
- {
387
- "cell_type": "code",
388
- "execution_count": 4,
389
- "id": "b5582c36",
390
- "metadata": {},
391
- "outputs": [
392
- {
393
- "data": {
394
- "text/html": [
395
- "<div>\n",
396
- "<style scoped>\n",
397
- " .dataframe tbody tr th:only-of-type {\n",
398
- " vertical-align: middle;\n",
399
- " }\n",
400
- "\n",
401
- " .dataframe tbody tr th {\n",
402
- " vertical-align: top;\n",
403
- " }\n",
404
- "\n",
405
- " .dataframe thead th {\n",
406
- " text-align: right;\n",
407
- " }\n",
408
- "</style>\n",
409
- "<table border=\"1\" class=\"dataframe\">\n",
410
- " <thead>\n",
411
- " <tr style=\"text-align: right;\">\n",
412
- " <th></th>\n",
413
- " <th>id</th>\n",
414
- " <th>url</th>\n",
415
- " <th>field</th>\n",
416
- " <th>keyword</th>\n",
417
- " <th>missing_readme</th>\n",
418
- " <th>missing_card</th>\n",
419
- " </tr>\n",
420
- " </thead>\n",
421
- " <tbody>\n",
422
- " <tr>\n",
423
- " <th>0</th>\n",
424
- " <td>solomonk/reddit_mental_health_posts</td>\n",
425
- " <td>https://huggingface.co/datasets/solomonk/reddi...</td>\n",
426
- " <td>life_sciences</td>\n",
427
- " <td>health</td>\n",
428
- " <td>False</td>\n",
429
- " <td>True</td>\n",
430
- " </tr>\n",
431
- " <tr>\n",
432
- " <th>1</th>\n",
433
- " <td>Kira-Asimov/gender_clinical_trial</td>\n",
434
- " <td>https://huggingface.co/datasets/Kira-Asimov/ge...</td>\n",
435
- " <td>life_sciences</td>\n",
436
- " <td>clinical</td>\n",
437
- " <td>False</td>\n",
438
- " <td>True</td>\n",
439
- " </tr>\n",
440
- " <tr>\n",
441
- " <th>2</th>\n",
442
- " <td>samhog/psychology-6k</td>\n",
443
- " <td>https://huggingface.co/datasets/samhog/psychol...</td>\n",
444
- " <td>life_sciences</td>\n",
445
- " <td>psychology</td>\n",
446
- " <td>True</td>\n",
447
- " <td>True</td>\n",
448
- " </tr>\n",
449
- " <tr>\n",
450
- " <th>3</th>\n",
451
- " <td>TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do...</td>\n",
452
- " <td>https://huggingface.co/datasets/TCMLM/real_cli...</td>\n",
453
- " <td>life_sciences</td>\n",
454
- " <td>clinical</td>\n",
455
- " <td>False</td>\n",
456
- " <td>True</td>\n",
457
- " </tr>\n",
458
- " <tr>\n",
459
- " <th>4</th>\n",
460
- " <td>jibrand/plant-dataset-JSONL</td>\n",
461
- " <td>https://huggingface.co/datasets/jibrand/plant-...</td>\n",
462
- " <td>agriculture_and_biology</td>\n",
463
- " <td>plant</td>\n",
464
- " <td>True</td>\n",
465
- " <td>True</td>\n",
466
- " </tr>\n",
467
- " <tr>\n",
468
- " <th>...</th>\n",
469
- " <td>...</td>\n",
470
- " <td>...</td>\n",
471
- " <td>...</td>\n",
472
- " <td>...</td>\n",
473
- " <td>...</td>\n",
474
- " <td>...</td>\n",
475
- " </tr>\n",
476
- " <tr>\n",
477
- " <th>4035</th>\n",
478
- " <td>AshwinManohar/medicine_normalizer_alpaca</td>\n",
479
- " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
480
- " <td>life_sciences</td>\n",
481
- " <td>medicine</td>\n",
482
- " <td>True</td>\n",
483
- " <td>True</td>\n",
484
- " </tr>\n",
485
- " <tr>\n",
486
- " <th>4036</th>\n",
487
- " <td>AshwinManohar/medicine_parser_alpaca</td>\n",
488
- " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
489
- " <td>life_sciences</td>\n",
490
- " <td>medicine</td>\n",
491
- " <td>True</td>\n",
492
- " <td>True</td>\n",
493
- " </tr>\n",
494
- " <tr>\n",
495
- " <th>4037</th>\n",
496
- " <td>AshwinManohar/medicine_normalizer_alpaca_20k</td>\n",
497
- " <td>https://huggingface.co/datasets/AshwinManohar/...</td>\n",
498
- " <td>life_sciences</td>\n",
499
- " <td>medicine</td>\n",
500
- " <td>True</td>\n",
501
- " <td>True</td>\n",
502
- " </tr>\n",
503
- " <tr>\n",
504
- " <th>4038</th>\n",
505
- " <td>Adithyaaaa/plant_leaf_classification</td>\n",
506
- " <td>https://huggingface.co/datasets/Adithyaaaa/pla...</td>\n",
507
- " <td>agriculture_and_biology</td>\n",
508
- " <td>plant</td>\n",
509
- " <td>True</td>\n",
510
- " <td>True</td>\n",
511
- " </tr>\n",
512
- " <tr>\n",
513
- " <th>4039</th>\n",
514
- " <td>benali-ai-24/drug-data-public</td>\n",
515
- " <td>https://huggingface.co/datasets/benali-ai-24/d...</td>\n",
516
- " <td>life_sciences</td>\n",
517
- " <td>drug</td>\n",
518
- " <td>True</td>\n",
519
- " <td>True</td>\n",
520
- " </tr>\n",
521
- " </tbody>\n",
522
- "</table>\n",
523
- "<p>4040 rows × 6 columns</p>\n",
524
- "</div>"
525
- ],
526
- "text/plain": [
527
- " id \\\n",
528
- "0 solomonk/reddit_mental_health_posts \n",
529
- "1 Kira-Asimov/gender_clinical_trial \n",
530
- "2 samhog/psychology-6k \n",
531
- "3 TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do... \n",
532
- "4 jibrand/plant-dataset-JSONL \n",
533
- "... ... \n",
534
- "4035 AshwinManohar/medicine_normalizer_alpaca \n",
535
- "4036 AshwinManohar/medicine_parser_alpaca \n",
536
- "4037 AshwinManohar/medicine_normalizer_alpaca_20k \n",
537
- "4038 Adithyaaaa/plant_leaf_classification \n",
538
- "4039 benali-ai-24/drug-data-public \n",
539
- "\n",
540
- " url \\\n",
541
- "0 https://huggingface.co/datasets/solomonk/reddi... \n",
542
- "1 https://huggingface.co/datasets/Kira-Asimov/ge... \n",
543
- "2 https://huggingface.co/datasets/samhog/psychol... \n",
544
- "3 https://huggingface.co/datasets/TCMLM/real_cli... \n",
545
- "4 https://huggingface.co/datasets/jibrand/plant-... \n",
546
- "... ... \n",
547
- "4035 https://huggingface.co/datasets/AshwinManohar/... \n",
548
- "4036 https://huggingface.co/datasets/AshwinManohar/... \n",
549
- "4037 https://huggingface.co/datasets/AshwinManohar/... \n",
550
- "4038 https://huggingface.co/datasets/Adithyaaaa/pla... \n",
551
- "4039 https://huggingface.co/datasets/benali-ai-24/d... \n",
552
- "\n",
553
- " field keyword missing_readme missing_card \n",
554
- "0 life_sciences health False True \n",
555
- "1 life_sciences clinical False True \n",
556
- "2 life_sciences psychology True True \n",
557
- "3 life_sciences clinical False True \n",
558
- "4 agriculture_and_biology plant True True \n",
559
- "... ... ... ... ... \n",
560
- "4035 life_sciences medicine True True \n",
561
- "4036 life_sciences medicine True True \n",
562
- "4037 life_sciences medicine True True \n",
563
- "4038 agriculture_and_biology plant True True \n",
564
- "4039 life_sciences drug True True \n",
565
- "\n",
566
- "[4040 rows x 6 columns]"
567
- ]
568
- },
569
- "execution_count": 4,
570
- "metadata": {},
571
- "output_type": "execute_result"
572
- }
573
- ],
574
- "source": [
575
- "csv_df = pd.read_csv(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/ds_missing_sci_data_4k.csv\")\n",
576
- "csv_df"
577
- ]
578
- },
579
- {
580
- "cell_type": "code",
581
- "execution_count": 6,
582
- "id": "a061659a",
583
- "metadata": {},
584
- "outputs": [
585
- {
586
- "data": {
587
- "text/html": [
588
- "<div>\n",
589
- "<style scoped>\n",
590
- " .dataframe tbody tr th:only-of-type {\n",
591
- " vertical-align: middle;\n",
592
- " }\n",
593
- "\n",
594
- " .dataframe tbody tr th {\n",
595
- " vertical-align: top;\n",
596
- " }\n",
597
- "\n",
598
- " .dataframe thead th {\n",
599
- " text-align: right;\n",
600
- " }\n",
601
- "</style>\n",
602
- "<table border=\"1\" class=\"dataframe\">\n",
603
- " <thead>\n",
604
- " <tr style=\"text-align: right;\">\n",
605
- " <th></th>\n",
606
- " <th>dataset_id</th>\n",
607
- " <th>dataset_url</th>\n",
608
- " <th>downloads</th>\n",
609
- " <th>author</th>\n",
610
- " <th>license</th>\n",
611
- " <th>tags</th>\n",
612
- " <th>task_categories</th>\n",
613
- " <th>last_modified</th>\n",
614
- " <th>reason</th>\n",
615
- " <th>readme_path</th>\n",
616
- " <th>word_count</th>\n",
617
- " <th>category</th>\n",
618
- " </tr>\n",
619
- " </thead>\n",
620
- " <tbody>\n",
621
- " <tr>\n",
622
- " <th>0</th>\n",
623
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
624
- " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
625
- " <td>22</td>\n",
626
- " <td>None</td>\n",
627
- " <td>None</td>\n",
628
- " <td>None</td>\n",
629
- " <td>None</td>\n",
630
- " <td>2024-01-30 07:40:02+00:00</td>\n",
631
- " <td>No metadata and no description</td>\n",
632
- " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
633
- " <td>0</td>\n",
634
- " <td>minimal</td>\n",
635
- " </tr>\n",
636
- " <tr>\n",
637
- " <th>1</th>\n",
638
- " <td>aemska/stuhl</td>\n",
639
- " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
640
- " <td>11</td>\n",
641
- " <td>None</td>\n",
642
- " <td>openrail</td>\n",
643
- " <td>None</td>\n",
644
- " <td>None</td>\n",
645
- " <td>2022-11-11 14:12:36+00:00</td>\n",
646
- " <td>Short description (char count=0, words=0)</td>\n",
647
- " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
648
- " <td>0</td>\n",
649
- " <td>minimal</td>\n",
650
- " </tr>\n",
651
- " <tr>\n",
652
- " <th>2</th>\n",
653
- " <td>Pogpotatofarmer/memes</td>\n",
654
- " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
655
- " <td>15</td>\n",
656
- " <td>None</td>\n",
657
- " <td>cc</td>\n",
658
- " <td>None</td>\n",
659
- " <td>None</td>\n",
660
- " <td>2022-07-15 21:11:34+00:00</td>\n",
661
- " <td>Short description (char count=0, words=0)</td>\n",
662
- " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
663
- " <td>0</td>\n",
664
- " <td>minimal</td>\n",
665
- " </tr>\n",
666
- " <tr>\n",
667
- " <th>3</th>\n",
668
- " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
669
- " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
670
- " <td>11</td>\n",
671
- " <td>None</td>\n",
672
- " <td>None</td>\n",
673
- " <td>None</td>\n",
674
- " <td>None</td>\n",
675
- " <td>None</td>\n",
676
- " <td>Failed to load card</td>\n",
677
- " <td>None</td>\n",
678
- " <td>0</td>\n",
679
- " <td>minimal</td>\n",
680
- " </tr>\n",
681
- " <tr>\n",
682
- " <th>4</th>\n",
683
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
684
- " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
685
- " <td>8</td>\n",
686
- " <td>None</td>\n",
687
- " <td>None</td>\n",
688
- " <td>None</td>\n",
689
- " <td>None</td>\n",
690
- " <td>None</td>\n",
691
- " <td>Failed to load card</td>\n",
692
- " <td>None</td>\n",
693
- " <td>0</td>\n",
694
- " <td>minimal</td>\n",
695
- " </tr>\n",
696
- " <tr>\n",
697
- " <th>...</th>\n",
698
- " <td>...</td>\n",
699
- " <td>...</td>\n",
700
- " <td>...</td>\n",
701
- " <td>...</td>\n",
702
- " <td>...</td>\n",
703
- " <td>...</td>\n",
704
- " <td>...</td>\n",
705
- " <td>...</td>\n",
706
- " <td>...</td>\n",
707
- " <td>...</td>\n",
708
- " <td>...</td>\n",
709
- " <td>...</td>\n",
710
- " </tr>\n",
711
- " <tr>\n",
712
- " <th>503185</th>\n",
713
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
714
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
715
- " <td>0</td>\n",
716
- " <td>None</td>\n",
717
- " <td>apache-2.0</td>\n",
718
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
719
- " <td>robotics</td>\n",
720
- " <td>2025-09-19 06:28:15+00:00</td>\n",
721
- " <td>None</td>\n",
722
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
723
- " <td>299</td>\n",
724
- " <td>rich</td>\n",
725
- " </tr>\n",
726
- " <tr>\n",
727
- " <th>503186</th>\n",
728
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
729
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
730
- " <td>0</td>\n",
731
- " <td>None</td>\n",
732
- " <td>apache-2.0</td>\n",
733
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
734
- " <td>robotics</td>\n",
735
- " <td>2025-09-19 06:29:40+00:00</td>\n",
736
- " <td>None</td>\n",
737
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
738
- " <td>299</td>\n",
739
- " <td>rich</td>\n",
740
- " </tr>\n",
741
- " <tr>\n",
742
- " <th>503187</th>\n",
743
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
744
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
745
- " <td>0</td>\n",
746
- " <td>None</td>\n",
747
- " <td>apache-2.0</td>\n",
748
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
749
- " <td>robotics</td>\n",
750
- " <td>2025-09-19 06:30:53+00:00</td>\n",
751
- " <td>None</td>\n",
752
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
753
- " <td>299</td>\n",
754
- " <td>rich</td>\n",
755
- " </tr>\n",
756
- " <tr>\n",
757
- " <th>503188</th>\n",
758
- " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
759
- " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
760
- " <td>0</td>\n",
761
- " <td>None</td>\n",
762
- " <td>apache-2.0</td>\n",
763
- " <td>LeRobot, tutorial</td>\n",
764
- " <td>robotics</td>\n",
765
- " <td>2025-09-19 06:34:11+00:00</td>\n",
766
- " <td>None</td>\n",
767
- " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
768
- " <td>231</td>\n",
769
- " <td>rich</td>\n",
770
- " </tr>\n",
771
- " <tr>\n",
772
- " <th>503189</th>\n",
773
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
774
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
775
- " <td>0</td>\n",
776
- " <td>None</td>\n",
777
- " <td>apache-2.0</td>\n",
778
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
779
- " <td>robotics</td>\n",
780
- " <td>2025-09-19 06:34:09+00:00</td>\n",
781
- " <td>None</td>\n",
782
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
783
- " <td>299</td>\n",
784
- " <td>rich</td>\n",
785
- " </tr>\n",
786
- " </tbody>\n",
787
- "</table>\n",
788
- "<p>503190 rows × 12 columns</p>\n",
789
- "</div>"
790
- ],
791
- "text/plain": [
792
- " dataset_id \\\n",
793
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
794
- "1 aemska/stuhl \n",
795
- "2 Pogpotatofarmer/memes \n",
796
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
797
- "4 chamisfum/brain_tumor_3_classes \n",
798
- "... ... \n",
799
- "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
800
- "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
801
- "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
802
- "503188 chenxing1234567890/eval_testZ1.2.1 \n",
803
- "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
804
- "\n",
805
- " dataset_url downloads author \\\n",
806
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
807
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
808
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
809
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
810
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
811
- "... ... ... ... \n",
812
- "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
813
- "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
814
- "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
815
- "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
816
- "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
817
- "\n",
818
- " license tags task_categories \\\n",
819
- "0 None None None \n",
820
- "1 openrail None None \n",
821
- "2 cc None None \n",
822
- "3 None None None \n",
823
- "4 None None None \n",
824
- "... ... ... ... \n",
825
- "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
826
- "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
827
- "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
828
- "503188 apache-2.0 LeRobot, tutorial robotics \n",
829
- "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
830
- "\n",
831
- " last_modified reason \\\n",
832
- "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
833
- "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
834
- "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
835
- "3 None Failed to load card \n",
836
- "4 None Failed to load card \n",
837
- "... ... ... \n",
838
- "503185 2025-09-19 06:28:15+00:00 None \n",
839
- "503186 2025-09-19 06:29:40+00:00 None \n",
840
- "503187 2025-09-19 06:30:53+00:00 None \n",
841
- "503188 2025-09-19 06:34:11+00:00 None \n",
842
- "503189 2025-09-19 06:34:09+00:00 None \n",
843
- "\n",
844
- " readme_path word_count category \n",
845
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
846
- "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
847
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
848
- "3 None 0 minimal \n",
849
- "4 None 0 minimal \n",
850
- "... ... ... ... \n",
851
- "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n",
852
- "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n",
853
- "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n",
854
- "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 rich \n",
855
- "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n",
856
- "\n",
857
- "[503190 rows x 12 columns]"
858
- ]
859
- },
860
- "execution_count": 6,
861
- "metadata": {},
862
- "output_type": "execute_result"
863
- }
864
- ],
865
- "source": [
866
- "merged_df = pd.concat([df1, df2], ignore_index=True)\n",
867
- "merged_df"
868
- ]
869
- },
870
- {
871
- "cell_type": "code",
872
- "execution_count": 21,
873
- "id": "e0623157",
874
- "metadata": {},
875
- "outputs": [
876
- {
877
- "name": "stdout",
878
- "output_type": "stream",
879
- "text": [
880
- "(11, 7)\n",
881
- " id \\\n",
882
- "623 introspector/unimath \n",
883
- "766 ekim15/bone_marrow_cell_dataset \n",
884
- "1645 fabriciojm/ecg-examples \n",
885
- "3280 ahork/record-test-6 \n",
886
- "3281 RickRain/SecondTrySimData3 \n",
887
- "\n",
888
- " url \\\n",
889
- "623 https://huggingface.co/datasets/introspector/u... \n",
890
- "766 https://huggingface.co/datasets/ekim15/bone_ma... \n",
891
- "1645 https://huggingface.co/datasets/fabriciojm/ecg... \n",
892
- "3280 https://huggingface.co/datasets/ahork/record-t... \n",
893
- "3281 https://huggingface.co/datasets/RickRain/Secon... \n",
894
- "\n",
895
- " field keyword missing_readme missing_card \\\n",
896
- "623 mathematics_and_statistics math False True \n",
897
- "766 life_sciences biology True False \n",
898
- "1645 life_sciences medical True False \n",
899
- "3280 engineering_and_technology robotics True False \n",
900
- "3281 engineering_and_technology robotics True False \n",
901
- "\n",
902
- " _id_lower \n",
903
- "623 introspector/unimath \n",
904
- "766 ekim15/bone_marrow_cell_dataset \n",
905
- "1645 fabriciojm/ecg-examples \n",
906
- "3280 ahork/record-test-6 \n",
907
- "3281 rickrain/secondtrysimdata3 \n"
908
- ]
909
- }
910
- ],
911
- "source": [
912
- "# Create lowercase helper columns\n",
913
- "df1[\"_dataset_id_lower\"] = df1[\"dataset_id\"].str.lower()\n",
914
- "csv_df[\"_id_lower\"] = csv_df[\"id\"].str.lower()\n",
915
- "\n",
916
- "# Get the rows from df3 where id is NOT in df1\n",
917
- "df3_missed = csv_df[~csv_df[\"_id_lower\"].isin(df1[\"_dataset_id_lower\"])]\n",
918
- "\n",
919
- "print(df3_missed.shape)\n",
920
- "print(df3_missed.head())\n"
921
- ]
922
- },
923
- {
924
- "cell_type": "code",
925
- "execution_count": 25,
926
- "id": "b6dbce79",
927
- "metadata": {},
928
- "outputs": [
929
- {
930
- "data": {
931
- "text/plain": [
932
- "array([['introspector/unimath',\n",
933
- " 'https://huggingface.co/datasets/introspector/unimath',\n",
934
- " 'mathematics_and_statistics', 'math', False, True,\n",
935
- " 'introspector/unimath'],\n",
936
- " ['ekim15/bone_marrow_cell_dataset',\n",
937
- " 'https://huggingface.co/datasets/ekim15/bone_marrow_cell_dataset',\n",
938
- " 'life_sciences', 'biology', True, False,\n",
939
- " 'ekim15/bone_marrow_cell_dataset'],\n",
940
- " ['fabriciojm/ecg-examples',\n",
941
- " 'https://huggingface.co/datasets/fabriciojm/ecg-examples',\n",
942
- " 'life_sciences', 'medical', True, False,\n",
943
- " 'fabriciojm/ecg-examples'],\n",
944
- " ['ahork/record-test-6',\n",
945
- " 'https://huggingface.co/datasets/ahork/record-test-6',\n",
946
- " 'engineering_and_technology', 'robotics', True, False,\n",
947
- " 'ahork/record-test-6'],\n",
948
- " ['RickRain/SecondTrySimData3',\n",
949
- " 'https://huggingface.co/datasets/RickRain/SecondTrySimData3',\n",
950
- " 'engineering_and_technology', 'robotics', True, False,\n",
951
- " 'rickrain/secondtrysimdata3'],\n",
952
- " ['MulixBF/record-cube-pick-2cam-black-2',\n",
953
- " 'https://huggingface.co/datasets/MulixBF/record-cube-pick-2cam-black-2',\n",
954
- " 'engineering_and_technology', 'robotics', True, False,\n",
955
- " 'mulixbf/record-cube-pick-2cam-black-2'],\n",
956
- " ['ricdigi/1two-camera3-test2345',\n",
957
- " 'https://huggingface.co/datasets/ricdigi/1two-camera3-test2345',\n",
958
- " 'engineering_and_technology', 'robotics', True, False,\n",
959
- " 'ricdigi/1two-camera3-test2345'],\n",
960
- " ['Ninkofu/sushi_put',\n",
961
- " 'https://huggingface.co/datasets/Ninkofu/sushi_put',\n",
962
- " 'engineering_and_technology', 'robotics', True, False,\n",
963
- " 'ninkofu/sushi_put'],\n",
964
- " ['jokla89/record-test-temp1',\n",
965
- " 'https://huggingface.co/datasets/jokla89/record-test-temp1',\n",
966
- " 'engineering_and_technology', 'robotics', True, False,\n",
967
- " 'jokla89/record-test-temp1'],\n",
968
- " ['LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n",
969
- " 'https://huggingface.co/datasets/LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n",
970
- " 'engineering_and_technology', 'robotics', True, False,\n",
971
- " 'lerobot-worldwide-hackathon/325-casino-dealer-dice-set'],\n",
972
- " ['jackvial/koch_screwdriver_attach_orange_panel_e125',\n",
973
- " 'https://huggingface.co/datasets/jackvial/koch_screwdriver_attach_orange_panel_e125',\n",
974
- " 'engineering_and_technology', 'robotics', True, False,\n",
975
- " 'jackvial/koch_screwdriver_attach_orange_panel_e125']],\n",
976
- " dtype=object)"
977
- ]
978
- },
979
- "execution_count": 25,
980
- "metadata": {},
981
- "output_type": "execute_result"
982
- }
983
- ],
984
- "source": [
985
- "df3_missed.values"
986
- ]
987
- },
988
- {
989
- "cell_type": "code",
990
- "execution_count": 26,
991
- "id": "0cec2023",
992
- "metadata": {},
993
- "outputs": [
994
- {
995
- "data": {
996
- "text/html": [
997
- "<div>\n",
998
- "<style scoped>\n",
999
- " .dataframe tbody tr th:only-of-type {\n",
1000
- " vertical-align: middle;\n",
1001
- " }\n",
1002
- "\n",
1003
- " .dataframe tbody tr th {\n",
1004
- " vertical-align: top;\n",
1005
- " }\n",
1006
- "\n",
1007
- " .dataframe thead th {\n",
1008
- " text-align: right;\n",
1009
- " }\n",
1010
- "</style>\n",
1011
- "<table border=\"1\" class=\"dataframe\">\n",
1012
- " <thead>\n",
1013
- " <tr style=\"text-align: right;\">\n",
1014
- " <th></th>\n",
1015
- " <th>dataset_id</th>\n",
1016
- " <th>dataset_url</th>\n",
1017
- " <th>downloads</th>\n",
1018
- " <th>author</th>\n",
1019
- " <th>license</th>\n",
1020
- " <th>tags</th>\n",
1021
- " <th>task_categories</th>\n",
1022
- " <th>last_modified</th>\n",
1023
- " <th>reason</th>\n",
1024
- " <th>readme_path</th>\n",
1025
- " <th>word_count</th>\n",
1026
- " <th>category</th>\n",
1027
- " <th>_dataset_id_lower</th>\n",
1028
- " </tr>\n",
1029
- " </thead>\n",
1030
- " <tbody>\n",
1031
- " <tr>\n",
1032
- " <th>0</th>\n",
1033
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1034
- " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1035
- " <td>22</td>\n",
1036
- " <td>None</td>\n",
1037
- " <td>None</td>\n",
1038
- " <td>None</td>\n",
1039
- " <td>None</td>\n",
1040
- " <td>2024-01-30 07:40:02+00:00</td>\n",
1041
- " <td>No metadata and no description</td>\n",
1042
- " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1043
- " <td>0</td>\n",
1044
- " <td>minimal</td>\n",
1045
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1046
- " </tr>\n",
1047
- " <tr>\n",
1048
- " <th>1</th>\n",
1049
- " <td>aemska/stuhl</td>\n",
1050
- " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1051
- " <td>11</td>\n",
1052
- " <td>None</td>\n",
1053
- " <td>openrail</td>\n",
1054
- " <td>None</td>\n",
1055
- " <td>None</td>\n",
1056
- " <td>2022-11-11 14:12:36+00:00</td>\n",
1057
- " <td>Short description (char count=0, words=0)</td>\n",
1058
- " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1059
- " <td>0</td>\n",
1060
- " <td>minimal</td>\n",
1061
- " <td>aemska/stuhl</td>\n",
1062
- " </tr>\n",
1063
- " <tr>\n",
1064
- " <th>2</th>\n",
1065
- " <td>Pogpotatofarmer/memes</td>\n",
1066
- " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1067
- " <td>15</td>\n",
1068
- " <td>None</td>\n",
1069
- " <td>cc</td>\n",
1070
- " <td>None</td>\n",
1071
- " <td>None</td>\n",
1072
- " <td>2022-07-15 21:11:34+00:00</td>\n",
1073
- " <td>Short description (char count=0, words=0)</td>\n",
1074
- " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1075
- " <td>0</td>\n",
1076
- " <td>minimal</td>\n",
1077
- " <td>pogpotatofarmer/memes</td>\n",
1078
- " </tr>\n",
1079
- " <tr>\n",
1080
- " <th>3</th>\n",
1081
- " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1082
- " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1083
- " <td>11</td>\n",
1084
- " <td>None</td>\n",
1085
- " <td>None</td>\n",
1086
- " <td>None</td>\n",
1087
- " <td>None</td>\n",
1088
- " <td>None</td>\n",
1089
- " <td>Failed to load card</td>\n",
1090
- " <td>None</td>\n",
1091
- " <td>0</td>\n",
1092
- " <td>minimal</td>\n",
1093
- " <td>splend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h</td>\n",
1094
- " </tr>\n",
1095
- " <tr>\n",
1096
- " <th>4</th>\n",
1097
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
1098
- " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1099
- " <td>8</td>\n",
1100
- " <td>None</td>\n",
1101
- " <td>None</td>\n",
1102
- " <td>None</td>\n",
1103
- " <td>None</td>\n",
1104
- " <td>None</td>\n",
1105
- " <td>Failed to load card</td>\n",
1106
- " <td>None</td>\n",
1107
- " <td>0</td>\n",
1108
- " <td>minimal</td>\n",
1109
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
1110
- " </tr>\n",
1111
- " <tr>\n",
1112
- " <th>...</th>\n",
1113
- " <td>...</td>\n",
1114
- " <td>...</td>\n",
1115
- " <td>...</td>\n",
1116
- " <td>...</td>\n",
1117
- " <td>...</td>\n",
1118
- " <td>...</td>\n",
1119
- " <td>...</td>\n",
1120
- " <td>...</td>\n",
1121
- " <td>...</td>\n",
1122
- " <td>...</td>\n",
1123
- " <td>...</td>\n",
1124
- " <td>...</td>\n",
1125
- " <td>...</td>\n",
1126
- " </tr>\n",
1127
- " <tr>\n",
1128
- " <th>503185</th>\n",
1129
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1130
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1131
- " <td>0</td>\n",
1132
- " <td>None</td>\n",
1133
- " <td>apache-2.0</td>\n",
1134
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1135
- " <td>robotics</td>\n",
1136
- " <td>2025-09-19 06:28:15+00:00</td>\n",
1137
- " <td>None</td>\n",
1138
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1139
- " <td>299</td>\n",
1140
- " <td>rich</td>\n",
1141
- " <td>robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1142
- " </tr>\n",
1143
- " <tr>\n",
1144
- " <th>503186</th>\n",
1145
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1146
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1147
- " <td>0</td>\n",
1148
- " <td>None</td>\n",
1149
- " <td>apache-2.0</td>\n",
1150
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1151
- " <td>robotics</td>\n",
1152
- " <td>2025-09-19 06:29:40+00:00</td>\n",
1153
- " <td>None</td>\n",
1154
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1155
- " <td>299</td>\n",
1156
- " <td>rich</td>\n",
1157
- " <td>robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1158
- " </tr>\n",
1159
- " <tr>\n",
1160
- " <th>503187</th>\n",
1161
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1162
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1163
- " <td>0</td>\n",
1164
- " <td>None</td>\n",
1165
- " <td>apache-2.0</td>\n",
1166
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1167
- " <td>robotics</td>\n",
1168
- " <td>2025-09-19 06:30:53+00:00</td>\n",
1169
- " <td>None</td>\n",
1170
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1171
- " <td>299</td>\n",
1172
- " <td>rich</td>\n",
1173
- " <td>dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1174
- " </tr>\n",
1175
- " <tr>\n",
1176
- " <th>503188</th>\n",
1177
- " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1178
- " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1179
- " <td>0</td>\n",
1180
- " <td>None</td>\n",
1181
- " <td>apache-2.0</td>\n",
1182
- " <td>LeRobot, tutorial</td>\n",
1183
- " <td>robotics</td>\n",
1184
- " <td>2025-09-19 06:34:11+00:00</td>\n",
1185
- " <td>None</td>\n",
1186
- " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1187
- " <td>231</td>\n",
1188
- " <td>rich</td>\n",
1189
- " <td>chenxing1234567890/eval_testz1.2.1</td>\n",
1190
- " </tr>\n",
1191
- " <tr>\n",
1192
- " <th>503189</th>\n",
1193
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1194
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1195
- " <td>0</td>\n",
1196
- " <td>None</td>\n",
1197
- " <td>apache-2.0</td>\n",
1198
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1199
- " <td>robotics</td>\n",
1200
- " <td>2025-09-19 06:34:09+00:00</td>\n",
1201
- " <td>None</td>\n",
1202
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1203
- " <td>299</td>\n",
1204
- " <td>rich</td>\n",
1205
- " <td>dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1206
- " </tr>\n",
1207
- " </tbody>\n",
1208
- "</table>\n",
1209
- "<p>503190 rows × 13 columns</p>\n",
1210
- "</div>"
1211
- ],
1212
- "text/plain": [
1213
- " dataset_id \\\n",
1214
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1215
- "1 aemska/stuhl \n",
1216
- "2 Pogpotatofarmer/memes \n",
1217
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1218
- "4 chamisfum/brain_tumor_3_classes \n",
1219
- "... ... \n",
1220
- "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1221
- "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1222
- "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1223
- "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1224
- "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1225
- "\n",
1226
- " dataset_url downloads author \\\n",
1227
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1228
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1229
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1230
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1231
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1232
- "... ... ... ... \n",
1233
- "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1234
- "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1235
- "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1236
- "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
1237
- "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1238
- "\n",
1239
- " license tags task_categories \\\n",
1240
- "0 None None None \n",
1241
- "1 openrail None None \n",
1242
- "2 cc None None \n",
1243
- "3 None None None \n",
1244
- "4 None None None \n",
1245
- "... ... ... ... \n",
1246
- "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1247
- "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1248
- "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1249
- "503188 apache-2.0 LeRobot, tutorial robotics \n",
1250
- "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1251
- "\n",
1252
- " last_modified reason \\\n",
1253
- "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
1254
- "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
1255
- "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
1256
- "3 None Failed to load card \n",
1257
- "4 None Failed to load card \n",
1258
- "... ... ... \n",
1259
- "503185 2025-09-19 06:28:15+00:00 None \n",
1260
- "503186 2025-09-19 06:29:40+00:00 None \n",
1261
- "503187 2025-09-19 06:30:53+00:00 None \n",
1262
- "503188 2025-09-19 06:34:11+00:00 None \n",
1263
- "503189 2025-09-19 06:34:09+00:00 None \n",
1264
- "\n",
1265
- " readme_path word_count \\\n",
1266
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
1267
- "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
1268
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
1269
- "3 None 0 \n",
1270
- "4 None 0 \n",
1271
- "... ... ... \n",
1272
- "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1273
- "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1274
- "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1275
- "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
1276
- "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1277
- "\n",
1278
- " category _dataset_id_lower \n",
1279
- "0 minimal akjadhav/leandojo-lean4-formal-informal-strings \n",
1280
- "1 minimal aemska/stuhl \n",
1281
- "2 minimal pogpotatofarmer/memes \n",
1282
- "3 minimal splend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h \n",
1283
- "4 minimal chamisfum/brain_tumor_3_classes \n",
1284
- "... ... ... \n",
1285
- "503185 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1286
- "503186 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1287
- "503187 rich dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1288
- "503188 rich chenxing1234567890/eval_testz1.2.1 \n",
1289
- "503189 rich dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1290
- "\n",
1291
- "[503190 rows x 13 columns]"
1292
- ]
1293
- },
1294
- "execution_count": 26,
1295
- "metadata": {},
1296
- "output_type": "execute_result"
1297
- }
1298
- ],
1299
- "source": [
1300
- "merged_df"
1301
- ]
1302
- },
1303
- {
1304
- "cell_type": "code",
1305
- "execution_count": 27,
1306
- "id": "2bc30fa7",
1307
- "metadata": {},
1308
- "outputs": [
1309
- {
1310
- "name": "stdout",
1311
- "output_type": "stream",
1312
- "text": [
1313
- "(503190, 14)\n",
1314
- " dataset_id \\\n",
1315
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1316
- "1 aemska/stuhl \n",
1317
- "2 Pogpotatofarmer/memes \n",
1318
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1319
- "4 chamisfum/brain_tumor_3_classes \n",
1320
- "\n",
1321
- " dataset_url downloads author \\\n",
1322
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1323
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1324
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1325
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1326
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1327
- "\n",
1328
- " license tags task_categories last_modified \\\n",
1329
- "0 None None None 2024-01-30 07:40:02+00:00 \n",
1330
- "1 openrail None None 2022-11-11 14:12:36+00:00 \n",
1331
- "2 cc None None 2022-07-15 21:11:34+00:00 \n",
1332
- "3 None None None None \n",
1333
- "4 None None None None \n",
1334
- "\n",
1335
- " reason \\\n",
1336
- "0 No metadata and no description \n",
1337
- "1 Short description (char count=0, words=0) \n",
1338
- "2 Short description (char count=0, words=0) \n",
1339
- "3 Failed to load card \n",
1340
- "4 Failed to load card \n",
1341
- "\n",
1342
- " readme_path word_count category \\\n",
1343
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n",
1344
- "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n",
1345
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n",
1346
- "3 None 0 minimal \n",
1347
- "4 None 0 minimal \n",
1348
- "\n",
1349
- " field keyword \n",
1350
- "0 NaN NaN \n",
1351
- "1 NaN NaN \n",
1352
- "2 NaN NaN \n",
1353
- "3 NaN NaN \n",
1354
- "4 life_sciences brain \n"
1355
- ]
1356
- }
1357
- ],
1358
- "source": [
1359
- "# Merge on lowercase columns to bring 'field' and 'keyword' from csv_df\n",
1360
- "merged_df = merged_df.merge(\n",
1361
- " csv_df[[\"_id_lower\", \"field\", \"keyword\"]],\n",
1362
- " left_on=\"_dataset_id_lower\",\n",
1363
- " right_on=\"_id_lower\",\n",
1364
- " how=\"left\"\n",
1365
- ")\n",
1366
- "\n",
1367
- "# Drop the helper columns\n",
1368
- "merged_df = merged_df.drop(columns=[\"_dataset_id_lower\", \"_id_lower\"])\n",
1369
- "\n",
1370
- "# Quick check\n",
1371
- "print(merged_df.shape)\n",
1372
- "print(merged_df.head())\n"
1373
- ]
1374
- },
1375
- {
1376
- "cell_type": "code",
1377
- "execution_count": 28,
1378
- "id": "4b104aef",
1379
- "metadata": {},
1380
- "outputs": [
1381
- {
1382
- "data": {
1383
- "text/html": [
1384
- "<div>\n",
1385
- "<style scoped>\n",
1386
- " .dataframe tbody tr th:only-of-type {\n",
1387
- " vertical-align: middle;\n",
1388
- " }\n",
1389
- "\n",
1390
- " .dataframe tbody tr th {\n",
1391
- " vertical-align: top;\n",
1392
- " }\n",
1393
- "\n",
1394
- " .dataframe thead th {\n",
1395
- " text-align: right;\n",
1396
- " }\n",
1397
- "</style>\n",
1398
- "<table border=\"1\" class=\"dataframe\">\n",
1399
- " <thead>\n",
1400
- " <tr style=\"text-align: right;\">\n",
1401
- " <th></th>\n",
1402
- " <th>dataset_id</th>\n",
1403
- " <th>dataset_url</th>\n",
1404
- " <th>downloads</th>\n",
1405
- " <th>author</th>\n",
1406
- " <th>license</th>\n",
1407
- " <th>tags</th>\n",
1408
- " <th>task_categories</th>\n",
1409
- " <th>last_modified</th>\n",
1410
- " <th>reason</th>\n",
1411
- " <th>readme_path</th>\n",
1412
- " <th>word_count</th>\n",
1413
- " <th>category</th>\n",
1414
- " <th>field</th>\n",
1415
- " <th>keyword</th>\n",
1416
- " </tr>\n",
1417
- " </thead>\n",
1418
- " <tbody>\n",
1419
- " <tr>\n",
1420
- " <th>0</th>\n",
1421
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1422
- " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1423
- " <td>22</td>\n",
1424
- " <td>None</td>\n",
1425
- " <td>None</td>\n",
1426
- " <td>None</td>\n",
1427
- " <td>None</td>\n",
1428
- " <td>2024-01-30 07:40:02+00:00</td>\n",
1429
- " <td>No metadata and no description</td>\n",
1430
- " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1431
- " <td>0</td>\n",
1432
- " <td>minimal</td>\n",
1433
- " <td>NaN</td>\n",
1434
- " <td>NaN</td>\n",
1435
- " </tr>\n",
1436
- " <tr>\n",
1437
- " <th>1</th>\n",
1438
- " <td>aemska/stuhl</td>\n",
1439
- " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1440
- " <td>11</td>\n",
1441
- " <td>None</td>\n",
1442
- " <td>openrail</td>\n",
1443
- " <td>None</td>\n",
1444
- " <td>None</td>\n",
1445
- " <td>2022-11-11 14:12:36+00:00</td>\n",
1446
- " <td>Short description (char count=0, words=0)</td>\n",
1447
- " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1448
- " <td>0</td>\n",
1449
- " <td>minimal</td>\n",
1450
- " <td>NaN</td>\n",
1451
- " <td>NaN</td>\n",
1452
- " </tr>\n",
1453
- " <tr>\n",
1454
- " <th>2</th>\n",
1455
- " <td>Pogpotatofarmer/memes</td>\n",
1456
- " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1457
- " <td>15</td>\n",
1458
- " <td>None</td>\n",
1459
- " <td>cc</td>\n",
1460
- " <td>None</td>\n",
1461
- " <td>None</td>\n",
1462
- " <td>2022-07-15 21:11:34+00:00</td>\n",
1463
- " <td>Short description (char count=0, words=0)</td>\n",
1464
- " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1465
- " <td>0</td>\n",
1466
- " <td>minimal</td>\n",
1467
- " <td>NaN</td>\n",
1468
- " <td>NaN</td>\n",
1469
- " </tr>\n",
1470
- " <tr>\n",
1471
- " <th>3</th>\n",
1472
- " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1473
- " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1474
- " <td>11</td>\n",
1475
- " <td>None</td>\n",
1476
- " <td>None</td>\n",
1477
- " <td>None</td>\n",
1478
- " <td>None</td>\n",
1479
- " <td>None</td>\n",
1480
- " <td>Failed to load card</td>\n",
1481
- " <td>None</td>\n",
1482
- " <td>0</td>\n",
1483
- " <td>minimal</td>\n",
1484
- " <td>NaN</td>\n",
1485
- " <td>NaN</td>\n",
1486
- " </tr>\n",
1487
- " <tr>\n",
1488
- " <th>4</th>\n",
1489
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
1490
- " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1491
- " <td>8</td>\n",
1492
- " <td>None</td>\n",
1493
- " <td>None</td>\n",
1494
- " <td>None</td>\n",
1495
- " <td>None</td>\n",
1496
- " <td>None</td>\n",
1497
- " <td>Failed to load card</td>\n",
1498
- " <td>None</td>\n",
1499
- " <td>0</td>\n",
1500
- " <td>minimal</td>\n",
1501
- " <td>life_sciences</td>\n",
1502
- " <td>brain</td>\n",
1503
- " </tr>\n",
1504
- " <tr>\n",
1505
- " <th>...</th>\n",
1506
- " <td>...</td>\n",
1507
- " <td>...</td>\n",
1508
- " <td>...</td>\n",
1509
- " <td>...</td>\n",
1510
- " <td>...</td>\n",
1511
- " <td>...</td>\n",
1512
- " <td>...</td>\n",
1513
- " <td>...</td>\n",
1514
- " <td>...</td>\n",
1515
- " <td>...</td>\n",
1516
- " <td>...</td>\n",
1517
- " <td>...</td>\n",
1518
- " <td>...</td>\n",
1519
- " <td>...</td>\n",
1520
- " </tr>\n",
1521
- " <tr>\n",
1522
- " <th>503185</th>\n",
1523
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1524
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1525
- " <td>0</td>\n",
1526
- " <td>None</td>\n",
1527
- " <td>apache-2.0</td>\n",
1528
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1529
- " <td>robotics</td>\n",
1530
- " <td>2025-09-19 06:28:15+00:00</td>\n",
1531
- " <td>None</td>\n",
1532
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1533
- " <td>299</td>\n",
1534
- " <td>rich</td>\n",
1535
- " <td>NaN</td>\n",
1536
- " <td>NaN</td>\n",
1537
- " </tr>\n",
1538
- " <tr>\n",
1539
- " <th>503186</th>\n",
1540
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1541
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1542
- " <td>0</td>\n",
1543
- " <td>None</td>\n",
1544
- " <td>apache-2.0</td>\n",
1545
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1546
- " <td>robotics</td>\n",
1547
- " <td>2025-09-19 06:29:40+00:00</td>\n",
1548
- " <td>None</td>\n",
1549
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1550
- " <td>299</td>\n",
1551
- " <td>rich</td>\n",
1552
- " <td>NaN</td>\n",
1553
- " <td>NaN</td>\n",
1554
- " </tr>\n",
1555
- " <tr>\n",
1556
- " <th>503187</th>\n",
1557
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1558
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1559
- " <td>0</td>\n",
1560
- " <td>None</td>\n",
1561
- " <td>apache-2.0</td>\n",
1562
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1563
- " <td>robotics</td>\n",
1564
- " <td>2025-09-19 06:30:53+00:00</td>\n",
1565
- " <td>None</td>\n",
1566
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1567
- " <td>299</td>\n",
1568
- " <td>rich</td>\n",
1569
- " <td>NaN</td>\n",
1570
- " <td>NaN</td>\n",
1571
- " </tr>\n",
1572
- " <tr>\n",
1573
- " <th>503188</th>\n",
1574
- " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1575
- " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1576
- " <td>0</td>\n",
1577
- " <td>None</td>\n",
1578
- " <td>apache-2.0</td>\n",
1579
- " <td>LeRobot, tutorial</td>\n",
1580
- " <td>robotics</td>\n",
1581
- " <td>2025-09-19 06:34:11+00:00</td>\n",
1582
- " <td>None</td>\n",
1583
- " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1584
- " <td>231</td>\n",
1585
- " <td>rich</td>\n",
1586
- " <td>NaN</td>\n",
1587
- " <td>NaN</td>\n",
1588
- " </tr>\n",
1589
- " <tr>\n",
1590
- " <th>503189</th>\n",
1591
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1592
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1593
- " <td>0</td>\n",
1594
- " <td>None</td>\n",
1595
- " <td>apache-2.0</td>\n",
1596
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1597
- " <td>robotics</td>\n",
1598
- " <td>2025-09-19 06:34:09+00:00</td>\n",
1599
- " <td>None</td>\n",
1600
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1601
- " <td>299</td>\n",
1602
- " <td>rich</td>\n",
1603
- " <td>NaN</td>\n",
1604
- " <td>NaN</td>\n",
1605
- " </tr>\n",
1606
- " </tbody>\n",
1607
- "</table>\n",
1608
- "<p>503190 rows × 14 columns</p>\n",
1609
- "</div>"
1610
- ],
1611
- "text/plain": [
1612
- " dataset_id \\\n",
1613
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1614
- "1 aemska/stuhl \n",
1615
- "2 Pogpotatofarmer/memes \n",
1616
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1617
- "4 chamisfum/brain_tumor_3_classes \n",
1618
- "... ... \n",
1619
- "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1620
- "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1621
- "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1622
- "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1623
- "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1624
- "\n",
1625
- " dataset_url downloads author \\\n",
1626
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n",
1627
- "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n",
1628
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n",
1629
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n",
1630
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n",
1631
- "... ... ... ... \n",
1632
- "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1633
- "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n",
1634
- "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1635
- "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n",
1636
- "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n",
1637
- "\n",
1638
- " license tags task_categories \\\n",
1639
- "0 None None None \n",
1640
- "1 openrail None None \n",
1641
- "2 cc None None \n",
1642
- "3 None None None \n",
1643
- "4 None None None \n",
1644
- "... ... ... ... \n",
1645
- "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1646
- "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1647
- "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1648
- "503188 apache-2.0 LeRobot, tutorial robotics \n",
1649
- "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1650
- "\n",
1651
- " last_modified reason \\\n",
1652
- "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
1653
- "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
1654
- "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
1655
- "3 None Failed to load card \n",
1656
- "4 None Failed to load card \n",
1657
- "... ... ... \n",
1658
- "503185 2025-09-19 06:28:15+00:00 None \n",
1659
- "503186 2025-09-19 06:29:40+00:00 None \n",
1660
- "503187 2025-09-19 06:30:53+00:00 None \n",
1661
- "503188 2025-09-19 06:34:11+00:00 None \n",
1662
- "503189 2025-09-19 06:34:09+00:00 None \n",
1663
- "\n",
1664
- " readme_path word_count \\\n",
1665
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
1666
- "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
1667
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
1668
- "3 None 0 \n",
1669
- "4 None 0 \n",
1670
- "... ... ... \n",
1671
- "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1672
- "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
1673
- "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1674
- "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
1675
- "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
1676
- "\n",
1677
- " category field keyword \n",
1678
- "0 minimal NaN NaN \n",
1679
- "1 minimal NaN NaN \n",
1680
- "2 minimal NaN NaN \n",
1681
- "3 minimal NaN NaN \n",
1682
- "4 minimal life_sciences brain \n",
1683
- "... ... ... ... \n",
1684
- "503185 rich NaN NaN \n",
1685
- "503186 rich NaN NaN \n",
1686
- "503187 rich NaN NaN \n",
1687
- "503188 rich NaN NaN \n",
1688
- "503189 rich NaN NaN \n",
1689
- "\n",
1690
- "[503190 rows x 14 columns]"
1691
- ]
1692
- },
1693
- "execution_count": 28,
1694
- "metadata": {},
1695
- "output_type": "execute_result"
1696
- }
1697
- ],
1698
- "source": [
1699
- "merged_df"
1700
- ]
1701
- },
1702
- {
1703
- "cell_type": "code",
1704
- "execution_count": 30,
1705
- "id": "69ec9289",
1706
- "metadata": {},
1707
- "outputs": [
1708
- {
1709
- "name": "stdout",
1710
- "output_type": "stream",
1711
- "text": [
1712
- "Number of rows with a value in 'science' column: 4040\n"
1713
- ]
1714
- }
1715
- ],
1716
- "source": [
1717
- "import numpy as np\n",
1718
- "\n",
1719
- "# Replace all None with np.nan\n",
1720
- "merged_df = merged_df.replace({None: np.nan})\n",
1721
- "\n",
1722
- "# Count rows where 'science' column has a value (not NaN)\n",
1723
- "science_count = merged_df[\"field\"].notna().sum()\n",
1724
- "\n",
1725
- "print(f\"Number of rows with a value in 'science' column: {science_count}\")\n"
1726
- ]
1727
- },
1728
- {
1729
- "cell_type": "code",
1730
- "execution_count": 31,
1731
- "id": "b0d58ceb",
1732
- "metadata": {},
1733
- "outputs": [
1734
- {
1735
- "data": {
1736
- "text/html": [
1737
- "<div>\n",
1738
- "<style scoped>\n",
1739
- " .dataframe tbody tr th:only-of-type {\n",
1740
- " vertical-align: middle;\n",
1741
- " }\n",
1742
- "\n",
1743
- " .dataframe tbody tr th {\n",
1744
- " vertical-align: top;\n",
1745
- " }\n",
1746
- "\n",
1747
- " .dataframe thead th {\n",
1748
- " text-align: right;\n",
1749
- " }\n",
1750
- "</style>\n",
1751
- "<table border=\"1\" class=\"dataframe\">\n",
1752
- " <thead>\n",
1753
- " <tr style=\"text-align: right;\">\n",
1754
- " <th></th>\n",
1755
- " <th>dataset_id</th>\n",
1756
- " <th>dataset_url</th>\n",
1757
- " <th>downloads</th>\n",
1758
- " <th>author</th>\n",
1759
- " <th>license</th>\n",
1760
- " <th>tags</th>\n",
1761
- " <th>task_categories</th>\n",
1762
- " <th>last_modified</th>\n",
1763
- " <th>reason</th>\n",
1764
- " <th>readme_path</th>\n",
1765
- " <th>word_count</th>\n",
1766
- " <th>category</th>\n",
1767
- " <th>field</th>\n",
1768
- " <th>keyword</th>\n",
1769
- " </tr>\n",
1770
- " </thead>\n",
1771
- " <tbody>\n",
1772
- " <tr>\n",
1773
- " <th>0</th>\n",
1774
- " <td>akjadhav/leandojo-lean4-formal-informal-strings</td>\n",
1775
- " <td>https://huggingface.co/datasets/akjadhav/leand...</td>\n",
1776
- " <td>22</td>\n",
1777
- " <td>NaN</td>\n",
1778
- " <td>NaN</td>\n",
1779
- " <td>NaN</td>\n",
1780
- " <td>NaN</td>\n",
1781
- " <td>2024-01-30 07:40:02+00:00</td>\n",
1782
- " <td>No metadata and no description</td>\n",
1783
- " <td>dataset_readmes/akjadhav__leandojo-lean4-forma...</td>\n",
1784
- " <td>0</td>\n",
1785
- " <td>minimal</td>\n",
1786
- " <td>NaN</td>\n",
1787
- " <td>NaN</td>\n",
1788
- " </tr>\n",
1789
- " <tr>\n",
1790
- " <th>1</th>\n",
1791
- " <td>aemska/stuhl</td>\n",
1792
- " <td>https://huggingface.co/datasets/aemska/stuhl</td>\n",
1793
- " <td>11</td>\n",
1794
- " <td>NaN</td>\n",
1795
- " <td>openrail</td>\n",
1796
- " <td>NaN</td>\n",
1797
- " <td>NaN</td>\n",
1798
- " <td>2022-11-11 14:12:36+00:00</td>\n",
1799
- " <td>Short description (char count=0, words=0)</td>\n",
1800
- " <td>dataset_readmes/aemska__stuhl_README.md</td>\n",
1801
- " <td>0</td>\n",
1802
- " <td>minimal</td>\n",
1803
- " <td>NaN</td>\n",
1804
- " <td>NaN</td>\n",
1805
- " </tr>\n",
1806
- " <tr>\n",
1807
- " <th>2</th>\n",
1808
- " <td>Pogpotatofarmer/memes</td>\n",
1809
- " <td>https://huggingface.co/datasets/Pogpotatofarme...</td>\n",
1810
- " <td>15</td>\n",
1811
- " <td>NaN</td>\n",
1812
- " <td>cc</td>\n",
1813
- " <td>NaN</td>\n",
1814
- " <td>NaN</td>\n",
1815
- " <td>2022-07-15 21:11:34+00:00</td>\n",
1816
- " <td>Short description (char count=0, words=0)</td>\n",
1817
- " <td>dataset_readmes/Pogpotatofarmer__memes_README.md</td>\n",
1818
- " <td>0</td>\n",
1819
- " <td>minimal</td>\n",
1820
- " <td>NaN</td>\n",
1821
- " <td>NaN</td>\n",
1822
- " </tr>\n",
1823
- " <tr>\n",
1824
- " <th>3</th>\n",
1825
- " <td>Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h</td>\n",
1826
- " <td>https://huggingface.co/datasets/Splend1dchan/N...</td>\n",
1827
- " <td>11</td>\n",
1828
- " <td>NaN</td>\n",
1829
- " <td>NaN</td>\n",
1830
- " <td>NaN</td>\n",
1831
- " <td>NaN</td>\n",
1832
- " <td>NaN</td>\n",
1833
- " <td>Failed to load card</td>\n",
1834
- " <td>NaN</td>\n",
1835
- " <td>0</td>\n",
1836
- " <td>minimal</td>\n",
1837
- " <td>NaN</td>\n",
1838
- " <td>NaN</td>\n",
1839
- " </tr>\n",
1840
- " <tr>\n",
1841
- " <th>4</th>\n",
1842
- " <td>chamisfum/brain_tumor_3_classes</td>\n",
1843
- " <td>https://huggingface.co/datasets/chamisfum/brai...</td>\n",
1844
- " <td>8</td>\n",
1845
- " <td>NaN</td>\n",
1846
- " <td>NaN</td>\n",
1847
- " <td>NaN</td>\n",
1848
- " <td>NaN</td>\n",
1849
- " <td>NaN</td>\n",
1850
- " <td>Failed to load card</td>\n",
1851
- " <td>NaN</td>\n",
1852
- " <td>0</td>\n",
1853
- " <td>minimal</td>\n",
1854
- " <td>life_sciences</td>\n",
1855
- " <td>brain</td>\n",
1856
- " </tr>\n",
1857
- " <tr>\n",
1858
- " <th>...</th>\n",
1859
- " <td>...</td>\n",
1860
- " <td>...</td>\n",
1861
- " <td>...</td>\n",
1862
- " <td>...</td>\n",
1863
- " <td>...</td>\n",
1864
- " <td>...</td>\n",
1865
- " <td>...</td>\n",
1866
- " <td>...</td>\n",
1867
- " <td>...</td>\n",
1868
- " <td>...</td>\n",
1869
- " <td>...</td>\n",
1870
- " <td>...</td>\n",
1871
- " <td>...</td>\n",
1872
- " <td>...</td>\n",
1873
- " </tr>\n",
1874
- " <tr>\n",
1875
- " <th>503185</th>\n",
1876
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14</td>\n",
1877
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1878
- " <td>0</td>\n",
1879
- " <td>NaN</td>\n",
1880
- " <td>apache-2.0</td>\n",
1881
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1882
- " <td>robotics</td>\n",
1883
- " <td>2025-09-19 06:28:15+00:00</td>\n",
1884
- " <td>NaN</td>\n",
1885
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1886
- " <td>299</td>\n",
1887
- " <td>rich</td>\n",
1888
- " <td>NaN</td>\n",
1889
- " <td>NaN</td>\n",
1890
- " </tr>\n",
1891
- " <tr>\n",
1892
- " <th>503186</th>\n",
1893
- " <td>ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15</td>\n",
1894
- " <td>https://huggingface.co/datasets/ROBOTIS/ffw_bg...</td>\n",
1895
- " <td>0</td>\n",
1896
- " <td>NaN</td>\n",
1897
- " <td>apache-2.0</td>\n",
1898
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1899
- " <td>robotics</td>\n",
1900
- " <td>2025-09-19 06:29:40+00:00</td>\n",
1901
- " <td>NaN</td>\n",
1902
- " <td>dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...</td>\n",
1903
- " <td>299</td>\n",
1904
- " <td>rich</td>\n",
1905
- " <td>NaN</td>\n",
1906
- " <td>NaN</td>\n",
1907
- " </tr>\n",
1908
- " <tr>\n",
1909
- " <th>503187</th>\n",
1910
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_5</td>\n",
1911
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1912
- " <td>0</td>\n",
1913
- " <td>NaN</td>\n",
1914
- " <td>apache-2.0</td>\n",
1915
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1916
- " <td>robotics</td>\n",
1917
- " <td>2025-09-19 06:30:53+00:00</td>\n",
1918
- " <td>NaN</td>\n",
1919
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1920
- " <td>299</td>\n",
1921
- " <td>rich</td>\n",
1922
- " <td>NaN</td>\n",
1923
- " <td>NaN</td>\n",
1924
- " </tr>\n",
1925
- " <tr>\n",
1926
- " <th>503188</th>\n",
1927
- " <td>chenxing1234567890/eval_testZ1.2.1</td>\n",
1928
- " <td>https://huggingface.co/datasets/chenxing123456...</td>\n",
1929
- " <td>0</td>\n",
1930
- " <td>NaN</td>\n",
1931
- " <td>apache-2.0</td>\n",
1932
- " <td>LeRobot, tutorial</td>\n",
1933
- " <td>robotics</td>\n",
1934
- " <td>2025-09-19 06:34:11+00:00</td>\n",
1935
- " <td>NaN</td>\n",
1936
- " <td>dataset_readmes/chenxing1234567890__eval_testZ...</td>\n",
1937
- " <td>231</td>\n",
1938
- " <td>rich</td>\n",
1939
- " <td>NaN</td>\n",
1940
- " <td>NaN</td>\n",
1941
- " </tr>\n",
1942
- " <tr>\n",
1943
- " <th>503189</th>\n",
1944
- " <td>Dongkkka/ffw_bg2_rev4_custom_0919_6</td>\n",
1945
- " <td>https://huggingface.co/datasets/Dongkkka/ffw_b...</td>\n",
1946
- " <td>0</td>\n",
1947
- " <td>NaN</td>\n",
1948
- " <td>apache-2.0</td>\n",
1949
- " <td>LeRobot, ffw_bg2_rev4_custom, robotis</td>\n",
1950
- " <td>robotics</td>\n",
1951
- " <td>2025-09-19 06:34:09+00:00</td>\n",
1952
- " <td>NaN</td>\n",
1953
- " <td>dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...</td>\n",
1954
- " <td>299</td>\n",
1955
- " <td>rich</td>\n",
1956
- " <td>NaN</td>\n",
1957
- " <td>NaN</td>\n",
1958
- " </tr>\n",
1959
- " </tbody>\n",
1960
- "</table>\n",
1961
- "<p>503190 rows × 14 columns</p>\n",
1962
- "</div>"
1963
- ],
1964
- "text/plain": [
1965
- " dataset_id \\\n",
1966
- "0 akjadhav/leandojo-lean4-formal-informal-strings \n",
1967
- "1 aemska/stuhl \n",
1968
- "2 Pogpotatofarmer/memes \n",
1969
- "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n",
1970
- "4 chamisfum/brain_tumor_3_classes \n",
1971
- "... ... \n",
1972
- "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n",
1973
- "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n",
1974
- "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n",
1975
- "503188 chenxing1234567890/eval_testZ1.2.1 \n",
1976
- "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n",
1977
- "\n",
1978
- " dataset_url downloads author \\\n",
1979
- "0 https://huggingface.co/datasets/akjadhav/leand... 22 NaN \n",
1980
- "1 https://huggingface.co/datasets/aemska/stuhl 11 NaN \n",
1981
- "2 https://huggingface.co/datasets/Pogpotatofarme... 15 NaN \n",
1982
- "3 https://huggingface.co/datasets/Splend1dchan/N... 11 NaN \n",
1983
- "4 https://huggingface.co/datasets/chamisfum/brai... 8 NaN \n",
1984
- "... ... ... ... \n",
1985
- "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n",
1986
- "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n",
1987
- "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n",
1988
- "503188 https://huggingface.co/datasets/chenxing123456... 0 NaN \n",
1989
- "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n",
1990
- "\n",
1991
- " license tags task_categories \\\n",
1992
- "0 NaN NaN NaN \n",
1993
- "1 openrail NaN NaN \n",
1994
- "2 cc NaN NaN \n",
1995
- "3 NaN NaN NaN \n",
1996
- "4 NaN NaN NaN \n",
1997
- "... ... ... ... \n",
1998
- "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
1999
- "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2000
- "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2001
- "503188 apache-2.0 LeRobot, tutorial robotics \n",
2002
- "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n",
2003
- "\n",
2004
- " last_modified reason \\\n",
2005
- "0 2024-01-30 07:40:02+00:00 No metadata and no description \n",
2006
- "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n",
2007
- "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n",
2008
- "3 NaN Failed to load card \n",
2009
- "4 NaN Failed to load card \n",
2010
- "... ... ... \n",
2011
- "503185 2025-09-19 06:28:15+00:00 NaN \n",
2012
- "503186 2025-09-19 06:29:40+00:00 NaN \n",
2013
- "503187 2025-09-19 06:30:53+00:00 NaN \n",
2014
- "503188 2025-09-19 06:34:11+00:00 NaN \n",
2015
- "503189 2025-09-19 06:34:09+00:00 NaN \n",
2016
- "\n",
2017
- " readme_path word_count \\\n",
2018
- "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n",
2019
- "1 dataset_readmes/aemska__stuhl_README.md 0 \n",
2020
- "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n",
2021
- "3 NaN 0 \n",
2022
- "4 NaN 0 \n",
2023
- "... ... ... \n",
2024
- "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
2025
- "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n",
2026
- "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
2027
- "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n",
2028
- "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n",
2029
- "\n",
2030
- " category field keyword \n",
2031
- "0 minimal NaN NaN \n",
2032
- "1 minimal NaN NaN \n",
2033
- "2 minimal NaN NaN \n",
2034
- "3 minimal NaN NaN \n",
2035
- "4 minimal life_sciences brain \n",
2036
- "... ... ... ... \n",
2037
- "503185 rich NaN NaN \n",
2038
- "503186 rich NaN NaN \n",
2039
- "503187 rich NaN NaN \n",
2040
- "503188 rich NaN NaN \n",
2041
- "503189 rich NaN NaN \n",
2042
- "\n",
2043
- "[503190 rows x 14 columns]"
2044
- ]
2045
- },
2046
- "execution_count": 31,
2047
- "metadata": {},
2048
- "output_type": "execute_result"
2049
- }
2050
- ],
2051
- "source": [
2052
- "merged_df"
2053
- ]
2054
- },
2055
- {
2056
- "cell_type": "code",
2057
- "execution_count": 32,
2058
- "id": "d8d61dc6",
2059
- "metadata": {},
2060
- "outputs": [
2061
- {
2062
- "name": "stdout",
2063
- "output_type": "stream",
2064
- "text": [
2065
- "merged_df saved to 'datasetcards.parquet'\n"
2066
- ]
2067
- }
2068
- ],
2069
- "source": [
2070
- "# Save to parquet\n",
2071
- "merged_df.to_parquet(\"datasetcards.parquet\", engine=\"pyarrow\", index=False)\n",
2072
- "\n",
2073
- "print(\"merged_df saved to 'datasetcards.parquet'\")\n"
2074
- ]
2075
- }
2076
- ],
2077
- "metadata": {
2078
- "kernelspec": {
2079
- "display_name": "hftest",
2080
- "language": "python",
2081
- "name": "python3"
2082
- },
2083
- "language_info": {
2084
- "codemirror_mode": {
2085
- "name": "ipython",
2086
- "version": 3
2087
- },
2088
- "file_extension": ".py",
2089
- "mimetype": "text/x-python",
2090
- "name": "python",
2091
- "nbconvert_exporter": "python",
2092
- "pygments_lexer": "ipython3",
2093
- "version": "3.10.18"
2094
- }
2095
- },
2096
- "nbformat": 4,
2097
- "nbformat_minor": 5
2098
- }