victormiller commited on
Commit
4254834
1 Parent(s): 4a437aa

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +15 -23
curated.py CHANGED
@@ -78,7 +78,7 @@ wikipedia_filter = pd.DataFrame(
78
  "",
79
  ],
80
  "Total Percentage Remaining": [
81
- "98.14%",
82
  ],
83
  }
84
  )
@@ -107,7 +107,7 @@ freelaw_filter = pd.DataFrame(
107
  "",
108
  ],
109
  "Total Percentage Remaining": [
110
- "98.14%",
111
  ],
112
  }
113
  )
@@ -136,7 +136,7 @@ dmm_filter = pd.DataFrame(
136
  "",
137
  ],
138
  "Total Percentage Remaining": [
139
- "98.14%",
140
  ],
141
  }
142
  )
@@ -166,7 +166,7 @@ uspto_filter = pd.DataFrame(
166
  "",
167
  ],
168
  "Total Percentage Remaining": [
169
- "98.14%",
170
  ],
171
  }
172
  )
@@ -195,7 +195,7 @@ pg19_filter = pd.DataFrame(
195
  "",
196
  ],
197
  "Total Percentage Remaining": [
198
- "98.14%",
199
  ],
200
  }
201
  )
@@ -225,7 +225,7 @@ hn_filter = pd.DataFrame(
225
  "",
226
  ],
227
  "Total Percentage Remaining": [
228
- "98.14%",
229
  ],
230
  }
231
  )
@@ -255,7 +255,7 @@ uirc_filter = pd.DataFrame(
255
  "",
256
  ],
257
  "Total Percentage Remaining": [
258
- "98.14%",
259
  ],
260
  }
261
  )
@@ -284,7 +284,7 @@ up_filter = pd.DataFrame(
284
  "",
285
  ],
286
  "Total Percentage Remaining": [
287
- "98.14%",
288
  ],
289
  }
290
  )
@@ -313,7 +313,7 @@ se_filter = pd.DataFrame(
313
  "",
314
  ],
315
  "Total Percentage Remaining": [
316
- "98.14%",
317
  ],
318
  }
319
  )
@@ -342,7 +342,7 @@ arx_filter = pd.DataFrame(
342
  "",
343
  ],
344
  "Total Percentage Remaining": [
345
- "98.14%",
346
  ],
347
  }
348
  )
@@ -371,7 +371,7 @@ s2o_filter = pd.DataFrame(
371
  "",
372
  ],
373
  "Total Percentage Remaining": [
374
- "98.14%",
375
  ],
376
  }
377
  )
@@ -400,7 +400,7 @@ med_filter = pd.DataFrame(
400
  "",
401
  ],
402
  "Total Percentage Remaining": [
403
- "98.14%",
404
  ],
405
  }
406
  )
@@ -429,7 +429,7 @@ phil_filter = pd.DataFrame(
429
  "",
430
  ],
431
  "Total Percentage Remaining": [
432
- "98.14%",
433
  ],
434
  }
435
  )
@@ -445,8 +445,8 @@ filtering_process = Div(
445
  H3("Wikipedia"),
446
  H4("Download and Extraction"),
447
  Ol(
448
- Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),
449
- Li("Data is originally in parqet format so we use huggingface dataset.to_json function to convert it to the jsonl format"),
450
  ),
451
  H4("Filtering"),
452
  Ol(
@@ -456,10 +456,6 @@ filtering_process = Div(
456
  Ol(
457
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
458
  ),
459
- H4("Global Deduplication Process"),
460
- Ol(
461
- Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
462
- ),
463
  table_div_wikipedia,
464
 
465
  ),
@@ -485,10 +481,6 @@ filtering_process = Div(
485
  Ol(
486
  Li("Local dedup was done with all papers combined."),
487
  ),
488
- H4("Global Deduplication Process"),
489
- Ol(
490
- Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
491
- ),
492
  table_div_arx,
493
  ),
494
  Section(
 
78
  "",
79
  ],
80
  "Total Percentage Remaining": [
81
+ "",
82
  ],
83
  }
84
  )
 
107
  "",
108
  ],
109
  "Total Percentage Remaining": [
110
+ "%",
111
  ],
112
  }
113
  )
 
136
  "",
137
  ],
138
  "Total Percentage Remaining": [
139
+ "%",
140
  ],
141
  }
142
  )
 
166
  "",
167
  ],
168
  "Total Percentage Remaining": [
169
+ "%",
170
  ],
171
  }
172
  )
 
195
  "",
196
  ],
197
  "Total Percentage Remaining": [
198
+ "%",
199
  ],
200
  }
201
  )
 
225
  "",
226
  ],
227
  "Total Percentage Remaining": [
228
+ "%",
229
  ],
230
  }
231
  )
 
255
  "",
256
  ],
257
  "Total Percentage Remaining": [
258
+ "%",
259
  ],
260
  }
261
  )
 
284
  "",
285
  ],
286
  "Total Percentage Remaining": [
287
+ "%",
288
  ],
289
  }
290
  )
 
313
  "",
314
  ],
315
  "Total Percentage Remaining": [
316
+ "%",
317
  ],
318
  }
319
  )
 
342
  "",
343
  ],
344
  "Total Percentage Remaining": [
345
+ "%",
346
  ],
347
  }
348
  )
 
371
  "",
372
  ],
373
  "Total Percentage Remaining": [
374
+ "%",
375
  ],
376
  }
377
  )
 
400
  "",
401
  ],
402
  "Total Percentage Remaining": [
403
+ "%",
404
  ],
405
  }
406
  )
 
429
  "",
430
  ],
431
  "Total Percentage Remaining": [
432
+ "%",
433
  ],
434
  }
435
  )
 
445
  H3("Wikipedia"),
446
  H4("Download and Extraction"),
447
  Ol(
448
+ Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
449
+ Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
450
  ),
451
  H4("Filtering"),
452
  Ol(
 
456
  Ol(
457
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
458
  ),
 
 
 
 
459
  table_div_wikipedia,
460
 
461
  ),
 
481
  Ol(
482
  Li("Local dedup was done with all papers combined."),
483
  ),
 
 
 
 
484
  table_div_arx,
485
  ),
486
  Section(