hynky HF staff commited on
Commit
43ec909
1 Parent(s): 462133f

add webpack

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +4 -0
  2. assets/data/clustering/data.csv +0 -0
  3. assets/data/clustering/info.csv +106 -0
  4. assets/data/plots/all_dumps_bad/agg_score.json +1 -0
  5. assets/data/plots/all_dumps_bad/arc_acc_norm.json +1 -0
  6. assets/data/plots/all_dumps_bad/commonsense_qa_acc_norm.json +1 -0
  7. assets/data/plots/all_dumps_bad/hellaswag_acc_norm.json +1 -0
  8. assets/data/plots/all_dumps_bad/index.json +1 -0
  9. assets/data/plots/all_dumps_bad/mmlu_acc_norm.json +1 -0
  10. assets/data/plots/all_dumps_bad/openbookqa_acc_norm.json +1 -0
  11. assets/data/plots/all_dumps_bad/piqa_acc_norm.json +1 -0
  12. assets/data/plots/all_dumps_bad/siqa_acc_norm.json +1 -0
  13. assets/data/plots/all_dumps_bad/winogrande_acc_norm.json +1 -0
  14. assets/data/plots/all_filtering_steps/agg_score.json +1 -0
  15. assets/data/plots/all_filtering_steps/arc_acc_norm.json +1 -0
  16. assets/data/plots/all_filtering_steps/commonsense_qa_acc_norm.json +1 -0
  17. assets/data/plots/all_filtering_steps/hellaswag_acc_norm.json +1 -0
  18. assets/data/plots/all_filtering_steps/index.json +1 -0
  19. assets/data/plots/all_filtering_steps/mmlu_acc_norm.json +1 -0
  20. assets/data/plots/all_filtering_steps/openbookqa_acc_norm.json +1 -0
  21. assets/data/plots/all_filtering_steps/piqa_acc_norm.json +1 -0
  22. assets/data/plots/all_filtering_steps/siqa_acc_norm.json +1 -0
  23. assets/data/plots/all_filtering_steps/winogrande_acc_norm.json +1 -0
  24. assets/data/plots/c4_filters_hellaswag/agg_score.json +1 -0
  25. assets/data/plots/c4_filters_hellaswag/arc_acc_norm.json +1 -0
  26. assets/data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json +1 -0
  27. assets/data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json +1 -0
  28. assets/data/plots/c4_filters_hellaswag/index.json +1 -0
  29. assets/data/plots/c4_filters_hellaswag/mmlu_acc_norm.json +1 -0
  30. assets/data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json +1 -0
  31. assets/data/plots/c4_filters_hellaswag/piqa_acc_norm.json +1 -0
  32. assets/data/plots/c4_filters_hellaswag/siqa_acc_norm.json +1 -0
  33. assets/data/plots/c4_filters_hellaswag/winogrande_acc_norm.json +1 -0
  34. assets/data/plots/cross_ind_unfiltered_comparison/agg_score.json +1 -0
  35. assets/data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json +1 -0
  36. assets/data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json +1 -0
  37. assets/data/plots/cross_ind_unfiltered_comparison/index.json +1 -0
  38. assets/data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json +1 -0
  39. assets/data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json +1 -0
  40. assets/data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json +1 -0
  41. assets/data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json +1 -0
  42. assets/data/plots/custom_filters/agg_score.json +1 -0
  43. assets/data/plots/custom_filters/arc_acc_norm.json +1 -0
  44. assets/data/plots/custom_filters/commonsense_qa_acc_norm.json +1 -0
  45. assets/data/plots/custom_filters/hellaswag_acc_norm.json +1 -0
  46. assets/data/plots/custom_filters/index.json +1 -0
  47. assets/data/plots/custom_filters/mmlu_acc_norm.json +1 -0
  48. assets/data/plots/custom_filters/openbookqa_acc_norm.json +1 -0
  49. assets/data/plots/custom_filters/piqa_acc_norm.json +1 -0
  50. assets/data/plots/custom_filters/siqa_acc_norm.json +1 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ node_modules/
2
+ *.log
3
+ *.env
4
+ *.cache
assets/data/clustering/data.csv ADDED
The diff for this file is too large to render. See raw diff
 
assets/data/clustering/info.csv ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,cluster_id,cluster_summaries,cluster_position_x,cluster_position_y
2
+ 0,-1,None,9.926462,4.7121987
3
+ 1,0,Philosophical/Spiritual Introspection,10.312462,1.2666532
4
+ 2,1,"Scholarships,",8.167274,4.8995786
5
+ 3,2,Politics,8.81142,2.4859838
6
+ 4,3,Theology,9.615214,0.3783942
7
+ 5,4,Dating,4.985182,1.8439052
8
+ 6,5,Accommodation,11.457769,5.080919
9
+ 7,6,Football,6.6154537,-1.6859366
10
+ 8,7,Film Festival,6.9734483,1.4548192
11
+ 9,8,Culinary,13.426296,4.5412893
12
+ 10,9,Music,6.0653744,0.7536916
13
+ 11,10,Gambling,3.124241,3.2533677
14
+ 12,11,Baseball,7.133596,-2.4256644
15
+ 13,12,Technology,6.4929094,6.768577
16
+ 14,13,Website Policies,4.873843,5.771508
17
+ 15,14,Weddings,11.815845,3.7894728
18
+ 16,15,Gaming,5.529167,2.9530518
19
+ 17,16,Commodities/Services Provision,10.453564,5.8489122
20
+ 18,17,Crafts,13.287651,6.4237967
21
+ 19,18,Automobiles,9.9531145,8.840178
22
+ 20,19,Watches,13.893139,9.859185
23
+ 21,20,Dogs,12.595798,3.5351615
24
+ 22,21,Photography,10.7942295,3.5504062
25
+ 23,22,Legalities,8.942016,4.72733
26
+ 24,23,Consumer Electronics,7.078649,8.338984
27
+ 25,24,Insulation,10.520957,7.914946
28
+ 26,25,Cannabis,14.317424,3.2114828
29
+ 27,26,Footwear,15.052116,7.6956415
30
+ 28,27,Real Estate,9.536316,6.103533
31
+ 29,28,Relocation,10.205071,7.1883316
32
+ 30,29,Sports betting,3.2779586,2.443366
33
+ 31,30,Narratives,7.613535,1.8300554
34
+ 32,31,Dating,4.788838,2.1900373
35
+ 33,32,Apparel/Clothing,14.394226,7.3073387
36
+ 34,33,User Authentication,5.265638,6.4014487
37
+ 35,34,Academicwriting,6.9187264,3.4357684
38
+ 36,35,Sports,7.4969172,-2.086585
39
+ 37,36,Fashion/Lifestyle Products,13.821669,7.7150764
40
+ 38,37,Diverse events,9.437052,2.2438836
41
+ 39,38,Blockchain/Cryptocurrency,7.7586045,6.9439344
42
+ 40,39,Online Businesses/Marketing,6.522259,5.219268
43
+ 41,40,Healthcare,11.425277,2.3801014
44
+ 42,41,Home Decor,12.878046,7.2632184
45
+ 43,42,Biomedicine,12.789575,2.3376262
46
+ 44,43,Jewelry,14.259997,8.653363
47
+ 45,44,Addiction,11.561383,1.3774762
48
+ 46,45,Products,11.711758,8.423251
49
+ 47,46,Multi-purposefulness,11.080702,7.4574013
50
+ 48,47,"Mass transit,",9.910158,5.4402313
51
+ 49,48,Ethernet,6.9763823,7.7909245
52
+ 50,49,Legal,9.516912,4.636553
53
+ 51,50,E-commerce,13.263438,8.6548195
54
+ 52,51,Audio,7.717162,8.903019
55
+ 53,52,Infrastructure,10.52904,5.369669
56
+ 54,53,Firearms,11.062812,9.268473
57
+ 55,54,Freight/Logistics,9.551044,7.0336204
58
+ 56,55,Products,12.073747,7.645973
59
+ 57,56,Vaccinations,11.9387045,2.7824683
60
+ 58,57,Artwork,11.019163,4.1677165
61
+ 59,58,Viticulture,14.223523,5.0761614
62
+ 60,59,WordPress,5.9597983,5.824579
63
+ 61,60,Cosmetics/Dermatology,15.093273,3.4669027
64
+ 62,61,Software,6.375921,6.4298844
65
+ 63,62,Dentistry,14.76626,1.1620314
66
+ 64,63,Pest Control,13.201735,3.6806118
67
+ 65,64,SEO,5.720493,5.238112
68
+ 66,65,Lottery,1.7142816,2.9782674
69
+ 67,66,Narratives,8.460977,1.0804662
70
+ 68,67,Waste Reduction & Recycling,10.634534,6.959523
71
+ 69,68,Communication,6.438943,5.9467845
72
+ 70,69,Orthopedics,13.005415,1.1908791
73
+ 71,70,Home Decor & Furniture,12.732457,7.876862
74
+ 72,71,Education,7.6568975,3.4944353
75
+ 73,72,Sports,7.295141,-0.7343214
76
+ 74,73,Social Media Advertising,6.133886,4.8547883
77
+ 75,74,Privacy,4.756733,6.3598356
78
+ 76,75,Website design,6.1168823,5.465095
79
+ 77,76,Roofing,11.389448,8.080609
80
+ 78,77,Nutrition/Supplements,13.631578,2.5334294
81
+ 79,78,Haircare/Hairstyling,15.544645,4.54254
82
+ 80,79,Cookies,4.341592,6.819268
83
+ 81,80,International Trade,8.993828,6.4757586
84
+ 82,81,Entrepreneurial Resources,9.435777,5.3340797
85
+ 83,82,Cricket,6.5171986,-1.245905
86
+ 84,83,Crafts,13.852216,7.049825
87
+ 85,84,Floristry,13.407425,5.8741536
88
+ 86,85,Genealogy,9.530803,1.6548243
89
+ 87,86,Mental Health,11.074349,1.6069281
90
+ 88,87,Volunteerism,10.145443,3.6734574
91
+ 89,88,Lighting,11.385381,8.93693
92
+ 90,89,Artificial Intelligence,6.5306387,6.2178063
93
+ 91,90,Business,7.471462,6.4142885
94
+ 92,91,E-commerce,13.638669,6.5098934
95
+ 93,92,Urbanization/Over-tourism,10.221115,6.100654
96
+ 94,93,Events,10.8449,3.9822264
97
+ 95,94,Pharmaceuticals/Biotechnology,12.318266,2.4331784
98
+ 96,95,Professional Wrestling,6.856304,-0.65598303
99
+ 97,96,Various,9.3211975,3.4894605
100
+ 98,97,Medicine,13.17882,2.1281319
101
+ 99,98,Community Engagement,9.848856,3.5187004
102
+ 100,99,Fitness,12.504849,0.9134393
103
+ 101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
104
+ 102,101,Business Development,7.328447,5.659843
105
+ 103,102,Sports,7.6370654,-1.0701839
106
+ 104,103,Sexuality,13.817207,1.6510898
assets/data/plots/all_dumps_bad/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3534814938902855, 0.3764607086777687, 0.38782499730587, 0.3981050960719585, 0.4028486795723438, 0.4125883243978023, 0.4117814563214779, 0.414029736071825, 0.4197172522544861, 0.4211113378405571, 0.4279881417751312, 0.4280137903988361, 0.4280424378812313, 0.4291964024305343, 0.4326301179826259, 0.4371833503246307, 0.4346669465303421, 0.4336562640964985, 0.4432648755609989, 0.4401291646063328, 0.4394684173166752, 0.4476612061262131, 0.4465444348752498, 0.4472153298556804, 0.4433343075215816, 0.4510187618434429, 0.4459567815065384, 0.4460812956094742, 0.4498684890568256, 0.4529943652451038, 0.4528274349868297, 0.4551213420927524, 0.4549156539142132, 0.4564928151667118, 0.4576693661510944, 0.4557182416319847, 0.4536240361630916, 0.457439012825489, 0.4570476822555065, 0.4589823484420776, 0.462024375796318, 0.4540738053619861, 0.4550252184271812, 0.4576593860983848, 0.4573238864541054, 0.4575810581445694, 0.4622134491801262, 0.4592566937208175, 0.4614734016358852, 0.4637473002076149, 0.4625372551381588, 0.4613912180066108, 0.4597448222339153, 0.4594792164862156, 0.4662549719214439, 0.4634026065468788, 0.4633508697152138, 0.4635734222829342, 0.4628961533308029, 0.4670135043561458, 0.4639505892992019, 0.4631133340299129, 0.4665167145431041, 0.4672448337078094, 0.4693268723785877, 0.4630668573081493, 0.4676454700529575, 0.4646359197795391, 0.4621579721570015, 0.4692446552217006, 0.4704835228621959, 0.4663223996758461, 0.4680556617677212, 0.466339822858572, 0.4682099223136902, 0.4711195565760135, 0.4722655527293682, 0.4727961830794811, 0.4676857478916645, 0.4719390422105789, 0.4713102728128433, 0.4712141714990139, 0.4721613004803657, 0.4713456854224205, 0.4682970903813839, 0.4679934531450271, 0.4685162976384163, 0.4679946713149547, 0.4681242071092129, 0.4702276065945625, 0.472664151340723, 0.4730790853500366, 0.4731674715876579, 0.4718914777040481, 0.4719801284372806, 0.4761029370129108, 0.4735167175531387, 0.4730370938777923, 0.4730173237621784, 0.4735377207398414, 0.4777223989367485, 0.4796326830983162, 0.4734170883893966, 0.4739485755562782, 0.4748299159109592, 0.4765299335122108, 0.4745025858283043, 0.4754423759877682, 0.4784592799842357, 0.4761341325938701, 0.4760282784700393, 0.4769757278263569, 0.47154351323843, 0.4786738082766533, 0.4804279990494251, 0.4777076803147793, 0.4798569902777672, 0.4759011939167976, 0.4784621745347976, 0.479673832654953, 0.4780617095530033, 0.48076206818223, 0.47995800152421, 0.4790860973298549, 0.4817167408764362, 0.4811586998403072, 0.482547752559185, 0.4816697351634502, 0.4809327870607376, 0.4816545359790325, 0.4804601892828941, 0.4776877984404564, 0.4813711903989315, 0.4844604581594467, 0.4819537848234176, 0.4820829331874847, 0.4778126627206802, 0.482935007661581, 0.48230691999197, 0.4826001971960068, 0.4823969900608063, 0.4811219945549965, 0.4789146520197391, 0.484035175293684, 0.4848698377609253, 0.4855728335678577, 0.4825376532971859, 0.485215101391077, 0.4824351668357849, 0.4835342466831207, 0.4822137206792831, 0.4838785007596016, 0.4837255179882049, 0.4853012599050998, 0.4857851006090641, 0.4863366298377514, 0.4856646582484245, 0.4842503517866134, 0.4838776960968971, 0.4846346862614155, 0.4837041422724724, 0.4813097268342972, 0.4873070046305656, 0.4841253720223903, 0.4837464913725853, 0.483069509267807, 0.4851242564618587, 0.4861010462045669], "label": "RefinedWeb"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3605199865996837, 0.3733148723840713, 0.3882005847990513, 0.3934122696518898, 0.3947227671742439, 0.4042885974049568, 0.3974800482392311, 0.4055779427289963, 0.4133470430970192, 0.4117913842201233, 0.4113653488457203, 0.4149517640471458, 0.4187851920723915, 0.4252083078026771, 0.4206527359783649, 0.4240428246557712, 0.422003373503685, 0.4280910938978195, 0.4244147576391697, 0.4316282644867897, 0.4295645765960216, 0.4310102686285972, 0.4360743537545204, 0.4313482865691185, 0.4350991360843181, 0.4378576353192329, 0.4335876516997814, 0.4347924515604973, 0.4348904751241207, 0.436600212007761, 0.430036511272192, 0.4350974671542644, 0.4399556629359722, 0.4371416717767715, 0.4363861419260502, 0.4376698136329651, 0.4405004419386387, 0.4373639523983001, 0.4379038028419018, 0.4371281825006008, 0.4393439553678036, 0.440426729619503, 0.4401675276458263, 0.4429537951946258, 0.4449137263000011, 0.4434786736965179, 0.4450470842421055, 0.4454202279448509, 0.4394537284970283, 0.442185215651989, 0.4461225643754005, 0.4427758157253265, 0.4430646039545536, 0.4476901069283485, 0.4478763341903686, 0.4493869319558143, 0.4448477327823639, 0.450044184923172, 0.4498609118163585, 0.4457665979862213, 0.4506924152374267, 0.449855338782072, 0.448790930211544, 0.4474099352955818, 0.4546772800385952, 0.4529431238770485, 0.452015146613121, 0.4502020999789238, 0.4493804536759853, 0.4523266032338142, 0.4551868587732315, 0.4501944817602634, 0.4493303671479225, 0.4526805207133293, 0.4533850513398647, 0.4518048763275146, 0.4518973492085933, 0.4531301632523536, 0.4518006071448326, 0.4553494565188885, 0.4528752230107784, 0.4536322727799415, 0.4561733976006508, 0.4549491256475448, 0.4574789106845855, 0.4577847123146057, 0.4563642293214798, 0.4578686729073524, 0.4561499990522861, 0.4537816494703293, 0.4542164430022239, 0.4559455662965774, 0.4554723873734474, 0.4575514122843742, 0.4575202167034149, 0.4592722058296203, 0.4585275091230869, 0.4580587856471538, 0.456934317946434, 0.4577495418488979, 0.4540119916200638, 0.4570806957781315, 0.4608120545744896, 0.4588425755500793, 0.4578334167599678, 0.4610816091299057, 0.4598177038133144, 0.461849745362997, 0.4631866924464702, 0.4601576402783394, 0.4646804705262184, 0.4632389545440674, 0.4604574106633663, 0.4602976888418197, 0.4581312239170074, 0.4654182009398937, 0.4655338563024997, 0.4616620391607284, 0.461054053157568, 0.4613021649420261, 0.4658613465726375, 0.4633531905710697, 0.4613638147711754, 0.4643996246159076, 0.462500050663948, 0.4650798961520195, 0.4648764543235302, 0.4639869071543216, 0.4634246975183487, 0.46585888043046, 0.4639799632132053, 0.4630857892334461, 0.4644265696406364, 0.4642998576164245, 0.4686848931014538, 0.4687492996454239, 0.4650243632495403, 0.4627032242715359, 0.4665953740477562, 0.4660026729106903, 0.4664581045508384, 0.4676475040614605, 0.4657339677214622, 0.4664678275585174, 0.4673498086631298, 0.4676674827933311, 0.4680955372750759, 0.4681585058569908, 0.4659864418208599, 0.4686457589268684, 0.4661462865769863, 0.4658931568264961, 0.4674226939678192, 0.46805215254426, 0.4682257212698459, 0.4689070098102093, 0.4699570722877979, 0.4655096270143986, 0.4688013233244419, 0.4707522802054882, 0.4661469310522079, 0.4688841328024864, 0.4671329781413078, 0.4662554152309894, 0.4697433896362781, 0.4698473587632179, 0.4676505327224731, 0.4696521013975143], "label": "FineWeb filtered only"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3551952373236418, 0.3736435137689113, 0.3814037963747978, 0.3948809280991554, 0.3996850810945034, 0.4089604057371616, 0.4100853353738785, 0.4119834117591381, 0.4168377220630646, 0.4186493046581745, 0.4169826358556747, 0.4234288297593593, 0.4229162000119686, 0.4273439794778824, 0.4290364980697632, 0.4291782416403293, 0.4296907968819141, 0.4311576783657074, 0.4326641112565994, 0.430318683385849, 0.430436260998249, 0.4339037239551544, 0.4363459683954716, 0.4357402548193931, 0.4342963136732578, 0.4366712383925915, 0.4363959729671478, 0.436981026083231, 0.4447868093848228, 0.4411709941923618, 0.4406092017889023, 0.4424176625907421, 0.4423875361680984, 0.4422253370285034, 0.4410557933151722, 0.4447037056088447, 0.4454837813973427, 0.4435960277915001, 0.4468514993786812, 0.4479999616742134, 0.4428562931716442, 0.445764634758234, 0.4456562362611294, 0.4488007053732872, 0.4475954286754131, 0.4468922987580299, 0.4548408314585686, 0.4511027485132217, 0.4530330970883369, 0.4483681954443455, 0.4531726539134979, 0.45334542542696, 0.4544384703040123, 0.4530758671462536, 0.4540613554418087, 0.4510113634169101, 0.4538320265710354, 0.4518541917204857, 0.4536847211420536, 0.4532708041369915, 0.4552236869931221, 0.455034039914608, 0.4562875479459762, 0.4532428197562694, 0.4574853852391243, 0.4517738744616508, 0.4579889141023159, 0.4538268558681011, 0.456730306148529, 0.4526018649339676, 0.4562746733427048, 0.4560015797615051, 0.4555426277220249, 0.4561501257121563, 0.4524396173655987, 0.4557023830711841, 0.4589769169688225, 0.4581078588962555, 0.4620813727378845, 0.4586601965129375, 0.4568093195557594, 0.4569808952510357, 0.4567535072565079, 0.4575250148773193, 0.4606908001005649, 0.4603964723646641, 0.4622848592698574, 0.4594669193029403, 0.4640629850327968, 0.4604269936680794, 0.4634841009974479, 0.4644578285515308, 0.4642514958977699, 0.4666304066777229, 0.4616626128554344, 0.4588956907391548, 0.4620226770639419, 0.4628621749579906, 0.4595407098531723, 0.4635516740381717, 0.46005355194211, 0.4601523540914058, 0.4644204638898372, 0.4620639197528362, 0.46614545956254, 0.4636696502566337, 0.4610077403485775, 0.4640897810459137, 0.4636163525283336, 0.4630545899271965, 0.466012816876173, 0.4650349207222461, 0.4613720141351223, 0.4644323363900184, 0.4647249802947044, 0.4656480401754379, 0.4651664271950722, 0.4622530452907085, 0.4655019529163837, 0.4650313258171081, 0.466718140989542, 0.4661559611558914, 0.4661237150430679, 0.4664223715662956, 0.4640601389110088, 0.4642657749354839, 0.4633881188929081, 0.4629989042878151, 0.4685831367969513, 0.4675870984792709, 0.467183344066143, 0.4678030684590339, 0.4660939238965511, 0.4691914953291416, 0.4670972637832165, 0.468262892216444, 0.4672016054391861, 0.4676182121038437, 0.4698677137494087, 0.4658828042447567, 0.4701816700398922, 0.4684622809290886, 0.466015312820673, 0.4675401039421558, 0.4693200923502445, 0.4702670983970165, 0.4679145030677318, 0.4676233418285846, 0.4674933589994907, 0.4678357951343059, 0.4669915996491909, 0.4657857678830623, 0.4666901864111423, 0.4669371582567692, 0.4672787226736545, 0.4684535376727581, 0.4685697965323925, 0.4694835692644119, 0.4683254994451999, 0.4712230190634727, 0.4683987610042095, 0.4707653746008873, 0.4663059376180172, 0.4683133698999882, 0.4686385430395603, 0.4657671600580215, 0.4692615270614624], "label": "FineWeb full MinHash"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2899999916553497, 0.31700000166893, 0.3409999907016754, 0.3425000011920929, 0.3485000133514404, 0.3555000126361847, 0.3574999868869781, 0.3585000038146972, 0.363999992609024, 0.3619999885559082, 0.3675000071525574, 0.3865000009536743, 0.3810000121593475, 0.3740000128746032, 0.3810000121593475, 0.3810000121593475, 0.3860000073909759, 0.3810000121593475, 0.3894999921321869, 0.3849999904632568, 0.3855000138282776, 0.3989999890327453, 0.3980000019073486, 0.3995000123977661, 0.395000010728836, 0.4084999859333038, 0.4040000140666961, 0.4004999995231628, 0.3955000042915344, 0.4135000109672546, 0.4070000052452087, 0.4104999899864197, 0.4014999866485595, 0.4099999964237213, 0.4199999868869781, 0.414000004529953, 0.402999997138977, 0.4214999973773956, 0.4095000028610229, 0.4059999883174896, 0.4090000092983246, 0.4074999988079071, 0.4120000004768371, 0.4154999852180481, 0.4189999997615814, 0.4149999916553497, 0.429500013589859, 0.4154999852180481, 0.4214999973773956, 0.4244999885559082, 0.4205000102519989, 0.4269999861717224, 0.4214999973773956, 0.4180000126361847, 0.4415000081062317, 0.4320000112056732, 0.4350000023841858, 0.4259999990463257, 0.4300000071525574, 0.4259999990463257, 0.4189999997615814, 0.4269999861717224, 0.4199999868869781, 0.426499992609024, 0.4350000023841858, 0.4289999902248382, 0.4345000088214874, 0.4259999990463257, 0.426499992609024, 0.4395000040531158, 0.4395000040531158, 0.4359999895095825, 0.4280000030994415, 0.4370000064373016, 0.4329999983310699, 0.4309999942779541, 0.4490000009536743, 0.4399999976158142, 0.4339999854564667, 0.4399999976158142, 0.4345000088214874, 0.429500013589859, 0.4370000064373016, 0.4379999935626983, 0.4284999966621399, 0.4309999942779541, 0.4350000023841858, 0.4399999976158142, 0.4314999878406524, 0.4300000071525574, 0.4410000145435333, 0.4345000088214874, 0.4410000145435333, 0.4345000088214874, 0.4339999854564667, 0.4460000097751617, 0.4410000145435333, 0.4469999969005584, 0.4480000138282776, 0.4435000121593475, 0.4375, 0.4519999921321869, 0.4480000138282776, 0.4429999887943268, 0.4519999921321869, 0.4435000121593475, 0.4334999918937683, 0.4460000097751617, 0.4564999938011169, 0.4469999969005584, 0.453000009059906, 0.4485000073909759, 0.4410000145435333, 0.4444999992847442, 0.4485000073909759, 0.457500010728836, 0.4469999969005584, 0.4535000026226043, 0.4535000026226043, 0.4485000073909759, 0.4490000009536743, 0.4505000114440918, 0.4595000147819519, 0.4544999897480011, 0.453000009059906, 0.4605000019073486, 0.4620000123977661, 0.457500010728836, 0.453000009059906, 0.4550000131130218, 0.460999995470047, 0.4449999928474426, 0.4474999904632568, 0.457500010728836, 0.4584999978542328, 0.4494999945163727, 0.4474999904632568, 0.4625000059604645, 0.4639999866485595, 0.4555000066757202, 0.4469999969005584, 0.4600000083446502, 0.453000009059906, 0.4629999995231628, 0.4589999914169311, 0.4614999890327453, 0.4555000066757202, 0.4560000002384186, 0.4580000042915344, 0.4584999978542328, 0.4560000002384186, 0.4605000019073486, 0.4595000147819519, 0.4639999866485595, 0.4614999890327453, 0.4564999938011169, 0.4634999930858612, 0.4625000059604645, 0.4614999890327453, 0.4679999947547912, 0.4584999978542328, 0.4595000147819519, 0.4505000114440918, 0.4544999897480011, 0.4595000147819519, 0.4620000123977661, 0.4670000076293945, 0.4555000066757202], "label": "RefinedWeb"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2894999980926513, 0.3235000073909759, 0.3389999866485595, 0.3384999930858612, 0.3459999859333038, 0.359499990940094, 0.3429999947547912, 0.3619999885559082, 0.3564999997615814, 0.3625000119209289, 0.363999992609024, 0.3680000007152557, 0.3680000007152557, 0.3785000145435333, 0.3684999942779541, 0.375, 0.3734999895095825, 0.3849999904632568, 0.3944999873638153, 0.3865000009536743, 0.395000010728836, 0.3935000002384186, 0.3980000019073486, 0.3910000026226043, 0.3885000050067901, 0.3914999961853027, 0.3815000057220459, 0.395000010728836, 0.3894999921321869, 0.395000010728836, 0.3935000002384186, 0.4034999907016754, 0.4004999995231628, 0.3970000147819519, 0.3975000083446502, 0.3995000123977661, 0.3980000019073486, 0.4034999907016754, 0.3959999978542328, 0.3989999890327453, 0.402999997138977, 0.3880000114440918, 0.3980000019073486, 0.4040000140666961, 0.3989999890327453, 0.3970000147819519, 0.3925000131130218, 0.4120000004768371, 0.3935000002384186, 0.395000010728836, 0.4070000052452087, 0.3935000002384186, 0.4034999907016754, 0.4189999997615814, 0.4129999876022339, 0.4160000085830688, 0.4149999916553497, 0.418500006198883, 0.4225000143051147, 0.4174999892711639, 0.4210000038146972, 0.4045000076293945, 0.4079999923706054, 0.4124999940395355, 0.4144999980926513, 0.4169999957084656, 0.4194999933242798, 0.4154999852180481, 0.4169999957084656, 0.4225000143051147, 0.4225000143051147, 0.4230000078678131, 0.4160000085830688, 0.4325000047683716, 0.4325000047683716, 0.4199999868869781, 0.4199999868869781, 0.4189999997615814, 0.4269999861717224, 0.4259999990463257, 0.4230000078678131, 0.4144999980926513, 0.4329999983310699, 0.4275000095367431, 0.4305000007152557, 0.4289999902248382, 0.4235000014305115, 0.4235000014305115, 0.4325000047683716, 0.4244999885559082, 0.4314999878406524, 0.4194999933242798, 0.4350000023841858, 0.4269999861717224, 0.4235000014305115, 0.4300000071525574, 0.4284999966621399, 0.4255000054836273, 0.4280000030994415, 0.4345000088214874, 0.4225000143051147, 0.4334999918937683, 0.4300000071525574, 0.4350000023841858, 0.429500013589859, 0.4325000047683716, 0.4384999871253967, 0.4345000088214874, 0.4354999959468841, 0.4359999895095825, 0.4354999959468841, 0.4424999952316284, 0.4424999952316284, 0.4320000112056732, 0.4280000030994415, 0.4390000104904175, 0.4480000138282776, 0.4415000081062317, 0.4384999871253967, 0.4390000104904175, 0.4494999945163727, 0.4449999928474426, 0.4384999871253967, 0.4424999952316284, 0.4359999895095825, 0.445499986410141, 0.4399999976158142, 0.4375, 0.4410000145435333, 0.4384999871253967, 0.4375, 0.4329999983310699, 0.4370000064373016, 0.4354999959468841, 0.4440000057220459, 0.4384999871253967, 0.4384999871253967, 0.4390000104904175, 0.4424999952316284, 0.4379999935626983, 0.4345000088214874, 0.4354999959468841, 0.4440000057220459, 0.4395000040531158, 0.4465000033378601, 0.4404999911785126, 0.4505000114440918, 0.4480000138282776, 0.4449999928474426, 0.445499986410141, 0.4410000145435333, 0.4485000073909759, 0.4460000097751617, 0.4480000138282776, 0.4465000033378601, 0.4460000097751617, 0.4460000097751617, 0.4395000040531158, 0.4474999904632568, 0.4469999969005584, 0.4404999911785126, 0.4440000057220459, 0.4435000121593475, 0.4435000121593475, 0.4514999985694885, 0.4474999904632568, 0.4474999904632568, 0.445499986410141], "label": "FineWeb filtered only"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2904999852180481, 0.3289999961853027, 0.3379999995231628, 0.3400000035762787, 0.3535000085830688, 0.3700000047683716, 0.3619999885559082, 0.3695000112056732, 0.3625000119209289, 0.3745000064373016, 0.3804999887943268, 0.3835000097751617, 0.3810000121593475, 0.3785000145435333, 0.3799999952316284, 0.3885000050067901, 0.3919999897480011, 0.3899999856948852, 0.3939999938011169, 0.4004999995231628, 0.3889999985694885, 0.4000000059604645, 0.3930000066757202, 0.4025000035762787, 0.398499995470047, 0.3939999938011169, 0.3989999890327453, 0.4020000100135803, 0.4079999923706054, 0.4129999876022339, 0.4014999866485595, 0.4129999876022339, 0.4079999923706054, 0.4115000069141388, 0.4070000052452087, 0.4095000028610229, 0.4199999868869781, 0.4165000021457672, 0.4239999949932098, 0.4129999876022339, 0.4034999907016754, 0.4050000011920929, 0.4135000109672546, 0.4189999997615814, 0.418500006198883, 0.4199999868869781, 0.4365000128746032, 0.4320000112056732, 0.4255000054836273, 0.4259999990463257, 0.4244999885559082, 0.4275000095367431, 0.4259999990463257, 0.4210000038146972, 0.421999990940094, 0.4099999964237213, 0.4305000007152557, 0.4239999949932098, 0.4194999933242798, 0.4205000102519989, 0.4255000054836273, 0.414000004529953, 0.4210000038146972, 0.4180000126361847, 0.4429999887943268, 0.429500013589859, 0.4165000021457672, 0.4239999949932098, 0.4255000054836273, 0.4180000126361847, 0.4325000047683716, 0.4305000007152557, 0.4329999983310699, 0.4325000047683716, 0.4320000112056732, 0.4375, 0.4410000145435333, 0.4395000040531158, 0.4379999935626983, 0.4280000030994415, 0.4365000128746032, 0.4205000102519989, 0.426499992609024, 0.4280000030994415, 0.4354999959468841, 0.4314999878406524, 0.429500013589859, 0.421999990940094, 0.4345000088214874, 0.429500013589859, 0.4354999959468841, 0.4314999878406524, 0.4404999911785126, 0.4384999871253967, 0.4359999895095825, 0.4345000088214874, 0.4320000112056732, 0.4345000088214874, 0.4375, 0.4410000145435333, 0.4280000030994415, 0.4320000112056732, 0.44200000166893, 0.4460000097751617, 0.4390000104904175, 0.4314999878406524, 0.4339999854564667, 0.4390000104904175, 0.4460000097751617, 0.4309999942779541, 0.4444999992847442, 0.44200000166893, 0.4404999911785126, 0.4395000040531158, 0.4370000064373016, 0.4519999921321869, 0.4429999887943268, 0.4395000040531158, 0.4415000081062317, 0.4384999871253967, 0.4494999945163727, 0.4469999969005584, 0.4375, 0.4395000040531158, 0.4345000088214874, 0.4390000104904175, 0.4375, 0.4309999942779541, 0.4320000112056732, 0.4415000081062317, 0.4354999959468841, 0.445499986410141, 0.4404999911785126, 0.4429999887943268, 0.4395000040531158, 0.4354999959468841, 0.4429999887943268, 0.4410000145435333, 0.4494999945163727, 0.4429999887943268, 0.4460000097751617, 0.445499986410141, 0.4429999887943268, 0.4429999887943268, 0.4350000023841858, 0.4474999904632568, 0.4415000081062317, 0.4424999952316284, 0.4375, 0.4444999992847442, 0.4424999952316284, 0.4354999959468841, 0.445499986410141, 0.4379999935626983, 0.4449999928474426, 0.4365000128746032, 0.4474999904632568, 0.4440000057220459, 0.4465000033378601, 0.445499986410141, 0.4474999904632568, 0.4494999945163727, 0.4449999928474426, 0.4444999992847442, 0.44200000166893, 0.4345000088214874, 0.4404999911785126], "label": "FineWeb full MinHash"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2529999911785126, 0.2800000011920929, 0.2870000004768371, 0.3179999887943268, 0.3129999935626983, 0.3210000097751617, 0.3160000145435333, 0.3210000097751617, 0.31700000166893, 0.3330000042915344, 0.3389999866485595, 0.3289999961853027, 0.3429999947547912, 0.3370000123977661, 0.3379999995231628, 0.3459999859333038, 0.3490000069141388, 0.3470000028610229, 0.3600000143051147, 0.3569999933242798, 0.3449999988079071, 0.3650000095367431, 0.3499999940395355, 0.3540000021457672, 0.3569999933242798, 0.3619999885559082, 0.3619999885559082, 0.3580000102519989, 0.3740000128746032, 0.3709999918937683, 0.3720000088214874, 0.3759999871253967, 0.3720000088214874, 0.3659999966621399, 0.3790000081062317, 0.3610000014305115, 0.3650000095367431, 0.3650000095367431, 0.3720000088214874, 0.3729999959468841, 0.3790000081062317, 0.3680000007152557, 0.3659999966621399, 0.3680000007152557, 0.3619999885559082, 0.3619999885559082, 0.3729999959468841, 0.3720000088214874, 0.3650000095367431, 0.3759999871253967, 0.367000013589859, 0.3650000095367431, 0.3680000007152557, 0.3580000102519989, 0.3589999973773956, 0.3700000047683716, 0.3680000007152557, 0.367000013589859, 0.3709999918937683, 0.3880000114440918, 0.3810000121593475, 0.375, 0.4040000140666961, 0.3860000073909759, 0.3840000033378601, 0.3779999911785126, 0.3729999959468841, 0.3720000088214874, 0.3799999952316284, 0.3799999952316284, 0.3779999911785126, 0.3689999878406524, 0.3770000040531158, 0.3740000128746032, 0.3819999992847442, 0.3899999856948852, 0.3799999952316284, 0.3919999897480011, 0.3720000088214874, 0.3770000040531158, 0.3930000066757202, 0.3849999904632568, 0.3899999856948852, 0.3740000128746032, 0.3740000128746032, 0.3799999952316284, 0.3779999911785126, 0.3880000114440918, 0.3709999918937683, 0.3810000121593475, 0.3880000114440918, 0.3980000019073486, 0.3819999992847442, 0.3849999904632568, 0.3810000121593475, 0.3819999992847442, 0.3889999985694885, 0.3840000033378601, 0.3910000026226043, 0.3899999856948852, 0.3959999978542328, 0.3880000114440918, 0.3869999945163727, 0.3779999911785126, 0.3819999992847442, 0.3919999897480011, 0.3849999904632568, 0.3860000073909759, 0.3919999897480011, 0.3819999992847442, 0.3819999992847442, 0.3889999985694885, 0.3889999985694885, 0.3860000073909759, 0.3880000114440918, 0.3889999985694885, 0.3939999938011169, 0.3899999856948852, 0.3869999945163727, 0.3910000026226043, 0.3910000026226043, 0.3910000026226043, 0.3970000147819519, 0.3970000147819519, 0.3970000147819519, 0.3970000147819519, 0.3939999938011169, 0.4000000059604645, 0.3970000147819519, 0.402999997138977, 0.3959999978542328, 0.3959999978542328, 0.4000000059604645, 0.4040000140666961, 0.4020000100135803, 0.3989999890327453, 0.3919999897480011, 0.3930000066757202, 0.3930000066757202, 0.3980000019073486, 0.4000000059604645, 0.395000010728836, 0.3899999856948852, 0.4059999883174896, 0.4020000100135803, 0.4020000100135803, 0.4059999883174896, 0.3970000147819519, 0.4110000133514404, 0.4050000011920929, 0.4000000059604645, 0.4090000092983246, 0.3989999890327453, 0.402999997138977, 0.4009999930858612, 0.3980000019073486, 0.4090000092983246, 0.4079999923706054, 0.4079999923706054, 0.4020000100135803, 0.402999997138977, 0.402999997138977, 0.4059999883174896, 0.4040000140666961, 0.4059999883174896, 0.3989999890327453, 0.4070000052452087, 0.4059999883174896], "label": "RefinedWeb"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2540000081062317, 0.2870000004768371, 0.2829999923706054, 0.3210000097751617, 0.3079999983310699, 0.3230000138282776, 0.3179999887943268, 0.3160000145435333, 0.3289999961853027, 0.3199999928474426, 0.324999988079071, 0.3310000002384186, 0.3260000050067901, 0.335999995470047, 0.335999995470047, 0.3310000002384186, 0.335999995470047, 0.3339999914169311, 0.3459999859333038, 0.3330000042915344, 0.3449999988079071, 0.3429999947547912, 0.3479999899864197, 0.3420000076293945, 0.3479999899864197, 0.3459999859333038, 0.3339999914169311, 0.3350000083446502, 0.3519999980926513, 0.3440000116825104, 0.3490000069141388, 0.3379999995231628, 0.3420000076293945, 0.3610000014305115, 0.3409999907016754, 0.356000006198883, 0.3630000054836273, 0.3519999980926513, 0.3510000109672546, 0.3619999885559082, 0.3569999933242798, 0.3479999899864197, 0.3529999852180481, 0.3569999933242798, 0.3529999852180481, 0.3519999980926513, 0.3549999892711639, 0.356000006198883, 0.3499999940395355, 0.3479999899864197, 0.3619999885559082, 0.3459999859333038, 0.3519999980926513, 0.3529999852180481, 0.3680000007152557, 0.3519999980926513, 0.3580000102519989, 0.3549999892711639, 0.3490000069141388, 0.3499999940395355, 0.3600000143051147, 0.3709999918937683, 0.3659999966621399, 0.3569999933242798, 0.3510000109672546, 0.3600000143051147, 0.367000013589859, 0.3529999852180481, 0.363999992609024, 0.3630000054836273, 0.3619999885559082, 0.356000006198883, 0.367000013589859, 0.3600000143051147, 0.3540000021457672, 0.3589999973773956, 0.3610000014305115, 0.356000006198883, 0.3680000007152557, 0.3519999980926513, 0.3549999892711639, 0.3479999899864197, 0.3549999892711639, 0.3519999980926513, 0.367000013589859, 0.3600000143051147, 0.3600000143051147, 0.3680000007152557, 0.356000006198883, 0.3610000014305115, 0.3689999878406524, 0.367000013589859, 0.3689999878406524, 0.3720000088214874, 0.3680000007152557, 0.3569999933242798, 0.3650000095367431, 0.363999992609024, 0.3610000014305115, 0.3709999918937683, 0.3569999933242798, 0.3540000021457672, 0.3619999885559082, 0.3549999892711639, 0.3650000095367431, 0.3680000007152557, 0.3589999973773956, 0.356000006198883, 0.3610000014305115, 0.3619999885559082, 0.3740000128746032, 0.3700000047683716, 0.3650000095367431, 0.3819999992847442, 0.3770000040531158, 0.3810000121593475, 0.3729999959468841, 0.3680000007152557, 0.3689999878406524, 0.3740000128746032, 0.3779999911785126, 0.3720000088214874, 0.3740000128746032, 0.367000013589859, 0.363999992609024, 0.367000013589859, 0.3689999878406524, 0.3709999918937683, 0.3709999918937683, 0.375, 0.3680000007152557, 0.375, 0.3630000054836273, 0.3720000088214874, 0.3819999992847442, 0.3729999959468841, 0.3689999878406524, 0.363999992609024, 0.3709999918937683, 0.3659999966621399, 0.3700000047683716, 0.367000013589859, 0.3709999918937683, 0.3759999871253967, 0.3759999871253967, 0.3729999959468841, 0.3729999959468841, 0.3729999959468841, 0.3779999911785126, 0.375, 0.3700000047683716, 0.3659999966621399, 0.3759999871253967, 0.3779999911785126, 0.3709999918937683, 0.3840000033378601, 0.3720000088214874, 0.375, 0.367000013589859, 0.3770000040531158, 0.3709999918937683, 0.375, 0.3709999918937683, 0.3740000128746032, 0.3740000128746032, 0.375, 0.3770000040531158], "label": "FineWeb full MinHash"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2599999904632568, 0.277999997138977, 0.2910000085830688, 0.3070000112056732, 0.3140000104904175, 0.3019999861717224, 0.3059999942779541, 0.3210000097751617, 0.3230000138282776, 0.324999988079071, 0.3149999976158142, 0.3109999895095825, 0.3339999914169311, 0.3289999961853027, 0.3319999873638153, 0.3319999873638153, 0.3300000131130218, 0.3370000123977661, 0.3219999969005584, 0.3370000123977661, 0.328000009059906, 0.3339999914169311, 0.3420000076293945, 0.3400000035762787, 0.3440000116825104, 0.3510000109672546, 0.3409999907016754, 0.3449999988079071, 0.3339999914169311, 0.3540000021457672, 0.3339999914169311, 0.3470000028610229, 0.3470000028610229, 0.3440000116825104, 0.3589999973773956, 0.3569999933242798, 0.3630000054836273, 0.3549999892711639, 0.3589999973773956, 0.3449999988079071, 0.3549999892711639, 0.3449999988079071, 0.3389999866485595, 0.3499999940395355, 0.3610000014305115, 0.3619999885559082, 0.3600000143051147, 0.3519999980926513, 0.3479999899864197, 0.356000006198883, 0.3519999980926513, 0.3440000116825104, 0.3490000069141388, 0.3519999980926513, 0.3470000028610229, 0.3589999973773956, 0.3449999988079071, 0.3490000069141388, 0.356000006198883, 0.3619999885559082, 0.3569999933242798, 0.3659999966621399, 0.3610000014305115, 0.3549999892711639, 0.3700000047683716, 0.363999992609024, 0.3600000143051147, 0.3580000102519989, 0.3549999892711639, 0.3619999885559082, 0.3689999878406524, 0.3630000054836273, 0.363999992609024, 0.3700000047683716, 0.367000013589859, 0.3630000054836273, 0.3630000054836273, 0.3700000047683716, 0.3589999973773956, 0.3540000021457672, 0.3540000021457672, 0.3659999966621399, 0.3619999885559082, 0.3589999973773956, 0.3650000095367431, 0.3709999918937683, 0.3680000007152557, 0.3689999878406524, 0.3650000095367431, 0.3729999959468841, 0.3619999885559082, 0.3689999878406524, 0.3569999933242798, 0.3510000109672546, 0.3680000007152557, 0.363999992609024, 0.3700000047683716, 0.3659999966621399, 0.3659999966621399, 0.363999992609024, 0.3619999885559082, 0.3659999966621399, 0.3680000007152557, 0.3610000014305115, 0.3720000088214874, 0.3729999959468841, 0.3810000121593475, 0.3630000054836273, 0.3689999878406524, 0.3709999918937683, 0.3759999871253967, 0.382999986410141, 0.3729999959468841, 0.3720000088214874, 0.3680000007152557, 0.3659999966621399, 0.3650000095367431, 0.363999992609024, 0.3589999973773956, 0.356000006198883, 0.3650000095367431, 0.3659999966621399, 0.367000013589859, 0.3729999959468841, 0.3720000088214874, 0.375, 0.3740000128746032, 0.3700000047683716, 0.3569999933242798, 0.3759999871253967, 0.3740000128746032, 0.367000013589859, 0.3770000040531158, 0.3759999871253967, 0.3709999918937683, 0.3779999911785126, 0.3709999918937683, 0.3689999878406524, 0.3799999952316284, 0.3630000054836273, 0.375, 0.3700000047683716, 0.3700000047683716, 0.3729999959468841, 0.3720000088214874, 0.3790000081062317, 0.375, 0.3729999959468841, 0.3770000040531158, 0.3799999952316284, 0.3779999911785126, 0.3720000088214874, 0.3799999952316284, 0.3759999871253967, 0.3799999952316284, 0.3790000081062317, 0.375, 0.3740000128746032, 0.3729999959468841, 0.3840000033378601, 0.3659999966621399, 0.3759999871253967, 0.3720000088214874, 0.3720000088214874, 0.3759999871253967, 0.375, 0.3650000095367431, 0.3729999959468841], "label": "FineWeb filtered only"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.2759999930858612, 0.328000009059906, 0.3499999940395355, 0.3889999985694885, 0.3910000026226043, 0.402999997138977, 0.4210000038146972, 0.4280000030994415, 0.4359999895095825, 0.4469999969005584, 0.4440000057220459, 0.4600000083446502, 0.4690000116825104, 0.4600000083446502, 0.4679999947547912, 0.4729999899864197, 0.4760000109672546, 0.4839999973773956, 0.4939999878406524, 0.488999992609024, 0.4990000128746032, 0.4979999959468841, 0.4979999959468841, 0.5009999871253967, 0.5, 0.5090000033378601, 0.5070000290870667, 0.5180000066757202, 0.5199999809265137, 0.5109999775886536, 0.5130000114440918, 0.5249999761581421, 0.5149999856948853, 0.5299999713897705, 0.5339999794960022, 0.5189999938011169, 0.5289999842643738, 0.5249999761581421, 0.5320000052452087, 0.5460000038146973, 0.5419999957084656, 0.5260000228881836, 0.5289999842643738, 0.546999990940094, 0.5419999957084656, 0.5419999957084656, 0.5460000038146973, 0.5419999957084656, 0.5389999747276306, 0.5440000295639038, 0.5569999814033508, 0.5450000166893005, 0.5329999923706055, 0.5580000281333923, 0.5339999794960022, 0.5540000200271606, 0.5460000038146973, 0.5479999780654907, 0.5529999732971191, 0.5540000200271606, 0.5619999766349792, 0.5490000247955322, 0.5410000085830688, 0.5490000247955322, 0.5569999814033508, 0.550000011920929, 0.5479999780654907, 0.5630000233650208, 0.546999990940094, 0.5559999942779541, 0.5600000023841858, 0.5509999990463257, 0.5569999814033508, 0.5569999814033508, 0.5580000281333923, 0.5619999766349792, 0.5580000281333923, 0.5669999718666077, 0.5569999814033508, 0.5709999799728394, 0.5529999732971191, 0.5649999976158142, 0.5659999847412109, 0.5659999847412109, 0.5690000057220459, 0.5600000023841858, 0.5580000281333923, 0.5540000200271606, 0.5640000104904175, 0.5680000185966492, 0.5709999799728394, 0.5649999976158142, 0.5680000185966492, 0.5730000138282776, 0.5640000104904175, 0.5799999833106995, 0.5699999928474426, 0.5669999718666077, 0.5680000185966492, 0.5770000219345093, 0.5709999799728394, 0.5759999752044678, 0.5690000057220459, 0.5789999961853027, 0.5740000009536743, 0.5709999799728394, 0.5789999961853027, 0.5709999799728394, 0.5770000219345093, 0.5770000219345093, 0.5730000138282776, 0.5809999704360962, 0.5720000267028809, 0.5849999785423279, 0.5820000171661377, 0.5799999833106995, 0.5830000042915344, 0.5759999752044678, 0.5730000138282776, 0.5799999833106995, 0.5830000042915344, 0.5860000252723694, 0.5789999961853027, 0.5789999961853027, 0.5860000252723694, 0.5979999899864197, 0.5920000076293945, 0.5820000171661377, 0.5870000123977661, 0.5889999866485596, 0.5839999914169312, 0.5849999785423279, 0.5899999737739563, 0.5920000076293945, 0.593999981880188, 0.597000002861023, 0.5889999866485596, 0.5889999866485596, 0.5849999785423279, 0.5899999737739563, 0.5989999771118164, 0.5899999737739563, 0.5839999914169312, 0.5910000205039978, 0.5910000205039978, 0.5929999947547913, 0.5920000076293945, 0.5929999947547913, 0.5889999866485596, 0.5899999737739563, 0.593999981880188, 0.5910000205039978, 0.5960000157356262, 0.5920000076293945, 0.5889999866485596, 0.593999981880188, 0.5879999995231628, 0.5960000157356262, 0.5920000076293945, 0.5960000157356262, 0.5960000157356262, 0.5920000076293945, 0.6010000109672546, 0.5920000076293945, 0.5899999737739563, 0.5889999866485596, 0.5920000076293945, 0.6019999980926514], "label": "RefinedWeb"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.2809999883174896, 0.3230000138282776, 0.3409999907016754, 0.3600000143051147, 0.3569999933242798, 0.3889999985694885, 0.395000010728836, 0.4199999868869781, 0.4180000126361847, 0.421999990940094, 0.4289999902248382, 0.4350000023841858, 0.4359999895095825, 0.4469999969005584, 0.4350000023841858, 0.4480000138282776, 0.4480000138282776, 0.453000009059906, 0.4550000131130218, 0.4589999914169311, 0.4639999866485595, 0.4600000083446502, 0.460999995470047, 0.4589999914169311, 0.481000006198883, 0.4769999980926513, 0.4709999859333038, 0.4740000069141388, 0.4679999947547912, 0.4790000021457672, 0.4729999899864197, 0.4819999933242798, 0.4850000143051147, 0.4819999933242798, 0.4819999933242798, 0.4880000054836273, 0.4869999885559082, 0.4959999918937683, 0.4850000143051147, 0.4959999918937683, 0.492000013589859, 0.503000020980835, 0.4930000007152557, 0.5099999904632568, 0.5040000081062317, 0.5009999871253967, 0.4970000088214874, 0.4979999959468841, 0.5059999823570251, 0.5070000290870667, 0.5040000081062317, 0.5059999823570251, 0.5049999952316284, 0.5080000162124634, 0.5049999952316284, 0.5019999742507935, 0.5120000243186951, 0.5170000195503235, 0.5170000195503235, 0.5090000033378601, 0.5239999890327454, 0.527999997138977, 0.5230000019073486, 0.5210000276565552, 0.5149999856948853, 0.5189999938011169, 0.5270000100135803, 0.5149999856948853, 0.5099999904632568, 0.5299999713897705, 0.5199999809265137, 0.5230000019073486, 0.5260000228881836, 0.5249999761581421, 0.5239999890327454, 0.5329999923706055, 0.5210000276565552, 0.5260000228881836, 0.5170000195503235, 0.531000018119812, 0.5289999842643738, 0.531000018119812, 0.5270000100135803, 0.5299999713897705, 0.5370000004768372, 0.5379999876022339, 0.5419999957084656, 0.5329999923706055, 0.5360000133514404, 0.5299999713897705, 0.5360000133514404, 0.5270000100135803, 0.5450000166893005, 0.5410000085830688, 0.546999990940094, 0.5329999923706055, 0.5329999923706055, 0.5379999876022339, 0.5299999713897705, 0.5429999828338623, 0.5360000133514404, 0.5339999794960022, 0.5419999957084656, 0.5410000085830688, 0.5370000004768372, 0.5389999747276306, 0.527999997138977, 0.5400000214576721, 0.5400000214576721, 0.531000018119812, 0.5440000295639038, 0.5460000038146973, 0.5479999780654907, 0.5460000038146973, 0.5410000085830688, 0.5509999990463257, 0.5479999780654907, 0.5410000085830688, 0.5389999747276306, 0.550000011920929, 0.5569999814033508, 0.550000011920929, 0.5490000247955322, 0.5490000247955322, 0.5569999814033508, 0.5519999861717224, 0.5479999780654907, 0.5559999942779541, 0.5550000071525574, 0.5460000038146973, 0.5540000200271606, 0.5460000038146973, 0.5460000038146973, 0.5509999990463257, 0.5460000038146973, 0.5550000071525574, 0.5479999780654907, 0.5479999780654907, 0.5540000200271606, 0.5550000071525574, 0.5529999732971191, 0.5529999732971191, 0.5509999990463257, 0.5509999990463257, 0.5419999957084656, 0.546999990940094, 0.5509999990463257, 0.5559999942779541, 0.5490000247955322, 0.5509999990463257, 0.5529999732971191, 0.550000011920929, 0.5540000200271606, 0.5550000071525574, 0.5580000281333923, 0.550000011920929, 0.5569999814033508, 0.5490000247955322, 0.5519999861717224, 0.5519999861717224, 0.5559999942779541, 0.5569999814033508, 0.5559999942779541, 0.5550000071525574, 0.5559999942779541, 0.5490000247955322, 0.5550000071525574, 0.5600000023841858], "label": "FineWeb filtered only"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.3009999990463257, 0.3149999976158142, 0.3400000035762787, 0.3610000014305115, 0.3680000007152557, 0.3799999952316284, 0.4020000100135803, 0.4180000126361847, 0.4129999876022339, 0.4259999990463257, 0.4239999949932098, 0.4440000057220459, 0.44200000166893, 0.4440000057220459, 0.4580000042915344, 0.4510000050067901, 0.4560000002384186, 0.4650000035762787, 0.4569999873638153, 0.460999995470047, 0.4659999907016754, 0.4679999947547912, 0.4779999852180481, 0.4740000069141388, 0.4600000083446502, 0.4860000014305115, 0.4790000021457672, 0.4880000054836273, 0.4930000007152557, 0.4860000014305115, 0.4850000143051147, 0.4900000095367431, 0.4850000143051147, 0.4900000095367431, 0.4959999918937683, 0.492000013589859, 0.4850000143051147, 0.4970000088214874, 0.4900000095367431, 0.4979999959468841, 0.503000020980835, 0.5040000081062317, 0.4990000128746032, 0.4979999959468841, 0.5080000162124634, 0.5019999742507935, 0.4970000088214874, 0.4939999878406524, 0.5120000243186951, 0.5070000290870667, 0.503000020980835, 0.5070000290870667, 0.503000020980835, 0.5109999775886536, 0.5080000162124634, 0.5009999871253967, 0.5090000033378601, 0.5, 0.5149999856948853, 0.5109999775886536, 0.5099999904632568, 0.5130000114440918, 0.5080000162124634, 0.5080000162124634, 0.5109999775886536, 0.5099999904632568, 0.5239999890327454, 0.5180000066757202, 0.5130000114440918, 0.5120000243186951, 0.5180000066757202, 0.515999972820282, 0.5260000228881836, 0.5199999809265137, 0.5239999890327454, 0.5220000147819519, 0.527999997138977, 0.5249999761581421, 0.5270000100135803, 0.5249999761581421, 0.5189999938011169, 0.5230000019073486, 0.5249999761581421, 0.5199999809265137, 0.5230000019073486, 0.5299999713897705, 0.5350000262260437, 0.5339999794960022, 0.5329999923706055, 0.5249999761581421, 0.5299999713897705, 0.5360000133514404, 0.5329999923706055, 0.5410000085830688, 0.5249999761581421, 0.5289999842643738, 0.5360000133514404, 0.5360000133514404, 0.5370000004768372, 0.5389999747276306, 0.5289999842643738, 0.5299999713897705, 0.5410000085830688, 0.5329999923706055, 0.5419999957084656, 0.5410000085830688, 0.527999997138977, 0.5370000004768372, 0.5429999828338623, 0.5419999957084656, 0.5389999747276306, 0.5320000052452087, 0.5350000262260437, 0.5419999957084656, 0.5410000085830688, 0.5339999794960022, 0.5440000295639038, 0.5329999923706055, 0.5429999828338623, 0.5460000038146973, 0.5400000214576721, 0.5429999828338623, 0.5479999780654907, 0.550000011920929, 0.5490000247955322, 0.5410000085830688, 0.5450000166893005, 0.5429999828338623, 0.550000011920929, 0.5529999732971191, 0.5490000247955322, 0.5450000166893005, 0.5450000166893005, 0.5519999861717224, 0.5569999814033508, 0.5460000038146973, 0.546999990940094, 0.5509999990463257, 0.5509999990463257, 0.5450000166893005, 0.5440000295639038, 0.5440000295639038, 0.546999990940094, 0.5479999780654907, 0.546999990940094, 0.5460000038146973, 0.546999990940094, 0.5479999780654907, 0.5460000038146973, 0.5460000038146973, 0.5440000295639038, 0.5410000085830688, 0.5440000295639038, 0.5389999747276306, 0.5410000085830688, 0.546999990940094, 0.546999990940094, 0.5479999780654907, 0.546999990940094, 0.550000011920929, 0.546999990940094, 0.5460000038146973, 0.546999990940094, 0.5479999780654907, 0.5479999780654907, 0.5519999861717224, 0.550000011920929], "label": "FineWeb full MinHash"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files": {"agg_score": {"file": "agg_score.json"}, "commonsense_qa/acc_norm": {"file": "commonsense_qa_acc_norm.json"}, "hellaswag/acc_norm": {"file": "hellaswag_acc_norm.json"}, "openbookqa/acc_norm": {"file": "openbookqa_acc_norm.json"}, "piqa/acc_norm": {"file": "piqa_acc_norm.json"}, "siqa/acc_norm": {"file": "siqa_acc_norm.json"}, "winogrande/acc_norm": {"file": "winogrande_acc_norm.json"}, "arc/acc_norm": {"file": "arc_acc_norm.json"}, "mmlu/acc_norm": {"file": "mmlu_acc_norm.json"}}, "settings": {"defaultMetric": "agg_score", "slider": {"min": 0, "max": 30, "default": 5}}}
assets/data/plots/all_dumps_bad/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2528519630432129, 0.2616856694221496, 0.2665999829769134, 0.2683407664299011, 0.2742894291877746, 0.2762066125869751, 0.2807516455650329, 0.2767378389835357, 0.2807380557060241, 0.2788906991481781, 0.2844051718711853, 0.2856102883815765, 0.2883394360542297, 0.2875711619853973, 0.2890409529209137, 0.2894668281078338, 0.2883355319499969, 0.2872501015663147, 0.291619062423706, 0.2900333702564239, 0.2962473034858703, 0.2962896525859833, 0.297355443239212, 0.2932226359844208, 0.2886744439601898, 0.29665008187294, 0.2976542115211487, 0.2991503179073334, 0.3004479110240936, 0.3044549524784088, 0.2976194322109222, 0.3014707863330841, 0.3048252463340759, 0.3039425611495971, 0.303354948759079, 0.3027459383010864, 0.2999922931194305, 0.3050121665000915, 0.2998814284801483, 0.2978588044643402, 0.3041949570178985, 0.3010904192924499, 0.3022017180919647, 0.2997751235961914, 0.3015910983085632, 0.3096485137939453, 0.3012076020240783, 0.3065535724163055, 0.3042872548103332, 0.3104783594608307, 0.2997980415821075, 0.3051296770572662, 0.303458571434021, 0.3088337182998657, 0.3145398199558258, 0.3032208085060119, 0.310806930065155, 0.3075874149799347, 0.3101692199707031, 0.310107946395874, 0.3066047430038452, 0.3109066784381866, 0.3081336915493011, 0.3084586262702942, 0.3086149394512176, 0.3085348606109619, 0.3136637806892395, 0.3110873103141784, 0.31076380610466, 0.3084572553634643, 0.3133681714534759, 0.3125792145729065, 0.3124453127384186, 0.3097185790538788, 0.3106793165206909, 0.3089564740657806, 0.3111244142055511, 0.3123694658279419, 0.3144859969615936, 0.3135123550891876, 0.311982125043869, 0.3142133951187134, 0.3122903704643249, 0.3147654831409454, 0.3078767359256744, 0.314947634935379, 0.3171303570270538, 0.3129573762416839, 0.3154936134815216, 0.3158208429813385, 0.3153132200241089, 0.3141326904296875, 0.3163397014141083, 0.3166318237781524, 0.3168410360813141, 0.3198235332965851, 0.3201336860656738, 0.3212967813014984, 0.3191385567188263, 0.3178017139434814, 0.3192791938781738, 0.323061466217041, 0.320336639881134, 0.3165886104106903, 0.3206393420696258, 0.3167395293712616, 0.3135207295417785, 0.315539002418518, 0.3191742599010467, 0.321073055267334, 0.3222262561321258, 0.3193058371543884, 0.3213480710983276, 0.3198905289173126, 0.3219239711761474, 0.3211614489555359, 0.318855881690979, 0.3177095353603363, 0.324197381734848, 0.3208906352519989, 0.3264936804771423, 0.3245965242385864, 0.3231639564037323, 0.3221887946128845, 0.3277338445186615, 0.3227696120738983, 0.3263820111751556, 0.3258577883243561, 0.3264622390270233, 0.3222362995147705, 0.3286814987659454, 0.3235024213790893, 0.32446950674057, 0.3311836123466491, 0.328130304813385, 0.3271634578704834, 0.3250012993812561, 0.3309800624847412, 0.3274554014205932, 0.3273015916347503, 0.3261759579181671, 0.32697594165802, 0.3303172886371612, 0.3282814025878906, 0.3289586305618286, 0.3260826468467712, 0.3258011937141418, 0.3297208249568939, 0.3254813551902771, 0.3287739753723144, 0.3287097811698913, 0.3275279700756073, 0.3293041586875915, 0.3314100801944732, 0.3287808299064636, 0.3251930773258209, 0.3288172781467438, 0.3265027701854706, 0.3275215625762939, 0.3290774822235107, 0.3261331617832184, 0.3299777805805206, 0.331955999135971, 0.3305029273033142, 0.3274719417095184, 0.3235560953617096, 0.3269940316677093, 0.3323083519935608], "label": "RefinedWeb"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2510619163513183, 0.2621481418609619, 0.2632303833961487, 0.2720474302768707, 0.2719806432723999, 0.2726832032203674, 0.2786827087402344, 0.2823672890663147, 0.276201844215393, 0.2816944718360901, 0.280361145734787, 0.2819306254386902, 0.2823295891284942, 0.2892518043518066, 0.2872919738292694, 0.2859259247779846, 0.2885263860225677, 0.2862614393234253, 0.2933129370212555, 0.2930494546890259, 0.2884900867938995, 0.2942298054695129, 0.2927677929401397, 0.2954220175743103, 0.2918704748153686, 0.2943699061870575, 0.2891678512096405, 0.291848212480545, 0.2942944765090942, 0.2973679602146148, 0.2953736186027527, 0.2963412702083587, 0.297100305557251, 0.2963026762008667, 0.2944463491439819, 0.2971296310424804, 0.293870210647583, 0.2982682287693023, 0.2978119254112243, 0.2989997565746307, 0.2993503510951996, 0.298117071390152, 0.2977498769760132, 0.3004056811332702, 0.3012634217739105, 0.3001384139060974, 0.3052266240119934, 0.3038219809532165, 0.3037647306919098, 0.3009455502033233, 0.3038812279701233, 0.303263396024704, 0.3025077581405639, 0.3056069612503052, 0.3024908602237701, 0.3050909340381622, 0.3001562356948852, 0.303833544254303, 0.3019777834415436, 0.3036664128303528, 0.3022894859313965, 0.3042722940444946, 0.3023003339767456, 0.3069425821304321, 0.307883083820343, 0.3026910126209259, 0.3054113090038299, 0.3046148121356964, 0.305342435836792, 0.3048149049282074, 0.3066973984241485, 0.3055126965045929, 0.3063409924507141, 0.307701051235199, 0.3075169324874878, 0.3091190159320831, 0.3098153173923492, 0.31436288356781, 0.3096509575843811, 0.3022815883159637, 0.3119745552539825, 0.3083471357822418, 0.3085280954837799, 0.3082001209259033, 0.3080264329910278, 0.3116717934608459, 0.3097788393497467, 0.3117353916168213, 0.3170038759708404, 0.3099159002304077, 0.3133728504180908, 0.3161626160144806, 0.3095119595527649, 0.3135432302951813, 0.3103009164333343, 0.3126655519008636, 0.3121814131736755, 0.3123973608016968, 0.3148256838321686, 0.3144133985042572, 0.3124284744262695, 0.3102188408374786, 0.3123636841773987, 0.3115113973617553, 0.3151636719703674, 0.3148572146892547, 0.315061867237091, 0.3127182424068451, 0.3139308094978332, 0.3134367167949676, 0.3136025071144104, 0.3172793388366699, 0.3134761154651642, 0.3109587132930755, 0.3127998411655426, 0.3161843717098236, 0.3163313865661621, 0.3145243525505066, 0.3155156075954437, 0.3127505779266357, 0.3182451128959656, 0.3162476718425751, 0.3124897480010986, 0.3128789663314819, 0.3119811117649078, 0.314126193523407, 0.3136049509048462, 0.3149912655353546, 0.3146650791168213, 0.3151968121528625, 0.3179666996002197, 0.3169245719909668, 0.3202513754367828, 0.3185319602489471, 0.3202781081199646, 0.3186031281948089, 0.3166128396987915, 0.3199457228183746, 0.3194417059421539, 0.3170624077320099, 0.3184532523155212, 0.3191981911659241, 0.3191225528717041, 0.3173209130764007, 0.3195607960224151, 0.3166368305683136, 0.3188160359859466, 0.3174867630004883, 0.3184468746185303, 0.3211863338947296, 0.3184327483177185, 0.3177861273288727, 0.3180214762687683, 0.3194973170757293, 0.3212297558784485, 0.3211282789707184, 0.3200584352016449, 0.3168685734272003, 0.3211040198802948, 0.3222841620445251, 0.3196901082992553, 0.3236229419708252, 0.3204475045204162, 0.3210069537162781, 0.3191083669662475, 0.31863734126091, 0.3195922076702118], "label": "FineWeb full MinHash"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2516599297523498, 0.2610189318656921, 0.2666046619415283, 0.2667981088161468, 0.2667821645736694, 0.2708088159561157, 0.2738403379917145, 0.2726235687732696, 0.2762763500213623, 0.2768311202526092, 0.2809228301048279, 0.2836140990257263, 0.2822815179824829, 0.2831664383411407, 0.2797218561172485, 0.286342591047287, 0.2855269610881805, 0.2847287058830261, 0.2888180613517761, 0.286526083946228, 0.2865165770053863, 0.294582188129425, 0.2925947606563568, 0.2947863042354584, 0.2892930805683136, 0.2903610467910766, 0.288201242685318, 0.2873396277427673, 0.2916238009929657, 0.2908017039299011, 0.2907920777797699, 0.2952797412872314, 0.2941452264785766, 0.2921333611011505, 0.2925891280174255, 0.2968584895133972, 0.2980035543441772, 0.2964116632938385, 0.2962304651737213, 0.2950254380702972, 0.2977516651153564, 0.2944138348102569, 0.3003402054309845, 0.2976303696632385, 0.3013098239898681, 0.302829384803772, 0.3018766045570373, 0.305361807346344, 0.2971298694610595, 0.3014816343784332, 0.3019805550575256, 0.3037064969539642, 0.2970167994499206, 0.2995208501815796, 0.2970106601715088, 0.2990955114364624, 0.3027818500995636, 0.3048534691333771, 0.2993872463703155, 0.2986327707767486, 0.3015393316745758, 0.3003426790237427, 0.3003274798393249, 0.3017795085906982, 0.3019182682037353, 0.3015450537204742, 0.3046211004257202, 0.3031167984008789, 0.3020436763763428, 0.3011128306388855, 0.3029948472976684, 0.3045558631420135, 0.301642894744873, 0.3029441833496094, 0.3035804331302643, 0.3004390001296997, 0.3021787703037262, 0.306041270494461, 0.3064048886299133, 0.3087956011295318, 0.3070018291473388, 0.3065581619739532, 0.3093871772289276, 0.3060930073261261, 0.3033313155174255, 0.3072777390480041, 0.306413859128952, 0.3104493916034698, 0.3056999444961548, 0.3077532052993774, 0.309231549501419, 0.3070645034313202, 0.3117790520191192, 0.3114112913608551, 0.312661737203598, 0.3181777000427246, 0.3117201030254364, 0.3099702894687652, 0.3074746131896972, 0.3064963519573211, 0.3105958700180053, 0.3111456036567688, 0.3084964454174042, 0.3087405860424042, 0.3121673166751861, 0.3121528625488281, 0.3100416660308838, 0.3142979145050049, 0.3129935264587402, 0.3112611472606659, 0.3119436800479889, 0.3154115974903106, 0.3091593086719513, 0.3103814721107483, 0.3130497634410858, 0.3133455514907837, 0.3152708411216736, 0.3137963414192199, 0.3099324703216553, 0.3164172768592834, 0.3133907914161682, 0.3128255009651184, 0.3134104907512665, 0.3106969892978668, 0.3130004107952118, 0.3131391704082489, 0.3130116462707519, 0.3143952488899231, 0.3143975436687469, 0.3143710494041443, 0.3163396418094635, 0.3166862726211548, 0.3184126019477844, 0.3178988993167877, 0.317479133605957, 0.3184944093227386, 0.316694974899292, 0.3176258206367492, 0.3182629346847534, 0.3200214207172394, 0.3181648552417755, 0.320680022239685, 0.3178716897964477, 0.3182425796985626, 0.3182984292507171, 0.3158398568630218, 0.3152642548084259, 0.3132680356502533, 0.3178914785385132, 0.3156660795211792, 0.3161703050136566, 0.3176451921463012, 0.3173815906047821, 0.3194171786308288, 0.3193057179450989, 0.3172560334205627, 0.317656546831131, 0.3155770003795624, 0.3199106156826019, 0.3170182108879089, 0.3156754970550537, 0.3180731236934662, 0.3205638229846954, 0.3175432682037353, 0.3184471428394317, 0.3192788958549499, 0.3197042346000671, 0.3177168369293213], "label": "FineWeb filtered only"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2560000121593475, 0.2840000092983246, 0.3059999942779541, 0.3059999942779541, 0.2980000078678131, 0.3240000009536743, 0.3100000023841858, 0.3000000119209289, 0.3160000145435333, 0.3140000104904175, 0.3260000050067901, 0.3199999928474426, 0.2980000078678131, 0.3179999887943268, 0.3179999887943268, 0.3319999873638153, 0.3019999861717224, 0.2939999997615814, 0.3319999873638153, 0.3319999873638153, 0.3219999969005584, 0.3379999995231628, 0.3379999995231628, 0.3339999914169311, 0.3240000009536743, 0.3479999899864197, 0.3300000131130218, 0.3240000009536743, 0.3300000131130218, 0.3400000035762787, 0.3459999859333038, 0.3319999873638153, 0.3379999995231628, 0.356000006198883, 0.3339999914169311, 0.3459999859333038, 0.3440000116825104, 0.3519999980926513, 0.3479999899864197, 0.3339999914169311, 0.3400000035762787, 0.3479999899864197, 0.3379999995231628, 0.3479999899864197, 0.3499999940395355, 0.3400000035762787, 0.3499999940395355, 0.3420000076293945, 0.3659999966621399, 0.3400000035762787, 0.3459999859333038, 0.3499999940395355, 0.356000006198883, 0.3400000035762787, 0.356000006198883, 0.3339999914169311, 0.3339999914169311, 0.3479999899864197, 0.3420000076293945, 0.3580000102519989, 0.3339999914169311, 0.3440000116825104, 0.3400000035762787, 0.3499999940395355, 0.3540000021457672, 0.3479999899864197, 0.3499999940395355, 0.3420000076293945, 0.3379999995231628, 0.335999995470047, 0.356000006198883, 0.3459999859333038, 0.3499999940395355, 0.3400000035762787, 0.3440000116825104, 0.356000006198883, 0.3519999980926513, 0.3400000035762787, 0.3440000116825104, 0.356000006198883, 0.3400000035762787, 0.356000006198883, 0.3600000143051147, 0.3540000021457672, 0.3479999899864197, 0.3379999995231628, 0.3440000116825104, 0.3300000131130218, 0.3400000035762787, 0.3459999859333038, 0.3339999914169311, 0.3499999940395355, 0.3600000143051147, 0.3440000116825104, 0.3499999940395355, 0.356000006198883, 0.3420000076293945, 0.3479999899864197, 0.3379999995231628, 0.3379999995231628, 0.3459999859333038, 0.356000006198883, 0.328000009059906, 0.3459999859333038, 0.3519999980926513, 0.3499999940395355, 0.3519999980926513, 0.3420000076293945, 0.3499999940395355, 0.3420000076293945, 0.3339999914169311, 0.335999995470047, 0.3379999995231628, 0.3379999995231628, 0.3540000021457672, 0.356000006198883, 0.356000006198883, 0.335999995470047, 0.363999992609024, 0.363999992609024, 0.3499999940395355, 0.356000006198883, 0.3519999980926513, 0.3519999980926513, 0.3540000021457672, 0.3459999859333038, 0.3479999899864197, 0.3519999980926513, 0.3519999980926513, 0.3420000076293945, 0.3440000116825104, 0.3379999995231628, 0.3519999980926513, 0.356000006198883, 0.3420000076293945, 0.3580000102519989, 0.3499999940395355, 0.3619999885559082, 0.3519999980926513, 0.3600000143051147, 0.3459999859333038, 0.3519999980926513, 0.3519999980926513, 0.3499999940395355, 0.3580000102519989, 0.356000006198883, 0.3580000102519989, 0.3600000143051147, 0.3440000116825104, 0.3600000143051147, 0.3440000116825104, 0.3479999899864197, 0.3479999899864197, 0.3580000102519989, 0.3600000143051147, 0.3580000102519989, 0.3540000021457672, 0.3519999980926513, 0.3459999859333038, 0.3459999859333038, 0.3540000021457672, 0.335999995470047, 0.3540000021457672, 0.3540000021457672, 0.3519999980926513, 0.356000006198883, 0.3499999940395355, 0.356000006198883], "label": "RefinedWeb"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2560000121593475, 0.2720000147819519, 0.2980000078678131, 0.2840000092983246, 0.2879999876022339, 0.3039999902248382, 0.2860000133514404, 0.2899999916553497, 0.3019999861717224, 0.2960000038146972, 0.3039999902248382, 0.3100000023841858, 0.3160000145435333, 0.3260000050067901, 0.3160000145435333, 0.3260000050067901, 0.3179999887943268, 0.3420000076293945, 0.3219999969005584, 0.328000009059906, 0.3240000009536743, 0.3300000131130218, 0.328000009059906, 0.3199999928474426, 0.3379999995231628, 0.3400000035762787, 0.3240000009536743, 0.3120000064373016, 0.3319999873638153, 0.3260000050067901, 0.3120000064373016, 0.3160000145435333, 0.3140000104904175, 0.3179999887943268, 0.3160000145435333, 0.3199999928474426, 0.3240000009536743, 0.3260000050067901, 0.3179999887943268, 0.3300000131130218, 0.3179999887943268, 0.328000009059906, 0.3240000009536743, 0.328000009059906, 0.3260000050067901, 0.3199999928474426, 0.3400000035762787, 0.3339999914169311, 0.328000009059906, 0.328000009059906, 0.3339999914169311, 0.328000009059906, 0.328000009059906, 0.335999995470047, 0.3580000102519989, 0.3499999940395355, 0.3260000050067901, 0.3499999940395355, 0.3420000076293945, 0.3160000145435333, 0.3339999914169311, 0.335999995470047, 0.3400000035762787, 0.3240000009536743, 0.3319999873638153, 0.3379999995231628, 0.3400000035762787, 0.3379999995231628, 0.3319999873638153, 0.3319999873638153, 0.3440000116825104, 0.3300000131130218, 0.3219999969005584, 0.3260000050067901, 0.3219999969005584, 0.3339999914169311, 0.328000009059906, 0.3300000131130218, 0.3219999969005584, 0.3379999995231628, 0.3400000035762787, 0.3319999873638153, 0.328000009059906, 0.3440000116825104, 0.3339999914169311, 0.328000009059906, 0.3379999995231628, 0.3499999940395355, 0.3339999914169311, 0.3300000131130218, 0.328000009059906, 0.335999995470047, 0.3240000009536743, 0.335999995470047, 0.3240000009536743, 0.3400000035762787, 0.3400000035762787, 0.3420000076293945, 0.3319999873638153, 0.3339999914169311, 0.3300000131130218, 0.3400000035762787, 0.3459999859333038, 0.3400000035762787, 0.3379999995231628, 0.3459999859333038, 0.3379999995231628, 0.3300000131130218, 0.3519999980926513, 0.3379999995231628, 0.356000006198883, 0.335999995470047, 0.3420000076293945, 0.3400000035762787, 0.328000009059906, 0.3540000021457672, 0.3499999940395355, 0.3479999899864197, 0.3440000116825104, 0.3519999980926513, 0.356000006198883, 0.3540000021457672, 0.3440000116825104, 0.3499999940395355, 0.356000006198883, 0.356000006198883, 0.356000006198883, 0.363999992609024, 0.3600000143051147, 0.356000006198883, 0.3479999899864197, 0.356000006198883, 0.3459999859333038, 0.3479999899864197, 0.3619999885559082, 0.363999992609024, 0.3499999940395355, 0.3379999995231628, 0.3479999899864197, 0.3499999940395355, 0.356000006198883, 0.3519999980926513, 0.3540000021457672, 0.3619999885559082, 0.3580000102519989, 0.3540000021457672, 0.356000006198883, 0.3479999899864197, 0.3519999980926513, 0.356000006198883, 0.3499999940395355, 0.3379999995231628, 0.3479999899864197, 0.3499999940395355, 0.3440000116825104, 0.3580000102519989, 0.356000006198883, 0.3499999940395355, 0.3479999899864197, 0.3580000102519989, 0.3519999980926513, 0.3540000021457672, 0.3519999980926513, 0.3540000021457672, 0.356000006198883, 0.363999992609024, 0.356000006198883, 0.356000006198883], "label": "FineWeb filtered only"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2460000067949295, 0.2720000147819519, 0.270000010728836, 0.2939999997615814, 0.2960000038146972, 0.3240000009536743, 0.3019999861717224, 0.2879999876022339, 0.3179999887943268, 0.3059999942779541, 0.2899999916553497, 0.3100000023841858, 0.3179999887943268, 0.3219999969005584, 0.3219999969005584, 0.3300000131130218, 0.3140000104904175, 0.3240000009536743, 0.3079999983310699, 0.3260000050067901, 0.3120000064373016, 0.3160000145435333, 0.3179999887943268, 0.3260000050067901, 0.3260000050067901, 0.3240000009536743, 0.3379999995231628, 0.3219999969005584, 0.3319999873638153, 0.3379999995231628, 0.3339999914169311, 0.328000009059906, 0.3319999873638153, 0.3199999928474426, 0.3000000119209289, 0.3260000050067901, 0.3240000009536743, 0.328000009059906, 0.3240000009536743, 0.328000009059906, 0.3260000050067901, 0.3440000116825104, 0.3199999928474426, 0.3319999873638153, 0.3219999969005584, 0.335999995470047, 0.3519999980926513, 0.3379999995231628, 0.328000009059906, 0.3300000131130218, 0.335999995470047, 0.3479999899864197, 0.3459999859333038, 0.3479999899864197, 0.3540000021457672, 0.3479999899864197, 0.3300000131130218, 0.356000006198883, 0.3479999899864197, 0.356000006198883, 0.335999995470047, 0.335999995470047, 0.3479999899864197, 0.3339999914169311, 0.3540000021457672, 0.3300000131130218, 0.3479999899864197, 0.3499999940395355, 0.3400000035762787, 0.3459999859333038, 0.3339999914169311, 0.3479999899864197, 0.335999995470047, 0.3400000035762787, 0.3179999887943268, 0.335999995470047, 0.328000009059906, 0.328000009059906, 0.3540000021457672, 0.3479999899864197, 0.3420000076293945, 0.3580000102519989, 0.3459999859333038, 0.3420000076293945, 0.3459999859333038, 0.3440000116825104, 0.3499999940395355, 0.335999995470047, 0.3540000021457672, 0.356000006198883, 0.3400000035762787, 0.3600000143051147, 0.3580000102519989, 0.3519999980926513, 0.3499999940395355, 0.3540000021457672, 0.3519999980926513, 0.3499999940395355, 0.3440000116825104, 0.356000006198883, 0.3479999899864197, 0.3479999899864197, 0.3440000116825104, 0.3499999940395355, 0.3440000116825104, 0.3519999980926513, 0.3440000116825104, 0.356000006198883, 0.3459999859333038, 0.3580000102519989, 0.356000006198883, 0.3519999980926513, 0.3420000076293945, 0.3379999995231628, 0.3479999899864197, 0.3459999859333038, 0.3499999940395355, 0.3400000035762787, 0.3440000116825104, 0.3420000076293945, 0.3420000076293945, 0.3499999940395355, 0.3459999859333038, 0.3420000076293945, 0.3459999859333038, 0.3459999859333038, 0.3479999899864197, 0.3440000116825104, 0.3720000088214874, 0.3619999885559082, 0.356000006198883, 0.3519999980926513, 0.3459999859333038, 0.3440000116825104, 0.3420000076293945, 0.3580000102519989, 0.3600000143051147, 0.3519999980926513, 0.3600000143051147, 0.3440000116825104, 0.3600000143051147, 0.3619999885559082, 0.3499999940395355, 0.3499999940395355, 0.363999992609024, 0.3580000102519989, 0.3499999940395355, 0.3479999899864197, 0.3479999899864197, 0.3580000102519989, 0.3540000021457672, 0.3600000143051147, 0.3420000076293945, 0.3519999980926513, 0.3440000116825104, 0.3519999980926513, 0.3540000021457672, 0.356000006198883, 0.3459999859333038, 0.3499999940395355, 0.3519999980926513, 0.3580000102519989, 0.3440000116825104, 0.3499999940395355, 0.3580000102519989, 0.3479999899864197, 0.3479999899864197], "label": "FineWeb full MinHash"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.6019999980926514, 0.652999997138977, 0.6710000038146973, 0.6740000247955322, 0.6899999976158142, 0.6919999718666077, 0.6909999847412109, 0.7070000171661377, 0.7089999914169312, 0.7129999995231628, 0.7229999899864197, 0.7120000123977661, 0.7200000286102295, 0.7300000190734863, 0.7279999852180481, 0.7369999885559082, 0.7390000224113464, 0.7350000143051147, 0.7319999933242798, 0.7279999852180481, 0.7269999980926514, 0.7459999918937683, 0.7400000095367432, 0.7390000224113464, 0.7319999933242798, 0.7390000224113464, 0.7379999756813049, 0.7390000224113464, 0.7360000014305115, 0.7440000176429749, 0.7400000095367432, 0.7360000014305115, 0.7480000257492065, 0.7360000014305115, 0.7440000176429749, 0.7459999918937683, 0.7409999966621399, 0.746999979019165, 0.7440000176429749, 0.7450000047683716, 0.753000020980835, 0.7390000224113464, 0.7490000128746033, 0.7419999837875366, 0.7390000224113464, 0.7559999823570251, 0.7519999742507935, 0.7549999952316284, 0.7419999837875366, 0.7490000128746033, 0.7540000081062317, 0.7480000257492065, 0.7450000047683716, 0.7429999709129333, 0.7509999871253967, 0.7549999952316284, 0.7490000128746033, 0.7490000128746033, 0.7400000095367432, 0.753000020980835, 0.75, 0.7509999871253967, 0.7570000290870667, 0.7590000033378601, 0.7570000290870667, 0.7329999804496765, 0.7540000081062317, 0.746999979019165, 0.7409999966621399, 0.7590000033378601, 0.7509999871253967, 0.7570000290870667, 0.75, 0.7540000081062317, 0.7480000257492065, 0.7580000162124634, 0.7639999985694885, 0.7630000114440918, 0.7590000033378601, 0.7549999952316284, 0.7480000257492065, 0.7509999871253967, 0.7570000290870667, 0.75, 0.7540000081062317, 0.7480000257492065, 0.7549999952316284, 0.7559999823570251, 0.7580000162124634, 0.7580000162124634, 0.753000020980835, 0.7490000128746033, 0.7540000081062317, 0.7639999985694885, 0.7580000162124634, 0.7519999742507935, 0.7590000033378601, 0.75, 0.7570000290870667, 0.7620000243186951, 0.7710000276565552, 0.7739999890327454, 0.7620000243186951, 0.7549999952316284, 0.7599999904632568, 0.765999972820282, 0.7680000066757202, 0.7639999985694885, 0.7540000081062317, 0.7649999856948853, 0.7649999856948853, 0.7609999775886536, 0.7549999952316284, 0.765999972820282, 0.7639999985694885, 0.7580000162124634, 0.7710000276565552, 0.7570000290870667, 0.7630000114440918, 0.7580000162124634, 0.7599999904632568, 0.7649999856948853, 0.7670000195503235, 0.7699999809265137, 0.7710000276565552, 0.7559999823570251, 0.7609999775886536, 0.7620000243186951, 0.7620000243186951, 0.7609999775886536, 0.753000020980835, 0.7570000290870667, 0.7620000243186951, 0.7609999775886536, 0.7609999775886536, 0.7559999823570251, 0.7540000081062317, 0.7570000290870667, 0.7639999985694885, 0.7590000033378601, 0.7680000066757202, 0.7680000066757202, 0.765999972820282, 0.765999972820282, 0.7670000195503235, 0.7739999890327454, 0.7649999856948853, 0.7749999761581421, 0.7699999809265137, 0.7639999985694885, 0.7680000066757202, 0.7630000114440918, 0.7680000066757202, 0.7699999809265137, 0.7739999890327454, 0.7749999761581421, 0.765999972820282, 0.7680000066757202, 0.7710000276565552, 0.7680000066757202, 0.765999972820282, 0.7689999938011169, 0.7760000228881836, 0.7710000276565552, 0.7680000066757202, 0.7649999856948853, 0.7720000147819519, 0.7730000019073486], "label": "RefinedWeb"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.6169999837875366, 0.6359999775886536, 0.6769999861717224, 0.6769999861717224, 0.6970000267028809, 0.6990000009536743, 0.6970000267028809, 0.6959999799728394, 0.7049999833106995, 0.7089999914169312, 0.7179999947547913, 0.7099999785423279, 0.7160000205039978, 0.7260000109672546, 0.7229999899864197, 0.7179999947547913, 0.7210000157356262, 0.7200000286102295, 0.734000027179718, 0.7089999914169312, 0.7229999899864197, 0.7239999771118164, 0.7310000061988831, 0.7300000190734863, 0.7260000109672546, 0.7250000238418579, 0.7239999771118164, 0.7289999723434448, 0.7390000224113464, 0.7229999899864197, 0.7310000061988831, 0.7350000143051147, 0.7289999723434448, 0.734000027179718, 0.7289999723434448, 0.7329999804496765, 0.7300000190734863, 0.7319999933242798, 0.7440000176429749, 0.746999979019165, 0.7310000061988831, 0.7329999804496765, 0.7480000257492065, 0.7429999709129333, 0.7369999885559082, 0.7269999980926514, 0.7269999980926514, 0.7379999756813049, 0.75, 0.7360000014305115, 0.746999979019165, 0.7409999966621399, 0.7369999885559082, 0.7459999918937683, 0.7400000095367432, 0.7409999966621399, 0.746999979019165, 0.7360000014305115, 0.7459999918937683, 0.7400000095367432, 0.7429999709129333, 0.7350000143051147, 0.7390000224113464, 0.7379999756813049, 0.7480000257492065, 0.7329999804496765, 0.734000027179718, 0.7390000224113464, 0.7459999918937683, 0.7360000014305115, 0.7419999837875366, 0.7429999709129333, 0.7400000095367432, 0.7379999756813049, 0.7310000061988831, 0.7360000014305115, 0.7390000224113464, 0.75, 0.7369999885559082, 0.7570000290870667, 0.7409999966621399, 0.7459999918937683, 0.7350000143051147, 0.7459999918937683, 0.7509999871253967, 0.7429999709129333, 0.7419999837875366, 0.7419999837875366, 0.75, 0.7440000176429749, 0.7450000047683716, 0.75, 0.7409999966621399, 0.7490000128746033, 0.7409999966621399, 0.7419999837875366, 0.7429999709129333, 0.7490000128746033, 0.7419999837875366, 0.7419999837875366, 0.75, 0.753000020980835, 0.75, 0.746999979019165, 0.7519999742507935, 0.746999979019165, 0.7570000290870667, 0.7549999952316284, 0.75, 0.7540000081062317, 0.7480000257492065, 0.7490000128746033, 0.7419999837875366, 0.7419999837875366, 0.746999979019165, 0.746999979019165, 0.75, 0.7519999742507935, 0.7580000162124634, 0.7549999952316284, 0.7490000128746033, 0.7480000257492065, 0.7519999742507935, 0.7590000033378601, 0.7450000047683716, 0.75, 0.7440000176429749, 0.7419999837875366, 0.7519999742507935, 0.7450000047683716, 0.753000020980835, 0.7450000047683716, 0.7440000176429749, 0.7559999823570251, 0.7509999871253967, 0.7540000081062317, 0.7440000176429749, 0.7509999871253967, 0.753000020980835, 0.7490000128746033, 0.7570000290870667, 0.7490000128746033, 0.746999979019165, 0.746999979019165, 0.7509999871253967, 0.7509999871253967, 0.7519999742507935, 0.7570000290870667, 0.7540000081062317, 0.7440000176429749, 0.7480000257492065, 0.7509999871253967, 0.7509999871253967, 0.7509999871253967, 0.7549999952316284, 0.75, 0.7559999823570251, 0.746999979019165, 0.7609999775886536, 0.7549999952316284, 0.746999979019165, 0.7490000128746033, 0.753000020980835, 0.753000020980835, 0.7609999775886536, 0.746999979019165, 0.7580000162124634], "label": "FineWeb full MinHash"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.621999979019165, 0.6439999938011169, 0.6700000166893005, 0.6790000200271606, 0.6869999766349792, 0.6959999799728394, 0.6790000200271606, 0.6880000233650208, 0.7049999833106995, 0.699999988079071, 0.6990000009536743, 0.6940000057220459, 0.7110000252723694, 0.7089999914169312, 0.7120000123977661, 0.7070000171661377, 0.7070000171661377, 0.6990000009536743, 0.7009999752044678, 0.7160000205039978, 0.7200000286102295, 0.7149999737739563, 0.7250000238418579, 0.7210000157356262, 0.722000002861023, 0.7310000061988831, 0.7289999723434448, 0.7319999933242798, 0.7250000238418579, 0.722000002861023, 0.7210000157356262, 0.7170000076293945, 0.7260000109672546, 0.7250000238418579, 0.7210000157356262, 0.7200000286102295, 0.7379999756813049, 0.7239999771118164, 0.7239999771118164, 0.7080000042915344, 0.7289999723434448, 0.7289999723434448, 0.7300000190734863, 0.7329999804496765, 0.7319999933242798, 0.7350000143051147, 0.7390000224113464, 0.7350000143051147, 0.7289999723434448, 0.734000027179718, 0.7329999804496765, 0.7400000095367432, 0.7409999966621399, 0.7310000061988831, 0.7350000143051147, 0.7360000014305115, 0.7360000014305115, 0.7409999966621399, 0.7319999933242798, 0.7409999966621399, 0.7400000095367432, 0.7390000224113464, 0.7329999804496765, 0.7459999918937683, 0.753000020980835, 0.746999979019165, 0.734000027179718, 0.7369999885559082, 0.7419999837875366, 0.734000027179718, 0.7419999837875366, 0.7289999723434448, 0.7350000143051147, 0.7300000190734863, 0.7519999742507935, 0.7390000224113464, 0.7400000095367432, 0.7409999966621399, 0.7429999709129333, 0.7450000047683716, 0.7329999804496765, 0.7260000109672546, 0.7570000290870667, 0.7360000014305115, 0.7519999742507935, 0.7419999837875366, 0.7379999756813049, 0.7390000224113464, 0.7490000128746033, 0.734000027179718, 0.7360000014305115, 0.7390000224113464, 0.7440000176429749, 0.7450000047683716, 0.7319999933242798, 0.7429999709129333, 0.7519999742507935, 0.7540000081062317, 0.7519999742507935, 0.753000020980835, 0.7480000257492065, 0.7440000176429749, 0.7459999918937683, 0.7369999885559082, 0.7419999837875366, 0.7480000257492065, 0.7419999837875366, 0.765999972820282, 0.746999979019165, 0.7459999918937683, 0.7570000290870667, 0.7390000224113464, 0.7409999966621399, 0.7459999918937683, 0.75, 0.7570000290870667, 0.753000020980835, 0.7549999952316284, 0.7519999742507935, 0.7490000128746033, 0.746999979019165, 0.7459999918937683, 0.7459999918937683, 0.746999979019165, 0.7409999966621399, 0.7419999837875366, 0.7459999918937683, 0.7440000176429749, 0.7459999918937683, 0.7490000128746033, 0.7450000047683716, 0.7409999966621399, 0.7419999837875366, 0.7490000128746033, 0.7590000033378601, 0.7549999952316284, 0.7549999952316284, 0.746999979019165, 0.753000020980835, 0.7549999952316284, 0.746999979019165, 0.7580000162124634, 0.7490000128746033, 0.753000020980835, 0.75, 0.75, 0.7540000081062317, 0.7540000081062317, 0.7490000128746033, 0.7570000290870667, 0.7570000290870667, 0.7590000033378601, 0.7559999823570251, 0.7620000243186951, 0.7590000033378601, 0.7509999871253967, 0.7639999985694885, 0.7580000162124634, 0.7599999904632568, 0.7620000243186951, 0.7590000033378601, 0.7609999775886536, 0.7559999823570251, 0.75, 0.7509999871253967, 0.7549999952316284, 0.7540000081062317, 0.7540000081062317], "label": "FineWeb filtered only"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.3980000019073486, 0.3899999856948852, 0.3860000073909759, 0.3919999897480011, 0.402999997138977, 0.3959999978542328, 0.3959999978542328, 0.4070000052452087, 0.4009999930858612, 0.4079999923706054, 0.4009999930858612, 0.3910000026226043, 0.3980000019073486, 0.395000010728836, 0.4129999876022339, 0.4020000100135803, 0.4090000092983246, 0.4120000004768371, 0.4129999876022339, 0.4129999876022339, 0.4099999964237213, 0.4110000133514404, 0.4110000133514404, 0.4090000092983246, 0.4000000059604645, 0.4050000011920929, 0.3939999938011169, 0.3889999985694885, 0.4050000011920929, 0.4099999964237213, 0.3980000019073486, 0.4090000092983246, 0.4079999923706054, 0.4070000052452087, 0.4040000140666961, 0.4129999876022339, 0.4090000092983246, 0.4059999883174896, 0.4090000092983246, 0.4090000092983246, 0.4149999916553497, 0.4059999883174896, 0.4000000059604645, 0.4000000059604645, 0.4070000052452087, 0.402999997138977, 0.4040000140666961, 0.3989999890327453, 0.4020000100135803, 0.4160000085830688, 0.4050000011920929, 0.4110000133514404, 0.4059999883174896, 0.3989999890327453, 0.4169999957084656, 0.4040000140666961, 0.4050000011920929, 0.4149999916553497, 0.4020000100135803, 0.402999997138977, 0.4129999876022339, 0.4009999930858612, 0.4059999883174896, 0.4040000140666961, 0.4099999964237213, 0.414000004529953, 0.4210000038146972, 0.4110000133514404, 0.4070000052452087, 0.4099999964237213, 0.4169999957084656, 0.4070000052452087, 0.4199999868869781, 0.4079999923706054, 0.4180000126361847, 0.4110000133514404, 0.4110000133514404, 0.4189999997615814, 0.414000004529953, 0.4129999876022339, 0.4180000126361847, 0.4070000052452087, 0.4059999883174896, 0.4059999883174896, 0.4129999876022339, 0.4149999916553497, 0.4099999964237213, 0.4009999930858612, 0.4020000100135803, 0.4099999964237213, 0.4169999957084656, 0.4129999876022339, 0.414000004529953, 0.4099999964237213, 0.4189999997615814, 0.4210000038146972, 0.4090000092983246, 0.4079999923706054, 0.4099999964237213, 0.4099999964237213, 0.4129999876022339, 0.4099999964237213, 0.4099999964237213, 0.4110000133514404, 0.4020000100135803, 0.4079999923706054, 0.4079999923706054, 0.414000004529953, 0.4129999876022339, 0.4189999997615814, 0.4129999876022339, 0.4180000126361847, 0.4050000011920929, 0.4230000078678131, 0.4180000126361847, 0.4120000004768371, 0.4149999916553497, 0.4189999997615814, 0.4110000133514404, 0.4160000085830688, 0.4059999883174896, 0.4110000133514404, 0.4110000133514404, 0.4110000133514404, 0.4040000140666961, 0.4149999916553497, 0.414000004529953, 0.4160000085830688, 0.414000004529953, 0.4129999876022339, 0.4120000004768371, 0.4149999916553497, 0.4169999957084656, 0.4110000133514404, 0.414000004529953, 0.4160000085830688, 0.4110000133514404, 0.4120000004768371, 0.4110000133514404, 0.4149999916553497, 0.4129999876022339, 0.4110000133514404, 0.4129999876022339, 0.4099999964237213, 0.4180000126361847, 0.414000004529953, 0.4040000140666961, 0.4099999964237213, 0.4099999964237213, 0.4120000004768371, 0.4149999916553497, 0.4129999876022339, 0.4079999923706054, 0.4040000140666961, 0.4129999876022339, 0.4149999916553497, 0.4120000004768371, 0.402999997138977, 0.4090000092983246, 0.4110000133514404, 0.4090000092983246, 0.4070000052452087, 0.4149999916553497, 0.4070000052452087, 0.4120000004768371, 0.4059999883174896, 0.4059999883174896, 0.4099999964237213], "label": "RefinedWeb"}, "big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.395000010728836, 0.3919999897480011, 0.3819999992847442, 0.3840000033378601, 0.3869999945163727, 0.395000010728836, 0.3959999978542328, 0.4020000100135803, 0.4009999930858612, 0.4079999923706054, 0.402999997138977, 0.4000000059604645, 0.3930000066757202, 0.4050000011920929, 0.4040000140666961, 0.3959999978542328, 0.4009999930858612, 0.4059999883174896, 0.3989999890327453, 0.3970000147819519, 0.4070000052452087, 0.4079999923706054, 0.4000000059604645, 0.3959999978542328, 0.3970000147819519, 0.4009999930858612, 0.3980000019073486, 0.3959999978542328, 0.3970000147819519, 0.4000000059604645, 0.3910000026226043, 0.4110000133514404, 0.4040000140666961, 0.3919999897480011, 0.4160000085830688, 0.4120000004768371, 0.4070000052452087, 0.4000000059604645, 0.4040000140666961, 0.4120000004768371, 0.3939999938011169, 0.4020000100135803, 0.4000000059604645, 0.4090000092983246, 0.4059999883174896, 0.3980000019073486, 0.4210000038146972, 0.402999997138977, 0.4149999916553497, 0.4009999930858612, 0.414000004529953, 0.4129999876022339, 0.4199999868869781, 0.4090000092983246, 0.3989999890327453, 0.4040000140666961, 0.402999997138977, 0.402999997138977, 0.4059999883174896, 0.4050000011920929, 0.4160000085830688, 0.4169999957084656, 0.4079999923706054, 0.402999997138977, 0.4020000100135803, 0.3959999978542328, 0.4169999957084656, 0.3970000147819519, 0.4099999964237213, 0.402999997138977, 0.4059999883174896, 0.402999997138977, 0.3939999938011169, 0.3939999938011169, 0.4020000100135803, 0.3970000147819519, 0.4120000004768371, 0.4040000140666961, 0.4040000140666961, 0.4090000092983246, 0.3980000019073486, 0.4079999923706054, 0.4070000052452087, 0.4099999964237213, 0.3989999890327453, 0.4000000059604645, 0.4070000052452087, 0.3980000019073486, 0.402999997138977, 0.4090000092983246, 0.4040000140666961, 0.3889999985694885, 0.4000000059604645, 0.402999997138977, 0.4050000011920929, 0.395000010728836, 0.4009999930858612, 0.3989999890327453, 0.3970000147819519, 0.4009999930858612, 0.3989999890327453, 0.3970000147819519, 0.4099999964237213, 0.3989999890327453, 0.4070000052452087, 0.4009999930858612, 0.3880000114440918, 0.3959999978542328, 0.3910000026226043, 0.3930000066757202, 0.3980000019073486, 0.402999997138977, 0.4009999930858612, 0.4000000059604645, 0.3919999897480011, 0.3980000019073486, 0.395000010728836, 0.4020000100135803, 0.3989999890327453, 0.4020000100135803, 0.4040000140666961, 0.4070000052452087, 0.4090000092983246, 0.4079999923706054, 0.4099999964237213, 0.4040000140666961, 0.3889999985694885, 0.3989999890327453, 0.4020000100135803, 0.3989999890327453, 0.3970000147819519, 0.4009999930858612, 0.4090000092983246, 0.414000004529953, 0.395000010728836, 0.4009999930858612, 0.4020000100135803, 0.4009999930858612, 0.3980000019073486, 0.402999997138977, 0.3980000019073486, 0.402999997138977, 0.395000010728836, 0.4020000100135803, 0.395000010728836, 0.3989999890327453, 0.3970000147819519, 0.3980000019073486, 0.3980000019073486, 0.3970000147819519, 0.3939999938011169, 0.395000010728836, 0.3989999890327453, 0.3970000147819519, 0.4020000100135803, 0.3930000066757202, 0.3989999890327453, 0.4050000011920929, 0.3930000066757202, 0.4040000140666961, 0.4000000059604645, 0.4020000100135803, 0.3880000114440918, 0.395000010728836, 0.3910000026226043, 0.3980000019073486, 0.4009999930858612], "label": "FineWeb full MinHash"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.4000000059604645, 0.395000010728836, 0.3959999978542328, 0.4020000100135803, 0.4000000059604645, 0.3959999978542328, 0.3930000066757202, 0.3899999856948852, 0.402999997138977, 0.4009999930858612, 0.3930000066757202, 0.4050000011920929, 0.3939999938011169, 0.4110000133514404, 0.4000000059604645, 0.3989999890327453, 0.3959999978542328, 0.4020000100135803, 0.4000000059604645, 0.3939999938011169, 0.395000010728836, 0.3919999897480011, 0.3980000019073486, 0.3910000026226043, 0.3880000114440918, 0.3959999978542328, 0.3980000019073486, 0.3989999890327453, 0.402999997138977, 0.3959999978542328, 0.3980000019073486, 0.395000010728836, 0.4090000092983246, 0.4090000092983246, 0.3889999985694885, 0.3959999978542328, 0.3880000114440918, 0.3840000033378601, 0.3959999978542328, 0.3880000114440918, 0.3939999938011169, 0.3970000147819519, 0.3910000026226043, 0.3939999938011169, 0.4020000100135803, 0.3980000019073486, 0.3970000147819519, 0.4009999930858612, 0.3919999897480011, 0.3899999856948852, 0.3989999890327453, 0.3860000073909759, 0.3860000073909759, 0.3970000147819519, 0.3959999978542328, 0.3939999938011169, 0.3840000033378601, 0.3869999945163727, 0.402999997138977, 0.4050000011920929, 0.395000010728836, 0.3880000114440918, 0.3869999945163727, 0.3939999938011169, 0.402999997138977, 0.3899999856948852, 0.3910000026226043, 0.3910000026226043, 0.4009999930858612, 0.3919999897480011, 0.3970000147819519, 0.3919999897480011, 0.3930000066757202, 0.3869999945163727, 0.3880000114440918, 0.3849999904632568, 0.3930000066757202, 0.395000010728836, 0.3889999985694885, 0.3959999978542328, 0.3989999890327453, 0.402999997138977, 0.3939999938011169, 0.4000000059604645, 0.4000000059604645, 0.4050000011920929, 0.3989999890327453, 0.3869999945163727, 0.3910000026226043, 0.3889999985694885, 0.3889999985694885, 0.4000000059604645, 0.3910000026226043, 0.3970000147819519, 0.3989999890327453, 0.3989999890327453, 0.3959999978542328, 0.3910000026226043, 0.3880000114440918, 0.3939999938011169, 0.382999986410141, 0.3849999904632568, 0.3959999978542328, 0.3989999890327453, 0.3959999978542328, 0.3880000114440918, 0.3840000033378601, 0.3980000019073486, 0.4000000059604645, 0.4000000059604645, 0.4020000100135803, 0.395000010728836, 0.3910000026226043, 0.3919999897480011, 0.4040000140666961, 0.3989999890327453, 0.4020000100135803, 0.3910000026226043, 0.4009999930858612, 0.3959999978542328, 0.3939999938011169, 0.3930000066757202, 0.3910000026226043, 0.3970000147819519, 0.3880000114440918, 0.3970000147819519, 0.3959999978542328, 0.3889999985694885, 0.3970000147819519, 0.4009999930858612, 0.3970000147819519, 0.3959999978542328, 0.3959999978542328, 0.3989999890327453, 0.4040000140666961, 0.3959999978542328, 0.3980000019073486, 0.3970000147819519, 0.3970000147819519, 0.3989999890327453, 0.4020000100135803, 0.3980000019073486, 0.4000000059604645, 0.4000000059604645, 0.402999997138977, 0.4090000092983246, 0.3970000147819519, 0.4020000100135803, 0.3970000147819519, 0.4009999930858612, 0.3959999978542328, 0.3970000147819519, 0.3989999890327453, 0.3939999938011169, 0.3989999890327453, 0.4000000059604645, 0.4000000059604645, 0.3989999890327453, 0.4050000011920929, 0.4059999883174896, 0.4009999930858612, 0.3989999890327453, 0.3959999978542328, 0.3939999938011169, 0.3970000147819519, 0.4009999930858612, 0.3989999890327453, 0.3939999938011169], "label": "FineWeb filtered only"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_dumps_bad/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-fineweb-cross-dedup-fixed": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.4869999885559082, 0.4959999918937683, 0.4979999959468841, 0.5099999904632568, 0.515999972820282, 0.5080000162124634, 0.5249999761581421, 0.5239999890327454, 0.5299999713897705, 0.5239999890327454, 0.5149999856948853, 0.5270000100135803, 0.5249999761581421, 0.5180000066757202, 0.5220000147819519, 0.5329999923706055, 0.5289999842643738, 0.5239999890327454, 0.5299999713897705, 0.5230000019073486, 0.5130000114440918, 0.5180000066757202, 0.5299999713897705, 0.5199999809265137, 0.5270000100135803, 0.5230000019073486, 0.5299999713897705, 0.5320000052452087, 0.5429999828338623, 0.527999997138977, 0.5379999876022339, 0.527999997138977, 0.5419999957084656, 0.5329999923706055, 0.5450000166893005, 0.5320000052452087, 0.5410000085830688, 0.5249999761581421, 0.5400000214576721, 0.5249999761581421, 0.5289999842643738, 0.5320000052452087, 0.5339999794960022, 0.5320000052452087, 0.5350000262260437, 0.5400000214576721, 0.5450000166893005, 0.5440000295639038, 0.5400000214576721, 0.5379999876022339, 0.5350000262260437, 0.5410000085830688, 0.5490000247955322, 0.531000018119812, 0.5389999747276306, 0.546999990940094, 0.5529999732971191, 0.5370000004768372, 0.5440000295639038, 0.5400000214576721, 0.5490000247955322, 0.550000011920929, 0.5580000281333923, 0.5609999895095825, 0.5429999828338623, 0.5529999732971191, 0.5519999861717224, 0.5450000166893005, 0.550000011920929, 0.5379999876022339, 0.5490000247955322, 0.5460000038146973, 0.5419999957084656, 0.5569999814033508, 0.5509999990463257, 0.5490000247955322, 0.5529999732971191, 0.5479999780654907, 0.5590000152587891, 0.5479999780654907, 0.5509999990463257, 0.5440000295639038, 0.5509999990463257, 0.5540000200271606, 0.5559999942779541, 0.5630000233650208, 0.5649999976158142, 0.5640000104904175, 0.5649999976158142, 0.5490000247955322, 0.5709999799728394, 0.5659999847412109, 0.5630000233650208, 0.5640000104904175, 0.5580000281333923, 0.546999990940094, 0.5550000071525574, 0.5580000281333923, 0.5429999828338623, 0.5440000295639038, 0.5569999814033508, 0.5569999814033508, 0.5540000200271606, 0.5550000071525574, 0.5649999976158142, 0.5540000200271606, 0.5630000233650208, 0.5609999895095825, 0.5580000281333923, 0.5509999990463257, 0.5550000071525574, 0.5550000071525574, 0.5519999861717224, 0.5609999895095825, 0.5630000233650208, 0.5509999990463257, 0.550000011920929, 0.5490000247955322, 0.5540000200271606, 0.550000011920929, 0.5529999732971191, 0.5460000038146973, 0.550000011920929, 0.5529999732971191, 0.5519999861717224, 0.5529999732971191, 0.5609999895095825, 0.5590000152587891, 0.5550000071525574, 0.550000011920929, 0.5609999895095825, 0.5619999766349792, 0.5609999895095825, 0.5540000200271606, 0.550000011920929, 0.5600000023841858, 0.5559999942779541, 0.5609999895095825, 0.5569999814033508, 0.5600000023841858, 0.5680000185966492, 0.5580000281333923, 0.5559999942779541, 0.5569999814033508, 0.5669999718666077, 0.5709999799728394, 0.5640000104904175, 0.5569999814033508, 0.5600000023841858, 0.5569999814033508, 0.5649999976158142, 0.5600000023841858, 0.5580000281333923, 0.5609999895095825, 0.5590000152587891, 0.5640000104904175, 0.5529999732971191, 0.5640000104904175, 0.5649999976158142, 0.5659999847412109, 0.5630000233650208, 0.5630000233650208, 0.5619999766349792, 0.5609999895095825, 0.5559999942779541, 0.5529999732971191, 0.5600000023841858], "label": "FineWeb full MinHash"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.5239999890327454, 0.4900000095367431, 0.5040000081062317, 0.5099999904632568, 0.4990000128746032, 0.5170000195503235, 0.5040000081062317, 0.5009999871253967, 0.5230000019073486, 0.5109999775886536, 0.5059999823570251, 0.5130000114440918, 0.5090000033378601, 0.5180000066757202, 0.5220000147819519, 0.5189999938011169, 0.5180000066757202, 0.5220000147819519, 0.5120000243186951, 0.5460000038146973, 0.5239999890327454, 0.5289999842643738, 0.5440000295639038, 0.5339999794960022, 0.5299999713897705, 0.5260000228881836, 0.5360000133514404, 0.5339999794960022, 0.5360000133514404, 0.5299999713897705, 0.5180000066757202, 0.5249999761581421, 0.5440000295639038, 0.5299999713897705, 0.5339999794960022, 0.5239999890327454, 0.527999997138977, 0.5139999985694885, 0.5289999842643738, 0.5360000133514404, 0.5260000228881836, 0.5389999747276306, 0.5460000038146973, 0.5270000100135803, 0.5339999794960022, 0.5320000052452087, 0.5329999923706055, 0.5260000228881836, 0.5220000147819519, 0.5260000228881836, 0.5379999876022339, 0.5410000085830688, 0.5350000262260437, 0.5389999747276306, 0.5320000052452087, 0.5389999747276306, 0.5379999876022339, 0.5329999923706055, 0.5270000100135803, 0.5170000195503235, 0.5329999923706055, 0.5370000004768372, 0.5379999876022339, 0.5249999761581421, 0.5479999780654907, 0.546999990940094, 0.5400000214576721, 0.5440000295639038, 0.5360000133514404, 0.5450000166893005, 0.5440000295639038, 0.5370000004768372, 0.5370000004768372, 0.5479999780654907, 0.5379999876022339, 0.5400000214576721, 0.5479999780654907, 0.5379999876022339, 0.5509999990463257, 0.5440000295639038, 0.5379999876022339, 0.550000011920929, 0.5389999747276306, 0.5370000004768372, 0.5379999876022339, 0.5419999957084656, 0.5360000133514404, 0.5509999990463257, 0.5360000133514404, 0.5419999957084656, 0.5419999957084656, 0.550000011920929, 0.5360000133514404, 0.5519999861717224, 0.5540000200271606, 0.546999990940094, 0.5370000004768372, 0.5379999876022339, 0.5519999861717224, 0.5329999923706055, 0.5400000214576721, 0.5429999828338623, 0.550000011920929, 0.5490000247955322, 0.5360000133514404, 0.550000011920929, 0.5569999814033508, 0.5490000247955322, 0.5490000247955322, 0.5479999780654907, 0.5350000262260437, 0.5490000247955322, 0.5370000004768372, 0.5440000295639038, 0.5329999923706055, 0.5440000295639038, 0.5429999828338623, 0.5389999747276306, 0.5450000166893005, 0.5320000052452087, 0.5450000166893005, 0.5400000214576721, 0.5419999957084656, 0.5460000038146973, 0.5370000004768372, 0.5400000214576721, 0.5460000038146973, 0.5370000004768372, 0.5370000004768372, 0.5460000038146973, 0.5400000214576721, 0.5490000247955322, 0.5529999732971191, 0.5379999876022339, 0.5460000038146973, 0.5450000166893005, 0.5429999828338623, 0.5460000038146973, 0.5400000214576721, 0.5479999780654907, 0.5460000038146973, 0.5540000200271606, 0.5400000214576721, 0.5350000262260437, 0.5490000247955322, 0.5460000038146973, 0.5460000038146973, 0.5509999990463257, 0.5410000085830688, 0.5429999828338623, 0.5379999876022339, 0.5450000166893005, 0.5389999747276306, 0.5400000214576721, 0.5400000214576721, 0.550000011920929, 0.5440000295639038, 0.5389999747276306, 0.5450000166893005, 0.5400000214576721, 0.5389999747276306, 0.5419999957084656, 0.5410000085830688, 0.5440000295639038, 0.5519999861717224, 0.5479999780654907, 0.5450000166893005, 0.5569999814033508], "label": "FineWeb filtered only"}, "big-run-refinedweb": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.5, 0.4979999959468841, 0.4950000047683716, 0.4950000047683716, 0.5049999952316284, 0.5329999923706055, 0.5220000147819519, 0.5139999985694885, 0.5339999794960022, 0.5130000114440918, 0.5389999747276306, 0.5400000214576721, 0.5270000100135803, 0.5320000052452087, 0.5260000228881836, 0.5370000004768372, 0.527999997138977, 0.5289999842643738, 0.5339999794960022, 0.5270000100135803, 0.531000018119812, 0.527999997138977, 0.5400000214576721, 0.5479999780654907, 0.550000011920929, 0.5400000214576721, 0.5350000262260437, 0.5410000085830688, 0.5379999876022339, 0.5299999713897705, 0.5490000247955322, 0.5509999990463257, 0.5519999861717224, 0.5429999828338623, 0.5429999828338623, 0.5440000295639038, 0.5379999876022339, 0.5379999876022339, 0.5419999957084656, 0.5609999895095825, 0.5540000200271606, 0.5370000004768372, 0.5440000295639038, 0.5410000085830688, 0.5379999876022339, 0.5329999923706055, 0.5419999957084656, 0.5419999957084656, 0.5519999861717224, 0.550000011920929, 0.5509999990463257, 0.5400000214576721, 0.5450000166893005, 0.5509999990463257, 0.5569999814033508, 0.5550000071525574, 0.5590000152587891, 0.5479999780654907, 0.5550000071525574, 0.5440000295639038, 0.5460000038146973, 0.546999990940094, 0.5559999942779541, 0.5550000071525574, 0.5490000247955322, 0.5440000295639038, 0.546999990940094, 0.5450000166893005, 0.546999990940094, 0.5649999976158142, 0.5490000247955322, 0.5519999861717224, 0.550000011920929, 0.5509999990463257, 0.5519999861717224, 0.5519999861717224, 0.5529999732971191, 0.5490000247955322, 0.546999990940094, 0.550000011920929, 0.5720000267028809, 0.5619999766349792, 0.5490000247955322, 0.5680000185966492, 0.5519999861717224, 0.5569999814033508, 0.5509999990463257, 0.5619999766349792, 0.5630000233650208, 0.5529999732971191, 0.5619999766349792, 0.5609999895095825, 0.550000011920929, 0.5479999780654907, 0.5529999732971191, 0.5519999861717224, 0.5580000281333923, 0.5590000152587891, 0.5529999732971191, 0.550000011920929, 0.5680000185966492, 0.5580000281333923, 0.5630000233650208, 0.5630000233650208, 0.5559999942779541, 0.5649999976158142, 0.5569999814033508, 0.5649999976158142, 0.5659999847412109, 0.5559999942779541, 0.5659999847412109, 0.5630000233650208, 0.5509999990463257, 0.5669999718666077, 0.5669999718666077, 0.5479999780654907, 0.5540000200271606, 0.5580000281333923, 0.5519999861717224, 0.5590000152587891, 0.5590000152587891, 0.5619999766349792, 0.5509999990463257, 0.546999990940094, 0.5609999895095825, 0.5540000200271606, 0.5630000233650208, 0.5580000281333923, 0.5559999942779541, 0.5680000185966492, 0.5649999976158142, 0.5619999766349792, 0.5580000281333923, 0.5630000233650208, 0.5559999942779541, 0.5540000200271606, 0.5540000200271606, 0.5569999814033508, 0.5619999766349792, 0.5559999942779541, 0.5600000023841858, 0.5460000038146973, 0.5429999828338623, 0.5580000281333923, 0.5550000071525574, 0.5580000281333923, 0.5540000200271606, 0.5609999895095825, 0.5519999861717224, 0.550000011920929, 0.5519999861717224, 0.5590000152587891, 0.5619999766349792, 0.5600000023841858, 0.5590000152587891, 0.5690000057220459, 0.5640000104904175, 0.5580000281333923, 0.5559999942779541, 0.5569999814033508, 0.5569999814033508, 0.5540000200271606, 0.5640000104904175, 0.5600000023841858, 0.5550000071525574, 0.5640000104904175, 0.5600000023841858, 0.5540000200271606], "label": "RefinedWeb"}}, "layout": {"title": {"text": "Dedup across all dumps does not improve performance"}}}
assets/data/plots/all_filtering_steps/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3552836012095213, 0.3781493119895458, 0.3866849727928638, 0.4050675220787525, 0.4032807648181915, 0.4174600429832935, 0.4206059761345386, 0.427497424185276, 0.4316632784903049, 0.4385909177362919, 0.4334069043397903, 0.4360812865197658, 0.4404293224215507, 0.4385774843394756, 0.4407080821692943, 0.4467254020273685, 0.4470436163246631, 0.4486658610403538, 0.4459679573774338, 0.4454015754163265, 0.4515932314097881, 0.4482216536998749, 0.4484201297163963, 0.455057855695486, 0.4526158757507801, 0.453176885843277, 0.450159091502428, 0.4516039006412029, 0.4549933448433876, 0.4555377587676048, 0.4575010798871517, 0.4577344059944153, 0.4540543705224991, 0.4537974074482918, 0.4611785635352134, 0.4586966186761856, 0.4594406597316265, 0.4598931074142456, 0.457538403570652, 0.4591932781040668, 0.4636382386088371, 0.4582749158143997, 0.4625946804881096, 0.4633439630270004, 0.4666871763765812, 0.4649887941777706, 0.4671247974038124, 0.4665776938199997, 0.4672530107200145, 0.4666078947484493, 0.4666155055165291, 0.4727727174758911, 0.467480719089508, 0.4681386984884739, 0.4651658721268177, 0.4668439887464046, 0.4671731516718864, 0.4719251021742821, 0.4699816256761551, 0.4723306186497211, 0.4686817973852157, 0.468911949545145, 0.4714248068630695, 0.4724191203713417, 0.4700912088155746, 0.4685601107776165, 0.4716645181179046, 0.4724556542932987, 0.4670086726546287, 0.4703365340828895, 0.4698334187269211, 0.471625205129385, 0.4688323326408863, 0.4735309742391109, 0.4729253277182579, 0.4747676998376846, 0.4723741039633751, 0.4764323942363262, 0.4737579710781574, 0.4758132360875606, 0.4755662642419338, 0.4730159305036068, 0.4787128046154976, 0.4740134924650192, 0.4785312972962856, 0.4783577285706997, 0.4752367511391639, 0.474204134196043, 0.4737414345145225, 0.4780189953744411, 0.477523285895586, 0.4751617163419723, 0.4776186011731624, 0.4769949465990066, 0.4790891669690609, 0.479917362332344, 0.4771673306822777, 0.4825278185307979, 0.4811677671968937, 0.4787211790680885, 0.4817796200513839, 0.4819813556969166, 0.4802381917834282, 0.4810985140502453, 0.481117732822895, 0.4791575670242309, 0.4798801243305206, 0.4829155020415783, 0.4822122864425182, 0.4827562272548675, 0.4839778505265713, 0.4820474348962307, 0.4858015961945057, 0.4826803356409073, 0.4831027314066887, 0.4827458150684833, 0.4819435514509678, 0.4836879819631576, 0.4835174195468426, 0.4855972006917, 0.4871680215001106, 0.4840429238975048, 0.4827739149332046, 0.4881435632705688, 0.4871019721031189, 0.486987367272377, 0.4836358055472374, 0.4867987409234047, 0.4869474284350872, 0.4886575266718864, 0.4855775311589241, 0.4863000251352787, 0.4841057248413563, 0.488163661211729, 0.4904011823236942, 0.4870587214827537, 0.4884037151932716, 0.4873756393790245, 0.4925794936716556, 0.4874482750892639, 0.4898910224437713, 0.4893574342131614, 0.4888269044458866, 0.4887814335525036, 0.4876748844981193, 0.4853886738419533, 0.4878034777939319, 0.4911742769181728, 0.4905468784272671, 0.4896938055753708, 0.4875142201781273, 0.4900367334485054, 0.4900274313986301, 0.4905461706221103, 0.4891181476414203, 0.4881824217736721, 0.4902780950069427, 0.4895042479038238, 0.4890727028250694, 0.4897591508924961, 0.4879062548279762, 0.4897833876311779, 0.4902243539690971, 0.4884885586798191, 0.4880276583135128, 0.4927133433520794, 0.4899616949260235], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3593025095760822, 0.3753932043910026, 0.3896549865603447, 0.4011945575475693, 0.4079862833023071, 0.4100634902715683, 0.4188448339700699, 0.4182912856340408, 0.4209799654781818, 0.426167830824852, 0.4270535074174404, 0.4293412938714027, 0.4376098960638046, 0.4369498938322067, 0.4447805918753147, 0.4420784451067447, 0.4401859976351261, 0.4450364373624325, 0.4467439614236355, 0.4494622647762298, 0.4474291987717151, 0.4474774301052093, 0.4496959559619427, 0.4504862427711487, 0.4483809620141983, 0.4500409476459026, 0.4506221041083336, 0.4519891515374183, 0.4511651210486889, 0.4493776857852936, 0.4546159133315086, 0.4542211070656776, 0.4540864638984203, 0.4535767734050751, 0.4580400213599205, 0.451940905302763, 0.4536588154733181, 0.4593464843928814, 0.4576366357505321, 0.4563389606773853, 0.4556163437664509, 0.4611873291432857, 0.4606512449681759, 0.4602674432098865, 0.4573654346168041, 0.4579697586596012, 0.4577618762850761, 0.465243399143219, 0.4626524560153484, 0.4652697443962097, 0.4616814218461513, 0.4664025083184242, 0.4648593515157699, 0.4665380977094173, 0.4670920372009277, 0.4651120826601982, 0.4648002386093139, 0.4674604535102844, 0.4694998189806938, 0.4647957049310207, 0.4655059054493904, 0.4694474637508392, 0.4685290567576885, 0.4678448662161827, 0.4666110426187515, 0.466820664703846, 0.4703560136258602, 0.4655868485569954, 0.4657375514507293, 0.4673589915037155, 0.4694744572043419, 0.4697113968431949, 0.4663790501654148, 0.4678909480571747, 0.4731503240764141, 0.4703953340649605, 0.4711540788412094, 0.4689725339412689, 0.4709760397672653, 0.4721849896013737, 0.4684626050293445, 0.4728966951370239, 0.4708623439073562, 0.4755619578063488, 0.4722185768187046, 0.4752251170575619, 0.4724387377500534, 0.4767676629126072, 0.4720797315239906, 0.476152952760458, 0.4784524105489254, 0.472656887024641, 0.4761070720851421, 0.4791567139327526, 0.4773554690182209, 0.4749615713953972, 0.4786102436482906, 0.4776762872934341, 0.4759960658848285, 0.4783963784575462, 0.4794723503291607, 0.4783952049911022, 0.4814380966126919, 0.476895060390234, 0.479157205671072, 0.4783024378120899, 0.4772652834653854, 0.4805076755583286, 0.4786335416138172, 0.4829660281538963, 0.4798073060810566, 0.4846024662256241, 0.4791539534926414, 0.4836216196417808, 0.482492484152317, 0.4832956567406654, 0.4811016321182251, 0.480607770383358, 0.4813096337020397, 0.4819207563996315, 0.482705220580101, 0.4817859195172786, 0.4817019775509834, 0.4848218411207199, 0.4850655570626259, 0.4847046621143818, 0.4811170361936092, 0.4863272421061992, 0.484540831297636, 0.4826735481619835, 0.4844910651445389, 0.4825031049549579, 0.4849743507802486, 0.484294731169939, 0.4857852198183536, 0.4881704896688461, 0.4850401543080807, 0.4885894693434238, 0.4855906665325165, 0.4871751256287098, 0.48358104377985, 0.4859574064612388, 0.4833582155406475, 0.4867088869214058, 0.4869902320206165, 0.4876262210309505, 0.4864178374409675, 0.4864541031420231, 0.4867057502269745, 0.4884936697781086, 0.4854058027267456, 0.4880223199725151, 0.4881350100040436, 0.4871640801429748, 0.4859121330082416, 0.4894774369895458, 0.4890438541769981, 0.489189263433218, 0.4893344156444073, 0.4886334165930748, 0.4900187514722347, 0.4877792187035084, 0.4887096807360649, 0.4900767691433429, 0.4877709597349167, 0.48653694242239, 0.4897000454366207], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3608616776764393, 0.3745453506708145, 0.3862277194857597, 0.3989979773759842, 0.406296543776989, 0.4094927236437797, 0.4138859286904335, 0.4177777022123337, 0.4208802655339241, 0.4254550077021122, 0.4283009432256222, 0.429458349943161, 0.4330311268568039, 0.4349483698606491, 0.4348161295056343, 0.438955657184124, 0.4389265701174736, 0.4393925778567791, 0.4383306242525577, 0.4436748661100864, 0.4423373565077781, 0.4460027255117893, 0.4440812170505523, 0.4476902261376381, 0.4465879611670971, 0.4497823156416416, 0.4513350501656532, 0.4518667235970497, 0.45149727165699, 0.4513994492590427, 0.4521937072277069, 0.4520382955670357, 0.4530793912708759, 0.4516105614602566, 0.4530563354492187, 0.4495660625398159, 0.4520940892398357, 0.4561133235692978, 0.4522969461977482, 0.4575686641037464, 0.4589144177734852, 0.4582882039248943, 0.457970168441534, 0.4554797261953354, 0.4622044861316681, 0.4596928395330906, 0.4624353349208832, 0.4619148448109627, 0.461100060492754, 0.458431463688612, 0.4620467089116573, 0.4562215581536293, 0.4620163068175316, 0.4631462283432483, 0.4600549824535846, 0.4620365314185619, 0.458735141903162, 0.461642112582922, 0.461245734244585, 0.4645131677389145, 0.4629777930676937, 0.4651660025119781, 0.4653937108814716, 0.4676259346306324, 0.4667201824486255, 0.4650012850761413, 0.4676916748285293, 0.4708514772355556, 0.4673572592437267, 0.4689626581966877, 0.4678038358688354, 0.4667215310037136, 0.4646228328347206, 0.4662510119378567, 0.4674677737057209, 0.4690804108977318, 0.4634581170976162, 0.4701276533305645, 0.4676450751721859, 0.4672758504748344, 0.4674397967755794, 0.4656238108873367, 0.4690065123140812, 0.4677213467657566, 0.4678985886275768, 0.4735414572060108, 0.4705612398684025, 0.4703374318778515, 0.4704933613538742, 0.4688010476529598, 0.4699571952223778, 0.4674785658717155, 0.4701188169419765, 0.4682065695524215, 0.4729971997439861, 0.4748715870082378, 0.4745333231985569, 0.4737020246684551, 0.4747246317565441, 0.4771635122597217, 0.4740425907075405, 0.475264236330986, 0.4744705818593502, 0.474684040993452, 0.4721556939184665, 0.475641455501318, 0.476833701133728, 0.4746401384472847, 0.4742486327886581, 0.4730467088520527, 0.4773029200732708, 0.4760043211281299, 0.4770320989191532, 0.4742161482572555, 0.4780259765684604, 0.4806670732796192, 0.4784667380154133, 0.4788618609309196, 0.4762138128280639, 0.4777246937155723, 0.4796081893146038, 0.4798486456274986, 0.475479181855917, 0.4779988899827003, 0.4765858314931392, 0.4772914499044418, 0.47843898832798, 0.4799034222960472, 0.4803600236773491, 0.4751846008002758, 0.4777872562408447, 0.4779460839927196, 0.4787487275898456, 0.4808406494557857, 0.4810357913374901, 0.4797308407723903, 0.4800078608095646, 0.4806460626423359, 0.4810502976179123, 0.4797912389039993, 0.477332629263401, 0.4818884879350662, 0.482621606439352, 0.4833096489310264, 0.4821632876992225, 0.4831674285233021, 0.4830279909074306, 0.4849893450736999, 0.4845218025147915, 0.4825541749596596, 0.4833571836352348, 0.4853803217411041, 0.483093187212944, 0.4850797094404697, 0.485261783003807, 0.4837660938501358, 0.4835929833352566, 0.4855643883347511, 0.4832059442996979, 0.484714712947607, 0.4839249886572361, 0.4829078912734985, 0.4818423055112362, 0.482727088034153, 0.4824129492044449, 0.4820138849318027, 0.4865870922803879], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3308933284133672, 0.3605199865996837, 0.3733148723840713, 0.3882005847990513, 0.3934122696518898, 0.3947227671742439, 0.4042885974049568, 0.3974800482392311, 0.4055779427289963, 0.4133470430970192, 0.4117913842201233, 0.4113653488457203, 0.4149517640471458, 0.4187851920723915, 0.4206527359783649, 0.4240428246557712, 0.422003373503685, 0.4280910938978195, 0.4244147576391697, 0.4316282644867897, 0.4295645765960216, 0.4310102686285972, 0.4360743537545204, 0.4313482865691185, 0.4350991360843181, 0.4378576353192329, 0.4335876516997814, 0.4347924515604973, 0.4348904751241207, 0.436600212007761, 0.430036511272192, 0.4350974671542644, 0.4399556629359722, 0.4371416717767715, 0.4363861419260502, 0.4376698136329651, 0.4405004419386387, 0.4373639523983001, 0.4379038028419018, 0.4371281825006008, 0.4393439553678036, 0.440426729619503, 0.4401675276458263, 0.4429537951946258, 0.4449137263000011, 0.4434786736965179, 0.4450470842421055, 0.4454202279448509, 0.4394537284970283, 0.442185215651989, 0.4461225643754005, 0.4427758157253265, 0.4430646039545536, 0.4476901069283485, 0.4478763341903686, 0.4493869319558143, 0.4448477327823639, 0.450044184923172, 0.4498609118163585, 0.4457665979862213, 0.4506924152374267, 0.449855338782072, 0.448790930211544, 0.4474099352955818, 0.4546772800385952, 0.4529431238770485, 0.452015146613121, 0.4502020999789238, 0.4493804536759853, 0.4523266032338142, 0.4551868587732315, 0.4501944817602634, 0.4493303671479225, 0.4526805207133293, 0.4533850513398647, 0.4518048763275146, 0.4518973492085933, 0.4531301632523536, 0.4518006071448326, 0.4553494565188885, 0.4528752230107784, 0.4536322727799415, 0.4561733976006508, 0.4549491256475448, 0.4574789106845855, 0.4577847123146057, 0.4563642293214798, 0.4578686729073524, 0.4561499990522861, 0.4537816494703293, 0.4542164430022239, 0.4559455662965774, 0.4554723873734474, 0.4575514122843742, 0.4575202167034149, 0.4592722058296203, 0.4585275091230869, 0.4580587856471538, 0.456934317946434, 0.4577495418488979, 0.4540119916200638, 0.4570806957781315, 0.4608120545744896, 0.4588425755500793, 0.4578334167599678, 0.4610816091299057, 0.4598177038133144, 0.461849745362997, 0.4631866924464702, 0.4601576402783394, 0.4646804705262184, 0.4632389545440674, 0.4604574106633663, 0.4602976888418197, 0.4581312239170074, 0.4654182009398937, 0.4655338563024997, 0.4616620391607284, 0.461054053157568, 0.4613021649420261, 0.4658613465726375, 0.4633531905710697, 0.4613638147711754, 0.4643996246159076, 0.462500050663948, 0.4650798961520195, 0.4648764543235302, 0.4639869071543216, 0.4634246975183487, 0.46585888043046, 0.4639799632132053, 0.4630857892334461, 0.4644265696406364, 0.4642998576164245, 0.4686848931014538, 0.4687492996454239, 0.4650243632495403, 0.4627032242715359, 0.4665953740477562, 0.4660026729106903, 0.4664581045508384, 0.4676475040614605, 0.4657339677214622, 0.4664678275585174, 0.4673498086631298, 0.4676674827933311, 0.4680955372750759, 0.4681585058569908, 0.4659864418208599, 0.4686457589268684, 0.4661462865769863, 0.4658931568264961, 0.4674226939678192, 0.46805215254426, 0.4682257212698459, 0.4689070098102093, 0.4699570722877979, 0.4655096270143986, 0.4688013233244419, 0.4707522802054882, 0.4661469310522079, 0.4688841328024864, 0.4671329781413078, 0.4662554152309894, 0.4697433896362781, 0.4698473587632179, 0.4676505327224731, 0.4696521013975143], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2939999997615814, 0.3174999952316284, 0.3294999897480011, 0.3510000109672546, 0.3485000133514404, 0.3634999990463257, 0.3700000047683716, 0.3524999916553497, 0.375, 0.3804999887943268, 0.37950000166893, 0.3824999928474426, 0.3799999952316284, 0.395000010728836, 0.3844999969005584, 0.3894999921321869, 0.3855000138282776, 0.3955000042915344, 0.3995000123977661, 0.4009999930858612, 0.3939999938011169, 0.3970000147819519, 0.3955000042915344, 0.3955000042915344, 0.4079999923706054, 0.3959999978542328, 0.4090000092983246, 0.4045000076293945, 0.3930000066757202, 0.4099999964237213, 0.4054999947547912, 0.4124999940395355, 0.4160000085830688, 0.4149999916553497, 0.4070000052452087, 0.4110000133514404, 0.4144999980926513, 0.4120000004768371, 0.4050000011920929, 0.4165000021457672, 0.4180000126361847, 0.4050000011920929, 0.4120000004768371, 0.4135000109672546, 0.4320000112056732, 0.4284999966621399, 0.4269999861717224, 0.414000004529953, 0.4255000054836273, 0.4165000021457672, 0.4144999980926513, 0.4079999923706054, 0.4205000102519989, 0.4180000126361847, 0.4244999885559082, 0.4235000014305115, 0.4244999885559082, 0.4300000071525574, 0.4160000085830688, 0.4205000102519989, 0.4329999983310699, 0.4280000030994415, 0.4244999885559082, 0.4375, 0.4244999885559082, 0.4365000128746032, 0.4329999983310699, 0.4424999952316284, 0.4390000104904175, 0.4449999928474426, 0.445499986410141, 0.4320000112056732, 0.4365000128746032, 0.4244999885559082, 0.429500013589859, 0.4395000040531158, 0.4284999966621399, 0.44200000166893, 0.4370000064373016, 0.4399999976158142, 0.4334999918937683, 0.4429999887943268, 0.44200000166893, 0.4334999918937683, 0.4384999871253967, 0.4365000128746032, 0.4390000104904175, 0.4354999959468841, 0.44200000166893, 0.4350000023841858, 0.4390000104904175, 0.4404999911785126, 0.4410000145435333, 0.4305000007152557, 0.4490000009536743, 0.4510000050067901, 0.4605000019073486, 0.4490000009536743, 0.449999988079071, 0.4595000147819519, 0.4514999985694885, 0.4490000009536743, 0.4474999904632568, 0.4444999992847442, 0.4524999856948852, 0.4465000033378601, 0.4519999921321869, 0.4550000131130218, 0.4524999856948852, 0.4429999887943268, 0.4550000131130218, 0.4510000050067901, 0.4560000002384186, 0.4465000033378601, 0.4485000073909759, 0.4524999856948852, 0.4440000057220459, 0.457500010728836, 0.4544999897480011, 0.4480000138282776, 0.4584999978542328, 0.4544999897480011, 0.4569999873638153, 0.4584999978542328, 0.4444999992847442, 0.4629999995231628, 0.457500010728836, 0.4555000066757202, 0.4569999873638153, 0.4474999904632568, 0.4564999938011169, 0.4595000147819519, 0.4634999930858612, 0.4555000066757202, 0.453000009059906, 0.457500010728836, 0.4614999890327453, 0.460999995470047, 0.4539999961853027, 0.4595000147819519, 0.4629999995231628, 0.4670000076293945, 0.4580000042915344, 0.4639999866485595, 0.457500010728836, 0.4595000147819519, 0.4665000140666961, 0.4584999978542328, 0.4629999995231628, 0.4595000147819519, 0.4659999907016754, 0.4645000100135803, 0.4675000011920929, 0.4690000116825104, 0.4715000092983246, 0.4634999930858612, 0.4634999930858612, 0.4639999866485595, 0.465499997138977, 0.4675000011920929, 0.4670000076293945, 0.4600000083446502, 0.4595000147819519, 0.4625000059604645, 0.4600000083446502, 0.4645000100135803, 0.4715000092983246], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2985000014305115, 0.3269999921321869, 0.340499997138977, 0.3495000004768371, 0.3535000085830688, 0.3519999980926513, 0.3625000119209289, 0.3569999933242798, 0.3659999966621399, 0.3619999885559082, 0.3759999871253967, 0.3779999911785126, 0.3919999897480011, 0.3835000097751617, 0.402999997138977, 0.3899999856948852, 0.3869999945163727, 0.3885000050067901, 0.3989999890327453, 0.390500009059906, 0.4054999947547912, 0.398499995470047, 0.3989999890327453, 0.398499995470047, 0.4014999866485595, 0.398499995470047, 0.4135000109672546, 0.4045000076293945, 0.4144999980926513, 0.4079999923706054, 0.4124999940395355, 0.4169999957084656, 0.4074999988079071, 0.4205000102519989, 0.4135000109672546, 0.4160000085830688, 0.4124999940395355, 0.4225000143051147, 0.4214999973773956, 0.418500006198883, 0.4115000069141388, 0.4165000021457672, 0.4199999868869781, 0.418500006198883, 0.414000004529953, 0.4194999933242798, 0.4095000028610229, 0.4214999973773956, 0.4149999916553497, 0.426499992609024, 0.4160000085830688, 0.4169999957084656, 0.4314999878406524, 0.4404999911785126, 0.4325000047683716, 0.4305000007152557, 0.4275000095367431, 0.4250000119209289, 0.4230000078678131, 0.4214999973773956, 0.4275000095367431, 0.4354999959468841, 0.4235000014305115, 0.4244999885559082, 0.4199999868869781, 0.4235000014305115, 0.4275000095367431, 0.4205000102519989, 0.4244999885559082, 0.4230000078678131, 0.4235000014305115, 0.4280000030994415, 0.4305000007152557, 0.4305000007152557, 0.4359999895095825, 0.4345000088214874, 0.4395000040531158, 0.4280000030994415, 0.4350000023841858, 0.4365000128746032, 0.4255000054836273, 0.4339999854564667, 0.4314999878406524, 0.4329999983310699, 0.4345000088214874, 0.4395000040531158, 0.4350000023841858, 0.4535000026226043, 0.4449999928474426, 0.445499986410141, 0.4404999911785126, 0.4424999952316284, 0.4505000114440918, 0.4440000057220459, 0.4519999921321869, 0.4449999928474426, 0.4474999904632568, 0.4494999945163727, 0.4494999945163727, 0.445499986410141, 0.4510000050067901, 0.4524999856948852, 0.4395000040531158, 0.4444999992847442, 0.4469999969005584, 0.4460000097751617, 0.4539999961853027, 0.4494999945163727, 0.4465000033378601, 0.4544999897480011, 0.4474999904632568, 0.4550000131130218, 0.4510000050067901, 0.4555000066757202, 0.4480000138282776, 0.4589999914169311, 0.4550000131130218, 0.4510000050067901, 0.4519999921321869, 0.4514999985694885, 0.4539999961853027, 0.4535000026226043, 0.4569999873638153, 0.4620000123977661, 0.4634999930858612, 0.4555000066757202, 0.4465000033378601, 0.4550000131130218, 0.4485000073909759, 0.4435000121593475, 0.4480000138282776, 0.4555000066757202, 0.4469999969005584, 0.4535000026226043, 0.4555000066757202, 0.4519999921321869, 0.4485000073909759, 0.4639999866485595, 0.4584999978542328, 0.4490000009536743, 0.4524999856948852, 0.453000009059906, 0.4535000026226043, 0.460999995470047, 0.4589999914169311, 0.4544999897480011, 0.4589999914169311, 0.4569999873638153, 0.4544999897480011, 0.4625000059604645, 0.4474999904632568, 0.4510000050067901, 0.4480000138282776, 0.453000009059906, 0.4460000097751617, 0.460999995470047, 0.4634999930858612, 0.4679999947547912, 0.4639999866485595, 0.4720000028610229, 0.4659999907016754, 0.4650000035762787, 0.4620000123977661, 0.4659999907016754, 0.465499997138977, 0.4595000147819519, 0.4620000123977661], "label": "FineWeb: id mh + C4 filters"}, "big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.296999990940094, 0.3219999969005584, 0.3305000066757202, 0.3555000126361847, 0.351500004529953, 0.3600000143051147, 0.363999992609024, 0.3680000007152557, 0.3785000145435333, 0.3765000104904175, 0.382999986410141, 0.3785000145435333, 0.3835000097751617, 0.3819999992847442, 0.3935000002384186, 0.387499988079071, 0.3935000002384186, 0.3959999978542328, 0.3860000073909759, 0.3935000002384186, 0.3885000050067901, 0.3810000121593475, 0.3880000114440918, 0.3964999914169311, 0.4054999947547912, 0.3935000002384186, 0.3944999873638153, 0.3989999890327453, 0.3980000019073486, 0.4050000011920929, 0.4054999947547912, 0.4009999930858612, 0.4110000133514404, 0.4054999947547912, 0.4180000126361847, 0.4110000133514404, 0.4050000011920929, 0.4079999923706054, 0.4120000004768371, 0.402999997138977, 0.4205000102519989, 0.4129999876022339, 0.4120000004768371, 0.4169999957084656, 0.4269999861717224, 0.4230000078678131, 0.4225000143051147, 0.4300000071525574, 0.4180000126361847, 0.4284999966621399, 0.4165000021457672, 0.4325000047683716, 0.4235000014305115, 0.4210000038146972, 0.4239999949932098, 0.4235000014305115, 0.421999990940094, 0.4280000030994415, 0.4300000071525574, 0.4275000095367431, 0.4305000007152557, 0.4244999885559082, 0.4314999878406524, 0.4325000047683716, 0.4395000040531158, 0.4325000047683716, 0.4300000071525574, 0.4399999976158142, 0.4320000112056732, 0.4370000064373016, 0.4280000030994415, 0.4309999942779541, 0.4314999878406524, 0.4370000064373016, 0.4280000030994415, 0.4325000047683716, 0.4300000071525574, 0.4334999918937683, 0.4334999918937683, 0.4379999935626983, 0.4399999976158142, 0.4350000023841858, 0.4395000040531158, 0.4375, 0.4390000104904175, 0.4365000128746032, 0.4435000121593475, 0.4365000128746032, 0.445499986410141, 0.4440000057220459, 0.4460000097751617, 0.4415000081062317, 0.4415000081062317, 0.4339999854564667, 0.4429999887943268, 0.4399999976158142, 0.4359999895095825, 0.4370000064373016, 0.4469999969005584, 0.4404999911785126, 0.4435000121593475, 0.445499986410141, 0.4424999952316284, 0.4480000138282776, 0.4370000064373016, 0.4444999992847442, 0.4465000033378601, 0.4309999942779541, 0.4440000057220459, 0.4469999969005584, 0.4539999961853027, 0.4440000057220459, 0.4555000066757202, 0.4519999921321869, 0.4510000050067901, 0.4519999921321869, 0.4544999897480011, 0.4494999945163727, 0.4584999978542328, 0.4580000042915344, 0.4544999897480011, 0.4514999985694885, 0.4550000131130218, 0.4560000002384186, 0.4600000083446502, 0.4589999914169311, 0.4560000002384186, 0.457500010728836, 0.4679999947547912, 0.4494999945163727, 0.4505000114440918, 0.4440000057220459, 0.4539999961853027, 0.4535000026226043, 0.4514999985694885, 0.457500010728836, 0.4620000123977661, 0.4564999938011169, 0.4595000147819519, 0.4564999938011169, 0.4550000131130218, 0.4539999961853027, 0.4544999897480011, 0.4569999873638153, 0.457500010728836, 0.4539999961853027, 0.4595000147819519, 0.4665000140666961, 0.465499997138977, 0.4625000059604645, 0.4629999995231628, 0.4580000042915344, 0.4569999873638153, 0.4620000123977661, 0.457500010728836, 0.4550000131130218, 0.4645000100135803, 0.4629999995231628, 0.4584999978542328, 0.465499997138977, 0.460999995470047, 0.4634999930858612, 0.4605000019073486, 0.4584999978542328, 0.4550000131130218, 0.4564999938011169, 0.4600000083446502], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2509999871253967, 0.2894999980926513, 0.3235000073909759, 0.3389999866485595, 0.3384999930858612, 0.3459999859333038, 0.359499990940094, 0.3429999947547912, 0.3619999885559082, 0.3564999997615814, 0.3625000119209289, 0.363999992609024, 0.3680000007152557, 0.3680000007152557, 0.3684999942779541, 0.375, 0.3734999895095825, 0.3849999904632568, 0.3944999873638153, 0.3865000009536743, 0.395000010728836, 0.3935000002384186, 0.3980000019073486, 0.3910000026226043, 0.3885000050067901, 0.3914999961853027, 0.3815000057220459, 0.395000010728836, 0.3894999921321869, 0.395000010728836, 0.3935000002384186, 0.4034999907016754, 0.4004999995231628, 0.3970000147819519, 0.3975000083446502, 0.3995000123977661, 0.3980000019073486, 0.4034999907016754, 0.3959999978542328, 0.3989999890327453, 0.402999997138977, 0.3880000114440918, 0.3980000019073486, 0.4040000140666961, 0.3989999890327453, 0.3970000147819519, 0.3925000131130218, 0.4120000004768371, 0.3935000002384186, 0.395000010728836, 0.4070000052452087, 0.3935000002384186, 0.4034999907016754, 0.4189999997615814, 0.4129999876022339, 0.4160000085830688, 0.4149999916553497, 0.418500006198883, 0.4225000143051147, 0.4174999892711639, 0.4210000038146972, 0.4045000076293945, 0.4079999923706054, 0.4124999940395355, 0.4144999980926513, 0.4169999957084656, 0.4194999933242798, 0.4154999852180481, 0.4169999957084656, 0.4225000143051147, 0.4225000143051147, 0.4230000078678131, 0.4160000085830688, 0.4325000047683716, 0.4325000047683716, 0.4199999868869781, 0.4199999868869781, 0.4189999997615814, 0.4269999861717224, 0.4259999990463257, 0.4230000078678131, 0.4144999980926513, 0.4329999983310699, 0.4275000095367431, 0.4305000007152557, 0.4289999902248382, 0.4235000014305115, 0.4235000014305115, 0.4325000047683716, 0.4244999885559082, 0.4314999878406524, 0.4194999933242798, 0.4350000023841858, 0.4269999861717224, 0.4235000014305115, 0.4300000071525574, 0.4284999966621399, 0.4255000054836273, 0.4280000030994415, 0.4345000088214874, 0.4225000143051147, 0.4334999918937683, 0.4300000071525574, 0.4350000023841858, 0.429500013589859, 0.4325000047683716, 0.4384999871253967, 0.4345000088214874, 0.4354999959468841, 0.4359999895095825, 0.4354999959468841, 0.4424999952316284, 0.4424999952316284, 0.4320000112056732, 0.4280000030994415, 0.4390000104904175, 0.4480000138282776, 0.4415000081062317, 0.4384999871253967, 0.4390000104904175, 0.4494999945163727, 0.4449999928474426, 0.4384999871253967, 0.4424999952316284, 0.4359999895095825, 0.445499986410141, 0.4399999976158142, 0.4375, 0.4410000145435333, 0.4384999871253967, 0.4375, 0.4329999983310699, 0.4370000064373016, 0.4354999959468841, 0.4440000057220459, 0.4384999871253967, 0.4384999871253967, 0.4390000104904175, 0.4424999952316284, 0.4379999935626983, 0.4345000088214874, 0.4354999959468841, 0.4440000057220459, 0.4395000040531158, 0.4465000033378601, 0.4404999911785126, 0.4505000114440918, 0.4480000138282776, 0.4449999928474426, 0.445499986410141, 0.4410000145435333, 0.4485000073909759, 0.4460000097751617, 0.4480000138282776, 0.4465000033378601, 0.4460000097751617, 0.4460000097751617, 0.4395000040531158, 0.4474999904632568, 0.4469999969005584, 0.4404999911785126, 0.4440000057220459, 0.4435000121593475, 0.4435000121593475, 0.4514999985694885, 0.4474999904632568, 0.4474999904632568, 0.445499986410141], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2639999985694885, 0.2790000140666961, 0.296999990940094, 0.3109999895095825, 0.3240000009536743, 0.3070000112056732, 0.3210000097751617, 0.31700000166893, 0.3339999914169311, 0.324999988079071, 0.3260000050067901, 0.3330000042915344, 0.3409999907016754, 0.3400000035762787, 0.3529999852180481, 0.3400000035762787, 0.3490000069141388, 0.3529999852180481, 0.3499999940395355, 0.3459999859333038, 0.3370000123977661, 0.356000006198883, 0.3490000069141388, 0.3429999947547912, 0.3490000069141388, 0.3610000014305115, 0.3499999940395355, 0.3569999933242798, 0.3610000014305115, 0.3619999885559082, 0.3449999988079071, 0.3409999907016754, 0.3420000076293945, 0.3449999988079071, 0.3409999907016754, 0.3379999995231628, 0.3420000076293945, 0.3569999933242798, 0.3529999852180481, 0.3610000014305115, 0.363999992609024, 0.3600000143051147, 0.3540000021457672, 0.3499999940395355, 0.3689999878406524, 0.367000013589859, 0.3569999933242798, 0.3610000014305115, 0.3680000007152557, 0.3630000054836273, 0.3709999918937683, 0.3540000021457672, 0.3580000102519989, 0.367000013589859, 0.3529999852180481, 0.356000006198883, 0.3569999933242798, 0.3610000014305115, 0.3700000047683716, 0.375, 0.3709999918937683, 0.3819999992847442, 0.3709999918937683, 0.3650000095367431, 0.3709999918937683, 0.3650000095367431, 0.3709999918937683, 0.3840000033378601, 0.3740000128746032, 0.375, 0.356000006198883, 0.3689999878406524, 0.3700000047683716, 0.3819999992847442, 0.3799999952316284, 0.3779999911785126, 0.3729999959468841, 0.3709999918937683, 0.3759999871253967, 0.3709999918937683, 0.3759999871253967, 0.3779999911785126, 0.3779999911785126, 0.3689999878406524, 0.3840000033378601, 0.3860000073909759, 0.3849999904632568, 0.3790000081062317, 0.375, 0.3849999904632568, 0.3720000088214874, 0.3770000040531158, 0.3799999952316284, 0.3810000121593475, 0.382999986410141, 0.3650000095367431, 0.3740000128746032, 0.382999986410141, 0.3689999878406524, 0.3759999871253967, 0.3869999945163727, 0.3889999985694885, 0.3860000073909759, 0.3819999992847442, 0.3689999878406524, 0.3860000073909759, 0.3810000121593475, 0.382999986410141, 0.3819999992847442, 0.3840000033378601, 0.3889999985694885, 0.3880000114440918, 0.3849999904632568, 0.3799999952316284, 0.3910000026226043, 0.3989999890327453, 0.3880000114440918, 0.3880000114440918, 0.3840000033378601, 0.3880000114440918, 0.3860000073909759, 0.3919999897480011, 0.3880000114440918, 0.3939999938011169, 0.3869999945163727, 0.3919999897480011, 0.3910000026226043, 0.382999986410141, 0.3930000066757202, 0.3840000033378601, 0.3880000114440918, 0.3840000033378601, 0.3819999992847442, 0.382999986410141, 0.3880000114440918, 0.3860000073909759, 0.3860000073909759, 0.3869999945163727, 0.3860000073909759, 0.3899999856948852, 0.3819999992847442, 0.3860000073909759, 0.3889999985694885, 0.3840000033378601, 0.395000010728836, 0.3899999856948852, 0.3899999856948852, 0.3910000026226043, 0.3959999978542328, 0.3959999978542328, 0.3919999897480011, 0.3980000019073486, 0.3880000114440918, 0.3930000066757202, 0.4000000059604645, 0.3919999897480011, 0.3919999897480011, 0.4040000140666961, 0.3930000066757202, 0.3970000147819519, 0.3889999985694885, 0.3959999978542328, 0.3930000066757202, 0.3939999938011169, 0.3970000147819519, 0.3910000026226043, 0.4020000100135803], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2630000114440918, 0.2770000100135803, 0.3050000071525574, 0.3100000023841858, 0.3149999976158142, 0.3190000057220459, 0.3350000083446502, 0.3210000097751617, 0.3310000002384186, 0.3389999866485595, 0.3289999961853027, 0.3379999995231628, 0.3420000076293945, 0.3409999907016754, 0.3510000109672546, 0.3479999899864197, 0.3440000116825104, 0.3569999933242798, 0.3529999852180481, 0.3680000007152557, 0.3549999892711639, 0.3499999940395355, 0.3589999973773956, 0.3529999852180481, 0.3459999859333038, 0.3529999852180481, 0.3630000054836273, 0.3600000143051147, 0.3490000069141388, 0.3540000021457672, 0.3600000143051147, 0.356000006198883, 0.3470000028610229, 0.3470000028610229, 0.3549999892711639, 0.3440000116825104, 0.3529999852180481, 0.3630000054836273, 0.3449999988079071, 0.3479999899864197, 0.3490000069141388, 0.3519999980926513, 0.367000013589859, 0.356000006198883, 0.356000006198883, 0.3519999980926513, 0.3580000102519989, 0.3569999933242798, 0.3659999966621399, 0.3759999871253967, 0.3689999878406524, 0.3779999911785126, 0.3549999892711639, 0.3610000014305115, 0.3650000095367431, 0.3610000014305115, 0.3580000102519989, 0.3729999959468841, 0.367000013589859, 0.3689999878406524, 0.3540000021457672, 0.363999992609024, 0.3700000047683716, 0.3650000095367431, 0.3529999852180481, 0.3709999918937683, 0.3740000128746032, 0.3680000007152557, 0.3689999878406524, 0.3580000102519989, 0.3650000095367431, 0.3619999885559082, 0.3619999885559082, 0.3630000054836273, 0.3610000014305115, 0.3659999966621399, 0.375, 0.375, 0.3700000047683716, 0.3840000033378601, 0.3779999911785126, 0.382999986410141, 0.367000013589859, 0.3860000073909759, 0.3770000040531158, 0.3790000081062317, 0.3880000114440918, 0.3659999966621399, 0.3630000054836273, 0.3770000040531158, 0.3779999911785126, 0.3680000007152557, 0.3779999911785126, 0.375, 0.3819999992847442, 0.3720000088214874, 0.3799999952316284, 0.382999986410141, 0.375, 0.367000013589859, 0.3869999945163727, 0.3810000121593475, 0.382999986410141, 0.3709999918937683, 0.3720000088214874, 0.3689999878406524, 0.367000013589859, 0.3819999992847442, 0.3720000088214874, 0.3849999904632568, 0.3709999918937683, 0.3740000128746032, 0.3709999918937683, 0.3799999952316284, 0.3799999952316284, 0.3869999945163727, 0.375, 0.3680000007152557, 0.3779999911785126, 0.3799999952316284, 0.3720000088214874, 0.3799999952316284, 0.3759999871253967, 0.3819999992847442, 0.3770000040531158, 0.3810000121593475, 0.3720000088214874, 0.3860000073909759, 0.3810000121593475, 0.3790000081062317, 0.3860000073909759, 0.3759999871253967, 0.3860000073909759, 0.3810000121593475, 0.3790000081062317, 0.3799999952316284, 0.3840000033378601, 0.3810000121593475, 0.3810000121593475, 0.3849999904632568, 0.3869999945163727, 0.3819999992847442, 0.3740000128746032, 0.3779999911785126, 0.3860000073909759, 0.3889999985694885, 0.3849999904632568, 0.3889999985694885, 0.3810000121593475, 0.3849999904632568, 0.3840000033378601, 0.3860000073909759, 0.3889999985694885, 0.382999986410141, 0.3849999904632568, 0.3840000033378601, 0.3880000114440918, 0.3810000121593475, 0.3849999904632568, 0.3790000081062317, 0.3799999952316284, 0.3819999992847442, 0.382999986410141, 0.3790000081062317, 0.3810000121593475, 0.3779999911785126, 0.3889999985694885], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2599999904632568, 0.277999997138977, 0.2910000085830688, 0.3070000112056732, 0.3140000104904175, 0.3019999861717224, 0.3059999942779541, 0.3210000097751617, 0.3230000138282776, 0.324999988079071, 0.3149999976158142, 0.3109999895095825, 0.3339999914169311, 0.3319999873638153, 0.3319999873638153, 0.3300000131130218, 0.3370000123977661, 0.3219999969005584, 0.3370000123977661, 0.328000009059906, 0.3339999914169311, 0.3420000076293945, 0.3400000035762787, 0.3440000116825104, 0.3510000109672546, 0.3409999907016754, 0.3449999988079071, 0.3339999914169311, 0.3540000021457672, 0.3339999914169311, 0.3470000028610229, 0.3470000028610229, 0.3440000116825104, 0.3589999973773956, 0.3569999933242798, 0.3630000054836273, 0.3549999892711639, 0.3589999973773956, 0.3449999988079071, 0.3549999892711639, 0.3449999988079071, 0.3389999866485595, 0.3499999940395355, 0.3610000014305115, 0.3619999885559082, 0.3600000143051147, 0.3519999980926513, 0.3479999899864197, 0.356000006198883, 0.3519999980926513, 0.3440000116825104, 0.3490000069141388, 0.3519999980926513, 0.3470000028610229, 0.3589999973773956, 0.3449999988079071, 0.3490000069141388, 0.356000006198883, 0.3619999885559082, 0.3569999933242798, 0.3659999966621399, 0.3610000014305115, 0.3549999892711639, 0.3700000047683716, 0.363999992609024, 0.3600000143051147, 0.3580000102519989, 0.3549999892711639, 0.3619999885559082, 0.3689999878406524, 0.3630000054836273, 0.363999992609024, 0.3700000047683716, 0.367000013589859, 0.3630000054836273, 0.3630000054836273, 0.3700000047683716, 0.3589999973773956, 0.3540000021457672, 0.3540000021457672, 0.3659999966621399, 0.3619999885559082, 0.3589999973773956, 0.3650000095367431, 0.3709999918937683, 0.3680000007152557, 0.3689999878406524, 0.3650000095367431, 0.3729999959468841, 0.3619999885559082, 0.3689999878406524, 0.3569999933242798, 0.3510000109672546, 0.3680000007152557, 0.363999992609024, 0.3700000047683716, 0.3659999966621399, 0.3659999966621399, 0.363999992609024, 0.3619999885559082, 0.3659999966621399, 0.3680000007152557, 0.3610000014305115, 0.3720000088214874, 0.3729999959468841, 0.3810000121593475, 0.3630000054836273, 0.3689999878406524, 0.3709999918937683, 0.3759999871253967, 0.382999986410141, 0.3729999959468841, 0.3720000088214874, 0.3680000007152557, 0.3659999966621399, 0.3650000095367431, 0.363999992609024, 0.3589999973773956, 0.356000006198883, 0.3650000095367431, 0.3659999966621399, 0.367000013589859, 0.3729999959468841, 0.3720000088214874, 0.375, 0.3740000128746032, 0.3700000047683716, 0.3569999933242798, 0.3759999871253967, 0.3740000128746032, 0.367000013589859, 0.3770000040531158, 0.3759999871253967, 0.3709999918937683, 0.3779999911785126, 0.3709999918937683, 0.3689999878406524, 0.3799999952316284, 0.3630000054836273, 0.375, 0.3700000047683716, 0.3700000047683716, 0.3729999959468841, 0.3720000088214874, 0.3790000081062317, 0.375, 0.3729999959468841, 0.3770000040531158, 0.3799999952316284, 0.3779999911785126, 0.3720000088214874, 0.3799999952316284, 0.3759999871253967, 0.3799999952316284, 0.3790000081062317, 0.375, 0.3740000128746032, 0.3729999959468841, 0.3840000033378601, 0.3659999966621399, 0.3759999871253967, 0.3720000088214874, 0.3720000088214874, 0.3759999871253967, 0.375, 0.3650000095367431, 0.3729999959468841], "label": "FineWeb: base filtering only"}, "big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2329999953508377, 0.2630000114440918, 0.2879999876022339, 0.296999990940094, 0.2960000038146972, 0.3039999902248382, 0.3129999935626983, 0.3149999976158142, 0.3300000131130218, 0.3300000131130218, 0.3350000083446502, 0.3379999995231628, 0.3370000123977661, 0.3330000042915344, 0.3370000123977661, 0.3389999866485595, 0.3429999947547912, 0.3659999966621399, 0.3459999859333038, 0.3479999899864197, 0.3440000116825104, 0.3470000028610229, 0.3569999933242798, 0.3510000109672546, 0.3680000007152557, 0.3529999852180481, 0.3680000007152557, 0.3549999892711639, 0.3540000021457672, 0.3529999852180481, 0.3499999940395355, 0.3569999933242798, 0.3529999852180481, 0.3499999940395355, 0.3540000021457672, 0.3659999966621399, 0.3600000143051147, 0.3680000007152557, 0.3659999966621399, 0.3600000143051147, 0.3659999966621399, 0.3540000021457672, 0.3580000102519989, 0.367000013589859, 0.3549999892711639, 0.3729999959468841, 0.3580000102519989, 0.3619999885559082, 0.3659999966621399, 0.3680000007152557, 0.3650000095367431, 0.3619999885559082, 0.3759999871253967, 0.3689999878406524, 0.3689999878406524, 0.3619999885559082, 0.3630000054836273, 0.3650000095367431, 0.3799999952316284, 0.3729999959468841, 0.3740000128746032, 0.367000013589859, 0.3720000088214874, 0.3600000143051147, 0.3650000095367431, 0.3729999959468841, 0.3589999973773956, 0.3799999952316284, 0.3589999973773956, 0.3799999952316284, 0.3680000007152557, 0.367000013589859, 0.367000013589859, 0.3700000047683716, 0.3790000081062317, 0.3729999959468841, 0.3770000040531158, 0.3709999918937683, 0.3759999871253967, 0.3759999871253967, 0.3700000047683716, 0.3720000088214874, 0.3840000033378601, 0.3770000040531158, 0.3770000040531158, 0.3790000081062317, 0.3860000073909759, 0.3759999871253967, 0.3650000095367431, 0.3700000047683716, 0.3819999992847442, 0.3819999992847442, 0.3630000054836273, 0.3689999878406524, 0.3759999871253967, 0.3759999871253967, 0.3779999911785126, 0.3740000128746032, 0.3860000073909759, 0.3619999885559082, 0.3740000128746032, 0.3799999952316284, 0.3819999992847442, 0.3740000128746032, 0.3770000040531158, 0.375, 0.3810000121593475, 0.3729999959468841, 0.3880000114440918, 0.3840000033378601, 0.3840000033378601, 0.3770000040531158, 0.3740000128746032, 0.382999986410141, 0.3840000033378601, 0.3770000040531158, 0.3869999945163727, 0.3729999959468841, 0.3770000040531158, 0.3759999871253967, 0.3840000033378601, 0.3880000114440918, 0.3759999871253967, 0.3740000128746032, 0.3720000088214874, 0.3790000081062317, 0.3740000128746032, 0.3630000054836273, 0.3810000121593475, 0.3720000088214874, 0.3729999959468841, 0.3720000088214874, 0.3840000033378601, 0.3759999871253967, 0.3840000033378601, 0.3790000081062317, 0.3819999992847442, 0.3689999878406524, 0.3700000047683716, 0.3790000081062317, 0.3729999959468841, 0.3799999952316284, 0.3799999952316284, 0.3740000128746032, 0.3689999878406524, 0.3810000121593475, 0.3720000088214874, 0.382999986410141, 0.3819999992847442, 0.3720000088214874, 0.3799999952316284, 0.3740000128746032, 0.3729999959468841, 0.3790000081062317, 0.3720000088214874, 0.3680000007152557, 0.3779999911785126, 0.3799999952316284, 0.3729999959468841, 0.3740000128746032, 0.3729999959468841, 0.3759999871253967, 0.3790000081062317, 0.3689999878406524, 0.3680000007152557, 0.3659999966621399, 0.3729999959468841, 0.3680000007152557], "label": "FineWeb: id mh + C4 + custom filters"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.2919999957084656, 0.3310000002384186, 0.3549999892711639, 0.3939999938011169, 0.4149999916553497, 0.4329999983310699, 0.4460000097751617, 0.4589999914169311, 0.4819999933242798, 0.4769999980926513, 0.4830000102519989, 0.4909999966621399, 0.5059999823570251, 0.5059999823570251, 0.503000020980835, 0.5170000195503235, 0.5049999952316284, 0.5210000276565552, 0.5130000114440918, 0.5189999938011169, 0.5360000133514404, 0.5320000052452087, 0.5460000038146973, 0.5400000214576721, 0.5379999876022339, 0.531000018119812, 0.5460000038146973, 0.5509999990463257, 0.5519999861717224, 0.5559999942779541, 0.5609999895095825, 0.5559999942779541, 0.5580000281333923, 0.5450000166893005, 0.5509999990463257, 0.5590000152587891, 0.5649999976158142, 0.5619999766349792, 0.5680000185966492, 0.5669999718666077, 0.5709999799728394, 0.5569999814033508, 0.5640000104904175, 0.5690000057220459, 0.5720000267028809, 0.5759999752044678, 0.5839999914169312, 0.5699999928474426, 0.5740000009536743, 0.5830000042915344, 0.5839999914169312, 0.5799999833106995, 0.5830000042915344, 0.574999988079071, 0.5910000205039978, 0.5799999833106995, 0.5879999995231628, 0.6039999723434448, 0.578000009059906, 0.5849999785423279, 0.5889999866485596, 0.5849999785423279, 0.6019999980926514, 0.5929999947547913, 0.5820000171661377, 0.5860000252723694, 0.5910000205039978, 0.5849999785423279, 0.5849999785423279, 0.5839999914169312, 0.5860000252723694, 0.5979999899864197, 0.5849999785423279, 0.597000002861023, 0.5960000157356262, 0.6019999980926514, 0.6060000061988831, 0.5989999771118164, 0.5889999866485596, 0.5920000076293945, 0.5960000157356262, 0.5950000286102295, 0.6060000061988831, 0.5960000157356262, 0.6000000238418579, 0.6069999933242798, 0.6039999723434448, 0.6069999933242798, 0.6010000109672546, 0.6060000061988831, 0.6129999756813049, 0.5989999771118164, 0.6200000047683716, 0.5979999899864197, 0.609000027179718, 0.6029999852180481, 0.609000027179718, 0.6179999709129333, 0.6150000095367432, 0.6060000061988831, 0.6069999933242798, 0.6119999885559082, 0.6190000176429749, 0.6079999804496765, 0.6150000095367432, 0.6079999804496765, 0.6190000176429749, 0.6079999804496765, 0.609000027179718, 0.6079999804496765, 0.6179999709129333, 0.6140000224113464, 0.6200000047683716, 0.621999979019165, 0.6129999756813049, 0.6200000047683716, 0.6129999756813049, 0.6110000014305115, 0.6069999933242798, 0.609000027179718, 0.6159999966621399, 0.6169999837875366, 0.6129999756813049, 0.6169999837875366, 0.6159999966621399, 0.6200000047683716, 0.6150000095367432, 0.6240000128746033, 0.6179999709129333, 0.6179999709129333, 0.6129999756813049, 0.6179999709129333, 0.6110000014305115, 0.6190000176429749, 0.6200000047683716, 0.6150000095367432, 0.6159999966621399, 0.621999979019165, 0.6209999918937683, 0.6230000257492065, 0.6200000047683716, 0.6240000128746033, 0.6159999966621399, 0.6200000047683716, 0.6159999966621399, 0.6179999709129333, 0.6119999885559082, 0.6269999742507935, 0.6230000257492065, 0.6200000047683716, 0.6240000128746033, 0.6190000176429749, 0.6169999837875366, 0.6299999952316284, 0.625, 0.6179999709129333, 0.6150000095367432, 0.6259999871253967, 0.621999979019165, 0.625, 0.6190000176429749, 0.6259999871253967, 0.6340000033378601, 0.628000020980835, 0.6290000081062317, 0.628000020980835, 0.6269999742507935], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.2849999964237213, 0.3240000009536743, 0.3580000102519989, 0.3930000066757202, 0.395000010728836, 0.4309999942779541, 0.44200000166893, 0.4399999976158142, 0.453000009059906, 0.453000009059906, 0.4650000035762787, 0.4699999988079071, 0.481000006198883, 0.4839999973773956, 0.4970000088214874, 0.5059999823570251, 0.4909999966621399, 0.5120000243186951, 0.5139999985694885, 0.5170000195503235, 0.5199999809265137, 0.5170000195503235, 0.5249999761581421, 0.5220000147819519, 0.5289999842643738, 0.5350000262260437, 0.531000018119812, 0.5289999842643738, 0.5339999794960022, 0.527999997138977, 0.5260000228881836, 0.5429999828338623, 0.5370000004768372, 0.5329999923706055, 0.5460000038146973, 0.5429999828338623, 0.5490000247955322, 0.546999990940094, 0.546999990940094, 0.5490000247955322, 0.5460000038146973, 0.5559999942779541, 0.5619999766349792, 0.5569999814033508, 0.5509999990463257, 0.5550000071525574, 0.5649999976158142, 0.5690000057220459, 0.5619999766349792, 0.5529999732971191, 0.5649999976158142, 0.5730000138282776, 0.5669999718666077, 0.5740000009536743, 0.5690000057220459, 0.5699999928474426, 0.574999988079071, 0.5640000104904175, 0.5789999961853027, 0.5720000267028809, 0.5640000104904175, 0.574999988079071, 0.5770000219345093, 0.5740000009536743, 0.5770000219345093, 0.5740000009536743, 0.5740000009536743, 0.578000009059906, 0.5759999752044678, 0.5789999961853027, 0.5799999833106995, 0.578000009059906, 0.5860000252723694, 0.5809999704360962, 0.5770000219345093, 0.5849999785423279, 0.5849999785423279, 0.5799999833106995, 0.578000009059906, 0.5809999704360962, 0.5870000123977661, 0.5830000042915344, 0.5720000267028809, 0.5879999995231628, 0.5830000042915344, 0.5929999947547913, 0.578000009059906, 0.5889999866485596, 0.5809999704360962, 0.5789999961853027, 0.593999981880188, 0.5820000171661377, 0.5910000205039978, 0.5830000042915344, 0.5879999995231628, 0.5879999995231628, 0.5889999866485596, 0.5879999995231628, 0.5899999737739563, 0.5960000157356262, 0.5899999737739563, 0.5879999995231628, 0.5870000123977661, 0.5910000205039978, 0.593999981880188, 0.597000002861023, 0.593999981880188, 0.5979999899864197, 0.593999981880188, 0.5989999771118164, 0.5929999947547913, 0.597000002861023, 0.6019999980926514, 0.5989999771118164, 0.6019999980926514, 0.597000002861023, 0.6000000238418579, 0.6019999980926514, 0.6039999723434448, 0.597000002861023, 0.6019999980926514, 0.5950000286102295, 0.6019999980926514, 0.6079999804496765, 0.6039999723434448, 0.6100000143051147, 0.6039999723434448, 0.6029999852180481, 0.6069999933242798, 0.6060000061988831, 0.6069999933242798, 0.6000000238418579, 0.6100000143051147, 0.6100000143051147, 0.6129999756813049, 0.609000027179718, 0.6010000109672546, 0.6000000238418579, 0.6110000014305115, 0.609000027179718, 0.6069999933242798, 0.6119999885559082, 0.6050000190734863, 0.6110000014305115, 0.6190000176429749, 0.6169999837875366, 0.6140000224113464, 0.6100000143051147, 0.6200000047683716, 0.6200000047683716, 0.6110000014305115, 0.6150000095367432, 0.6129999756813049, 0.6079999804496765, 0.6179999709129333, 0.6200000047683716, 0.6129999756813049, 0.6190000176429749, 0.6150000095367432, 0.6240000128746033, 0.6240000128746033, 0.609000027179718, 0.609000027179718, 0.6159999966621399, 0.6110000014305115, 0.6110000014305115, 0.6190000176429749], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.3019999861717224, 0.3059999942779541, 0.335999995470047, 0.3610000014305115, 0.3819999992847442, 0.4009999930858612, 0.4020000100135803, 0.4250000119209289, 0.4309999942779541, 0.4469999969005584, 0.4519999921321869, 0.453000009059906, 0.4580000042915344, 0.4749999940395355, 0.4699999988079071, 0.4799999892711639, 0.4749999940395355, 0.4769999980926513, 0.481000006198883, 0.4839999973773956, 0.4959999918937683, 0.5040000081062317, 0.4970000088214874, 0.4979999959468841, 0.5070000290870667, 0.5049999952316284, 0.5109999775886536, 0.515999972820282, 0.5120000243186951, 0.5120000243186951, 0.515999972820282, 0.5120000243186951, 0.5249999761581421, 0.5170000195503235, 0.5199999809265137, 0.5270000100135803, 0.5170000195503235, 0.5220000147819519, 0.5260000228881836, 0.5360000133514404, 0.5339999794960022, 0.5370000004768372, 0.5339999794960022, 0.5329999923706055, 0.531000018119812, 0.5329999923706055, 0.5400000214576721, 0.5429999828338623, 0.5389999747276306, 0.5419999957084656, 0.5429999828338623, 0.5360000133514404, 0.5299999713897705, 0.546999990940094, 0.5360000133514404, 0.5450000166893005, 0.5440000295639038, 0.5350000262260437, 0.5339999794960022, 0.5419999957084656, 0.5450000166893005, 0.5460000038146973, 0.5370000004768372, 0.5490000247955322, 0.5440000295639038, 0.550000011920929, 0.5490000247955322, 0.5450000166893005, 0.5490000247955322, 0.5559999942779541, 0.5559999942779541, 0.5410000085830688, 0.5419999957084656, 0.5529999732971191, 0.5460000038146973, 0.5540000200271606, 0.5379999876022339, 0.5509999990463257, 0.5540000200271606, 0.5419999957084656, 0.546999990940094, 0.5479999780654907, 0.5460000038146973, 0.5460000038146973, 0.5519999861717224, 0.5600000023841858, 0.5540000200271606, 0.5509999990463257, 0.5609999895095825, 0.5619999766349792, 0.5590000152587891, 0.5559999942779541, 0.5580000281333923, 0.5640000104904175, 0.5649999976158142, 0.5590000152587891, 0.5550000071525574, 0.5630000233650208, 0.5630000233650208, 0.5609999895095825, 0.5559999942779541, 0.5609999895095825, 0.5630000233650208, 0.5680000185966492, 0.5630000233650208, 0.5690000057220459, 0.5609999895095825, 0.5590000152587891, 0.5640000104904175, 0.5690000057220459, 0.5640000104904175, 0.5630000233650208, 0.574999988079071, 0.5630000233650208, 0.5619999766349792, 0.5690000057220459, 0.5770000219345093, 0.5690000057220459, 0.5609999895095825, 0.5649999976158142, 0.5680000185966492, 0.5590000152587891, 0.5600000023841858, 0.5619999766349792, 0.5799999833106995, 0.5619999766349792, 0.5699999928474426, 0.5709999799728394, 0.5669999718666077, 0.5680000185966492, 0.5609999895095825, 0.5649999976158142, 0.5680000185966492, 0.5730000138282776, 0.5720000267028809, 0.5709999799728394, 0.5770000219345093, 0.574999988079071, 0.5730000138282776, 0.5690000057220459, 0.5740000009536743, 0.578000009059906, 0.574999988079071, 0.5820000171661377, 0.5730000138282776, 0.5740000009536743, 0.574999988079071, 0.5770000219345093, 0.5789999961853027, 0.5759999752044678, 0.5720000267028809, 0.5770000219345093, 0.5759999752044678, 0.5789999961853027, 0.5789999961853027, 0.5730000138282776, 0.5789999961853027, 0.5759999752044678, 0.5690000057220459, 0.5849999785423279, 0.5759999752044678, 0.5699999928474426, 0.5789999961853027, 0.5820000171661377, 0.5730000138282776, 0.5730000138282776, 0.5789999961853027], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.257999986410141, 0.2809999883174896, 0.3230000138282776, 0.3409999907016754, 0.3600000143051147, 0.3569999933242798, 0.3889999985694885, 0.395000010728836, 0.4199999868869781, 0.4180000126361847, 0.421999990940094, 0.4289999902248382, 0.4350000023841858, 0.4359999895095825, 0.4350000023841858, 0.4480000138282776, 0.4480000138282776, 0.453000009059906, 0.4550000131130218, 0.4589999914169311, 0.4639999866485595, 0.4600000083446502, 0.460999995470047, 0.4589999914169311, 0.481000006198883, 0.4769999980926513, 0.4709999859333038, 0.4740000069141388, 0.4679999947547912, 0.4790000021457672, 0.4729999899864197, 0.4819999933242798, 0.4850000143051147, 0.4819999933242798, 0.4819999933242798, 0.4880000054836273, 0.4869999885559082, 0.4959999918937683, 0.4850000143051147, 0.4959999918937683, 0.492000013589859, 0.503000020980835, 0.4930000007152557, 0.5099999904632568, 0.5040000081062317, 0.5009999871253967, 0.4970000088214874, 0.4979999959468841, 0.5059999823570251, 0.5070000290870667, 0.5040000081062317, 0.5059999823570251, 0.5049999952316284, 0.5080000162124634, 0.5049999952316284, 0.5019999742507935, 0.5120000243186951, 0.5170000195503235, 0.5170000195503235, 0.5090000033378601, 0.5239999890327454, 0.527999997138977, 0.5230000019073486, 0.5210000276565552, 0.5149999856948853, 0.5189999938011169, 0.5270000100135803, 0.5149999856948853, 0.5099999904632568, 0.5299999713897705, 0.5199999809265137, 0.5230000019073486, 0.5260000228881836, 0.5249999761581421, 0.5239999890327454, 0.5329999923706055, 0.5210000276565552, 0.5260000228881836, 0.5170000195503235, 0.531000018119812, 0.5289999842643738, 0.531000018119812, 0.5270000100135803, 0.5299999713897705, 0.5370000004768372, 0.5379999876022339, 0.5419999957084656, 0.5329999923706055, 0.5360000133514404, 0.5299999713897705, 0.5360000133514404, 0.5270000100135803, 0.5450000166893005, 0.5410000085830688, 0.546999990940094, 0.5329999923706055, 0.5329999923706055, 0.5379999876022339, 0.5299999713897705, 0.5429999828338623, 0.5360000133514404, 0.5339999794960022, 0.5419999957084656, 0.5410000085830688, 0.5370000004768372, 0.5389999747276306, 0.527999997138977, 0.5400000214576721, 0.5400000214576721, 0.531000018119812, 0.5440000295639038, 0.5460000038146973, 0.5479999780654907, 0.5460000038146973, 0.5410000085830688, 0.5509999990463257, 0.5479999780654907, 0.5410000085830688, 0.5389999747276306, 0.550000011920929, 0.5569999814033508, 0.550000011920929, 0.5490000247955322, 0.5490000247955322, 0.5569999814033508, 0.5519999861717224, 0.5479999780654907, 0.5559999942779541, 0.5550000071525574, 0.5460000038146973, 0.5540000200271606, 0.5460000038146973, 0.5460000038146973, 0.5509999990463257, 0.5460000038146973, 0.5550000071525574, 0.5479999780654907, 0.5479999780654907, 0.5540000200271606, 0.5550000071525574, 0.5529999732971191, 0.5529999732971191, 0.5509999990463257, 0.5509999990463257, 0.5419999957084656, 0.546999990940094, 0.5509999990463257, 0.5559999942779541, 0.5490000247955322, 0.5509999990463257, 0.5529999732971191, 0.550000011920929, 0.5540000200271606, 0.5550000071525574, 0.5580000281333923, 0.550000011920929, 0.5569999814033508, 0.5490000247955322, 0.5519999861717224, 0.5519999861717224, 0.5559999942779541, 0.5569999814033508, 0.5559999942779541, 0.5550000071525574, 0.5559999942779541, 0.5490000247955322, 0.5550000071525574, 0.5600000023841858], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files": {"agg_score": {"file": "agg_score.json"}, "commonsense_qa/acc_norm": {"file": "commonsense_qa_acc_norm.json"}, "hellaswag/acc_norm": {"file": "hellaswag_acc_norm.json"}, "openbookqa/acc_norm": {"file": "openbookqa_acc_norm.json"}, "piqa/acc_norm": {"file": "piqa_acc_norm.json"}, "siqa/acc_norm": {"file": "siqa_acc_norm.json"}, "winogrande/acc_norm": {"file": "winogrande_acc_norm.json"}, "arc/acc_norm": {"file": "arc_acc_norm.json"}, "mmlu/acc_norm": {"file": "mmlu_acc_norm.json"}}, "settings": {"defaultMetric": "agg_score", "slider": {"min": 0, "max": 30, "default": 5}}}
assets/data/plots/all_filtering_steps/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2558934390544891, 0.2618628144264221, 0.2683217823505401, 0.2699837982654571, 0.2738722860813141, 0.2744417488574981, 0.2740873992443084, 0.2807216048240661, 0.2820421457290649, 0.2891400754451751, 0.2879075407981872, 0.2881667613983154, 0.2892490327358246, 0.2935869693756103, 0.2870290875434875, 0.2911452651023865, 0.2949125170707702, 0.2916406095027923, 0.2981449663639068, 0.2953989207744598, 0.2946988642215729, 0.297021746635437, 0.3001497685909271, 0.3010218441486358, 0.2977036237716675, 0.2992585003376007, 0.2986803948879242, 0.2994338274002075, 0.2989781498908996, 0.3041955828666687, 0.3030496537685394, 0.303806334733963, 0.3036351203918457, 0.3058845102787018, 0.300450712442398, 0.3025284707546234, 0.3072526752948761, 0.3039065897464752, 0.3073755502700805, 0.3070493042469024, 0.3083153367042541, 0.3123056292533874, 0.307761400938034, 0.3053378164768219, 0.3116358816623688, 0.3080427348613739, 0.308482676744461, 0.307318776845932, 0.3083004653453827, 0.3089516758918762, 0.3088736236095428, 0.3077724277973175, 0.3126304149627685, 0.3101697862148285, 0.3159398734569549, 0.314792275428772, 0.3103811144828796, 0.3111368715763092, 0.3129658997058868, 0.311605304479599, 0.3118223249912262, 0.3133279979228973, 0.3146496713161468, 0.3195074200630188, 0.3142614662647247, 0.3125102519989013, 0.3115333616733551, 0.3183117806911468, 0.3168580532073974, 0.3187012672424316, 0.3179306983947754, 0.3157722651958465, 0.3214826583862304, 0.3145081698894501, 0.3172421753406524, 0.3151432573795318, 0.3181649446487427, 0.3180212080478668, 0.3171605765819549, 0.3212067782878876, 0.3180184066295624, 0.3209905624389648, 0.319052129983902, 0.3212707936763763, 0.3196887373924255, 0.3188316226005554, 0.3164899051189422, 0.3241994678974151, 0.3179469406604767, 0.3214083909988403, 0.3206575512886047, 0.3263285160064697, 0.3219505250453949, 0.3181525468826294, 0.3219776451587677, 0.3259726762771606, 0.3197665512561798, 0.3236161768436432, 0.3177970349788666, 0.3258080780506134, 0.3208407461643219, 0.3251138925552368, 0.3242645859718323, 0.3229723274707794, 0.3227455914020538, 0.3206316232681274, 0.3256695866584778, 0.3241210877895355, 0.3224890530109405, 0.3263737261295318, 0.3214233517646789, 0.3240345120429992, 0.3222567737102508, 0.3242291808128357, 0.3257078528404236, 0.3278365731239319, 0.3277338743209839, 0.3253948092460632, 0.3232105076313019, 0.3267974853515625, 0.3263654410839081, 0.3262891769409179, 0.3238334357738495, 0.3294911682605743, 0.3261866867542267, 0.3243315815925598, 0.3250119090080261, 0.326727420091629, 0.3268802464008331, 0.3269768059253692, 0.3257980346679687, 0.3280686736106872, 0.3274897634983063, 0.3282252252101898, 0.3272863030433655, 0.328346699476242, 0.325562834739685, 0.3301684856414795, 0.3284023404121399, 0.3268299400806427, 0.3286610245704651, 0.3291078805923462, 0.324972927570343, 0.3314772248268127, 0.3278062343597412, 0.326839417219162, 0.3277239501476288, 0.330414742231369, 0.3271744549274444, 0.3279334008693695, 0.3288575112819671, 0.3285425007343292, 0.3282454907894134, 0.3296376466751098, 0.3305942714214325, 0.3276287615299225, 0.3292438983917236, 0.329515129327774, 0.3281475007534027, 0.3282177448272705, 0.3333999514579773, 0.3302631080150604, 0.330238401889801, 0.3323166668415069, 0.3313035368919372, 0.32961106300354, 0.3321967124938965], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2562687695026397, 0.264194518327713, 0.2659797668457031, 0.2690401375293731, 0.2707462012767792, 0.2736803293228149, 0.2808477580547333, 0.2819793820381164, 0.2818062305450439, 0.2852273285388946, 0.2852552533149719, 0.293150246143341, 0.2869345247745514, 0.2926198840141296, 0.2911646664142608, 0.2883031964302063, 0.2938489317893982, 0.2923268675804138, 0.2927436530590057, 0.2957125902175903, 0.2942458391189575, 0.2957732379436493, 0.2933609783649444, 0.2939628064632416, 0.2984270751476288, 0.2989151179790497, 0.3007727265357971, 0.2968312501907348, 0.2969468235969543, 0.3013020753860473, 0.3045085966587066, 0.3018752634525299, 0.3014349043369293, 0.2988792657852173, 0.3034284710884094, 0.3015728890895843, 0.3065252900123596, 0.3021449446678161, 0.3043071627616882, 0.303546279668808, 0.3056059181690216, 0.2971993386745453, 0.3057574033737182, 0.3057517111301422, 0.3124973773956299, 0.3139103651046753, 0.3144983947277069, 0.3126215636730194, 0.3140240907669067, 0.3113631308078766, 0.3124240636825561, 0.3126817643642425, 0.3123457431793213, 0.3111095428466797, 0.3113269805908203, 0.3142518699169159, 0.3163851797580719, 0.3134008049964905, 0.3138530254364013, 0.3171449303627014, 0.3119543790817261, 0.3147956132888794, 0.3138984441757202, 0.3178529143333435, 0.3162296414375305, 0.315980851650238, 0.3123161196708679, 0.3166452944278717, 0.3140694200992584, 0.3176922798156738, 0.3176673054695129, 0.3150016367435455, 0.3161586821079254, 0.3222477436065674, 0.3194025754928589, 0.3176416158676147, 0.3159928619861603, 0.3169592320919037, 0.3135637938976288, 0.3155058920383453, 0.3215300440788269, 0.3201274275779724, 0.3192023932933807, 0.3156079053878784, 0.3212503492832184, 0.3163617849349975, 0.3223940432071686, 0.3191330432891845, 0.3194314539432525, 0.3221519589424133, 0.3211863040924072, 0.3197937309741974, 0.3174488544464111, 0.3159596025943756, 0.3157133460044861, 0.3193388879299164, 0.3163386285305023, 0.3202225565910339, 0.3163421154022217, 0.3212694227695465, 0.3187369704246521, 0.3203508555889129, 0.3224054872989654, 0.3207881152629852, 0.3219418525695801, 0.3197605609893799, 0.3255409598350525, 0.3253240585327148, 0.319698303937912, 0.3250498473644256, 0.3228228390216827, 0.3213794529438019, 0.3219127357006073, 0.3214426934719085, 0.3238218128681183, 0.3229665458202362, 0.3220484256744385, 0.3240038454532623, 0.3246393501758575, 0.3237775564193725, 0.3258441984653473, 0.322843462228775, 0.3241913020610809, 0.324148565530777, 0.3238157927989959, 0.3248989582061767, 0.3280864655971527, 0.3288898766040802, 0.3265794515609741, 0.3277602791786194, 0.3231202363967895, 0.3224002718925476, 0.323845773935318, 0.3278093039989471, 0.3247094452381134, 0.3289697468280792, 0.3272296786308288, 0.3275051414966583, 0.3271359801292419, 0.3280861675739288, 0.3281281590461731, 0.327859491109848, 0.3281152546405792, 0.3282515406608581, 0.3258990049362182, 0.3271094560623169, 0.3259278535842895, 0.3258941769599914, 0.3278749883174896, 0.3300504386425018, 0.326113760471344, 0.3242938220500946, 0.3262194991111755, 0.3263693153858185, 0.3274452090263366, 0.3254594206809997, 0.3287247717380523, 0.3250340223312378, 0.3270816206932068, 0.3275731801986694, 0.3282500207424164, 0.3257671594619751, 0.3272948265075683, 0.3274084031581878, 0.3302212655544281, 0.3322067260742187, 0.3296935856342315], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.251920074224472, 0.2591456174850464, 0.2687398791313171, 0.269056499004364, 0.2683902382850647, 0.2725079655647278, 0.2752586305141449, 0.2753303050994873, 0.2848396897315979, 0.2833426892757416, 0.2844280302524566, 0.2847303748130798, 0.294879138469696, 0.2900991439819336, 0.2932447791099548, 0.2926276624202728, 0.2924879789352417, 0.2937914729118347, 0.2919517457485199, 0.2991980910301208, 0.2929336428642273, 0.3003193736076355, 0.2955676615238189, 0.2993899285793304, 0.2975476682186126, 0.2978275716304779, 0.2994768321514129, 0.2984132170677185, 0.2998209595680237, 0.3030214607715606, 0.2984272837638855, 0.2997688949108124, 0.3041917085647583, 0.3071142137050628, 0.3038201630115509, 0.3035272359848022, 0.3047704994678497, 0.3072718679904938, 0.3085931539535522, 0.3052116930484772, 0.3084307312965393, 0.3089986145496368, 0.3102100193500519, 0.3066395819187164, 0.3109234273433685, 0.3082580268383026, 0.3055950105190277, 0.3064471781253814, 0.3052197098731994, 0.3076579868793487, 0.3114514350891113, 0.3092200756072998, 0.3083749115467071, 0.3078047931194305, 0.3102362751960754, 0.3083966672420501, 0.3149019181728363, 0.3096835613250732, 0.3129985630512237, 0.3098655939102173, 0.3105471730232239, 0.3110797703266144, 0.3097324073314667, 0.3102588951587677, 0.3108883202075958, 0.3140653371810913, 0.3143481016159057, 0.3121947944164276, 0.3064004778861999, 0.3148718774318695, 0.3152956068515777, 0.3166911900043487, 0.3115324079990387, 0.311627596616745, 0.3122025728225708, 0.3186626732349396, 0.3177326321601867, 0.3107803463935852, 0.3128083050251007, 0.3109799027442932, 0.3142008483409881, 0.3121736049652099, 0.3163987696170807, 0.3134956955909729, 0.3152486085891723, 0.3163009285926819, 0.3165099024772644, 0.3186413049697876, 0.315637856721878, 0.3207236230373382, 0.3161193430423736, 0.3157550990581512, 0.320356547832489, 0.3172537386417389, 0.318843811750412, 0.3146925568580627, 0.3193819522857666, 0.3169102966785431, 0.3154685497283935, 0.3166710138320923, 0.3187788426876068, 0.3196616470813751, 0.3190047442913055, 0.3186605274677276, 0.3162576556205749, 0.3164195120334625, 0.3191222250461578, 0.3135613799095154, 0.3175683617591858, 0.3212282657623291, 0.3279584646224975, 0.3228197395801544, 0.3242316544055938, 0.3254729807376861, 0.3239398598670959, 0.3223652243614197, 0.3198129832744598, 0.3218621611595154, 0.3264771103858948, 0.323866069316864, 0.32564178109169, 0.32478728890419, 0.3236158192157745, 0.3245747685432434, 0.3280244767665863, 0.3271372020244598, 0.3254362642765045, 0.3266178965568542, 0.3218266665935516, 0.3268883228302002, 0.321928471326828, 0.324524849653244, 0.3237947523593902, 0.3238577842712402, 0.3237817287445068, 0.3233639299869537, 0.325821191072464, 0.3257157802581787, 0.3272253274917602, 0.3244009912014007, 0.3231483995914459, 0.3226592242717743, 0.3233656585216522, 0.3266710937023163, 0.3259218335151672, 0.3275097906589508, 0.3273427188396454, 0.3276328444480896, 0.3251460194587707, 0.3274493515491485, 0.3227463960647583, 0.3261785507202148, 0.32408007979393, 0.3253126442432403, 0.3242971301078796, 0.326819509267807, 0.3268508613109588, 0.3265140950679779, 0.3266753256320953, 0.3250673115253448, 0.3271500170230865, 0.3292337656021118, 0.3286773562431335, 0.3286141455173492, 0.3296676576137543, 0.3257955610752105, 0.3266003727912903], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2501466572284698, 0.2516599297523498, 0.2610189318656921, 0.2666046619415283, 0.2667981088161468, 0.2667821645736694, 0.2708088159561157, 0.2738403379917145, 0.2726235687732696, 0.2762763500213623, 0.2768311202526092, 0.2809228301048279, 0.2836140990257263, 0.2822815179824829, 0.2797218561172485, 0.286342591047287, 0.2855269610881805, 0.2847287058830261, 0.2888180613517761, 0.286526083946228, 0.2865165770053863, 0.294582188129425, 0.2925947606563568, 0.2947863042354584, 0.2892930805683136, 0.2903610467910766, 0.288201242685318, 0.2873396277427673, 0.2916238009929657, 0.2908017039299011, 0.2907920777797699, 0.2952797412872314, 0.2941452264785766, 0.2921333611011505, 0.2925891280174255, 0.2968584895133972, 0.2980035543441772, 0.2964116632938385, 0.2962304651737213, 0.2950254380702972, 0.2977516651153564, 0.2944138348102569, 0.3003402054309845, 0.2976303696632385, 0.3013098239898681, 0.302829384803772, 0.3018766045570373, 0.305361807346344, 0.2971298694610595, 0.3014816343784332, 0.3019805550575256, 0.3037064969539642, 0.2970167994499206, 0.2995208501815796, 0.2970106601715088, 0.2990955114364624, 0.3027818500995636, 0.3048534691333771, 0.2993872463703155, 0.2986327707767486, 0.3015393316745758, 0.3003426790237427, 0.3003274798393249, 0.3017795085906982, 0.3019182682037353, 0.3015450537204742, 0.3046211004257202, 0.3031167984008789, 0.3020436763763428, 0.3011128306388855, 0.3029948472976684, 0.3045558631420135, 0.301642894744873, 0.3029441833496094, 0.3035804331302643, 0.3004390001296997, 0.3021787703037262, 0.306041270494461, 0.3064048886299133, 0.3087956011295318, 0.3070018291473388, 0.3065581619739532, 0.3093871772289276, 0.3060930073261261, 0.3033313155174255, 0.3072777390480041, 0.306413859128952, 0.3104493916034698, 0.3056999444961548, 0.3077532052993774, 0.309231549501419, 0.3070645034313202, 0.3117790520191192, 0.3114112913608551, 0.312661737203598, 0.3181777000427246, 0.3117201030254364, 0.3099702894687652, 0.3074746131896972, 0.3064963519573211, 0.3105958700180053, 0.3111456036567688, 0.3084964454174042, 0.3087405860424042, 0.3121673166751861, 0.3121528625488281, 0.3100416660308838, 0.3142979145050049, 0.3129935264587402, 0.3112611472606659, 0.3119436800479889, 0.3154115974903106, 0.3091593086719513, 0.3103814721107483, 0.3130497634410858, 0.3133455514907837, 0.3152708411216736, 0.3137963414192199, 0.3099324703216553, 0.3164172768592834, 0.3133907914161682, 0.3128255009651184, 0.3134104907512665, 0.3106969892978668, 0.3130004107952118, 0.3131391704082489, 0.3130116462707519, 0.3143952488899231, 0.3143975436687469, 0.3143710494041443, 0.3163396418094635, 0.3166862726211548, 0.3184126019477844, 0.3178988993167877, 0.317479133605957, 0.3184944093227386, 0.316694974899292, 0.3176258206367492, 0.3182629346847534, 0.3200214207172394, 0.3181648552417755, 0.320680022239685, 0.3178716897964477, 0.3182425796985626, 0.3182984292507171, 0.3158398568630218, 0.3152642548084259, 0.3132680356502533, 0.3178914785385132, 0.3156660795211792, 0.3161703050136566, 0.3176451921463012, 0.3173815906047821, 0.3194171786308288, 0.3193057179450989, 0.3172560334205627, 0.317656546831131, 0.3155770003795624, 0.3199106156826019, 0.3170182108879089, 0.3156754970550537, 0.3180731236934662, 0.3205638229846954, 0.3175432682037353, 0.3184471428394317, 0.3192788958549499, 0.3197042346000671, 0.3177168369293213], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2440000027418136, 0.270000010728836, 0.2720000147819519, 0.3000000119209289, 0.2919999957084656, 0.3160000145435333, 0.3160000145435333, 0.3179999887943268, 0.3199999928474426, 0.3440000116825104, 0.3179999887943268, 0.3240000009536743, 0.3300000131130218, 0.3240000009536743, 0.3199999928474426, 0.335999995470047, 0.3339999914169311, 0.3440000116825104, 0.3459999859333038, 0.3400000035762787, 0.3440000116825104, 0.335999995470047, 0.3379999995231628, 0.3519999980926513, 0.3379999995231628, 0.3420000076293945, 0.3319999873638153, 0.3479999899864197, 0.3459999859333038, 0.3339999914169311, 0.3440000116825104, 0.3420000076293945, 0.3219999969005584, 0.3319999873638153, 0.3479999899864197, 0.3379999995231628, 0.335999995470047, 0.3499999940395355, 0.3420000076293945, 0.3319999873638153, 0.3400000035762787, 0.3400000035762787, 0.3519999980926513, 0.3479999899864197, 0.3379999995231628, 0.335999995470047, 0.3400000035762787, 0.3319999873638153, 0.3580000102519989, 0.3499999940395355, 0.3700000047683716, 0.3680000007152557, 0.335999995470047, 0.3600000143051147, 0.3499999940395355, 0.356000006198883, 0.3499999940395355, 0.356000006198883, 0.3619999885559082, 0.363999992609024, 0.3519999980926513, 0.3540000021457672, 0.3600000143051147, 0.3600000143051147, 0.3540000021457672, 0.356000006198883, 0.363999992609024, 0.363999992609024, 0.3499999940395355, 0.3659999966621399, 0.356000006198883, 0.363999992609024, 0.3540000021457672, 0.3540000021457672, 0.3619999885559082, 0.3740000128746032, 0.3519999980926513, 0.3659999966621399, 0.3680000007152557, 0.3700000047683716, 0.3580000102519989, 0.3499999940395355, 0.3740000128746032, 0.3659999966621399, 0.3659999966621399, 0.3580000102519989, 0.3479999899864197, 0.363999992609024, 0.3519999980926513, 0.3580000102519989, 0.356000006198883, 0.3740000128746032, 0.363999992609024, 0.3700000047683716, 0.363999992609024, 0.3700000047683716, 0.363999992609024, 0.3799999952316284, 0.3860000073909759, 0.3680000007152557, 0.3779999911785126, 0.3740000128746032, 0.3600000143051147, 0.3659999966621399, 0.3680000007152557, 0.3619999885559082, 0.3700000047683716, 0.3759999871253967, 0.363999992609024, 0.3740000128746032, 0.3799999952316284, 0.3779999911785126, 0.3659999966621399, 0.3600000143051147, 0.3740000128746032, 0.3600000143051147, 0.363999992609024, 0.363999992609024, 0.363999992609024, 0.3779999911785126, 0.3700000047683716, 0.3799999952316284, 0.3720000088214874, 0.3819999992847442, 0.3759999871253967, 0.3799999952316284, 0.3740000128746032, 0.3860000073909759, 0.3779999911785126, 0.3959999978542328, 0.3880000114440918, 0.3799999952316284, 0.3860000073909759, 0.3759999871253967, 0.3939999938011169, 0.3779999911785126, 0.3959999978542328, 0.3779999911785126, 0.3899999856948852, 0.3860000073909759, 0.3959999978542328, 0.3759999871253967, 0.3720000088214874, 0.3799999952316284, 0.3740000128746032, 0.3759999871253967, 0.3799999952316284, 0.3819999992847442, 0.3840000033378601, 0.3720000088214874, 0.363999992609024, 0.3840000033378601, 0.3919999897480011, 0.3819999992847442, 0.3819999992847442, 0.3779999911785126, 0.3799999952316284, 0.3840000033378601, 0.3819999992847442, 0.3899999856948852, 0.3860000073909759, 0.3819999992847442, 0.3840000033378601, 0.3720000088214874, 0.3799999952316284, 0.3819999992847442, 0.3959999978542328], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2759999930858612, 0.2739999890327453, 0.2800000011920929, 0.2879999876022339, 0.3179999887943268, 0.3019999861717224, 0.3000000119209289, 0.3240000009536743, 0.3100000023841858, 0.3140000104904175, 0.3260000050067901, 0.3260000050067901, 0.3339999914169311, 0.328000009059906, 0.335999995470047, 0.3319999873638153, 0.3379999995231628, 0.3519999980926513, 0.3420000076293945, 0.3440000116825104, 0.3379999995231628, 0.3420000076293945, 0.3499999940395355, 0.3420000076293945, 0.3420000076293945, 0.3499999940395355, 0.3300000131130218, 0.3459999859333038, 0.3379999995231628, 0.3400000035762787, 0.3440000116825104, 0.3319999873638153, 0.3339999914169311, 0.3459999859333038, 0.3459999859333038, 0.335999995470047, 0.3379999995231628, 0.3479999899864197, 0.3540000021457672, 0.3479999899864197, 0.3420000076293945, 0.3600000143051147, 0.3499999940395355, 0.3459999859333038, 0.3600000143051147, 0.335999995470047, 0.3400000035762787, 0.3619999885559082, 0.3619999885559082, 0.3580000102519989, 0.3459999859333038, 0.363999992609024, 0.3479999899864197, 0.356000006198883, 0.3420000076293945, 0.3619999885559082, 0.3479999899864197, 0.356000006198883, 0.363999992609024, 0.356000006198883, 0.3700000047683716, 0.356000006198883, 0.356000006198883, 0.3600000143051147, 0.3680000007152557, 0.3519999980926513, 0.363999992609024, 0.3540000021457672, 0.3600000143051147, 0.3600000143051147, 0.3580000102519989, 0.3680000007152557, 0.3459999859333038, 0.356000006198883, 0.3740000128746032, 0.356000006198883, 0.3580000102519989, 0.3479999899864197, 0.3540000021457672, 0.3499999940395355, 0.3580000102519989, 0.3540000021457672, 0.3659999966621399, 0.3680000007152557, 0.356000006198883, 0.3600000143051147, 0.3519999980926513, 0.363999992609024, 0.3440000116825104, 0.3659999966621399, 0.3580000102519989, 0.363999992609024, 0.3600000143051147, 0.3759999871253967, 0.363999992609024, 0.3700000047683716, 0.3619999885559082, 0.3600000143051147, 0.3720000088214874, 0.3779999911785126, 0.3680000007152557, 0.3759999871253967, 0.3819999992847442, 0.363999992609024, 0.3740000128746032, 0.363999992609024, 0.3619999885559082, 0.3700000047683716, 0.3680000007152557, 0.3659999966621399, 0.3700000047683716, 0.3759999871253967, 0.363999992609024, 0.3720000088214874, 0.3700000047683716, 0.3619999885559082, 0.3680000007152557, 0.3799999952316284, 0.3659999966621399, 0.3740000128746032, 0.3740000128746032, 0.3740000128746032, 0.3659999966621399, 0.3700000047683716, 0.3779999911785126, 0.3720000088214874, 0.3700000047683716, 0.3860000073909759, 0.3759999871253967, 0.3659999966621399, 0.3680000007152557, 0.3680000007152557, 0.3700000047683716, 0.3700000047683716, 0.3799999952316284, 0.3860000073909759, 0.3840000033378601, 0.3899999856948852, 0.3720000088214874, 0.3939999938011169, 0.3700000047683716, 0.3779999911785126, 0.3860000073909759, 0.3720000088214874, 0.3700000047683716, 0.3759999871253967, 0.3799999952316284, 0.3840000033378601, 0.3759999871253967, 0.3720000088214874, 0.3759999871253967, 0.3779999911785126, 0.3880000114440918, 0.3799999952316284, 0.3759999871253967, 0.3840000033378601, 0.3759999871253967, 0.3720000088214874, 0.3779999911785126, 0.3700000047683716, 0.3799999952316284, 0.3799999952316284, 0.3860000073909759, 0.3799999952316284, 0.3779999911785126, 0.3740000128746032, 0.3779999911785126], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2660000026226043, 0.277999997138977, 0.2820000052452087, 0.3079999983310699, 0.3140000104904175, 0.3260000050067901, 0.3039999902248382, 0.3319999873638153, 0.3240000009536743, 0.3199999928474426, 0.3379999995231628, 0.3339999914169311, 0.3319999873638153, 0.3319999873638153, 0.3219999969005584, 0.3319999873638153, 0.3379999995231628, 0.3199999928474426, 0.3179999887943268, 0.3400000035762787, 0.3219999969005584, 0.335999995470047, 0.3339999914169311, 0.3420000076293945, 0.3240000009536743, 0.3440000116825104, 0.3420000076293945, 0.3379999995231628, 0.3459999859333038, 0.328000009059906, 0.3420000076293945, 0.3459999859333038, 0.3479999899864197, 0.3379999995231628, 0.356000006198883, 0.3379999995231628, 0.3440000116825104, 0.3400000035762787, 0.3379999995231628, 0.3499999940395355, 0.3540000021457672, 0.3479999899864197, 0.3479999899864197, 0.3440000116825104, 0.3459999859333038, 0.3440000116825104, 0.3519999980926513, 0.356000006198883, 0.3600000143051147, 0.3379999995231628, 0.356000006198883, 0.3400000035762787, 0.3519999980926513, 0.3479999899864197, 0.3479999899864197, 0.3400000035762787, 0.3459999859333038, 0.3519999980926513, 0.3440000116825104, 0.3400000035762787, 0.356000006198883, 0.3420000076293945, 0.356000006198883, 0.3540000021457672, 0.3600000143051147, 0.3339999914169311, 0.3499999940395355, 0.3580000102519989, 0.3440000116825104, 0.3479999899864197, 0.3580000102519989, 0.3519999980926513, 0.3339999914169311, 0.3540000021457672, 0.3459999859333038, 0.3459999859333038, 0.3400000035762787, 0.356000006198883, 0.356000006198883, 0.3420000076293945, 0.3420000076293945, 0.3400000035762787, 0.3479999899864197, 0.3519999980926513, 0.3319999873638153, 0.3580000102519989, 0.356000006198883, 0.356000006198883, 0.3499999940395355, 0.3479999899864197, 0.3400000035762787, 0.3440000116825104, 0.3339999914169311, 0.3379999995231628, 0.3479999899864197, 0.3680000007152557, 0.3619999885559082, 0.3440000116825104, 0.3619999885559082, 0.3580000102519989, 0.356000006198883, 0.3600000143051147, 0.3519999980926513, 0.3519999980926513, 0.3459999859333038, 0.3540000021457672, 0.3600000143051147, 0.356000006198883, 0.3540000021457672, 0.3519999980926513, 0.356000006198883, 0.3600000143051147, 0.3540000021457672, 0.3540000021457672, 0.363999992609024, 0.3580000102519989, 0.3680000007152557, 0.3580000102519989, 0.356000006198883, 0.3519999980926513, 0.3519999980926513, 0.3519999980926513, 0.3459999859333038, 0.3499999940395355, 0.356000006198883, 0.3540000021457672, 0.3540000021457672, 0.3659999966621399, 0.3619999885559082, 0.3420000076293945, 0.363999992609024, 0.3580000102519989, 0.3619999885559082, 0.3759999871253967, 0.3740000128746032, 0.363999992609024, 0.3580000102519989, 0.3700000047683716, 0.3700000047683716, 0.363999992609024, 0.3440000116825104, 0.3580000102519989, 0.3680000007152557, 0.3700000047683716, 0.3740000128746032, 0.3619999885559082, 0.3619999885559082, 0.3700000047683716, 0.363999992609024, 0.363999992609024, 0.363999992609024, 0.3700000047683716, 0.3600000143051147, 0.3680000007152557, 0.363999992609024, 0.3659999966621399, 0.363999992609024, 0.3680000007152557, 0.3580000102519989, 0.363999992609024, 0.3659999966621399, 0.363999992609024, 0.3580000102519989, 0.3600000143051147, 0.3600000143051147, 0.3580000102519989, 0.3600000143051147], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.2860000133514404, 0.2560000121593475, 0.2720000147819519, 0.2980000078678131, 0.2840000092983246, 0.2879999876022339, 0.3039999902248382, 0.2860000133514404, 0.2899999916553497, 0.3019999861717224, 0.2960000038146972, 0.3039999902248382, 0.3100000023841858, 0.3160000145435333, 0.3160000145435333, 0.3260000050067901, 0.3179999887943268, 0.3420000076293945, 0.3219999969005584, 0.328000009059906, 0.3240000009536743, 0.3300000131130218, 0.328000009059906, 0.3199999928474426, 0.3379999995231628, 0.3400000035762787, 0.3240000009536743, 0.3120000064373016, 0.3319999873638153, 0.3260000050067901, 0.3120000064373016, 0.3160000145435333, 0.3140000104904175, 0.3179999887943268, 0.3160000145435333, 0.3199999928474426, 0.3240000009536743, 0.3260000050067901, 0.3179999887943268, 0.3300000131130218, 0.3179999887943268, 0.328000009059906, 0.3240000009536743, 0.328000009059906, 0.3260000050067901, 0.3199999928474426, 0.3400000035762787, 0.3339999914169311, 0.328000009059906, 0.328000009059906, 0.3339999914169311, 0.328000009059906, 0.328000009059906, 0.335999995470047, 0.3580000102519989, 0.3499999940395355, 0.3260000050067901, 0.3499999940395355, 0.3420000076293945, 0.3160000145435333, 0.3339999914169311, 0.335999995470047, 0.3400000035762787, 0.3240000009536743, 0.3319999873638153, 0.3379999995231628, 0.3400000035762787, 0.3379999995231628, 0.3319999873638153, 0.3319999873638153, 0.3440000116825104, 0.3300000131130218, 0.3219999969005584, 0.3260000050067901, 0.3219999969005584, 0.3339999914169311, 0.328000009059906, 0.3300000131130218, 0.3219999969005584, 0.3379999995231628, 0.3400000035762787, 0.3319999873638153, 0.328000009059906, 0.3440000116825104, 0.3339999914169311, 0.328000009059906, 0.3379999995231628, 0.3499999940395355, 0.3339999914169311, 0.3300000131130218, 0.328000009059906, 0.335999995470047, 0.3240000009536743, 0.335999995470047, 0.3240000009536743, 0.3400000035762787, 0.3400000035762787, 0.3420000076293945, 0.3319999873638153, 0.3339999914169311, 0.3300000131130218, 0.3400000035762787, 0.3459999859333038, 0.3400000035762787, 0.3379999995231628, 0.3459999859333038, 0.3379999995231628, 0.3300000131130218, 0.3519999980926513, 0.3379999995231628, 0.356000006198883, 0.335999995470047, 0.3420000076293945, 0.3400000035762787, 0.328000009059906, 0.3540000021457672, 0.3499999940395355, 0.3479999899864197, 0.3440000116825104, 0.3519999980926513, 0.356000006198883, 0.3540000021457672, 0.3440000116825104, 0.3499999940395355, 0.356000006198883, 0.356000006198883, 0.356000006198883, 0.363999992609024, 0.3600000143051147, 0.356000006198883, 0.3479999899864197, 0.356000006198883, 0.3459999859333038, 0.3479999899864197, 0.3619999885559082, 0.363999992609024, 0.3499999940395355, 0.3379999995231628, 0.3479999899864197, 0.3499999940395355, 0.356000006198883, 0.3519999980926513, 0.3540000021457672, 0.3619999885559082, 0.3580000102519989, 0.3540000021457672, 0.356000006198883, 0.3479999899864197, 0.3519999980926513, 0.356000006198883, 0.3499999940395355, 0.3379999995231628, 0.3479999899864197, 0.3499999940395355, 0.3440000116825104, 0.3580000102519989, 0.356000006198883, 0.3499999940395355, 0.3479999899864197, 0.3580000102519989, 0.3519999980926513, 0.3540000021457672, 0.3519999980926513, 0.3540000021457672, 0.356000006198883, 0.363999992609024, 0.356000006198883, 0.356000006198883], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.6200000047683716, 0.6470000147819519, 0.6700000166893005, 0.6869999766349792, 0.6990000009536743, 0.7059999704360962, 0.7120000123977661, 0.7139999866485596, 0.7129999995231628, 0.7289999723434448, 0.7200000286102295, 0.7139999866485596, 0.7260000109672546, 0.7329999804496765, 0.7289999723434448, 0.7369999885559082, 0.7319999933242798, 0.7260000109672546, 0.7360000014305115, 0.7369999885559082, 0.7369999885559082, 0.7300000190734863, 0.7300000190734863, 0.7350000143051147, 0.734000027179718, 0.7409999966621399, 0.7429999709129333, 0.7400000095367432, 0.7440000176429749, 0.7440000176429749, 0.7400000095367432, 0.7400000095367432, 0.7480000257492065, 0.7440000176429749, 0.7570000290870667, 0.7360000014305115, 0.734000027179718, 0.7419999837875366, 0.7429999709129333, 0.7519999742507935, 0.746999979019165, 0.7459999918937683, 0.75, 0.75, 0.7400000095367432, 0.7440000176429749, 0.7450000047683716, 0.7549999952316284, 0.7580000162124634, 0.7540000081062317, 0.7419999837875366, 0.7580000162124634, 0.746999979019165, 0.7540000081062317, 0.765999972820282, 0.7549999952316284, 0.7580000162124634, 0.753000020980835, 0.7549999952316284, 0.753000020980835, 0.7490000128746033, 0.7519999742507935, 0.7630000114440918, 0.7509999871253967, 0.7570000290870667, 0.7609999775886536, 0.7609999775886536, 0.7559999823570251, 0.75, 0.7540000081062317, 0.7480000257492065, 0.7590000033378601, 0.7509999871253967, 0.75, 0.7559999823570251, 0.7509999871253967, 0.7480000257492065, 0.7519999742507935, 0.765999972820282, 0.7590000033378601, 0.7549999952316284, 0.7609999775886536, 0.7559999823570251, 0.7599999904632568, 0.765999972820282, 0.7549999952316284, 0.7549999952316284, 0.7580000162124634, 0.7699999809265137, 0.7590000033378601, 0.7699999809265137, 0.7609999775886536, 0.7590000033378601, 0.765999972820282, 0.765999972820282, 0.7639999985694885, 0.7710000276565552, 0.7649999856948853, 0.7519999742507935, 0.7609999775886536, 0.7549999952316284, 0.7580000162124634, 0.7770000100135803, 0.7699999809265137, 0.7749999761581421, 0.777999997138977, 0.7710000276565552, 0.7680000066757202, 0.7749999761581421, 0.7730000019073486, 0.7699999809265137, 0.7799999713897705, 0.7649999856948853, 0.7689999938011169, 0.7739999890327454, 0.7710000276565552, 0.7760000228881836, 0.7739999890327454, 0.7699999809265137, 0.7749999761581421, 0.7730000019073486, 0.7770000100135803, 0.7720000147819519, 0.7699999809265137, 0.7739999890327454, 0.7710000276565552, 0.7710000276565552, 0.777999997138977, 0.7789999842643738, 0.7770000100135803, 0.7720000147819519, 0.7730000019073486, 0.7730000019073486, 0.7760000228881836, 0.7710000276565552, 0.7789999842643738, 0.781000018119812, 0.7749999761581421, 0.777999997138977, 0.7789999842643738, 0.7770000100135803, 0.7760000228881836, 0.7730000019073486, 0.7789999842643738, 0.7749999761581421, 0.7770000100135803, 0.7749999761581421, 0.7689999938011169, 0.7749999761581421, 0.777999997138977, 0.7760000228881836, 0.7749999761581421, 0.7789999842643738, 0.7820000052452087, 0.7739999890327454, 0.7799999713897705, 0.781000018119812, 0.7870000004768372, 0.781000018119812, 0.7789999842643738, 0.781000018119812, 0.777999997138977, 0.7760000228881836, 0.777999997138977, 0.7720000147819519, 0.7749999761581421, 0.7739999890327454], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.6209999918937683, 0.6549999713897705, 0.6800000071525574, 0.6830000281333923, 0.703000009059906, 0.7020000219345093, 0.7110000252723694, 0.7160000205039978, 0.7129999995231628, 0.7210000157356262, 0.7250000238418579, 0.7210000157356262, 0.7310000061988831, 0.7269999980926514, 0.7329999804496765, 0.7459999918937683, 0.734000027179718, 0.7409999966621399, 0.7390000224113464, 0.7350000143051147, 0.7509999871253967, 0.7440000176429749, 0.7379999756813049, 0.7599999904632568, 0.7400000095367432, 0.7409999966621399, 0.7590000033378601, 0.7409999966621399, 0.7440000176429749, 0.7400000095367432, 0.7450000047683716, 0.75, 0.7440000176429749, 0.7409999966621399, 0.7429999709129333, 0.7440000176429749, 0.7440000176429749, 0.7559999823570251, 0.7459999918937683, 0.7559999823570251, 0.7540000081062317, 0.7599999904632568, 0.7559999823570251, 0.7490000128746033, 0.7490000128746033, 0.7429999709129333, 0.7609999775886536, 0.7519999742507935, 0.7480000257492065, 0.7490000128746033, 0.7620000243186951, 0.7580000162124634, 0.7580000162124634, 0.7540000081062317, 0.7509999871253967, 0.7519999742507935, 0.7440000176429749, 0.7459999918937683, 0.7559999823570251, 0.7620000243186951, 0.746999979019165, 0.7570000290870667, 0.7620000243186951, 0.7570000290870667, 0.7540000081062317, 0.7540000081062317, 0.7570000290870667, 0.7590000033378601, 0.7519999742507935, 0.75, 0.7559999823570251, 0.7590000033378601, 0.7559999823570251, 0.7519999742507935, 0.7639999985694885, 0.7620000243186951, 0.7549999952316284, 0.7490000128746033, 0.7559999823570251, 0.7639999985694885, 0.7609999775886536, 0.7609999775886536, 0.7519999742507935, 0.7549999952316284, 0.7570000290870667, 0.7620000243186951, 0.7599999904632568, 0.7639999985694885, 0.7559999823570251, 0.753000020980835, 0.7649999856948853, 0.753000020980835, 0.7549999952316284, 0.7609999775886536, 0.7599999904632568, 0.7680000066757202, 0.7540000081062317, 0.7559999823570251, 0.7590000033378601, 0.7590000033378601, 0.7649999856948853, 0.7639999985694885, 0.7710000276565552, 0.7699999809265137, 0.7609999775886536, 0.765999972820282, 0.7670000195503235, 0.7720000147819519, 0.7639999985694885, 0.7609999775886536, 0.7549999952316284, 0.7630000114440918, 0.7670000195503235, 0.7599999904632568, 0.765999972820282, 0.7670000195503235, 0.7670000195503235, 0.7670000195503235, 0.7720000147819519, 0.7760000228881836, 0.7710000276565552, 0.7829999923706055, 0.7630000114440918, 0.7720000147819519, 0.7649999856948853, 0.7630000114440918, 0.7699999809265137, 0.7720000147819519, 0.7720000147819519, 0.7689999938011169, 0.777999997138977, 0.7689999938011169, 0.7760000228881836, 0.7730000019073486, 0.7799999713897705, 0.7720000147819519, 0.7760000228881836, 0.7710000276565552, 0.7770000100135803, 0.777999997138977, 0.7670000195503235, 0.7789999842643738, 0.7799999713897705, 0.7749999761581421, 0.7730000019073486, 0.777999997138977, 0.777999997138977, 0.7799999713897705, 0.7770000100135803, 0.7770000100135803, 0.7789999842643738, 0.7760000228881836, 0.7770000100135803, 0.7770000100135803, 0.7770000100135803, 0.7739999890327454, 0.7689999938011169, 0.7760000228881836, 0.777999997138977, 0.7699999809265137, 0.7739999890327454, 0.7670000195503235, 0.7699999809265137, 0.7710000276565552, 0.7730000019073486, 0.7739999890327454, 0.7680000066757202], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.6190000176429749, 0.6549999713897705, 0.6769999861717224, 0.6899999976158142, 0.6869999766349792, 0.7149999737739563, 0.7179999947547913, 0.7179999947547913, 0.7319999933242798, 0.7390000224113464, 0.7350000143051147, 0.7480000257492065, 0.7440000176429749, 0.7409999966621399, 0.7440000176429749, 0.7580000162124634, 0.7419999837875366, 0.7440000176429749, 0.75, 0.734000027179718, 0.746999979019165, 0.7459999918937683, 0.7390000224113464, 0.7490000128746033, 0.7379999756813049, 0.7429999709129333, 0.7390000224113464, 0.7360000014305115, 0.7419999837875366, 0.7480000257492065, 0.7480000257492065, 0.7490000128746033, 0.7440000176429749, 0.75, 0.7540000081062317, 0.7490000128746033, 0.7549999952316284, 0.7429999709129333, 0.7540000081062317, 0.753000020980835, 0.7540000081062317, 0.7440000176429749, 0.7570000290870667, 0.7400000095367432, 0.7490000128746033, 0.7549999952316284, 0.7559999823570251, 0.7580000162124634, 0.7609999775886536, 0.7480000257492065, 0.7490000128746033, 0.7599999904632568, 0.7609999775886536, 0.7540000081062317, 0.753000020980835, 0.7490000128746033, 0.7480000257492065, 0.7440000176429749, 0.7549999952316284, 0.7540000081062317, 0.7559999823570251, 0.7490000128746033, 0.7409999966621399, 0.7580000162124634, 0.75, 0.746999979019165, 0.7400000095367432, 0.7559999823570251, 0.7490000128746033, 0.7429999709129333, 0.7519999742507935, 0.7549999952316284, 0.7559999823570251, 0.753000020980835, 0.753000020980835, 0.746999979019165, 0.746999979019165, 0.7559999823570251, 0.7549999952316284, 0.7549999952316284, 0.7570000290870667, 0.7599999904632568, 0.7599999904632568, 0.7549999952316284, 0.765999972820282, 0.7649999856948853, 0.7630000114440918, 0.7580000162124634, 0.7599999904632568, 0.7559999823570251, 0.7490000128746033, 0.7620000243186951, 0.7519999742507935, 0.7580000162124634, 0.7559999823570251, 0.7580000162124634, 0.7670000195503235, 0.7599999904632568, 0.7559999823570251, 0.7580000162124634, 0.7570000290870667, 0.7649999856948853, 0.7590000033378601, 0.7649999856948853, 0.7649999856948853, 0.7609999775886536, 0.7519999742507935, 0.7639999985694885, 0.7699999809265137, 0.7689999938011169, 0.7609999775886536, 0.765999972820282, 0.7710000276565552, 0.7590000033378601, 0.7710000276565552, 0.7639999985694885, 0.7710000276565552, 0.7730000019073486, 0.7680000066757202, 0.7590000033378601, 0.7639999985694885, 0.7609999775886536, 0.7559999823570251, 0.7749999761581421, 0.7680000066757202, 0.7599999904632568, 0.7609999775886536, 0.7599999904632568, 0.7580000162124634, 0.7599999904632568, 0.7649999856948853, 0.765999972820282, 0.7580000162124634, 0.7739999890327454, 0.7739999890327454, 0.7739999890327454, 0.7620000243186951, 0.7749999761581421, 0.7699999809265137, 0.7670000195503235, 0.7720000147819519, 0.7739999890327454, 0.7739999890327454, 0.7649999856948853, 0.7710000276565552, 0.7649999856948853, 0.7699999809265137, 0.7760000228881836, 0.7730000019073486, 0.7699999809265137, 0.7739999890327454, 0.7720000147819519, 0.7670000195503235, 0.7720000147819519, 0.7749999761581421, 0.7699999809265137, 0.7689999938011169, 0.7639999985694885, 0.7760000228881836, 0.7670000195503235, 0.7670000195503235, 0.7689999938011169, 0.7760000228881836, 0.7670000195503235, 0.7649999856948853, 0.7720000147819519, 0.7609999775886536], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.5099999904632568, 0.621999979019165, 0.6439999938011169, 0.6700000166893005, 0.6790000200271606, 0.6869999766349792, 0.6959999799728394, 0.6790000200271606, 0.6880000233650208, 0.7049999833106995, 0.699999988079071, 0.6990000009536743, 0.6940000057220459, 0.7110000252723694, 0.7120000123977661, 0.7070000171661377, 0.7070000171661377, 0.6990000009536743, 0.7009999752044678, 0.7160000205039978, 0.7200000286102295, 0.7149999737739563, 0.7250000238418579, 0.7210000157356262, 0.722000002861023, 0.7310000061988831, 0.7289999723434448, 0.7319999933242798, 0.7250000238418579, 0.722000002861023, 0.7210000157356262, 0.7170000076293945, 0.7260000109672546, 0.7250000238418579, 0.7210000157356262, 0.7200000286102295, 0.7379999756813049, 0.7239999771118164, 0.7239999771118164, 0.7080000042915344, 0.7289999723434448, 0.7289999723434448, 0.7300000190734863, 0.7329999804496765, 0.7319999933242798, 0.7350000143051147, 0.7390000224113464, 0.7350000143051147, 0.7289999723434448, 0.734000027179718, 0.7329999804496765, 0.7400000095367432, 0.7409999966621399, 0.7310000061988831, 0.7350000143051147, 0.7360000014305115, 0.7360000014305115, 0.7409999966621399, 0.7319999933242798, 0.7409999966621399, 0.7400000095367432, 0.7390000224113464, 0.7329999804496765, 0.7459999918937683, 0.753000020980835, 0.746999979019165, 0.734000027179718, 0.7369999885559082, 0.7419999837875366, 0.734000027179718, 0.7419999837875366, 0.7289999723434448, 0.7350000143051147, 0.7300000190734863, 0.7519999742507935, 0.7390000224113464, 0.7400000095367432, 0.7409999966621399, 0.7429999709129333, 0.7450000047683716, 0.7329999804496765, 0.7260000109672546, 0.7570000290870667, 0.7360000014305115, 0.7519999742507935, 0.7419999837875366, 0.7379999756813049, 0.7390000224113464, 0.7490000128746033, 0.734000027179718, 0.7360000014305115, 0.7390000224113464, 0.7440000176429749, 0.7450000047683716, 0.7319999933242798, 0.7429999709129333, 0.7519999742507935, 0.7540000081062317, 0.7519999742507935, 0.753000020980835, 0.7480000257492065, 0.7440000176429749, 0.7459999918937683, 0.7369999885559082, 0.7419999837875366, 0.7480000257492065, 0.7419999837875366, 0.765999972820282, 0.746999979019165, 0.7459999918937683, 0.7570000290870667, 0.7390000224113464, 0.7409999966621399, 0.7459999918937683, 0.75, 0.7570000290870667, 0.753000020980835, 0.7549999952316284, 0.7519999742507935, 0.7490000128746033, 0.746999979019165, 0.7459999918937683, 0.7459999918937683, 0.746999979019165, 0.7409999966621399, 0.7419999837875366, 0.7459999918937683, 0.7440000176429749, 0.7459999918937683, 0.7490000128746033, 0.7450000047683716, 0.7409999966621399, 0.7419999837875366, 0.7490000128746033, 0.7590000033378601, 0.7549999952316284, 0.7549999952316284, 0.746999979019165, 0.753000020980835, 0.7549999952316284, 0.746999979019165, 0.7580000162124634, 0.7490000128746033, 0.753000020980835, 0.75, 0.75, 0.7540000081062317, 0.7540000081062317, 0.7490000128746033, 0.7570000290870667, 0.7570000290870667, 0.7590000033378601, 0.7559999823570251, 0.7620000243186951, 0.7590000033378601, 0.7509999871253967, 0.7639999985694885, 0.7580000162124634, 0.7599999904632568, 0.7620000243186951, 0.7590000033378601, 0.7609999775886536, 0.7559999823570251, 0.75, 0.7509999871253967, 0.7549999952316284, 0.7540000081062317, 0.7540000081062317], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.395000010728836, 0.3970000147819519, 0.3930000066757202, 0.4050000011920929, 0.3899999856948852, 0.4070000052452087, 0.4040000140666961, 0.4189999997615814, 0.4000000059604645, 0.4269999861717224, 0.4009999930858612, 0.3980000019073486, 0.414000004529953, 0.4120000004768371, 0.4050000011920929, 0.4070000052452087, 0.4070000052452087, 0.4129999876022339, 0.4009999930858612, 0.4070000052452087, 0.4059999883174896, 0.4050000011920929, 0.4059999883174896, 0.4090000092983246, 0.4160000085830688, 0.4059999883174896, 0.3899999856948852, 0.3899999856948852, 0.4009999930858612, 0.3970000147819519, 0.3959999978542328, 0.4110000133514404, 0.4040000140666961, 0.4110000133514404, 0.4050000011920929, 0.4090000092983246, 0.402999997138977, 0.4189999997615814, 0.3980000019073486, 0.4059999883174896, 0.4120000004768371, 0.4149999916553497, 0.4059999883174896, 0.4250000119209289, 0.4110000133514404, 0.4070000052452087, 0.4120000004768371, 0.4120000004768371, 0.4020000100135803, 0.4050000011920929, 0.4009999930858612, 0.4079999923706054, 0.4110000133514404, 0.4110000133514404, 0.4059999883174896, 0.4040000140666961, 0.3980000019073486, 0.395000010728836, 0.4050000011920929, 0.402999997138977, 0.4020000100135803, 0.4090000092983246, 0.4079999923706054, 0.4020000100135803, 0.4000000059604645, 0.4020000100135803, 0.4090000092983246, 0.4160000085830688, 0.3959999978542328, 0.3970000147819519, 0.3970000147819519, 0.3989999890327453, 0.3970000147819519, 0.4009999930858612, 0.4020000100135803, 0.4009999930858612, 0.4050000011920929, 0.4110000133514404, 0.4050000011920929, 0.414000004529953, 0.4059999883174896, 0.4020000100135803, 0.395000010728836, 0.4009999930858612, 0.402999997138977, 0.4040000140666961, 0.4020000100135803, 0.3989999890327453, 0.3980000019073486, 0.4040000140666961, 0.4059999883174896, 0.3899999856948852, 0.4009999930858612, 0.3980000019073486, 0.4040000140666961, 0.4110000133514404, 0.3939999938011169, 0.4040000140666961, 0.4040000140666961, 0.4059999883174896, 0.402999997138977, 0.4020000100135803, 0.4040000140666961, 0.4059999883174896, 0.3970000147819519, 0.3989999890327453, 0.4040000140666961, 0.4090000092983246, 0.4059999883174896, 0.4020000100135803, 0.4099999964237213, 0.4050000011920929, 0.4099999964237213, 0.402999997138977, 0.4040000140666961, 0.3989999890327453, 0.4070000052452087, 0.4129999876022339, 0.4120000004768371, 0.4149999916553497, 0.4169999957084656, 0.3989999890327453, 0.414000004529953, 0.4169999957084656, 0.4079999923706054, 0.4020000100135803, 0.4009999930858612, 0.4020000100135803, 0.4090000092983246, 0.4189999997615814, 0.4050000011920929, 0.4090000092983246, 0.4040000140666961, 0.4090000092983246, 0.4160000085830688, 0.402999997138977, 0.4169999957084656, 0.4160000085830688, 0.4149999916553497, 0.4059999883174896, 0.4169999957084656, 0.4099999964237213, 0.4210000038146972, 0.4239999949932098, 0.4160000085830688, 0.4090000092983246, 0.4120000004768371, 0.4110000133514404, 0.4189999997615814, 0.414000004529953, 0.4059999883174896, 0.4110000133514404, 0.4149999916553497, 0.4110000133514404, 0.4129999876022339, 0.4120000004768371, 0.4149999916553497, 0.414000004529953, 0.4079999923706054, 0.4070000052452087, 0.4099999964237213, 0.4059999883174896, 0.4040000140666961, 0.4149999916553497, 0.4099999964237213, 0.4149999916553497, 0.414000004529953], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.4009999930858612, 0.4110000133514404, 0.3889999985694885, 0.4020000100135803, 0.4079999923706054, 0.3880000114440918, 0.4000000059604645, 0.3910000026226043, 0.3980000019073486, 0.395000010728836, 0.3939999938011169, 0.4050000011920929, 0.4099999964237213, 0.4099999964237213, 0.4099999964237213, 0.4059999883174896, 0.4059999883174896, 0.402999997138977, 0.4079999923706054, 0.4169999957084656, 0.4020000100135803, 0.3970000147819519, 0.3970000147819519, 0.4210000038146972, 0.3970000147819519, 0.3980000019073486, 0.395000010728836, 0.4000000059604645, 0.3989999890327453, 0.4009999930858612, 0.4129999876022339, 0.4120000004768371, 0.4120000004768371, 0.3919999897480011, 0.414000004529953, 0.4009999930858612, 0.4090000092983246, 0.4099999964237213, 0.4079999923706054, 0.4079999923706054, 0.4009999930858612, 0.3959999978542328, 0.3959999978542328, 0.4099999964237213, 0.3959999978542328, 0.4050000011920929, 0.4059999883174896, 0.4110000133514404, 0.402999997138977, 0.4040000140666961, 0.414000004529953, 0.3989999890327453, 0.4199999868869781, 0.4050000011920929, 0.4070000052452087, 0.4079999923706054, 0.3989999890327453, 0.4099999964237213, 0.4020000100135803, 0.4090000092983246, 0.414000004529953, 0.4189999997615814, 0.4079999923706054, 0.4180000126361847, 0.4059999883174896, 0.4070000052452087, 0.4120000004768371, 0.402999997138977, 0.4059999883174896, 0.4090000092983246, 0.4110000133514404, 0.3989999890327453, 0.4079999923706054, 0.4040000140666961, 0.414000004529953, 0.4079999923706054, 0.402999997138977, 0.4099999964237213, 0.402999997138977, 0.4120000004768371, 0.4000000059604645, 0.4090000092983246, 0.4050000011920929, 0.4129999876022339, 0.4040000140666961, 0.4129999876022339, 0.4079999923706054, 0.4180000126361847, 0.414000004529953, 0.4160000085830688, 0.4199999868869781, 0.4020000100135803, 0.4020000100135803, 0.4079999923706054, 0.4009999930858612, 0.4040000140666961, 0.4099999964237213, 0.4050000011920929, 0.4040000140666961, 0.414000004529953, 0.4180000126361847, 0.4050000011920929, 0.414000004529953, 0.4079999923706054, 0.4050000011920929, 0.4050000011920929, 0.4070000052452087, 0.4050000011920929, 0.4059999883174896, 0.4059999883174896, 0.4000000059604645, 0.4120000004768371, 0.4059999883174896, 0.4070000052452087, 0.4120000004768371, 0.4050000011920929, 0.4059999883174896, 0.3989999890327453, 0.3959999978542328, 0.4020000100135803, 0.3989999890327453, 0.3959999978542328, 0.3989999890327453, 0.4059999883174896, 0.4070000052452087, 0.4120000004768371, 0.4009999930858612, 0.4120000004768371, 0.4129999876022339, 0.4090000092983246, 0.414000004529953, 0.4099999964237213, 0.4160000085830688, 0.4040000140666961, 0.4099999964237213, 0.414000004529953, 0.4050000011920929, 0.402999997138977, 0.4040000140666961, 0.4079999923706054, 0.3989999890327453, 0.4059999883174896, 0.3980000019073486, 0.4070000052452087, 0.4020000100135803, 0.4009999930858612, 0.4000000059604645, 0.4079999923706054, 0.4070000052452087, 0.402999997138977, 0.4079999923706054, 0.4050000011920929, 0.4040000140666961, 0.4070000052452087, 0.4020000100135803, 0.3959999978542328, 0.402999997138977, 0.402999997138977, 0.4099999964237213, 0.4090000092983246, 0.4009999930858612, 0.4059999883174896, 0.4020000100135803, 0.4040000140666961, 0.4009999930858612, 0.4070000052452087, 0.4070000052452087], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.3959999978542328, 0.4070000052452087, 0.3910000026226043, 0.3939999938011169, 0.3980000019073486, 0.3930000066757202, 0.4059999883174896, 0.4000000059604645, 0.3889999985694885, 0.3989999890327453, 0.402999997138977, 0.3980000019073486, 0.4000000059604645, 0.3989999890327453, 0.4000000059604645, 0.3930000066757202, 0.3939999938011169, 0.3930000066757202, 0.3880000114440918, 0.3980000019073486, 0.4020000100135803, 0.3980000019073486, 0.3959999978542328, 0.3989999890327453, 0.4020000100135803, 0.402999997138977, 0.4009999930858612, 0.4079999923706054, 0.4009999930858612, 0.4070000052452087, 0.4070000052452087, 0.4020000100135803, 0.4059999883174896, 0.4079999923706054, 0.4110000133514404, 0.3989999890327453, 0.4000000059604645, 0.402999997138977, 0.3939999938011169, 0.3939999938011169, 0.3980000019073486, 0.3980000019073486, 0.4050000011920929, 0.4009999930858612, 0.4120000004768371, 0.402999997138977, 0.4090000092983246, 0.402999997138977, 0.3980000019073486, 0.3959999978542328, 0.3970000147819519, 0.4009999930858612, 0.4070000052452087, 0.4070000052452087, 0.4079999923706054, 0.402999997138977, 0.3989999890327453, 0.3980000019073486, 0.4009999930858612, 0.4050000011920929, 0.4000000059604645, 0.3959999978542328, 0.3980000019073486, 0.3970000147819519, 0.4129999876022339, 0.402999997138977, 0.4090000092983246, 0.4050000011920929, 0.402999997138977, 0.4009999930858612, 0.3980000019073486, 0.402999997138977, 0.4020000100135803, 0.3980000019073486, 0.3970000147819519, 0.402999997138977, 0.4000000059604645, 0.4149999916553497, 0.3959999978542328, 0.4000000059604645, 0.4020000100135803, 0.3919999897480011, 0.4110000133514404, 0.4090000092983246, 0.4070000052452087, 0.4059999883174896, 0.4020000100135803, 0.3959999978542328, 0.4050000011920929, 0.395000010728836, 0.4020000100135803, 0.3959999978542328, 0.4090000092983246, 0.4070000052452087, 0.4040000140666961, 0.4000000059604645, 0.4020000100135803, 0.402999997138977, 0.4050000011920929, 0.414000004529953, 0.4009999930858612, 0.402999997138977, 0.4020000100135803, 0.3980000019073486, 0.4020000100135803, 0.4000000059604645, 0.402999997138977, 0.395000010728836, 0.4009999930858612, 0.3959999978542328, 0.4120000004768371, 0.3989999890327453, 0.3980000019073486, 0.4040000140666961, 0.4070000052452087, 0.3989999890327453, 0.3989999890327453, 0.3970000147819519, 0.3970000147819519, 0.3980000019073486, 0.3970000147819519, 0.3989999890327453, 0.4110000133514404, 0.4050000011920929, 0.3939999938011169, 0.3970000147819519, 0.4009999930858612, 0.3989999890327453, 0.3980000019073486, 0.402999997138977, 0.3860000073909759, 0.402999997138977, 0.4020000100135803, 0.3959999978542328, 0.3989999890327453, 0.3959999978542328, 0.4000000059604645, 0.395000010728836, 0.395000010728836, 0.3939999938011169, 0.3980000019073486, 0.4020000100135803, 0.3989999890327453, 0.3989999890327453, 0.3880000114440918, 0.3980000019073486, 0.3959999978542328, 0.4009999930858612, 0.4079999923706054, 0.395000010728836, 0.3980000019073486, 0.4009999930858612, 0.3980000019073486, 0.4009999930858612, 0.3989999890327453, 0.4000000059604645, 0.402999997138977, 0.4000000059604645, 0.4020000100135803, 0.3959999978542328, 0.395000010728836, 0.4020000100135803, 0.3970000147819519, 0.3980000019073486, 0.3959999978542328, 0.4000000059604645, 0.4050000011920929], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.3619999885559082, 0.4000000059604645, 0.395000010728836, 0.3959999978542328, 0.4020000100135803, 0.4000000059604645, 0.3959999978542328, 0.3930000066757202, 0.3899999856948852, 0.402999997138977, 0.4009999930858612, 0.3930000066757202, 0.4050000011920929, 0.3939999938011169, 0.4000000059604645, 0.3989999890327453, 0.3959999978542328, 0.4020000100135803, 0.4000000059604645, 0.3939999938011169, 0.395000010728836, 0.3919999897480011, 0.3980000019073486, 0.3910000026226043, 0.3880000114440918, 0.3959999978542328, 0.3980000019073486, 0.3989999890327453, 0.402999997138977, 0.3959999978542328, 0.3980000019073486, 0.395000010728836, 0.4090000092983246, 0.4090000092983246, 0.3889999985694885, 0.3959999978542328, 0.3880000114440918, 0.3840000033378601, 0.3959999978542328, 0.3880000114440918, 0.3939999938011169, 0.3970000147819519, 0.3910000026226043, 0.3939999938011169, 0.4020000100135803, 0.3980000019073486, 0.3970000147819519, 0.4009999930858612, 0.3919999897480011, 0.3899999856948852, 0.3989999890327453, 0.3860000073909759, 0.3860000073909759, 0.3970000147819519, 0.3959999978542328, 0.3939999938011169, 0.3840000033378601, 0.3869999945163727, 0.402999997138977, 0.4050000011920929, 0.395000010728836, 0.3880000114440918, 0.3869999945163727, 0.3939999938011169, 0.402999997138977, 0.3899999856948852, 0.3910000026226043, 0.3910000026226043, 0.4009999930858612, 0.3919999897480011, 0.3970000147819519, 0.3919999897480011, 0.3930000066757202, 0.3869999945163727, 0.3880000114440918, 0.3849999904632568, 0.3930000066757202, 0.395000010728836, 0.3889999985694885, 0.3959999978542328, 0.3989999890327453, 0.402999997138977, 0.3939999938011169, 0.4000000059604645, 0.4000000059604645, 0.4050000011920929, 0.3989999890327453, 0.3869999945163727, 0.3910000026226043, 0.3889999985694885, 0.3889999985694885, 0.4000000059604645, 0.3910000026226043, 0.3970000147819519, 0.3989999890327453, 0.3989999890327453, 0.3959999978542328, 0.3910000026226043, 0.3880000114440918, 0.3939999938011169, 0.382999986410141, 0.3849999904632568, 0.3959999978542328, 0.3989999890327453, 0.3959999978542328, 0.3880000114440918, 0.3840000033378601, 0.3980000019073486, 0.4000000059604645, 0.4000000059604645, 0.4020000100135803, 0.395000010728836, 0.3910000026226043, 0.3919999897480011, 0.4040000140666961, 0.3989999890327453, 0.4020000100135803, 0.3910000026226043, 0.4009999930858612, 0.3959999978542328, 0.3939999938011169, 0.3930000066757202, 0.3910000026226043, 0.3970000147819519, 0.3880000114440918, 0.3970000147819519, 0.3959999978542328, 0.3889999985694885, 0.3970000147819519, 0.4009999930858612, 0.3970000147819519, 0.3959999978542328, 0.3959999978542328, 0.3989999890327453, 0.4040000140666961, 0.3959999978542328, 0.3980000019073486, 0.3970000147819519, 0.3970000147819519, 0.3989999890327453, 0.4020000100135803, 0.3980000019073486, 0.4000000059604645, 0.4000000059604645, 0.402999997138977, 0.4090000092983246, 0.3970000147819519, 0.4020000100135803, 0.3970000147819519, 0.4009999930858612, 0.3959999978542328, 0.3970000147819519, 0.3989999890327453, 0.3939999938011169, 0.3989999890327453, 0.4000000059604645, 0.4000000059604645, 0.3989999890327453, 0.4050000011920929, 0.4059999883174896, 0.4009999930858612, 0.3989999890327453, 0.3959999978542328, 0.3939999938011169, 0.3970000147819519, 0.4009999930858612, 0.3989999890327453, 0.3939999938011169], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/all_filtering_steps/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"big-run-sampled_full_ind_minhash": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.4880000054836273, 0.492000013589859, 0.5059999823570251, 0.5139999985694885, 0.5070000290870667, 0.5090000033378601, 0.5230000019073486, 0.5189999938011169, 0.5189999938011169, 0.5220000147819519, 0.5149999856948853, 0.5260000228881836, 0.5329999923706055, 0.5180000066757202, 0.5289999842643738, 0.5400000214576721, 0.5410000085830688, 0.5440000295639038, 0.5329999923706055, 0.550000011920929, 0.5419999957084656, 0.5360000133514404, 0.5429999828338623, 0.5429999828338623, 0.5450000166893005, 0.5490000247955322, 0.5400000214576721, 0.5509999990463257, 0.5559999942779541, 0.5479999780654907, 0.5540000200271606, 0.5490000247955322, 0.5400000214576721, 0.5429999828338623, 0.5460000038146973, 0.5370000004768372, 0.5479999780654907, 0.5550000071525574, 0.5490000247955322, 0.5400000214576721, 0.5410000085830688, 0.5460000038146973, 0.546999990940094, 0.5479999780654907, 0.546999990940094, 0.5509999990463257, 0.5450000166893005, 0.5590000152587891, 0.5419999957084656, 0.5540000200271606, 0.5440000295639038, 0.5450000166893005, 0.5580000281333923, 0.5540000200271606, 0.5440000295639038, 0.5619999766349792, 0.5450000166893005, 0.5600000023841858, 0.5559999942779541, 0.5600000023841858, 0.5400000214576721, 0.5569999814033508, 0.5600000023841858, 0.5619999766349792, 0.5529999732971191, 0.5649999976158142, 0.5609999895095825, 0.5550000071525574, 0.5609999895095825, 0.5580000281333923, 0.5550000071525574, 0.5619999766349792, 0.5550000071525574, 0.5519999861717224, 0.5600000023841858, 0.5550000071525574, 0.5550000071525574, 0.5590000152587891, 0.5490000247955322, 0.5580000281333923, 0.5600000023841858, 0.5419999957084656, 0.5559999942779541, 0.5559999942779541, 0.5529999732971191, 0.5609999895095825, 0.5519999861717224, 0.5569999814033508, 0.5569999814033508, 0.5509999990463257, 0.5619999766349792, 0.546999990940094, 0.5619999766349792, 0.5460000038146973, 0.5529999732971191, 0.5619999766349792, 0.5690000057220459, 0.5680000185966492, 0.5720000267028809, 0.5640000104904175, 0.5550000071525574, 0.5509999990463257, 0.550000011920929, 0.5600000023841858, 0.5609999895095825, 0.5630000233650208, 0.5649999976158142, 0.5529999732971191, 0.5540000200271606, 0.5529999732971191, 0.5659999847412109, 0.5600000023841858, 0.5590000152587891, 0.5619999766349792, 0.5600000023841858, 0.5730000138282776, 0.5569999814033508, 0.5690000057220459, 0.5619999766349792, 0.5680000185966492, 0.578000009059906, 0.5730000138282776, 0.5550000071525574, 0.5529999732971191, 0.5600000023841858, 0.5630000233650208, 0.5590000152587891, 0.5659999847412109, 0.5669999718666077, 0.5609999895095825, 0.5630000233650208, 0.5569999814033508, 0.5490000247955322, 0.5619999766349792, 0.5550000071525574, 0.5630000233650208, 0.5559999942779541, 0.5559999942779541, 0.5649999976158142, 0.5569999814033508, 0.5619999766349792, 0.5559999942779541, 0.5669999718666077, 0.5609999895095825, 0.5690000057220459, 0.5770000219345093, 0.5690000057220459, 0.5720000267028809, 0.5619999766349792, 0.5649999976158142, 0.5669999718666077, 0.5680000185966492, 0.5699999928474426, 0.5640000104904175, 0.5609999895095825, 0.5740000009536743, 0.5690000057220459, 0.5669999718666077, 0.5720000267028809, 0.5699999928474426, 0.5709999799728394, 0.5740000009536743, 0.5680000185966492, 0.5619999766349792, 0.5690000057220459, 0.5659999847412109, 0.574999988079071], "label": "FineWeb: independent MinHash (id mh)"}, "big-run-fineweb-v1-all-dumps": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.4760000109672546, 0.4979999959468841, 0.503000020980835, 0.531000018119812, 0.515999972820282, 0.5220000147819519, 0.5210000276565552, 0.5260000228881836, 0.5289999842643738, 0.5249999761581421, 0.5239999890327454, 0.5189999938011169, 0.5260000228881836, 0.5139999985694885, 0.5299999713897705, 0.5370000004768372, 0.5350000262260437, 0.5329999923706055, 0.531000018119812, 0.5299999713897705, 0.550000011920929, 0.5329999923706055, 0.5260000228881836, 0.5320000052452087, 0.5339999794960022, 0.5429999828338623, 0.5440000295639038, 0.5379999876022339, 0.5509999990463257, 0.5529999732971191, 0.5440000295639038, 0.5479999780654907, 0.5419999957084656, 0.5339999794960022, 0.5440000295639038, 0.5419999957084656, 0.5370000004768372, 0.5289999842643738, 0.5220000147819519, 0.5429999828338623, 0.5519999861717224, 0.5419999957084656, 0.5370000004768372, 0.546999990940094, 0.5509999990463257, 0.5509999990463257, 0.5460000038146973, 0.5519999861717224, 0.5429999828338623, 0.5419999957084656, 0.5379999876022339, 0.5450000166893005, 0.5440000295639038, 0.5440000295639038, 0.5239999890327454, 0.5450000166893005, 0.550000011920929, 0.5550000071525574, 0.5429999828338623, 0.5540000200271606, 0.5410000085830688, 0.5429999828338623, 0.5550000071525574, 0.5509999990463257, 0.5460000038146973, 0.550000011920929, 0.546999990940094, 0.5429999828338623, 0.5299999713897705, 0.550000011920929, 0.5550000071525574, 0.5440000295639038, 0.5410000085830688, 0.5450000166893005, 0.550000011920929, 0.546999990940094, 0.5519999861717224, 0.5529999732971191, 0.550000011920929, 0.5519999861717224, 0.5540000200271606, 0.5379999876022339, 0.5590000152587891, 0.5440000295639038, 0.5540000200271606, 0.5540000200271606, 0.5429999828338623, 0.5450000166893005, 0.5440000295639038, 0.5519999861717224, 0.546999990940094, 0.5519999861717224, 0.5559999942779541, 0.5659999847412109, 0.5649999976158142, 0.5600000023841858, 0.5569999814033508, 0.5550000071525574, 0.5630000233650208, 0.5559999942779541, 0.5669999718666077, 0.5550000071525574, 0.5609999895095825, 0.5580000281333923, 0.5699999928474426, 0.5580000281333923, 0.5490000247955322, 0.5619999766349792, 0.5609999895095825, 0.5529999732971191, 0.5490000247955322, 0.5540000200271606, 0.5590000152587891, 0.5600000023841858, 0.5509999990463257, 0.5569999814033508, 0.5509999990463257, 0.5580000281333923, 0.5580000281333923, 0.5580000281333923, 0.5619999766349792, 0.5649999976158142, 0.5540000200271606, 0.5619999766349792, 0.5659999847412109, 0.5759999752044678, 0.5709999799728394, 0.5550000071525574, 0.5659999847412109, 0.5659999847412109, 0.5680000185966492, 0.5669999718666077, 0.5600000023841858, 0.5619999766349792, 0.5640000104904175, 0.5580000281333923, 0.5580000281333923, 0.5540000200271606, 0.5789999961853027, 0.5600000023841858, 0.5509999990463257, 0.5690000057220459, 0.5709999799728394, 0.5669999718666077, 0.5600000023841858, 0.5619999766349792, 0.5600000023841858, 0.5590000152587891, 0.5600000023841858, 0.5690000057220459, 0.5690000057220459, 0.5789999961853027, 0.5669999718666077, 0.5690000057220459, 0.5649999976158142, 0.5690000057220459, 0.5699999928474426, 0.5669999718666077, 0.5649999976158142, 0.5630000233650208, 0.5559999942779541, 0.5669999718666077, 0.5669999718666077, 0.5720000267028809, 0.5690000057220459, 0.5830000042915344, 0.5640000104904175], "label": "FineWeb: id mh + C4 + custom filters"}, "big-run-sampled-fineweb-c4-filters": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.4790000021457672, 0.4839999973773956, 0.5059999823570251, 0.5109999775886536, 0.5070000290870667, 0.5099999904632568, 0.5239999890327454, 0.5239999890327454, 0.5120000243186951, 0.5339999794960022, 0.5220000147819519, 0.5189999938011169, 0.5210000276565552, 0.5260000228881836, 0.5389999747276306, 0.5249999761581421, 0.531000018119812, 0.527999997138977, 0.5299999713897705, 0.5230000019073486, 0.5289999842643738, 0.5450000166893005, 0.5419999957084656, 0.5329999923706055, 0.5400000214576721, 0.5270000100135803, 0.5299999713897705, 0.5379999876022339, 0.531000018119812, 0.5170000195503235, 0.5429999828338623, 0.5339999794960022, 0.5429999828338623, 0.5389999747276306, 0.5289999842643738, 0.5360000133514404, 0.5289999842643738, 0.5350000262260437, 0.5339999794960022, 0.5220000147819519, 0.5400000214576721, 0.5540000200271606, 0.5299999713897705, 0.5379999876022339, 0.531000018119812, 0.5440000295639038, 0.5329999923706055, 0.5400000214576721, 0.5299999713897705, 0.5429999828338623, 0.5299999713897705, 0.5329999923706055, 0.5419999957084656, 0.5339999794960022, 0.5450000166893005, 0.5260000228881836, 0.5379999876022339, 0.5490000247955322, 0.5529999732971191, 0.527999997138977, 0.5350000262260437, 0.5429999828338623, 0.5410000085830688, 0.5400000214576721, 0.5410000085830688, 0.5320000052452087, 0.5360000133514404, 0.5329999923706055, 0.5339999794960022, 0.5410000085830688, 0.5550000071525574, 0.546999990940094, 0.5360000133514404, 0.546999990940094, 0.5550000071525574, 0.5440000295639038, 0.5429999828338623, 0.5479999780654907, 0.5490000247955322, 0.5440000295639038, 0.5299999713897705, 0.546999990940094, 0.5529999732971191, 0.5429999828338623, 0.5419999957084656, 0.5460000038146973, 0.546999990940094, 0.546999990940094, 0.5440000295639038, 0.5460000038146973, 0.5509999990463257, 0.5460000038146973, 0.5479999780654907, 0.5640000104904175, 0.546999990940094, 0.5419999957084656, 0.550000011920929, 0.5540000200271606, 0.550000011920929, 0.5490000247955322, 0.5479999780654907, 0.546999990940094, 0.550000011920929, 0.5479999780654907, 0.550000011920929, 0.5509999990463257, 0.5440000295639038, 0.5580000281333923, 0.550000011920929, 0.5590000152587891, 0.5590000152587891, 0.5600000023841858, 0.550000011920929, 0.5609999895095825, 0.550000011920929, 0.5630000233650208, 0.5490000247955322, 0.5490000247955322, 0.5580000281333923, 0.5519999861717224, 0.5619999766349792, 0.5540000200271606, 0.5580000281333923, 0.5559999942779541, 0.5490000247955322, 0.5490000247955322, 0.5590000152587891, 0.5440000295639038, 0.550000011920929, 0.5540000200271606, 0.5590000152587891, 0.5529999732971191, 0.5540000200271606, 0.5559999942779541, 0.5540000200271606, 0.5619999766349792, 0.5509999990463257, 0.5699999928474426, 0.5529999732971191, 0.5490000247955322, 0.5529999732971191, 0.5580000281333923, 0.5540000200271606, 0.5590000152587891, 0.5590000152587891, 0.5590000152587891, 0.5509999990463257, 0.546999990940094, 0.5550000071525574, 0.5600000023841858, 0.5580000281333923, 0.5680000185966492, 0.5600000023841858, 0.5590000152587891, 0.5619999766349792, 0.5640000104904175, 0.5609999895095825, 0.5569999814033508, 0.5550000071525574, 0.5509999990463257, 0.5609999895095825, 0.5529999732971191, 0.5630000233650208, 0.5690000057220459, 0.5640000104904175, 0.5619999766349792, 0.5619999766349792], "label": "FineWeb: id mh + C4 filters"}, "big-run-sampled_full_filtered_no_dedup": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 29.360128000000003, 31.45728, 33.554432000000006, 35.651584, 37.748736, 39.845888, 41.94304, 44.040192000000005, 46.137344000000006, 48.234496, 50.331648, 52.4288, 54.525952000000004, 56.623104000000005, 58.720256000000006, 60.817408, 62.91456, 65.011712, 67.10886400000001, 69.206016, 71.303168, 73.40032000000001, 75.497472, 77.59462400000001, 79.691776, 81.788928, 83.88608, 85.983232, 88.08038400000001, 90.177536, 92.27468800000001, 94.37184, 96.468992, 98.56614400000001, 100.663296, 102.76044800000001, 104.8576, 106.95475200000001, 109.05190400000001, 111.149056, 113.24620800000001, 115.34336, 117.44051200000001, 119.537664, 121.634816, 123.73196800000001, 125.82912, 127.92627200000001, 130.023424, 132.120576, 134.21772800000002, 136.31488000000002, 138.412032, 140.509184, 142.606336, 144.70348800000002, 146.80064000000002, 148.897792, 150.994944, 153.092096, 155.18924800000002, 157.28640000000001, 159.383552, 161.480704, 163.577856, 165.67500800000002, 167.77216, 169.869312, 171.966464, 174.06361600000002, 176.16076800000002, 178.25792, 180.355072, 182.452224, 184.54937600000002, 186.64652800000002, 188.74368, 190.840832, 192.937984, 195.03513600000002, 197.13228800000002, 199.22944, 201.326592, 203.423744, 205.52089600000002, 207.61804800000002, 209.7152, 211.812352, 213.90950400000003, 216.00665600000002, 218.10380800000001, 220.20096, 222.298112, 224.39526400000003, 226.49241600000002, 228.589568, 230.68672, 232.783872, 234.88102400000002, 236.97817600000002, 239.075328, 241.17248, 243.269632, 245.36678400000002, 247.46393600000002, 249.561088, 251.65824, 253.75539200000003, 255.85254400000002, 257.949696, 260.046848, 262.144, 264.241152, 266.338304, 268.43545600000004, 270.53260800000004, 272.62976000000003, 274.726912, 276.824064, 278.921216, 281.018368, 283.11552, 285.212672, 287.309824, 289.40697600000004, 291.50412800000004, 293.60128000000003, 295.698432, 297.795584, 299.892736, 301.989888, 304.08704, 306.184192, 308.28134400000005, 310.37849600000004, 312.47564800000004, 314.57280000000003, 316.669952, 318.767104, 320.864256, 322.961408, 325.05856, 327.155712, 329.25286400000005, 331.35001600000004, 333.44716800000003, 335.54432, 337.641472, 339.738624, 341.835776, 343.932928, 346.03008, 348.12723200000005, 350.22438400000004], "y": [0.4970000088214874, 0.5239999890327454, 0.4900000095367431, 0.5040000081062317, 0.5099999904632568, 0.4990000128746032, 0.5170000195503235, 0.5040000081062317, 0.5009999871253967, 0.5230000019073486, 0.5109999775886536, 0.5059999823570251, 0.5130000114440918, 0.5090000033378601, 0.5220000147819519, 0.5189999938011169, 0.5180000066757202, 0.5220000147819519, 0.5120000243186951, 0.5460000038146973, 0.5239999890327454, 0.5289999842643738, 0.5440000295639038, 0.5339999794960022, 0.5299999713897705, 0.5260000228881836, 0.5360000133514404, 0.5339999794960022, 0.5360000133514404, 0.5299999713897705, 0.5180000066757202, 0.5249999761581421, 0.5440000295639038, 0.5299999713897705, 0.5339999794960022, 0.5239999890327454, 0.527999997138977, 0.5139999985694885, 0.5289999842643738, 0.5360000133514404, 0.5260000228881836, 0.5389999747276306, 0.5460000038146973, 0.5270000100135803, 0.5339999794960022, 0.5320000052452087, 0.5329999923706055, 0.5260000228881836, 0.5220000147819519, 0.5260000228881836, 0.5379999876022339, 0.5410000085830688, 0.5350000262260437, 0.5389999747276306, 0.5320000052452087, 0.5389999747276306, 0.5379999876022339, 0.5329999923706055, 0.5270000100135803, 0.5170000195503235, 0.5329999923706055, 0.5370000004768372, 0.5379999876022339, 0.5249999761581421, 0.5479999780654907, 0.546999990940094, 0.5400000214576721, 0.5440000295639038, 0.5360000133514404, 0.5450000166893005, 0.5440000295639038, 0.5370000004768372, 0.5370000004768372, 0.5479999780654907, 0.5379999876022339, 0.5400000214576721, 0.5479999780654907, 0.5379999876022339, 0.5509999990463257, 0.5440000295639038, 0.5379999876022339, 0.550000011920929, 0.5389999747276306, 0.5370000004768372, 0.5379999876022339, 0.5419999957084656, 0.5360000133514404, 0.5509999990463257, 0.5360000133514404, 0.5419999957084656, 0.5419999957084656, 0.550000011920929, 0.5360000133514404, 0.5519999861717224, 0.5540000200271606, 0.546999990940094, 0.5370000004768372, 0.5379999876022339, 0.5519999861717224, 0.5329999923706055, 0.5400000214576721, 0.5429999828338623, 0.550000011920929, 0.5490000247955322, 0.5360000133514404, 0.550000011920929, 0.5569999814033508, 0.5490000247955322, 0.5490000247955322, 0.5479999780654907, 0.5350000262260437, 0.5490000247955322, 0.5370000004768372, 0.5440000295639038, 0.5329999923706055, 0.5440000295639038, 0.5429999828338623, 0.5389999747276306, 0.5450000166893005, 0.5320000052452087, 0.5450000166893005, 0.5400000214576721, 0.5419999957084656, 0.5460000038146973, 0.5370000004768372, 0.5400000214576721, 0.5460000038146973, 0.5370000004768372, 0.5370000004768372, 0.5460000038146973, 0.5400000214576721, 0.5490000247955322, 0.5529999732971191, 0.5379999876022339, 0.5460000038146973, 0.5450000166893005, 0.5429999828338623, 0.5460000038146973, 0.5400000214576721, 0.5479999780654907, 0.5460000038146973, 0.5540000200271606, 0.5400000214576721, 0.5350000262260437, 0.5490000247955322, 0.5460000038146973, 0.5460000038146973, 0.5509999990463257, 0.5410000085830688, 0.5429999828338623, 0.5379999876022339, 0.5450000166893005, 0.5389999747276306, 0.5400000214576721, 0.5400000214576721, 0.550000011920929, 0.5440000295639038, 0.5389999747276306, 0.5450000166893005, 0.5400000214576721, 0.5389999747276306, 0.5419999957084656, 0.5410000085830688, 0.5440000295639038, 0.5519999861717224, 0.5479999780654907, 0.5450000166893005, 0.5569999814033508], "label": "FineWeb: base filtering only"}}, "layout": {"title": {"text": "The different FineWeb processing steps"}}}
assets/data/plots/c4_filters_hellaswag/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.3580685469011466, 0.3740996705989043, 0.39048008372386295, 0.39857714250683784, 0.40837346265713376, 0.4111154315372308, 0.41773712386687595, 0.4196594481666882, 0.42379963273803395, 0.4276047808428605, 0.42980752388636273, 0.43098293244838715, 0.43155378103256226, 0.4327609067161878], "label": "C4"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.36066408455371857, 0.3812380563467741, 0.394003426656127, 0.40062618628144264, 0.4117735456675291, 0.4165923688560724, 0.4175422675907612, 0.42100309208035464, 0.42246321588754654, 0.42360376194119453, 0.42823668196797365, 0.4299001637846231, 0.4302353039383888, 0.4310380257666111], "label": "All filters"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.330924579873681, 0.35825083684176207, 0.37912008538842196, 0.38942993618547916, 0.3983491826802492, 0.4053049590438604, 0.4079726096242666, 0.4135104585438967, 0.41717425361275673, 0.41904263757169247, 0.4211529679596424, 0.4212619122117758, 0.42373160831630224, 0.42435371689498425, 0.4279126934707165], "label": "All filters except terminal_punct"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.36182260885834694, 0.3764855917543173, 0.3928546328097582, 0.3978128544986248, 0.4073755294084549, 0.4112890623509884, 0.41486112400889397, 0.4196756165474653, 0.4235504809767008, 0.42218128964304924, 0.4228535555303097, 0.4249562546610832, 0.42740595713257784, 0.42711055465042586], "label": "terminal_punct filter"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.3583905678242445, 0.38119001872837543, 0.3873079549521208, 0.39723034016788, 0.4043100867420435, 0.40908974781632423, 0.4140731003135443, 0.41894380562007427, 0.41736695170402527, 0.4232212919741869, 0.4229240976274013, 0.4236308634281158, 0.42750727012753487, 0.4268195778131485], "label": "curly_bracket filter"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.36000680737197394, 0.37551611103117466, 0.38802069239318365, 0.3933942876756191, 0.4043118376284838, 0.40780537389218807, 0.4112964067608118, 0.4137573726475239, 0.41791345551609993, 0.4173779133707285, 0.42117033526301384, 0.42073468305170536, 0.42412591539323324, 0.4260616712272167], "label": "word_lengths filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308296035975218, 0.35613923892378807, 0.3746252153068781, 0.38806260935962195, 0.39690930768847466, 0.4043668694794178, 0.40220927633345127, 0.41070565767586226, 0.41399387270212173, 0.4170555509626865, 0.42098715901374817, 0.4210818205028772, 0.42051274701952934, 0.424176013097167, 0.4225243702530861], "label": "baseline"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.2928333381811778, 0.3191666702429453, 0.3451666633288066, 0.342166672150294, 0.35983332991600037, 0.35483332475026447, 0.3643333315849304, 0.3631666700045268, 0.3698333303133647, 0.3696666657924652, 0.37433333198229474, 0.3805000086625417, 0.3800000051657359, 0.3798333406448364], "label": "C4"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.29250000417232513, 0.3184999972581863, 0.3297500014305115, 0.34450000524520874, 0.3512499928474426, 0.35724999010562897, 0.36375001072883606, 0.3665000051259994, 0.3684999942779541, 0.3712499886751175, 0.37375000119209284, 0.37800000607967377, 0.3840000033378601, 0.37950000166893005], "label": "curly_bracket filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.2905000001192093, 0.32549999654293055, 0.3307500034570694, 0.3467499911785126, 0.3500000089406967, 0.3452499955892563, 0.3622500002384185, 0.35999999940395355, 0.37024998664855957, 0.3684999942779541, 0.3675000071525574, 0.37249998748302454, 0.37675000727176666, 0.3760000020265579], "label": "baseline"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.28949999809265137, 0.3187499940395355, 0.33825001120567316, 0.35074999928474426, 0.3604999929666519, 0.36274999380111694, 0.3634999990463257, 0.3645000010728836, 0.3644999861717224, 0.3669999986886978, 0.3642500042915344, 0.3722499907016754, 0.37499999999999994, 0.37549999356269836], "label": "word_lengths filter"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2512499988079071, 0.294500008225441, 0.32725000381469727, 0.3352499902248382, 0.3504999876022339, 0.3487499952316284, 0.3557500094175339, 0.35324999690055847, 0.36374999582767487, 0.36474999785423273, 0.372749999165535, 0.36775000393390656, 0.3707500100135803, 0.3734999895095825, 0.375], "label": "All filters except terminal_punct"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.30024999380111694, 0.32724998891353607, 0.33374999463558197, 0.34574998915195465, 0.351749986410141, 0.36124999821186066, 0.3527500033378601, 0.3582500070333481, 0.35850000381469727, 0.36075000464916224, 0.364750012755394, 0.37049999833106995, 0.3729999959468841, 0.36974999308586115], "label": "All filters"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.2947500050067901, 0.31974999606609344, 0.3344999998807907, 0.3445000052452087, 0.351500004529953, 0.35199999809265137, 0.35925000905990595, 0.3634999990463257, 0.36374999582767487, 0.36550000309944153, 0.36775000393390656, 0.3677499890327453, 0.36900000274181366, 0.36650000512599945], "label": "terminal_punct filter"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.26349999010562897, 0.2824999988079071, 0.2985000014305115, 0.3050000071525574, 0.3119999915361404, 0.3110000044107437, 0.3164999932050705, 0.32199999690055847, 0.3279999941587448, 0.3365000039339065, 0.3375000059604645, 0.3384999930858612, 0.340499997138977, 0.341499999165535], "label": "word_lengths filter"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2619999945163727, 0.288000002503395, 0.29749999940395355, 0.30399999022483826, 0.3149999976158142, 0.3245000094175339, 0.3230000138282776, 0.3240000009536743, 0.3245000094175339, 0.33550000190734863, 0.335999995470047, 0.32999999821186066, 0.3375000059604645, 0.34049999713897705], "label": "curly_bracket filter"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.26299999654293055, 0.2864999920129776, 0.2944999933242798, 0.2985000014305115, 0.3165000081062317, 0.3194999992847442, 0.318000003695488, 0.32500000298023224, 0.32899999618530273, 0.3254999965429306, 0.33150000870227814, 0.3330000042915344, 0.33200000226497645, 0.3330000042915344], "label": "All filters"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2650000005960464, 0.28599999845027924, 0.3110000044107437, 0.2944999933242798, 0.3085000067949295, 0.32199999690055847, 0.31949999928474426, 0.3240000009536743, 0.32500000298023224, 0.3245000094175339, 0.32199999690055847, 0.3265000134706497, 0.3295000046491623, 0.32999999821186066], "label": "terminal_punct filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2584999948740005, 0.2850000113248825, 0.30850000679492945, 0.30149999260902405, 0.31049999594688416, 0.3079999983310699, 0.3150000125169754, 0.32199999690055847, 0.3244999945163727, 0.3205000013113022, 0.3244999945163727, 0.3279999941587448, 0.33149999380111694, 0.32850000262260437], "label": "baseline"}, "sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.25700000921885174, 0.2786666651566823, 0.2960000038146972, 0.3049999972184499, 0.3053333262602488, 0.3120000064373016, 0.31733333071072894, 0.3163333336512248, 0.3186666667461395, 0.3226666748523712, 0.3286666671435038, 0.3240000009536743, 0.32900000611941016, 0.3283333381017049], "label": "C4"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.25800000131130213, 0.2849999964237213, 0.29200001060962677, 0.289000004529953, 0.30349999666213984, 0.30400000512599945, 0.3139999955892563, 0.3139999955892563, 0.318000003695488, 0.32299999892711634, 0.3174999952316284, 0.3215000033378601, 0.32250000536441803, 0.32549999654293055], "label": "All filters except terminal_punct"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29950000345706934, 0.33799999952316284, 0.3789999932050705, 0.3970000147819519, 0.42149999737739563, 0.431999996304512, 0.4440000057220459, 0.4490000009536743, 0.45949999988079065, 0.4714999943971634, 0.48000000417232513, 0.47749999165534973, 0.48100000619888306, 0.48950000107288355], "label": "All filters"}, "sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29699999094009394, 0.3369999925295512, 0.3699999948342641, 0.3930000066757202, 0.41233333945274353, 0.42733333508173627, 0.43799999356269836, 0.4506666660308838, 0.454666664203008, 0.47166667381922406, 0.47766666611035663, 0.476666659116745, 0.48366666833559663, 0.4853333334128062], "label": "C4"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.2955000102519989, 0.3385000079870224, 0.36800000071525574, 0.40099999308586115, 0.4099999964237213, 0.41700001060962677, 0.42400000989437103, 0.4389999955892563, 0.4414999932050705, 0.4484999924898147, 0.455499991774559, 0.45799998939037323, 0.4660000056028366, 0.471000000834465], "label": "All filters except terminal_punct"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.2939999997615814, 0.3295000046491623, 0.3684999942779541, 0.38449999690055847, 0.398499995470047, 0.3959999978542328, 0.4204999953508377, 0.4335000067949295, 0.445499986410141, 0.443000003695488, 0.455499991774559, 0.45250000059604645, 0.4529999941587448, 0.4545000046491623], "label": "terminal_punct filter"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29100000858306885, 0.32400000095367426, 0.3439999967813492, 0.3575000017881393, 0.3800000101327896, 0.40049999952316284, 0.4134999960660934, 0.42099998891353607, 0.4204999953508377, 0.4280000030994415, 0.44099999964237213, 0.43799999356269836, 0.44200000166893005, 0.44600000977516174], "label": "word_lengths filter"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29749999940395355, 0.3240000009536743, 0.34849999845027924, 0.3725000023841858, 0.3895000070333481, 0.39800000190734863, 0.41000001132488245, 0.4214999973773956, 0.42149999737739563, 0.42499999701976776, 0.42750000953674316, 0.4364999979734421, 0.4354999959468841, 0.4385000020265579], "label": "curly_bracket filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.28949999809265137, 0.32599999010562897, 0.34450000524520874, 0.3725000023841858, 0.38500000536441803, 0.39499999582767487, 0.408500000834465, 0.41700001060962677, 0.4174999892711639, 0.4284999966621399, 0.42849999666213984, 0.43150000274181366, 0.4399999976158142, 0.4375], "label": "baseline"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files": {"agg_score": {"file": "agg_score.json"}, "commonsense_qa/acc_norm": {"file": "commonsense_qa_acc_norm.json"}, "hellaswag/acc_norm": {"file": "hellaswag_acc_norm.json"}, "openbookqa/acc_norm": {"file": "openbookqa_acc_norm.json"}, "piqa/acc_norm": {"file": "piqa_acc_norm.json"}, "siqa/acc_norm": {"file": "siqa_acc_norm.json"}, "winogrande/acc_norm": {"file": "winogrande_acc_norm.json"}, "arc/acc_norm": {"file": "arc_acc_norm.json"}, "mmlu/acc_norm": {"file": "mmlu_acc_norm.json"}}, "settings": {"defaultMetric": "hellaswag/acc_norm", "slider": {"min": 0, "max": 10, "default": 3}}}
assets/data/plots/c4_filters_hellaswag/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.25013685226440424, 0.25661391019821167, 0.2620016932487488, 0.2657508552074432, 0.2710244506597519, 0.2744349539279938, 0.27642421424388885, 0.2818952649831772, 0.2794509679079056, 0.2831944525241852, 0.28439727425575256, 0.2866545617580414, 0.2866020053625107, 0.28615814447402954, 0.2871949374675751], "label": "baseline"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.25205445289611816, 0.2613788843154907, 0.26891554892063135, 0.2724043130874634, 0.27449470758438105, 0.27719296514987946, 0.27587129175662994, 0.2815589904785156, 0.2833077013492584, 0.2830233126878738, 0.28461267054080963, 0.2871275246143341, 0.28650729358196253, 0.2869933694601059], "label": "word_lengths filter"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.25500668585300446, 0.26221066713333124, 0.26368947327136993, 0.2702934741973877, 0.27218967676162714, 0.27553085982799524, 0.27833363413810724, 0.2786440253257751, 0.2810910940170288, 0.2834737300872803, 0.2833452969789505, 0.2836028486490249, 0.28682972490787506, 0.2868015915155411], "label": "All filters except terminal_punct"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.25762456655502314, 0.2630201578140259, 0.2672136425971985, 0.27234274148941034, 0.2702306807041168, 0.27446796000003815, 0.27583475410938263, 0.2770504504442215, 0.2794356495141983, 0.28302033245563507, 0.28214274346828455, 0.2855468988418579, 0.2840581685304642, 0.28505663573741913], "label": "curly_bracket filter"}, "sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.2557150324185689, 0.25763070583343506, 0.2643406589825948, 0.26745049158732087, 0.2721543808778127, 0.2737567722797394, 0.2732303539911906, 0.27877557277679443, 0.27923040588696796, 0.2798382341861725, 0.2831268608570099, 0.28203009565671283, 0.2810969154040019, 0.28292057911554974], "label": "C4"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.2583308666944504, 0.2611347585916519, 0.26333703100681305, 0.2685028165578842, 0.2725042402744293, 0.27531248331069946, 0.27463899552822113, 0.2784048914909363, 0.27915388345718384, 0.27945026755332947, 0.28207844495773315, 0.281900018453598, 0.2822476774454117, 0.28188446164131165], "label": "terminal_punct filter"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.25806266069412226, 0.26165445148944855, 0.26727744936943054, 0.2677594721317291, 0.2689383774995804, 0.2724889665842056, 0.27308812737464905, 0.27327476441860193, 0.27370570600032806, 0.277080088853836, 0.27814342081546783, 0.2782013118267059, 0.27888238430023193, 0.2795541882514953], "label": "All filters"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2559999972581863, 0.27699999511241913, 0.288000002503395, 0.2980000078678131, 0.31199999153614044, 0.29500000178813934, 0.3139999955892563, 0.31199999153614044, 0.31200000643730164, 0.3369999974966049, 0.32899999618530273, 0.3200000077486038, 0.3310000002384186, 0.3330000042915344], "label": "baseline"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2590000033378601, 0.278999999165535, 0.2979999929666519, 0.29899999499320984, 0.3270000070333481, 0.32800000905990595, 0.32899999618530273, 0.3369999974966049, 0.33200000226497645, 0.3260000050067901, 0.33599999547004694, 0.335999995470047, 0.33500000834465027, 0.3330000042915344], "label": "All filters"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2690000087022781, 0.27300000190734863, 0.28599999845027924, 0.28299999237060547, 0.3050000071525574, 0.30900000035762787, 0.31199999153614044, 0.3200000077486038, 0.33200000226497645, 0.31200000643730164, 0.3230000138282776, 0.32299999892711634, 0.32899999618530273, 0.3320000022649765], "label": "word_lengths filter"}, "sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2526666720708211, 0.26533332467079157, 0.26600000262260437, 0.29333333174387616, 0.3059999942779541, 0.30933333436648053, 0.31600000460942584, 0.31466667850812274, 0.32933333516120905, 0.3346666693687439, 0.3366666634877522, 0.3386666675408681, 0.33799999952316284, 0.33066666126251215], "label": "C4"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.26900000870227814, 0.27400000393390656, 0.2929999977350235, 0.29600000381469727, 0.306999996304512, 0.3199999928474426, 0.3190000057220459, 0.31299999356269836, 0.3229999989271164, 0.3210000097751617, 0.3270000070333481, 0.3230000138282776, 0.33399999141693115, 0.3260000050067901], "label": "terminal_punct filter"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.25800000131130213, 0.29899999499320984, 0.27900001406669617, 0.296999990940094, 0.2980000078678131, 0.3149999976158142, 0.3179999887943268, 0.32500000298023224, 0.3079999983310699, 0.32900001108646393, 0.32599999010562897, 0.3190000057220459, 0.3279999941587448, 0.3229999989271164], "label": "curly_bracket filter"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2500000074505806, 0.2759999930858612, 0.2800000011920929, 0.29099999368190765, 0.3070000112056732, 0.3070000112056732, 0.3229999989271164, 0.3240000009536743, 0.31700000166893005, 0.3100000023841858, 0.31300000846385956, 0.31700000166893005, 0.3100000023841858, 0.3189999908208847], "label": "All filters except terminal_punct"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6196666558583578, 0.6583333412806193, 0.6833333373069763, 0.6829999883969625, 0.6983333230018616, 0.702999989191691, 0.7056666612625122, 0.7076666553815206, 0.7139999866485596, 0.7209999958674113, 0.7179999947547913, 0.7273333470026652, 0.7209999958674113, 0.7273333271344503], "label": "C4"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.621999979019165, 0.6520000100135803, 0.6800000071525574, 0.6895000040531158, 0.6949999928474426, 0.6990000009536743, 0.7045000195503235, 0.7114999890327454, 0.710999995470047, 0.7159999907016754, 0.7199999988079071, 0.7199999988079071, 0.7204999923706055, 0.7254999876022339], "label": "All filters except terminal_punct"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6149999797344208, 0.6520000100135803, 0.6789999902248383, 0.69200000166893, 0.6949999928474426, 0.6955000162124634, 0.7055000066757202, 0.7150000035762787, 0.7169999778270721, 0.7184999883174896, 0.7235000133514404, 0.7240000069141388, 0.723499983549118, 0.7249999940395355], "label": "curly_bracket filter"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.621999979019165, 0.6549999713897705, 0.6695000231266022, 0.6860000193119049, 0.6994999945163727, 0.6980000138282776, 0.7084999978542328, 0.7120000123977661, 0.7124999761581421, 0.7160000205039978, 0.7179999947547913, 0.7195000052452087, 0.7229999899864197, 0.723499983549118], "label": "terminal_punct filter"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6215000152587891, 0.6580000221729279, 0.6784999966621399, 0.69200000166893, 0.703499972820282, 0.7029999792575836, 0.710999995470047, 0.7139999866485596, 0.7179999947547913, 0.7150000035762787, 0.715499997138977, 0.7184999883174896, 0.7160000205039978, 0.7224999964237213], "label": "All filters"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6229999959468842, 0.6590000092983246, 0.6714999973773956, 0.6820000112056732, 0.6949999928474426, 0.6940000057220459, 0.7064999938011169, 0.7005000114440918, 0.6989999711513519, 0.7084999978542328, 0.7060000002384186, 0.7099999785423279, 0.7160000205039978, 0.7150000035762787], "label": "word_lengths filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6105000078678131, 0.6350000202655792, 0.6620000004768372, 0.675000011920929, 0.6940000057220459, 0.6974999904632568, 0.7054999768733978, 0.7060000002384186, 0.7059999704360962, 0.7084999978542328, 0.7060000002384186, 0.7084999978542328, 0.7144999802112579, 0.7134999930858612], "label": "baseline"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.39633333683013916, 0.3893333276112874, 0.3933333357175191, 0.39800000190734863, 0.4013333320617676, 0.4010000030199687, 0.4059999982515971, 0.41100000341733295, 0.4063333372275035, 0.40433333317438763, 0.404666672150294, 0.3993333379427592, 0.4053333302338918, 0.40800000230471295], "label": "C4"}, "filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.4010000079870224, 0.3974999934434891, 0.39499999582767487, 0.403999999165535, 0.40299999713897705, 0.4095000028610229, 0.4074999988079071, 0.4065000116825104, 0.4074999988079071, 0.4050000011920929, 0.3999999910593033, 0.40700000524520874, 0.4050000011920929, 0.40799999237060547], "label": "terminal_punct filter"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3955000042915344, 0.40049999952316284, 0.3945000022649765, 0.40700000524520874, 0.4010000079870224, 0.4025000035762787, 0.39650000631809235, 0.4004999995231628, 0.4020000100135803, 0.40150000154972076, 0.40950000286102295, 0.4080000072717666, 0.40600000321865076, 0.40750001370906824], "label": "All filters"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3994999974966049, 0.392999991774559, 0.40350000560283655, 0.3939999938011169, 0.39650000631809235, 0.39450000226497645, 0.4025000035762787, 0.39900000393390656, 0.3985000103712082, 0.4030000120401382, 0.3969999998807907, 0.40150000154972076, 0.40049999952316284, 0.4025000035762787], "label": "All filters except terminal_punct"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.4009999930858612, 0.4024999886751175, 0.3935000002384186, 0.3904999941587448, 0.4040000140666961, 0.4035000056028366, 0.40449999272823334, 0.4079999923706054, 0.40049999952316284, 0.3985000103712082, 0.39750000834465027, 0.39799998700618744, 0.3995000123977661, 0.39699999988079065], "label": "curly_bracket filter"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.40150000154972076, 0.39549998939037323, 0.3969999998807907, 0.3974999934434891, 0.3959999978542328, 0.39750000834465027, 0.39549998939037323, 0.3895000070333481, 0.3994999974966049, 0.3980000019073486, 0.4000000059604645, 0.39100000262260437, 0.39250001311302185, 0.39499999582767487], "label": "word_lengths filter"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.36149999499320984, 0.39800000190734863, 0.3970000147819519, 0.4000000059604645, 0.39799998700618744, 0.408500000834465, 0.39400000870227814, 0.392999991774559, 0.40450000762939453, 0.4070000052452087, 0.39950001239776606, 0.3994999974966049, 0.3949999958276748, 0.398499995470047, 0.3920000046491623], "label": "baseline"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/c4_filters_hellaswag/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-c4-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.49050000309944153, 0.48900000751018524, 0.5080000162124634, 0.50450000166893, 0.5185000002384186, 0.5175000131130219, 0.5099999904632568, 0.526500016450882, 0.5320000052452087, 0.5230000019073486, 0.5105000138282776, 0.5214999914169312, 0.523499995470047, 0.5264999866485596], "label": "terminal_punct filter"}, "filtering-c4-curly_bracket": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.48350000381469727, 0.5024999976158142, 0.5039999932050705, 0.5049999952316284, 0.5115000009536743, 0.50450000166893, 0.5120000243186951, 0.5144999921321869, 0.5194999873638153, 0.5250000059604645, 0.5170000195503235, 0.5180000066757202, 0.527999997138977, 0.5259999930858612], "label": "curly_bracket filter"}, "sm-baseline-c4": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.4933333396911621, 0.48733333746592206, 0.5056666731834412, 0.5066666503747305, 0.5116666754086813, 0.5076666871706644, 0.5213333169619242, 0.5150000055631002, 0.5183333357175192, 0.5169999996821085, 0.515333334604899, 0.5193333427111307, 0.5143333276112875, 0.5196666717529297], "label": "C4"}, "filtering-c4-all-except-terminal_punct": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.49150000512599945, 0.49900001287460327, 0.49300000071525574, 0.5015000104904175, 0.5094999969005585, 0.5109999775886536, 0.5085000097751617, 0.507500022649765, 0.5205000042915344, 0.5125000178813934, 0.5160000026226044, 0.5175000131130219, 0.5150000154972076, 0.5179999768733978], "label": "All filters except terminal_punct"}, "filtering-c4-word_lengths": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.49050000309944153, 0.49000000953674316, 0.4999999850988388, 0.4989999830722809, 0.5115000009536743, 0.5105000138282776, 0.5069999992847443, 0.5109999775886536, 0.5164999961853027, 0.5059999823570251, 0.5129999816417694, 0.5059999823570251, 0.5115000009536743, 0.5164999961853027], "label": "word_lengths filter"}, "filtering-c4-all": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.4884999990463257, 0.4989999979734421, 0.5064999908208847, 0.49800001084804535, 0.5040000081062317, 0.5139999985694885, 0.5160000026226044, 0.5109999775886536, 0.5070000141859055, 0.5115000009536743, 0.5105000138282776, 0.5175000131130219, 0.5200000107288361, 0.5135000050067902], "label": "All filters"}, "filtering-baseline-2019-18-60gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.4970000088214874, 0.48950000107288355, 0.48950000107288355, 0.5049999952316284, 0.5125000178813934, 0.5004999935626984, 0.5065000057220459, 0.5055000185966492, 0.511000007390976, 0.5160000026226044, 0.5209999978542328, 0.5270000100135803, 0.5219999849796295, 0.5149999856948853, 0.5125000178813934], "label": "baseline"}}, "layout": {"title": {"text": "C4 filtering effect on HellaSwag"}}}
assets/data/plots/cross_ind_unfiltered_comparison/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3534814938902855,0.3764607086777687,0.38782499730587,0.3981050960719585,0.4028486795723438,0.4125883243978023,0.4117814563214779,0.414029736071825,0.4197172522544861,0.4211113378405571,0.4279881417751312,0.4280137903988361,0.4280424378812313,0.4291964024305343,0.4326301179826259,0.4371833503246307,0.4346669465303421,0.4336562640964985,0.4432648755609989,0.4401291646063328,0.4394684173166752,0.4476612061262131,0.4465444348752498,0.4472153298556804,0.4433343075215816,0.4510187618434429,0.4459567815065384,0.4460812956094742,0.4498684890568256,0.4529943652451038,0.4528274349868297,0.4551213420927524,0.4549156539142132,0.4564928151667118,0.4576693661510944,0.4557182416319847,0.4536240361630916,0.457439012825489,0.4570476822555065,0.4589823484420776,0.462024375796318,0.4540738053619861,0.4550252184271812,0.4576593860983848,0.4573238864541054,0.4575810581445694,0.4622134491801262,0.4592566937208175,0.4614734016358852,0.4637473002076149,0.4625372551381588,0.4613912180066108,0.4597448222339153,0.4594792164862156,0.4662549719214439,0.4634026065468788,0.4633508697152138,0.4635734222829342,0.4628961533308029,0.4670135043561458,0.4639505892992019,0.4631133340299129,0.4665167145431041,0.4672448337078094,0.4693268723785877,0.4630668573081493,0.4676454700529575,0.4646359197795391,0.4621579721570015,0.4692446552217006,0.4704835228621959,0.4663223996758461,0.4680556617677212,0.466339822858572,0.4682099223136902,0.4711195565760135,0.4722655527293682,0.4727961830794811,0.4676857478916645,0.4719390422105789,0.4713102728128433,0.4712141714990139,0.4721613004803657,0.4713456854224205,0.4682970903813839,0.4679934531450271,0.4685162976384163,0.4679946713149547,0.4681242071092129,0.4702276065945625,0.472664151340723,0.4730790853500366,0.4731674715876579,0.4718914777040481,0.4719801284372806,0.4761029370129108,0.4735167175531387,0.4730370938777923,0.4730173237621784,0.4735377207398414,0.4777223989367485,0.4796326830983162,0.4734170883893966,0.4739485755562782,0.4748299159109592,0.4765299335122108,0.4745025858283043,0.4754423759877682,0.4784592799842357,0.4761341325938701,0.4760282784700393,0.4769757278263569,0.47154351323843,0.4786738082766533,0.4804279990494251,0.4777076803147793,0.4798569902777672,0.4759011939167976,0.4784621745347976,0.479673832654953,0.4780617095530033,0.48076206818223,0.47995800152421,0.4790860973298549,0.4817167408764362,0.4811586998403072,0.482547752559185,0.4816697351634502,0.4809327870607376,0.4816545359790325,0.4804601892828941,0.4776877984404564,0.4813711903989315,0.4844604581594467,0.4819537848234176,0.4820829331874847,0.4778126627206802,0.482935007661581,0.48230691999197,0.4826001971960068,0.4823969900608063,0.4811219945549965,0.4789146520197391,0.484035175293684,0.4848698377609253,0.4855728335678577,0.4825376532971859,0.485215101391077,0.4824351668357849,0.4835342466831207,0.4822137206792831,0.4838785007596016,0.4837255179882049,0.4853012599050998,0.4857851006090641,0.4863366298377514,0.4856646582484245,0.4842503517866134,0.4838776960968971,0.4846346862614155,0.4837041422724724,0.4813097268342972,0.4873070046305656,0.4841253720223903,0.4837464913725853,0.483069509267807,0.4851242564618587,0.4861010462045669],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3551952373236418,0.3736435137689113,0.3814037963747978,0.3948809280991554,0.3996850810945034,0.4089604057371616,0.4100853353738785,0.4119834117591381,0.4168377220630646,0.4186493046581745,0.4169826358556747,0.4234288297593593,0.4229162000119686,0.4273439794778824,0.4290364980697632,0.4291782416403293,0.4296907968819141,0.4311576783657074,0.4326641112565994,0.430318683385849,0.430436260998249,0.4339037239551544,0.4363459683954716,0.4357402548193931,0.4342963136732578,0.4366712383925915,0.4363959729671478,0.436981026083231,0.4447868093848228,0.4411709941923618,0.4406092017889023,0.4424176625907421,0.4423875361680984,0.4422253370285034,0.4410557933151722,0.4447037056088447,0.4454837813973427,0.4435960277915001,0.4468514993786812,0.4479999616742134,0.4428562931716442,0.445764634758234,0.4456562362611294,0.4488007053732872,0.4475954286754131,0.4468922987580299,0.4548408314585686,0.4511027485132217,0.4530330970883369,0.4483681954443455,0.4531726539134979,0.45334542542696,0.4544384703040123,0.4530758671462536,0.4540613554418087,0.4510113634169101,0.4538320265710354,0.4518541917204857,0.4536847211420536,0.4532708041369915,0.4552236869931221,0.455034039914608,0.4562875479459762,0.4532428197562694,0.4574853852391243,0.4517738744616508,0.4579889141023159,0.4538268558681011,0.456730306148529,0.4526018649339676,0.4562746733427048,0.4560015797615051,0.4555426277220249,0.4561501257121563,0.4524396173655987,0.4557023830711841,0.4589769169688225,0.4581078588962555,0.4620813727378845,0.4586601965129375,0.4568093195557594,0.4569808952510357,0.4567535072565079,0.4575250148773193,0.4606908001005649,0.4603964723646641,0.4622848592698574,0.4594669193029403,0.4640629850327968,0.4604269936680794,0.4634841009974479,0.4644578285515308,0.4642514958977699,0.4666304066777229,0.4616626128554344,0.4588956907391548,0.4620226770639419,0.4628621749579906,0.4595407098531723,0.4635516740381717,0.46005355194211,0.4601523540914058,0.4644204638898372,0.4620639197528362,0.46614545956254,0.4636696502566337,0.4610077403485775,0.4640897810459137,0.4636163525283336,0.4630545899271965,0.466012816876173,0.4650349207222461,0.4613720141351223,0.4644323363900184,0.4647249802947044,0.4656480401754379,0.4651664271950722,0.4622530452907085,0.4655019529163837,0.4650313258171081,0.466718140989542,0.4661559611558914,0.4661237150430679,0.4664223715662956,0.4640601389110088,0.4642657749354839,0.4633881188929081,0.4629989042878151,0.4685831367969513,0.4675870984792709,0.467183344066143,0.4678030684590339,0.4660939238965511,0.4691914953291416,0.4670972637832165,0.468262892216444,0.4672016054391861,0.4676182121038437,0.4698677137494087,0.4658828042447567,0.4701816700398922,0.4684622809290886,0.466015312820673,0.4675401039421558,0.4693200923502445,0.4702670983970165,0.4679145030677318,0.4676233418285846,0.4674933589994907,0.4678357951343059,0.4669915996491909,0.4657857678830623,0.4666901864111423,0.4669371582567692,0.4672787226736545,0.4684535376727581,0.4685697965323925,0.4694835692644119,0.4683254994451999,0.4712230190634727,0.4683987610042095,0.4707653746008873,0.4663059376180172,0.4683133698999882,0.4686385430395603,0.4657671600580215,0.4692615270614624],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3605199865996837,0.3733148723840713,0.3882005847990513,0.3934122696518898,0.3947227671742439,0.4042885974049568,0.3974800482392311,0.4055779427289963,0.4133470430970192,0.4117913842201233,0.4113653488457203,0.4149517640471458,0.4187851920723915,0.4252083078026771,0.4206527359783649,0.4240428246557712,0.422003373503685,0.4280910938978195,0.4244147576391697,0.4316282644867897,0.4295645765960216,0.4310102686285972,0.4360743537545204,0.4313482865691185,0.4350991360843181,0.4378576353192329,0.4335876516997814,0.4347924515604973,0.4348904751241207,0.436600212007761,0.430036511272192,0.4350974671542644,0.4399556629359722,0.4371416717767715,0.4363861419260502,0.4376698136329651,0.4405004419386387,0.4373639523983001,0.4379038028419018,0.4371281825006008,0.4393439553678036,0.440426729619503,0.4401675276458263,0.4429537951946258,0.4449137263000011,0.4434786736965179,0.4450470842421055,0.4454202279448509,0.4394537284970283,0.442185215651989,0.4461225643754005,0.4427758157253265,0.4430646039545536,0.4476901069283485,0.4478763341903686,0.4493869319558143,0.4448477327823639,0.450044184923172,0.4498609118163585,0.4457665979862213,0.4506924152374267,0.449855338782072,0.448790930211544,0.4474099352955818,0.4546772800385952,0.4529431238770485,0.452015146613121,0.4502020999789238,0.4493804536759853,0.4523266032338142,0.4551868587732315,0.4501944817602634,0.4493303671479225,0.4526805207133293,0.4533850513398647,0.4518048763275146,0.4518973492085933,0.4531301632523536,0.4518006071448326,0.4553494565188885,0.4528752230107784,0.4536322727799415,0.4561733976006508,0.4549491256475448,0.4574789106845855,0.4577847123146057,0.4563642293214798,0.4578686729073524,0.4561499990522861,0.4537816494703293,0.4542164430022239,0.4559455662965774,0.4554723873734474,0.4575514122843742,0.4575202167034149,0.4592722058296203,0.4585275091230869,0.4580587856471538,0.456934317946434,0.4577495418488979,0.4540119916200638,0.4570806957781315,0.4608120545744896,0.4588425755500793,0.4578334167599678,0.4610816091299057,0.4598177038133144,0.461849745362997,0.4631866924464702,0.4601576402783394,0.4646804705262184,0.4632389545440674,0.4604574106633663,0.4602976888418197,0.4581312239170074,0.4654182009398937,0.4655338563024997,0.4616620391607284,0.461054053157568,0.4613021649420261,0.4658613465726375,0.4633531905710697,0.4613638147711754,0.4643996246159076,0.462500050663948,0.4650798961520195,0.4648764543235302,0.4639869071543216,0.4634246975183487,0.46585888043046,0.4639799632132053,0.4630857892334461,0.4644265696406364,0.4642998576164245,0.4686848931014538,0.4687492996454239,0.4650243632495403,0.4627032242715359,0.4665953740477562,0.4660026729106903,0.4664581045508384,0.4676475040614605,0.4657339677214622,0.4664678275585174,0.4673498086631298,0.4676674827933311,0.4680955372750759,0.4681585058569908,0.4659864418208599,0.4686457589268684,0.4661462865769863,0.4658931568264961,0.4674226939678192,0.46805215254426,0.4682257212698459,0.4689070098102093,0.4699570722877979,0.4655096270143986,0.4688013233244419,0.4707522802054882,0.4661469310522079,0.4688841328024864,0.4671329781413078,0.4662554152309894,0.4697433896362781,0.4698473587632179,0.4676505327224731,0.4696521013975143],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3608616776764393,0.3745453506708145,0.3862277194857597,0.3989979773759842,0.406296543776989,0.4094927236437797,0.4138859286904335,0.4177777022123337,0.4208802655339241,0.4254550077021122,0.4283009432256222,0.429458349943161,0.4330311268568039,0.4303463362157345,0.4349483698606491,0.4348161295056343,0.438955657184124,0.4389265701174736,0.4393925778567791,0.4383306242525577,0.4436748661100864,0.4423373565077781,0.4460027255117893,0.4440812170505523,0.4476902261376381,0.4465879611670971,0.4497823156416416,0.4513350501656532,0.4518667235970497,0.45149727165699,0.4513994492590427,0.4521937072277069,0.4520382955670357,0.4530793912708759,0.4516105614602566,0.4530563354492187,0.4495660625398159,0.4520940892398357,0.4561133235692978,0.4522969461977482,0.4575686641037464,0.4589144177734852,0.4582882039248943,0.457970168441534,0.4554797261953354,0.4622044861316681,0.4596928395330906,0.4624353349208832,0.4619148448109627,0.461100060492754,0.458431463688612,0.4620467089116573,0.4562215581536293,0.4620163068175316,0.4631462283432483,0.4600549824535846,0.4620365314185619,0.458735141903162,0.461642112582922,0.461245734244585,0.4645131677389145,0.4629777930676937,0.4651660025119781,0.4653937108814716,0.4676259346306324,0.4667201824486255,0.4650012850761413,0.4676916748285293,0.4708514772355556,0.4673572592437267,0.4689626581966877,0.4678038358688354,0.4667215310037136,0.4646228328347206,0.4662510119378567,0.4674677737057209,0.4690804108977318,0.4634581170976162,0.4701276533305645,0.4676450751721859,0.4672758504748344,0.4674397967755794,0.4656238108873367,0.4690065123140812,0.4677213467657566,0.4678985886275768,0.4735414572060108,0.4705612398684025,0.4703374318778515,0.4704933613538742,0.4688010476529598,0.4699571952223778,0.4674785658717155,0.4701188169419765,0.4682065695524215,0.4729971997439861,0.4748715870082378,0.4745333231985569,0.4737020246684551,0.4747246317565441,0.4771635122597217,0.4740425907075405,0.475264236330986,0.4744705818593502,0.474684040993452,0.4721556939184665,0.475641455501318,0.476833701133728,0.4746401384472847,0.4742486327886581,0.4730467088520527,0.4773029200732708,0.4760043211281299,0.4770320989191532,0.4742161482572555,0.4780259765684604,0.4806670732796192,0.4784667380154133,0.4788618609309196,0.4762138128280639,0.4777246937155723,0.4796081893146038,0.4798486456274986,0.475479181855917,0.4779988899827003,0.4765858314931392,0.4772914499044418,0.47843898832798,0.4799034222960472,0.4803600236773491,0.4751846008002758,0.4777872562408447,0.4779460839927196,0.4787487275898456,0.4808406494557857,0.4810357913374901,0.4797308407723903,0.4800078608095646,0.4806460626423359,0.4810502976179123,0.4797912389039993,0.477332629263401,0.4818884879350662,0.482621606439352,0.4833096489310264,0.4821632876992225,0.4831674285233021,0.4830279909074306,0.4849893450736999,0.4845218025147915,0.4825541749596596,0.4833571836352348,0.4853803217411041,0.483093187212944,0.4850797094404697,0.485261783003807,0.4837660938501358,0.4835929833352566,0.4855643883347511,0.4832059442996979,0.484714712947607,0.4839249886572361,0.4829078912734985,0.4818423055112362,0.482727088034153,0.4824129492044449,0.4820138849318027,0.4865870922803879],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2529999911785126,0.2800000011920929,0.2870000004768371,0.3179999887943268,0.3129999935626983,0.3210000097751617,0.3160000145435333,0.3210000097751617,0.31700000166893,0.3330000042915344,0.3389999866485595,0.3289999961853027,0.3429999947547912,0.3370000123977661,0.3379999995231628,0.3459999859333038,0.3490000069141388,0.3470000028610229,0.3600000143051147,0.3569999933242798,0.3449999988079071,0.3650000095367431,0.3499999940395355,0.3540000021457672,0.3569999933242798,0.3619999885559082,0.3619999885559082,0.3580000102519989,0.3740000128746032,0.3709999918937683,0.3720000088214874,0.3759999871253967,0.3720000088214874,0.3659999966621399,0.3790000081062317,0.3610000014305115,0.3650000095367431,0.3650000095367431,0.3720000088214874,0.3729999959468841,0.3790000081062317,0.3680000007152557,0.3659999966621399,0.3680000007152557,0.3619999885559082,0.3619999885559082,0.3729999959468841,0.3720000088214874,0.3650000095367431,0.3759999871253967,0.367000013589859,0.3650000095367431,0.3680000007152557,0.3580000102519989,0.3589999973773956,0.3700000047683716,0.3680000007152557,0.367000013589859,0.3709999918937683,0.3880000114440918,0.3810000121593475,0.375,0.4040000140666961,0.3860000073909759,0.3840000033378601,0.3779999911785126,0.3729999959468841,0.3720000088214874,0.3799999952316284,0.3799999952316284,0.3779999911785126,0.3689999878406524,0.3770000040531158,0.3740000128746032,0.3819999992847442,0.3899999856948852,0.3799999952316284,0.3919999897480011,0.3720000088214874,0.3770000040531158,0.3930000066757202,0.3849999904632568,0.3899999856948852,0.3740000128746032,0.3740000128746032,0.3799999952316284,0.3779999911785126,0.3880000114440918,0.3709999918937683,0.3810000121593475,0.3880000114440918,0.3980000019073486,0.3819999992847442,0.3849999904632568,0.3810000121593475,0.3819999992847442,0.3889999985694885,0.3840000033378601,0.3910000026226043,0.3899999856948852,0.3959999978542328,0.3880000114440918,0.3869999945163727,0.3779999911785126,0.3819999992847442,0.3919999897480011,0.3849999904632568,0.3860000073909759,0.3919999897480011,0.3819999992847442,0.3819999992847442,0.3889999985694885,0.3889999985694885,0.3860000073909759,0.3880000114440918,0.3889999985694885,0.3939999938011169,0.3899999856948852,0.3869999945163727,0.3910000026226043,0.3910000026226043,0.3910000026226043,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3939999938011169,0.4000000059604645,0.3970000147819519,0.402999997138977,0.3959999978542328,0.3959999978542328,0.4000000059604645,0.4040000140666961,0.4020000100135803,0.3989999890327453,0.3919999897480011,0.3930000066757202,0.3930000066757202,0.3980000019073486,0.4000000059604645,0.395000010728836,0.3899999856948852,0.4059999883174896,0.4020000100135803,0.4020000100135803,0.4059999883174896,0.3970000147819519,0.4110000133514404,0.4050000011920929,0.4000000059604645,0.4090000092983246,0.3989999890327453,0.402999997138977,0.4009999930858612,0.3980000019073486,0.4090000092983246,0.4079999923706054,0.4079999923706054,0.4020000100135803,0.402999997138977,0.402999997138977,0.4059999883174896,0.4040000140666961,0.4059999883174896,0.3989999890327453,0.4070000052452087,0.4059999883174896],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2540000081062317,0.2870000004768371,0.2829999923706054,0.3210000097751617,0.3079999983310699,0.3230000138282776,0.3179999887943268,0.3160000145435333,0.3289999961853027,0.3199999928474426,0.324999988079071,0.3310000002384186,0.3260000050067901,0.335999995470047,0.335999995470047,0.3310000002384186,0.335999995470047,0.3339999914169311,0.3459999859333038,0.3330000042915344,0.3449999988079071,0.3429999947547912,0.3479999899864197,0.3420000076293945,0.3479999899864197,0.3459999859333038,0.3339999914169311,0.3350000083446502,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3379999995231628,0.3420000076293945,0.3610000014305115,0.3409999907016754,0.356000006198883,0.3630000054836273,0.3519999980926513,0.3510000109672546,0.3619999885559082,0.3569999933242798,0.3479999899864197,0.3529999852180481,0.3569999933242798,0.3529999852180481,0.3519999980926513,0.3549999892711639,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3619999885559082,0.3459999859333038,0.3519999980926513,0.3529999852180481,0.3680000007152557,0.3519999980926513,0.3580000102519989,0.3549999892711639,0.3490000069141388,0.3499999940395355,0.3600000143051147,0.3709999918937683,0.3659999966621399,0.3569999933242798,0.3510000109672546,0.3600000143051147,0.367000013589859,0.3529999852180481,0.363999992609024,0.3630000054836273,0.3619999885559082,0.356000006198883,0.367000013589859,0.3600000143051147,0.3540000021457672,0.3589999973773956,0.3610000014305115,0.356000006198883,0.3680000007152557,0.3519999980926513,0.3549999892711639,0.3479999899864197,0.3549999892711639,0.3519999980926513,0.367000013589859,0.3600000143051147,0.3600000143051147,0.3680000007152557,0.356000006198883,0.3610000014305115,0.3689999878406524,0.367000013589859,0.3689999878406524,0.3720000088214874,0.3680000007152557,0.3569999933242798,0.3650000095367431,0.363999992609024,0.3610000014305115,0.3709999918937683,0.3569999933242798,0.3540000021457672,0.3619999885559082,0.3549999892711639,0.3650000095367431,0.3680000007152557,0.3589999973773956,0.356000006198883,0.3610000014305115,0.3619999885559082,0.3740000128746032,0.3700000047683716,0.3650000095367431,0.3819999992847442,0.3770000040531158,0.3810000121593475,0.3729999959468841,0.3680000007152557,0.3689999878406524,0.3740000128746032,0.3779999911785126,0.3720000088214874,0.3740000128746032,0.367000013589859,0.363999992609024,0.367000013589859,0.3689999878406524,0.3709999918937683,0.3709999918937683,0.375,0.3680000007152557,0.375,0.3630000054836273,0.3720000088214874,0.3819999992847442,0.3729999959468841,0.3689999878406524,0.363999992609024,0.3709999918937683,0.3659999966621399,0.3700000047683716,0.367000013589859,0.3709999918937683,0.3759999871253967,0.3759999871253967,0.3729999959468841,0.3729999959468841,0.3729999959468841,0.3779999911785126,0.375,0.3700000047683716,0.3659999966621399,0.3759999871253967,0.3779999911785126,0.3709999918937683,0.3840000033378601,0.3720000088214874,0.375,0.367000013589859,0.3770000040531158,0.3709999918937683,0.375,0.3709999918937683,0.3740000128746032,0.3740000128746032,0.375,0.3770000040531158],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2599999904632568,0.277999997138977,0.2910000085830688,0.3070000112056732,0.3140000104904175,0.3019999861717224,0.3059999942779541,0.3210000097751617,0.3230000138282776,0.324999988079071,0.3149999976158142,0.3109999895095825,0.3339999914169311,0.3289999961853027,0.3319999873638153,0.3319999873638153,0.3300000131130218,0.3370000123977661,0.3219999969005584,0.3370000123977661,0.328000009059906,0.3339999914169311,0.3420000076293945,0.3400000035762787,0.3440000116825104,0.3510000109672546,0.3409999907016754,0.3449999988079071,0.3339999914169311,0.3540000021457672,0.3339999914169311,0.3470000028610229,0.3470000028610229,0.3440000116825104,0.3589999973773956,0.3569999933242798,0.3630000054836273,0.3549999892711639,0.3589999973773956,0.3449999988079071,0.3549999892711639,0.3449999988079071,0.3389999866485595,0.3499999940395355,0.3610000014305115,0.3619999885559082,0.3600000143051147,0.3519999980926513,0.3479999899864197,0.356000006198883,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3519999980926513,0.3470000028610229,0.3589999973773956,0.3449999988079071,0.3490000069141388,0.356000006198883,0.3619999885559082,0.3569999933242798,0.3659999966621399,0.3610000014305115,0.3549999892711639,0.3700000047683716,0.363999992609024,0.3600000143051147,0.3580000102519989,0.3549999892711639,0.3619999885559082,0.3689999878406524,0.3630000054836273,0.363999992609024,0.3700000047683716,0.367000013589859,0.3630000054836273,0.3630000054836273,0.3700000047683716,0.3589999973773956,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3589999973773956,0.3650000095367431,0.3709999918937683,0.3680000007152557,0.3689999878406524,0.3650000095367431,0.3729999959468841,0.3619999885559082,0.3689999878406524,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.363999992609024,0.3700000047683716,0.3659999966621399,0.3659999966621399,0.363999992609024,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3610000014305115,0.3720000088214874,0.3729999959468841,0.3810000121593475,0.3630000054836273,0.3689999878406524,0.3709999918937683,0.3759999871253967,0.382999986410141,0.3729999959468841,0.3720000088214874,0.3680000007152557,0.3659999966621399,0.3650000095367431,0.363999992609024,0.3589999973773956,0.356000006198883,0.3650000095367431,0.3659999966621399,0.367000013589859,0.3729999959468841,0.3720000088214874,0.375,0.3740000128746032,0.3700000047683716,0.3569999933242798,0.3759999871253967,0.3740000128746032,0.367000013589859,0.3770000040531158,0.3759999871253967,0.3709999918937683,0.3779999911785126,0.3709999918937683,0.3689999878406524,0.3799999952316284,0.3630000054836273,0.375,0.3700000047683716,0.3700000047683716,0.3729999959468841,0.3720000088214874,0.3790000081062317,0.375,0.3729999959468841,0.3770000040531158,0.3799999952316284,0.3779999911785126,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3799999952316284,0.3790000081062317,0.375,0.3740000128746032,0.3729999959468841,0.3840000033378601,0.3659999966621399,0.3759999871253967,0.3720000088214874,0.3720000088214874,0.3759999871253967,0.375,0.3650000095367431,0.3729999959468841],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2639999985694885,0.2790000140666961,0.296999990940094,0.3109999895095825,0.3240000009536743,0.3070000112056732,0.3210000097751617,0.31700000166893,0.3339999914169311,0.324999988079071,0.3260000050067901,0.3330000042915344,0.3409999907016754,0.3350000083446502,0.3400000035762787,0.3529999852180481,0.3400000035762787,0.3490000069141388,0.3529999852180481,0.3499999940395355,0.3459999859333038,0.3370000123977661,0.356000006198883,0.3490000069141388,0.3429999947547912,0.3490000069141388,0.3610000014305115,0.3499999940395355,0.3569999933242798,0.3610000014305115,0.3619999885559082,0.3449999988079071,0.3409999907016754,0.3420000076293945,0.3449999988079071,0.3409999907016754,0.3379999995231628,0.3420000076293945,0.3569999933242798,0.3529999852180481,0.3610000014305115,0.363999992609024,0.3600000143051147,0.3540000021457672,0.3499999940395355,0.3689999878406524,0.367000013589859,0.3569999933242798,0.3610000014305115,0.3680000007152557,0.3630000054836273,0.3709999918937683,0.3540000021457672,0.3580000102519989,0.367000013589859,0.3529999852180481,0.356000006198883,0.3569999933242798,0.3610000014305115,0.3700000047683716,0.375,0.3709999918937683,0.3819999992847442,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3840000033378601,0.3740000128746032,0.375,0.356000006198883,0.3689999878406524,0.3700000047683716,0.3819999992847442,0.3799999952316284,0.3779999911785126,0.3729999959468841,0.3709999918937683,0.3759999871253967,0.3709999918937683,0.3759999871253967,0.3779999911785126,0.3779999911785126,0.3689999878406524,0.3840000033378601,0.3860000073909759,0.3849999904632568,0.3790000081062317,0.375,0.3849999904632568,0.3720000088214874,0.3770000040531158,0.3799999952316284,0.3810000121593475,0.382999986410141,0.3650000095367431,0.3740000128746032,0.382999986410141,0.3689999878406524,0.3759999871253967,0.3869999945163727,0.3889999985694885,0.3860000073909759,0.3819999992847442,0.3689999878406524,0.3860000073909759,0.3810000121593475,0.382999986410141,0.3819999992847442,0.3840000033378601,0.3889999985694885,0.3880000114440918,0.3849999904632568,0.3799999952316284,0.3910000026226043,0.3989999890327453,0.3880000114440918,0.3880000114440918,0.3840000033378601,0.3880000114440918,0.3860000073909759,0.3919999897480011,0.3880000114440918,0.3939999938011169,0.3869999945163727,0.3919999897480011,0.3910000026226043,0.382999986410141,0.3930000066757202,0.3840000033378601,0.3880000114440918,0.3840000033378601,0.3819999992847442,0.382999986410141,0.3880000114440918,0.3860000073909759,0.3860000073909759,0.3869999945163727,0.3860000073909759,0.3899999856948852,0.3819999992847442,0.3860000073909759,0.3889999985694885,0.3840000033378601,0.395000010728836,0.3899999856948852,0.3899999856948852,0.3910000026226043,0.3959999978542328,0.3959999978542328,0.3919999897480011,0.3980000019073486,0.3880000114440918,0.3930000066757202,0.4000000059604645,0.3919999897480011,0.3919999897480011,0.4040000140666961,0.3930000066757202,0.3970000147819519,0.3889999985694885,0.3959999978542328,0.3930000066757202,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.4020000100135803],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2759999930858612,0.328000009059906,0.3499999940395355,0.3889999985694885,0.3910000026226043,0.402999997138977,0.4210000038146972,0.4280000030994415,0.4359999895095825,0.4469999969005584,0.4440000057220459,0.4600000083446502,0.4690000116825104,0.4600000083446502,0.4679999947547912,0.4729999899864197,0.4760000109672546,0.4839999973773956,0.4939999878406524,0.488999992609024,0.4990000128746032,0.4979999959468841,0.4979999959468841,0.5009999871253967,0.5,0.5090000033378601,0.5070000290870667,0.5180000066757202,0.5199999809265137,0.5109999775886536,0.5130000114440918,0.5249999761581421,0.5149999856948853,0.5299999713897705,0.5339999794960022,0.5189999938011169,0.5289999842643738,0.5249999761581421,0.5320000052452087,0.5460000038146973,0.5419999957084656,0.5260000228881836,0.5289999842643738,0.546999990940094,0.5419999957084656,0.5419999957084656,0.5460000038146973,0.5419999957084656,0.5389999747276306,0.5440000295639038,0.5569999814033508,0.5450000166893005,0.5329999923706055,0.5580000281333923,0.5339999794960022,0.5540000200271606,0.5460000038146973,0.5479999780654907,0.5529999732971191,0.5540000200271606,0.5619999766349792,0.5490000247955322,0.5410000085830688,0.5490000247955322,0.5569999814033508,0.550000011920929,0.5479999780654907,0.5630000233650208,0.546999990940094,0.5559999942779541,0.5600000023841858,0.5509999990463257,0.5569999814033508,0.5569999814033508,0.5580000281333923,0.5619999766349792,0.5580000281333923,0.5669999718666077,0.5569999814033508,0.5709999799728394,0.5529999732971191,0.5649999976158142,0.5659999847412109,0.5659999847412109,0.5690000057220459,0.5600000023841858,0.5580000281333923,0.5540000200271606,0.5640000104904175,0.5680000185966492,0.5709999799728394,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5640000104904175,0.5799999833106995,0.5699999928474426,0.5669999718666077,0.5680000185966492,0.5770000219345093,0.5709999799728394,0.5759999752044678,0.5690000057220459,0.5789999961853027,0.5740000009536743,0.5709999799728394,0.5789999961853027,0.5709999799728394,0.5770000219345093,0.5770000219345093,0.5730000138282776,0.5809999704360962,0.5720000267028809,0.5849999785423279,0.5820000171661377,0.5799999833106995,0.5830000042915344,0.5759999752044678,0.5730000138282776,0.5799999833106995,0.5830000042915344,0.5860000252723694,0.5789999961853027,0.5789999961853027,0.5860000252723694,0.5979999899864197,0.5920000076293945,0.5820000171661377,0.5870000123977661,0.5889999866485596,0.5839999914169312,0.5849999785423279,0.5899999737739563,0.5920000076293945,0.593999981880188,0.597000002861023,0.5889999866485596,0.5889999866485596,0.5849999785423279,0.5899999737739563,0.5989999771118164,0.5899999737739563,0.5839999914169312,0.5910000205039978,0.5910000205039978,0.5929999947547913,0.5920000076293945,0.5929999947547913,0.5889999866485596,0.5899999737739563,0.593999981880188,0.5910000205039978,0.5960000157356262,0.5920000076293945,0.5889999866485596,0.593999981880188,0.5879999995231628,0.5960000157356262,0.5920000076293945,0.5960000157356262,0.5960000157356262,0.5920000076293945,0.6010000109672546,0.5920000076293945,0.5899999737739563,0.5889999866485596,0.5920000076293945,0.6019999980926514],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3009999990463257,0.3149999976158142,0.3400000035762787,0.3610000014305115,0.3680000007152557,0.3799999952316284,0.4020000100135803,0.4180000126361847,0.4129999876022339,0.4259999990463257,0.4239999949932098,0.4440000057220459,0.44200000166893,0.4440000057220459,0.4580000042915344,0.4510000050067901,0.4560000002384186,0.4650000035762787,0.4569999873638153,0.460999995470047,0.4659999907016754,0.4679999947547912,0.4779999852180481,0.4740000069141388,0.4600000083446502,0.4860000014305115,0.4790000021457672,0.4880000054836273,0.4930000007152557,0.4860000014305115,0.4850000143051147,0.4900000095367431,0.4850000143051147,0.4900000095367431,0.4959999918937683,0.492000013589859,0.4850000143051147,0.4970000088214874,0.4900000095367431,0.4979999959468841,0.503000020980835,0.5040000081062317,0.4990000128746032,0.4979999959468841,0.5080000162124634,0.5019999742507935,0.4970000088214874,0.4939999878406524,0.5120000243186951,0.5070000290870667,0.503000020980835,0.5070000290870667,0.503000020980835,0.5109999775886536,0.5080000162124634,0.5009999871253967,0.5090000033378601,0.5,0.5149999856948853,0.5109999775886536,0.5099999904632568,0.5130000114440918,0.5080000162124634,0.5080000162124634,0.5109999775886536,0.5099999904632568,0.5239999890327454,0.5180000066757202,0.5130000114440918,0.5120000243186951,0.5180000066757202,0.515999972820282,0.5260000228881836,0.5199999809265137,0.5239999890327454,0.5220000147819519,0.527999997138977,0.5249999761581421,0.5270000100135803,0.5249999761581421,0.5189999938011169,0.5230000019073486,0.5249999761581421,0.5199999809265137,0.5230000019073486,0.5299999713897705,0.5350000262260437,0.5339999794960022,0.5329999923706055,0.5249999761581421,0.5299999713897705,0.5360000133514404,0.5329999923706055,0.5410000085830688,0.5249999761581421,0.5289999842643738,0.5360000133514404,0.5360000133514404,0.5370000004768372,0.5389999747276306,0.5289999842643738,0.5299999713897705,0.5410000085830688,0.5329999923706055,0.5419999957084656,0.5410000085830688,0.527999997138977,0.5370000004768372,0.5429999828338623,0.5419999957084656,0.5389999747276306,0.5320000052452087,0.5350000262260437,0.5419999957084656,0.5410000085830688,0.5339999794960022,0.5440000295639038,0.5329999923706055,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5429999828338623,0.5479999780654907,0.550000011920929,0.5490000247955322,0.5410000085830688,0.5450000166893005,0.5429999828338623,0.550000011920929,0.5529999732971191,0.5490000247955322,0.5450000166893005,0.5450000166893005,0.5519999861717224,0.5569999814033508,0.5460000038146973,0.546999990940094,0.5509999990463257,0.5509999990463257,0.5450000166893005,0.5440000295639038,0.5440000295639038,0.546999990940094,0.5479999780654907,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5440000295639038,0.5410000085830688,0.5440000295639038,0.5389999747276306,0.5410000085830688,0.546999990940094,0.546999990940094,0.5479999780654907,0.546999990940094,0.550000011920929,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5479999780654907,0.5519999861717224,0.550000011920929],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2809999883174896,0.3230000138282776,0.3409999907016754,0.3600000143051147,0.3569999933242798,0.3889999985694885,0.395000010728836,0.4199999868869781,0.4180000126361847,0.421999990940094,0.4289999902248382,0.4350000023841858,0.4359999895095825,0.4469999969005584,0.4350000023841858,0.4480000138282776,0.4480000138282776,0.453000009059906,0.4550000131130218,0.4589999914169311,0.4639999866485595,0.4600000083446502,0.460999995470047,0.4589999914169311,0.481000006198883,0.4769999980926513,0.4709999859333038,0.4740000069141388,0.4679999947547912,0.4790000021457672,0.4729999899864197,0.4819999933242798,0.4850000143051147,0.4819999933242798,0.4819999933242798,0.4880000054836273,0.4869999885559082,0.4959999918937683,0.4850000143051147,0.4959999918937683,0.492000013589859,0.503000020980835,0.4930000007152557,0.5099999904632568,0.5040000081062317,0.5009999871253967,0.4970000088214874,0.4979999959468841,0.5059999823570251,0.5070000290870667,0.5040000081062317,0.5059999823570251,0.5049999952316284,0.5080000162124634,0.5049999952316284,0.5019999742507935,0.5120000243186951,0.5170000195503235,0.5170000195503235,0.5090000033378601,0.5239999890327454,0.527999997138977,0.5230000019073486,0.5210000276565552,0.5149999856948853,0.5189999938011169,0.5270000100135803,0.5149999856948853,0.5099999904632568,0.5299999713897705,0.5199999809265137,0.5230000019073486,0.5260000228881836,0.5249999761581421,0.5239999890327454,0.5329999923706055,0.5210000276565552,0.5260000228881836,0.5170000195503235,0.531000018119812,0.5289999842643738,0.531000018119812,0.5270000100135803,0.5299999713897705,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5329999923706055,0.5360000133514404,0.5299999713897705,0.5360000133514404,0.5270000100135803,0.5450000166893005,0.5410000085830688,0.546999990940094,0.5329999923706055,0.5329999923706055,0.5379999876022339,0.5299999713897705,0.5429999828338623,0.5360000133514404,0.5339999794960022,0.5419999957084656,0.5410000085830688,0.5370000004768372,0.5389999747276306,0.527999997138977,0.5400000214576721,0.5400000214576721,0.531000018119812,0.5440000295639038,0.5460000038146973,0.5479999780654907,0.5460000038146973,0.5410000085830688,0.5509999990463257,0.5479999780654907,0.5410000085830688,0.5389999747276306,0.550000011920929,0.5569999814033508,0.550000011920929,0.5490000247955322,0.5490000247955322,0.5569999814033508,0.5519999861717224,0.5479999780654907,0.5559999942779541,0.5550000071525574,0.5460000038146973,0.5540000200271606,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5550000071525574,0.5479999780654907,0.5479999780654907,0.5540000200271606,0.5550000071525574,0.5529999732971191,0.5529999732971191,0.5509999990463257,0.5509999990463257,0.5419999957084656,0.546999990940094,0.5509999990463257,0.5559999942779541,0.5490000247955322,0.5509999990463257,0.5529999732971191,0.550000011920929,0.5540000200271606,0.5550000071525574,0.5580000281333923,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5519999861717224,0.5519999861717224,0.5559999942779541,0.5569999814033508,0.5559999942779541,0.5550000071525574,0.5559999942779541,0.5490000247955322,0.5550000071525574,0.5600000023841858],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3019999861717224,0.3059999942779541,0.335999995470047,0.3610000014305115,0.3819999992847442,0.4009999930858612,0.4020000100135803,0.4250000119209289,0.4309999942779541,0.4469999969005584,0.4519999921321869,0.453000009059906,0.4580000042915344,0.4729999899864197,0.4749999940395355,0.4699999988079071,0.4799999892711639,0.4749999940395355,0.4769999980926513,0.481000006198883,0.4839999973773956,0.4959999918937683,0.5040000081062317,0.4970000088214874,0.4979999959468841,0.5070000290870667,0.5049999952316284,0.5109999775886536,0.515999972820282,0.5120000243186951,0.5120000243186951,0.515999972820282,0.5120000243186951,0.5249999761581421,0.5170000195503235,0.5199999809265137,0.5270000100135803,0.5170000195503235,0.5220000147819519,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5370000004768372,0.5339999794960022,0.5329999923706055,0.531000018119812,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.5389999747276306,0.5419999957084656,0.5429999828338623,0.5360000133514404,0.5299999713897705,0.546999990940094,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5350000262260437,0.5339999794960022,0.5419999957084656,0.5450000166893005,0.5460000038146973,0.5370000004768372,0.5490000247955322,0.5440000295639038,0.550000011920929,0.5490000247955322,0.5450000166893005,0.5490000247955322,0.5559999942779541,0.5559999942779541,0.5410000085830688,0.5419999957084656,0.5529999732971191,0.5460000038146973,0.5540000200271606,0.5379999876022339,0.5509999990463257,0.5540000200271606,0.5419999957084656,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5519999861717224,0.5600000023841858,0.5540000200271606,0.5509999990463257,0.5609999895095825,0.5619999766349792,0.5590000152587891,0.5559999942779541,0.5580000281333923,0.5640000104904175,0.5649999976158142,0.5590000152587891,0.5550000071525574,0.5630000233650208,0.5630000233650208,0.5609999895095825,0.5559999942779541,0.5609999895095825,0.5630000233650208,0.5680000185966492,0.5630000233650208,0.5690000057220459,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5690000057220459,0.5640000104904175,0.5630000233650208,0.574999988079071,0.5630000233650208,0.5619999766349792,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5590000152587891,0.5600000023841858,0.5619999766349792,0.5799999833106995,0.5619999766349792,0.5699999928474426,0.5709999799728394,0.5669999718666077,0.5680000185966492,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5720000267028809,0.5709999799728394,0.5770000219345093,0.574999988079071,0.5730000138282776,0.5690000057220459,0.5740000009536743,0.578000009059906,0.574999988079071,0.5820000171661377,0.5730000138282776,0.5740000009536743,0.574999988079071,0.5770000219345093,0.5789999961853027,0.5759999752044678,0.5720000267028809,0.5770000219345093,0.5759999752044678,0.5789999961853027,0.5789999961853027,0.5730000138282776,0.5789999961853027,0.5759999752044678,0.5690000057220459,0.5849999785423279,0.5759999752044678,0.5699999928474426,0.5789999961853027,0.5820000171661377,0.5730000138282776,0.5730000138282776,0.5789999961853027],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":30,"default":5}}}
assets/data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2528519630432129,0.2616856694221496,0.2665999829769134,0.2683407664299011,0.2742894291877746,0.2762066125869751,0.2807516455650329,0.2767378389835357,0.2807380557060241,0.2788906991481781,0.2844051718711853,0.2856102883815765,0.2883394360542297,0.2875711619853973,0.2890409529209137,0.2894668281078338,0.2883355319499969,0.2872501015663147,0.291619062423706,0.2900333702564239,0.2962473034858703,0.2962896525859833,0.297355443239212,0.2932226359844208,0.2886744439601898,0.29665008187294,0.2976542115211487,0.2991503179073334,0.3004479110240936,0.3044549524784088,0.2976194322109222,0.3014707863330841,0.3048252463340759,0.3039425611495971,0.303354948759079,0.3027459383010864,0.2999922931194305,0.3050121665000915,0.2998814284801483,0.2978588044643402,0.3041949570178985,0.3010904192924499,0.3022017180919647,0.2997751235961914,0.3015910983085632,0.3096485137939453,0.3012076020240783,0.3065535724163055,0.3042872548103332,0.3104783594608307,0.2997980415821075,0.3051296770572662,0.303458571434021,0.3088337182998657,0.3145398199558258,0.3032208085060119,0.310806930065155,0.3075874149799347,0.3101692199707031,0.310107946395874,0.3066047430038452,0.3109066784381866,0.3081336915493011,0.3084586262702942,0.3086149394512176,0.3085348606109619,0.3136637806892395,0.3110873103141784,0.31076380610466,0.3084572553634643,0.3133681714534759,0.3125792145729065,0.3124453127384186,0.3097185790538788,0.3106793165206909,0.3089564740657806,0.3111244142055511,0.3123694658279419,0.3144859969615936,0.3135123550891876,0.311982125043869,0.3142133951187134,0.3122903704643249,0.3147654831409454,0.3078767359256744,0.314947634935379,0.3171303570270538,0.3129573762416839,0.3154936134815216,0.3158208429813385,0.3153132200241089,0.3141326904296875,0.3163397014141083,0.3166318237781524,0.3168410360813141,0.3198235332965851,0.3201336860656738,0.3212967813014984,0.3191385567188263,0.3178017139434814,0.3192791938781738,0.323061466217041,0.320336639881134,0.3165886104106903,0.3206393420696258,0.3167395293712616,0.3135207295417785,0.315539002418518,0.3191742599010467,0.321073055267334,0.3222262561321258,0.3193058371543884,0.3213480710983276,0.3198905289173126,0.3219239711761474,0.3211614489555359,0.318855881690979,0.3177095353603363,0.324197381734848,0.3208906352519989,0.3264936804771423,0.3245965242385864,0.3231639564037323,0.3221887946128845,0.3277338445186615,0.3227696120738983,0.3263820111751556,0.3258577883243561,0.3264622390270233,0.3222362995147705,0.3286814987659454,0.3235024213790893,0.32446950674057,0.3311836123466491,0.328130304813385,0.3271634578704834,0.3250012993812561,0.3309800624847412,0.3274554014205932,0.3273015916347503,0.3261759579181671,0.32697594165802,0.3303172886371612,0.3282814025878906,0.3289586305618286,0.3260826468467712,0.3258011937141418,0.3297208249568939,0.3254813551902771,0.3287739753723144,0.3287097811698913,0.3275279700756073,0.3293041586875915,0.3314100801944732,0.3287808299064636,0.3251930773258209,0.3288172781467438,0.3265027701854706,0.3275215625762939,0.3290774822235107,0.3261331617832184,0.3299777805805206,0.331955999135971,0.3305029273033142,0.3274719417095184,0.3235560953617096,0.3269940316677093,0.3323083519935608],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2510619163513183,0.2621481418609619,0.2632303833961487,0.2720474302768707,0.2719806432723999,0.2726832032203674,0.2786827087402344,0.2823672890663147,0.276201844215393,0.2816944718360901,0.280361145734787,0.2819306254386902,0.2823295891284942,0.2892518043518066,0.2872919738292694,0.2859259247779846,0.2885263860225677,0.2862614393234253,0.2933129370212555,0.2930494546890259,0.2884900867938995,0.2942298054695129,0.2927677929401397,0.2954220175743103,0.2918704748153686,0.2943699061870575,0.2891678512096405,0.291848212480545,0.2942944765090942,0.2973679602146148,0.2953736186027527,0.2963412702083587,0.297100305557251,0.2963026762008667,0.2944463491439819,0.2971296310424804,0.293870210647583,0.2982682287693023,0.2978119254112243,0.2989997565746307,0.2993503510951996,0.298117071390152,0.2977498769760132,0.3004056811332702,0.3012634217739105,0.3001384139060974,0.3052266240119934,0.3038219809532165,0.3037647306919098,0.3009455502033233,0.3038812279701233,0.303263396024704,0.3025077581405639,0.3056069612503052,0.3024908602237701,0.3050909340381622,0.3001562356948852,0.303833544254303,0.3019777834415436,0.3036664128303528,0.3022894859313965,0.3042722940444946,0.3023003339767456,0.3069425821304321,0.307883083820343,0.3026910126209259,0.3054113090038299,0.3046148121356964,0.305342435836792,0.3048149049282074,0.3066973984241485,0.3055126965045929,0.3063409924507141,0.307701051235199,0.3075169324874878,0.3091190159320831,0.3098153173923492,0.31436288356781,0.3096509575843811,0.3022815883159637,0.3119745552539825,0.3083471357822418,0.3085280954837799,0.3082001209259033,0.3080264329910278,0.3116717934608459,0.3097788393497467,0.3117353916168213,0.3170038759708404,0.3099159002304077,0.3133728504180908,0.3161626160144806,0.3095119595527649,0.3135432302951813,0.3103009164333343,0.3126655519008636,0.3121814131736755,0.3123973608016968,0.3148256838321686,0.3144133985042572,0.3124284744262695,0.3102188408374786,0.3123636841773987,0.3115113973617553,0.3151636719703674,0.3148572146892547,0.315061867237091,0.3127182424068451,0.3139308094978332,0.3134367167949676,0.3136025071144104,0.3172793388366699,0.3134761154651642,0.3109587132930755,0.3127998411655426,0.3161843717098236,0.3163313865661621,0.3145243525505066,0.3155156075954437,0.3127505779266357,0.3182451128959656,0.3162476718425751,0.3124897480010986,0.3128789663314819,0.3119811117649078,0.314126193523407,0.3136049509048462,0.3149912655353546,0.3146650791168213,0.3151968121528625,0.3179666996002197,0.3169245719909668,0.3202513754367828,0.3185319602489471,0.3202781081199646,0.3186031281948089,0.3166128396987915,0.3199457228183746,0.3194417059421539,0.3170624077320099,0.3184532523155212,0.3191981911659241,0.3191225528717041,0.3173209130764007,0.3195607960224151,0.3166368305683136,0.3188160359859466,0.3174867630004883,0.3184468746185303,0.3211863338947296,0.3184327483177185,0.3177861273288727,0.3180214762687683,0.3194973170757293,0.3212297558784485,0.3211282789707184,0.3200584352016449,0.3168685734272003,0.3211040198802948,0.3222841620445251,0.3196901082992553,0.3236229419708252,0.3204475045204162,0.3210069537162781,0.3191083669662475,0.31863734126091,0.3195922076702118],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2516599297523498,0.2610189318656921,0.2666046619415283,0.2667981088161468,0.2667821645736694,0.2708088159561157,0.2738403379917145,0.2726235687732696,0.2762763500213623,0.2768311202526092,0.2809228301048279,0.2836140990257263,0.2822815179824829,0.2831664383411407,0.2797218561172485,0.286342591047287,0.2855269610881805,0.2847287058830261,0.2888180613517761,0.286526083946228,0.2865165770053863,0.294582188129425,0.2925947606563568,0.2947863042354584,0.2892930805683136,0.2903610467910766,0.288201242685318,0.2873396277427673,0.2916238009929657,0.2908017039299011,0.2907920777797699,0.2952797412872314,0.2941452264785766,0.2921333611011505,0.2925891280174255,0.2968584895133972,0.2980035543441772,0.2964116632938385,0.2962304651737213,0.2950254380702972,0.2977516651153564,0.2944138348102569,0.3003402054309845,0.2976303696632385,0.3013098239898681,0.302829384803772,0.3018766045570373,0.305361807346344,0.2971298694610595,0.3014816343784332,0.3019805550575256,0.3037064969539642,0.2970167994499206,0.2995208501815796,0.2970106601715088,0.2990955114364624,0.3027818500995636,0.3048534691333771,0.2993872463703155,0.2986327707767486,0.3015393316745758,0.3003426790237427,0.3003274798393249,0.3017795085906982,0.3019182682037353,0.3015450537204742,0.3046211004257202,0.3031167984008789,0.3020436763763428,0.3011128306388855,0.3029948472976684,0.3045558631420135,0.301642894744873,0.3029441833496094,0.3035804331302643,0.3004390001296997,0.3021787703037262,0.306041270494461,0.3064048886299133,0.3087956011295318,0.3070018291473388,0.3065581619739532,0.3093871772289276,0.3060930073261261,0.3033313155174255,0.3072777390480041,0.306413859128952,0.3104493916034698,0.3056999444961548,0.3077532052993774,0.309231549501419,0.3070645034313202,0.3117790520191192,0.3114112913608551,0.312661737203598,0.3181777000427246,0.3117201030254364,0.3099702894687652,0.3074746131896972,0.3064963519573211,0.3105958700180053,0.3111456036567688,0.3084964454174042,0.3087405860424042,0.3121673166751861,0.3121528625488281,0.3100416660308838,0.3142979145050049,0.3129935264587402,0.3112611472606659,0.3119436800479889,0.3154115974903106,0.3091593086719513,0.3103814721107483,0.3130497634410858,0.3133455514907837,0.3152708411216736,0.3137963414192199,0.3099324703216553,0.3164172768592834,0.3133907914161682,0.3128255009651184,0.3134104907512665,0.3106969892978668,0.3130004107952118,0.3131391704082489,0.3130116462707519,0.3143952488899231,0.3143975436687469,0.3143710494041443,0.3163396418094635,0.3166862726211548,0.3184126019477844,0.3178988993167877,0.317479133605957,0.3184944093227386,0.316694974899292,0.3176258206367492,0.3182629346847534,0.3200214207172394,0.3181648552417755,0.320680022239685,0.3178716897964477,0.3182425796985626,0.3182984292507171,0.3158398568630218,0.3152642548084259,0.3132680356502533,0.3178914785385132,0.3156660795211792,0.3161703050136566,0.3176451921463012,0.3173815906047821,0.3194171786308288,0.3193057179450989,0.3172560334205627,0.317656546831131,0.3155770003795624,0.3199106156826019,0.3170182108879089,0.3156754970550537,0.3180731236934662,0.3205638229846954,0.3175432682037353,0.3184471428394317,0.3192788958549499,0.3197042346000671,0.3177168369293213],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2558934390544891,0.2618628144264221,0.2683217823505401,0.2699837982654571,0.2738722860813141,0.2744417488574981,0.2740873992443084,0.2807216048240661,0.2820421457290649,0.2891400754451751,0.2879075407981872,0.2881667613983154,0.2892490327358246,0.2882707118988037,0.2935869693756103,0.2870290875434875,0.2911452651023865,0.2949125170707702,0.2916406095027923,0.2981449663639068,0.2953989207744598,0.2946988642215729,0.297021746635437,0.3001497685909271,0.3010218441486358,0.2977036237716675,0.2992585003376007,0.2986803948879242,0.2994338274002075,0.2989781498908996,0.3041955828666687,0.3030496537685394,0.303806334733963,0.3036351203918457,0.3058845102787018,0.300450712442398,0.3025284707546234,0.3072526752948761,0.3039065897464752,0.3073755502700805,0.3070493042469024,0.3083153367042541,0.3123056292533874,0.307761400938034,0.3053378164768219,0.3116358816623688,0.3080427348613739,0.308482676744461,0.307318776845932,0.3083004653453827,0.3089516758918762,0.3088736236095428,0.3077724277973175,0.3126304149627685,0.3101697862148285,0.3159398734569549,0.314792275428772,0.3103811144828796,0.3111368715763092,0.3129658997058868,0.311605304479599,0.3118223249912262,0.3133279979228973,0.3146496713161468,0.3195074200630188,0.3142614662647247,0.3125102519989013,0.3115333616733551,0.3183117806911468,0.3168580532073974,0.3187012672424316,0.3179306983947754,0.3157722651958465,0.3214826583862304,0.3145081698894501,0.3172421753406524,0.3151432573795318,0.3181649446487427,0.3180212080478668,0.3171605765819549,0.3212067782878876,0.3180184066295624,0.3209905624389648,0.319052129983902,0.3212707936763763,0.3196887373924255,0.3188316226005554,0.3164899051189422,0.3241994678974151,0.3179469406604767,0.3214083909988403,0.3206575512886047,0.3263285160064697,0.3219505250453949,0.3181525468826294,0.3219776451587677,0.3259726762771606,0.3197665512561798,0.3236161768436432,0.3177970349788666,0.3258080780506134,0.3208407461643219,0.3251138925552368,0.3242645859718323,0.3229723274707794,0.3227455914020538,0.3206316232681274,0.3256695866584778,0.3241210877895355,0.3224890530109405,0.3263737261295318,0.3214233517646789,0.3240345120429992,0.3222567737102508,0.3242291808128357,0.3257078528404236,0.3278365731239319,0.3277338743209839,0.3253948092460632,0.3232105076313019,0.3267974853515625,0.3263654410839081,0.3262891769409179,0.3238334357738495,0.3294911682605743,0.3261866867542267,0.3243315815925598,0.3250119090080261,0.326727420091629,0.3268802464008331,0.3269768059253692,0.3257980346679687,0.3280686736106872,0.3274897634983063,0.3282252252101898,0.3272863030433655,0.328346699476242,0.325562834739685,0.3301684856414795,0.3284023404121399,0.3268299400806427,0.3286610245704651,0.3291078805923462,0.324972927570343,0.3314772248268127,0.3278062343597412,0.326839417219162,0.3277239501476288,0.330414742231369,0.3271744549274444,0.3279334008693695,0.3288575112819671,0.3285425007343292,0.3282454907894134,0.3296376466751098,0.3305942714214325,0.3276287615299225,0.3292438983917236,0.329515129327774,0.3281475007534027,0.3282177448272705,0.3333999514579773,0.3302631080150604,0.330238401889801,0.3323166668415069,0.3313035368919372,0.32961106300354,0.3321967124938965],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2840000092983246,0.3059999942779541,0.3059999942779541,0.2980000078678131,0.3240000009536743,0.3100000023841858,0.3000000119209289,0.3160000145435333,0.3140000104904175,0.3260000050067901,0.3199999928474426,0.2980000078678131,0.3179999887943268,0.3179999887943268,0.3319999873638153,0.3019999861717224,0.2939999997615814,0.3319999873638153,0.3319999873638153,0.3219999969005584,0.3379999995231628,0.3379999995231628,0.3339999914169311,0.3240000009536743,0.3479999899864197,0.3300000131130218,0.3240000009536743,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3319999873638153,0.3379999995231628,0.356000006198883,0.3339999914169311,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.3479999899864197,0.3339999914169311,0.3400000035762787,0.3479999899864197,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3499999940395355,0.3420000076293945,0.3659999966621399,0.3400000035762787,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3400000035762787,0.356000006198883,0.3339999914169311,0.3339999914169311,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3339999914169311,0.3440000116825104,0.3400000035762787,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3499999940395355,0.3420000076293945,0.3379999995231628,0.335999995470047,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3519999980926513,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3400000035762787,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3479999899864197,0.3379999995231628,0.3440000116825104,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3499999940395355,0.3600000143051147,0.3440000116825104,0.3499999940395355,0.356000006198883,0.3420000076293945,0.3479999899864197,0.3379999995231628,0.3379999995231628,0.3459999859333038,0.356000006198883,0.328000009059906,0.3459999859333038,0.3519999980926513,0.3499999940395355,0.3519999980926513,0.3420000076293945,0.3499999940395355,0.3420000076293945,0.3339999914169311,0.335999995470047,0.3379999995231628,0.3379999995231628,0.3540000021457672,0.356000006198883,0.356000006198883,0.335999995470047,0.363999992609024,0.363999992609024,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3540000021457672,0.3459999859333038,0.3479999899864197,0.3519999980926513,0.3519999980926513,0.3420000076293945,0.3440000116825104,0.3379999995231628,0.3519999980926513,0.356000006198883,0.3420000076293945,0.3580000102519989,0.3499999940395355,0.3619999885559082,0.3519999980926513,0.3600000143051147,0.3459999859333038,0.3519999980926513,0.3519999980926513,0.3499999940395355,0.3580000102519989,0.356000006198883,0.3580000102519989,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3440000116825104,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3600000143051147,0.3580000102519989,0.3540000021457672,0.3519999980926513,0.3459999859333038,0.3459999859333038,0.3540000021457672,0.335999995470047,0.3540000021457672,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3499999940395355,0.356000006198883],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2460000067949295,0.2720000147819519,0.270000010728836,0.2939999997615814,0.2960000038146972,0.3240000009536743,0.3019999861717224,0.2879999876022339,0.3179999887943268,0.3059999942779541,0.2899999916553497,0.3100000023841858,0.3179999887943268,0.3219999969005584,0.3219999969005584,0.3300000131130218,0.3140000104904175,0.3240000009536743,0.3079999983310699,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3179999887943268,0.3260000050067901,0.3260000050067901,0.3240000009536743,0.3379999995231628,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3339999914169311,0.328000009059906,0.3319999873638153,0.3199999928474426,0.3000000119209289,0.3260000050067901,0.3240000009536743,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3440000116825104,0.3199999928474426,0.3319999873638153,0.3219999969005584,0.335999995470047,0.3519999980926513,0.3379999995231628,0.328000009059906,0.3300000131130218,0.335999995470047,0.3479999899864197,0.3459999859333038,0.3479999899864197,0.3540000021457672,0.3479999899864197,0.3300000131130218,0.356000006198883,0.3479999899864197,0.356000006198883,0.335999995470047,0.335999995470047,0.3479999899864197,0.3339999914169311,0.3540000021457672,0.3300000131130218,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3479999899864197,0.335999995470047,0.3400000035762787,0.3179999887943268,0.335999995470047,0.328000009059906,0.328000009059906,0.3540000021457672,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3440000116825104,0.3499999940395355,0.335999995470047,0.3540000021457672,0.356000006198883,0.3400000035762787,0.3600000143051147,0.3580000102519989,0.3519999980926513,0.3499999940395355,0.3540000021457672,0.3519999980926513,0.3499999940395355,0.3440000116825104,0.356000006198883,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3499999940395355,0.3440000116825104,0.3519999980926513,0.3440000116825104,0.356000006198883,0.3459999859333038,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3420000076293945,0.3379999995231628,0.3479999899864197,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.3420000076293945,0.3420000076293945,0.3499999940395355,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3459999859333038,0.3479999899864197,0.3440000116825104,0.3720000088214874,0.3619999885559082,0.356000006198883,0.3519999980926513,0.3459999859333038,0.3440000116825104,0.3420000076293945,0.3580000102519989,0.3600000143051147,0.3519999980926513,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3619999885559082,0.3499999940395355,0.3499999940395355,0.363999992609024,0.3580000102519989,0.3499999940395355,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3540000021457672,0.3600000143051147,0.3420000076293945,0.3519999980926513,0.3440000116825104,0.3519999980926513,0.3540000021457672,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3519999980926513,0.3580000102519989,0.3440000116825104,0.3499999940395355,0.3580000102519989,0.3479999899864197,0.3479999899864197],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2720000147819519,0.2980000078678131,0.2840000092983246,0.2879999876022339,0.3039999902248382,0.2860000133514404,0.2899999916553497,0.3019999861717224,0.2960000038146972,0.3039999902248382,0.3100000023841858,0.3160000145435333,0.3260000050067901,0.3160000145435333,0.3260000050067901,0.3179999887943268,0.3420000076293945,0.3219999969005584,0.328000009059906,0.3240000009536743,0.3300000131130218,0.328000009059906,0.3199999928474426,0.3379999995231628,0.3400000035762787,0.3240000009536743,0.3120000064373016,0.3319999873638153,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3140000104904175,0.3179999887943268,0.3160000145435333,0.3199999928474426,0.3240000009536743,0.3260000050067901,0.3179999887943268,0.3300000131130218,0.3179999887943268,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3199999928474426,0.3400000035762787,0.3339999914169311,0.328000009059906,0.328000009059906,0.3339999914169311,0.328000009059906,0.328000009059906,0.335999995470047,0.3580000102519989,0.3499999940395355,0.3260000050067901,0.3499999940395355,0.3420000076293945,0.3160000145435333,0.3339999914169311,0.335999995470047,0.3400000035762787,0.3240000009536743,0.3319999873638153,0.3379999995231628,0.3400000035762787,0.3379999995231628,0.3319999873638153,0.3319999873638153,0.3440000116825104,0.3300000131130218,0.3219999969005584,0.3260000050067901,0.3219999969005584,0.3339999914169311,0.328000009059906,0.3300000131130218,0.3219999969005584,0.3379999995231628,0.3400000035762787,0.3319999873638153,0.328000009059906,0.3440000116825104,0.3339999914169311,0.328000009059906,0.3379999995231628,0.3499999940395355,0.3339999914169311,0.3300000131130218,0.328000009059906,0.335999995470047,0.3240000009536743,0.335999995470047,0.3240000009536743,0.3400000035762787,0.3400000035762787,0.3420000076293945,0.3319999873638153,0.3339999914169311,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3400000035762787,0.3379999995231628,0.3459999859333038,0.3379999995231628,0.3300000131130218,0.3519999980926513,0.3379999995231628,0.356000006198883,0.335999995470047,0.3420000076293945,0.3400000035762787,0.328000009059906,0.3540000021457672,0.3499999940395355,0.3479999899864197,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3540000021457672,0.3440000116825104,0.3499999940395355,0.356000006198883,0.356000006198883,0.356000006198883,0.363999992609024,0.3600000143051147,0.356000006198883,0.3479999899864197,0.356000006198883,0.3459999859333038,0.3479999899864197,0.3619999885559082,0.363999992609024,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3540000021457672,0.3619999885559082,0.3580000102519989,0.3540000021457672,0.356000006198883,0.3479999899864197,0.3519999980926513,0.356000006198883,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3440000116825104,0.3580000102519989,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3540000021457672,0.3519999980926513,0.3540000021457672,0.356000006198883,0.363999992609024,0.356000006198883,0.356000006198883],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2660000026226043,0.277999997138977,0.2820000052452087,0.3079999983310699,0.3140000104904175,0.3260000050067901,0.3039999902248382,0.3319999873638153,0.3240000009536743,0.3199999928474426,0.3379999995231628,0.3339999914169311,0.3319999873638153,0.3179999887943268,0.3319999873638153,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3199999928474426,0.3179999887943268,0.3400000035762787,0.3219999969005584,0.335999995470047,0.3339999914169311,0.3420000076293945,0.3240000009536743,0.3440000116825104,0.3420000076293945,0.3379999995231628,0.3459999859333038,0.328000009059906,0.3420000076293945,0.3459999859333038,0.3479999899864197,0.3379999995231628,0.356000006198883,0.3379999995231628,0.3440000116825104,0.3400000035762787,0.3379999995231628,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3379999995231628,0.356000006198883,0.3400000035762787,0.3519999980926513,0.3479999899864197,0.3479999899864197,0.3400000035762787,0.3459999859333038,0.3519999980926513,0.3440000116825104,0.3400000035762787,0.356000006198883,0.3420000076293945,0.356000006198883,0.3540000021457672,0.3600000143051147,0.3339999914169311,0.3499999940395355,0.3580000102519989,0.3440000116825104,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3339999914169311,0.3540000021457672,0.3459999859333038,0.3459999859333038,0.3400000035762787,0.356000006198883,0.356000006198883,0.3420000076293945,0.3420000076293945,0.3400000035762787,0.3479999899864197,0.3519999980926513,0.3319999873638153,0.3580000102519989,0.356000006198883,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3400000035762787,0.3440000116825104,0.3339999914169311,0.3379999995231628,0.3479999899864197,0.3680000007152557,0.3619999885559082,0.3440000116825104,0.3619999885559082,0.3580000102519989,0.356000006198883,0.3600000143051147,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3540000021457672,0.3600000143051147,0.356000006198883,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3540000021457672,0.363999992609024,0.3580000102519989,0.3680000007152557,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3420000076293945,0.363999992609024,0.3580000102519989,0.3619999885559082,0.3759999871253967,0.3740000128746032,0.363999992609024,0.3580000102519989,0.3700000047683716,0.3700000047683716,0.363999992609024,0.3440000116825104,0.3580000102519989,0.3680000007152557,0.3700000047683716,0.3740000128746032,0.3619999885559082,0.3619999885559082,0.3700000047683716,0.363999992609024,0.363999992609024,0.363999992609024,0.3700000047683716,0.3600000143051147,0.3680000007152557,0.363999992609024,0.3659999966621399,0.363999992609024,0.3680000007152557,0.3580000102519989,0.363999992609024,0.3659999966621399,0.363999992609024,0.3580000102519989,0.3600000143051147,0.3600000143051147,0.3580000102519989,0.3600000143051147],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6019999980926514,0.652999997138977,0.6710000038146973,0.6740000247955322,0.6899999976158142,0.6919999718666077,0.6909999847412109,0.7070000171661377,0.7089999914169312,0.7129999995231628,0.7229999899864197,0.7120000123977661,0.7200000286102295,0.7300000190734863,0.7279999852180481,0.7369999885559082,0.7390000224113464,0.7350000143051147,0.7319999933242798,0.7279999852180481,0.7269999980926514,0.7459999918937683,0.7400000095367432,0.7390000224113464,0.7319999933242798,0.7390000224113464,0.7379999756813049,0.7390000224113464,0.7360000014305115,0.7440000176429749,0.7400000095367432,0.7360000014305115,0.7480000257492065,0.7360000014305115,0.7440000176429749,0.7459999918937683,0.7409999966621399,0.746999979019165,0.7440000176429749,0.7450000047683716,0.753000020980835,0.7390000224113464,0.7490000128746033,0.7419999837875366,0.7390000224113464,0.7559999823570251,0.7519999742507935,0.7549999952316284,0.7419999837875366,0.7490000128746033,0.7540000081062317,0.7480000257492065,0.7450000047683716,0.7429999709129333,0.7509999871253967,0.7549999952316284,0.7490000128746033,0.7490000128746033,0.7400000095367432,0.753000020980835,0.75,0.7509999871253967,0.7570000290870667,0.7590000033378601,0.7570000290870667,0.7329999804496765,0.7540000081062317,0.746999979019165,0.7409999966621399,0.7590000033378601,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7580000162124634,0.7639999985694885,0.7630000114440918,0.7590000033378601,0.7549999952316284,0.7480000257492065,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7549999952316284,0.7559999823570251,0.7580000162124634,0.7580000162124634,0.753000020980835,0.7490000128746033,0.7540000081062317,0.7639999985694885,0.7580000162124634,0.7519999742507935,0.7590000033378601,0.75,0.7570000290870667,0.7620000243186951,0.7710000276565552,0.7739999890327454,0.7620000243186951,0.7549999952316284,0.7599999904632568,0.765999972820282,0.7680000066757202,0.7639999985694885,0.7540000081062317,0.7649999856948853,0.7649999856948853,0.7609999775886536,0.7549999952316284,0.765999972820282,0.7639999985694885,0.7580000162124634,0.7710000276565552,0.7570000290870667,0.7630000114440918,0.7580000162124634,0.7599999904632568,0.7649999856948853,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7559999823570251,0.7609999775886536,0.7620000243186951,0.7620000243186951,0.7609999775886536,0.753000020980835,0.7570000290870667,0.7620000243186951,0.7609999775886536,0.7609999775886536,0.7559999823570251,0.7540000081062317,0.7570000290870667,0.7639999985694885,0.7590000033378601,0.7680000066757202,0.7680000066757202,0.765999972820282,0.765999972820282,0.7670000195503235,0.7739999890327454,0.7649999856948853,0.7749999761581421,0.7699999809265137,0.7639999985694885,0.7680000066757202,0.7630000114440918,0.7680000066757202,0.7699999809265137,0.7739999890327454,0.7749999761581421,0.765999972820282,0.7680000066757202,0.7710000276565552,0.7680000066757202,0.765999972820282,0.7689999938011169,0.7760000228881836,0.7710000276565552,0.7680000066757202,0.7649999856948853,0.7720000147819519,0.7730000019073486],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6169999837875366,0.6359999775886536,0.6769999861717224,0.6769999861717224,0.6970000267028809,0.6990000009536743,0.6970000267028809,0.6959999799728394,0.7049999833106995,0.7089999914169312,0.7179999947547913,0.7099999785423279,0.7160000205039978,0.7260000109672546,0.7229999899864197,0.7179999947547913,0.7210000157356262,0.7200000286102295,0.734000027179718,0.7089999914169312,0.7229999899864197,0.7239999771118164,0.7310000061988831,0.7300000190734863,0.7260000109672546,0.7250000238418579,0.7239999771118164,0.7289999723434448,0.7390000224113464,0.7229999899864197,0.7310000061988831,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7289999723434448,0.7329999804496765,0.7300000190734863,0.7319999933242798,0.7440000176429749,0.746999979019165,0.7310000061988831,0.7329999804496765,0.7480000257492065,0.7429999709129333,0.7369999885559082,0.7269999980926514,0.7269999980926514,0.7379999756813049,0.75,0.7360000014305115,0.746999979019165,0.7409999966621399,0.7369999885559082,0.7459999918937683,0.7400000095367432,0.7409999966621399,0.746999979019165,0.7360000014305115,0.7459999918937683,0.7400000095367432,0.7429999709129333,0.7350000143051147,0.7390000224113464,0.7379999756813049,0.7480000257492065,0.7329999804496765,0.734000027179718,0.7390000224113464,0.7459999918937683,0.7360000014305115,0.7419999837875366,0.7429999709129333,0.7400000095367432,0.7379999756813049,0.7310000061988831,0.7360000014305115,0.7390000224113464,0.75,0.7369999885559082,0.7570000290870667,0.7409999966621399,0.7459999918937683,0.7350000143051147,0.7459999918937683,0.7509999871253967,0.7429999709129333,0.7419999837875366,0.7419999837875366,0.75,0.7440000176429749,0.7450000047683716,0.75,0.7409999966621399,0.7490000128746033,0.7409999966621399,0.7419999837875366,0.7429999709129333,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.75,0.753000020980835,0.75,0.746999979019165,0.7519999742507935,0.746999979019165,0.7570000290870667,0.7549999952316284,0.75,0.7540000081062317,0.7480000257492065,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.746999979019165,0.746999979019165,0.75,0.7519999742507935,0.7580000162124634,0.7549999952316284,0.7490000128746033,0.7480000257492065,0.7519999742507935,0.7590000033378601,0.7450000047683716,0.75,0.7440000176429749,0.7419999837875366,0.7519999742507935,0.7450000047683716,0.753000020980835,0.7450000047683716,0.7440000176429749,0.7559999823570251,0.7509999871253967,0.7540000081062317,0.7440000176429749,0.7509999871253967,0.753000020980835,0.7490000128746033,0.7570000290870667,0.7490000128746033,0.746999979019165,0.746999979019165,0.7509999871253967,0.7509999871253967,0.7519999742507935,0.7570000290870667,0.7540000081062317,0.7440000176429749,0.7480000257492065,0.7509999871253967,0.7509999871253967,0.7509999871253967,0.7549999952316284,0.75,0.7559999823570251,0.746999979019165,0.7609999775886536,0.7549999952316284,0.746999979019165,0.7490000128746033,0.753000020980835,0.753000020980835,0.7609999775886536,0.746999979019165,0.7580000162124634],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.621999979019165,0.6439999938011169,0.6700000166893005,0.6790000200271606,0.6869999766349792,0.6959999799728394,0.6790000200271606,0.6880000233650208,0.7049999833106995,0.699999988079071,0.6990000009536743,0.6940000057220459,0.7110000252723694,0.7089999914169312,0.7120000123977661,0.7070000171661377,0.7070000171661377,0.6990000009536743,0.7009999752044678,0.7160000205039978,0.7200000286102295,0.7149999737739563,0.7250000238418579,0.7210000157356262,0.722000002861023,0.7310000061988831,0.7289999723434448,0.7319999933242798,0.7250000238418579,0.722000002861023,0.7210000157356262,0.7170000076293945,0.7260000109672546,0.7250000238418579,0.7210000157356262,0.7200000286102295,0.7379999756813049,0.7239999771118164,0.7239999771118164,0.7080000042915344,0.7289999723434448,0.7289999723434448,0.7300000190734863,0.7329999804496765,0.7319999933242798,0.7350000143051147,0.7390000224113464,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7329999804496765,0.7400000095367432,0.7409999966621399,0.7310000061988831,0.7350000143051147,0.7360000014305115,0.7360000014305115,0.7409999966621399,0.7319999933242798,0.7409999966621399,0.7400000095367432,0.7390000224113464,0.7329999804496765,0.7459999918937683,0.753000020980835,0.746999979019165,0.734000027179718,0.7369999885559082,0.7419999837875366,0.734000027179718,0.7419999837875366,0.7289999723434448,0.7350000143051147,0.7300000190734863,0.7519999742507935,0.7390000224113464,0.7400000095367432,0.7409999966621399,0.7429999709129333,0.7450000047683716,0.7329999804496765,0.7260000109672546,0.7570000290870667,0.7360000014305115,0.7519999742507935,0.7419999837875366,0.7379999756813049,0.7390000224113464,0.7490000128746033,0.734000027179718,0.7360000014305115,0.7390000224113464,0.7440000176429749,0.7450000047683716,0.7319999933242798,0.7429999709129333,0.7519999742507935,0.7540000081062317,0.7519999742507935,0.753000020980835,0.7480000257492065,0.7440000176429749,0.7459999918937683,0.7369999885559082,0.7419999837875366,0.7480000257492065,0.7419999837875366,0.765999972820282,0.746999979019165,0.7459999918937683,0.7570000290870667,0.7390000224113464,0.7409999966621399,0.7459999918937683,0.75,0.7570000290870667,0.753000020980835,0.7549999952316284,0.7519999742507935,0.7490000128746033,0.746999979019165,0.7459999918937683,0.7459999918937683,0.746999979019165,0.7409999966621399,0.7419999837875366,0.7459999918937683,0.7440000176429749,0.7459999918937683,0.7490000128746033,0.7450000047683716,0.7409999966621399,0.7419999837875366,0.7490000128746033,0.7590000033378601,0.7549999952316284,0.7549999952316284,0.746999979019165,0.753000020980835,0.7549999952316284,0.746999979019165,0.7580000162124634,0.7490000128746033,0.753000020980835,0.75,0.75,0.7540000081062317,0.7540000081062317,0.7490000128746033,0.7570000290870667,0.7570000290870667,0.7590000033378601,0.7559999823570251,0.7620000243186951,0.7590000033378601,0.7509999871253967,0.7639999985694885,0.7580000162124634,0.7599999904632568,0.7620000243186951,0.7590000033378601,0.7609999775886536,0.7559999823570251,0.75,0.7509999871253967,0.7549999952316284,0.7540000081062317,0.7540000081062317],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6209999918937683,0.6549999713897705,0.6800000071525574,0.6830000281333923,0.703000009059906,0.7020000219345093,0.7110000252723694,0.7160000205039978,0.7129999995231628,0.7210000157356262,0.7250000238418579,0.7210000157356262,0.7310000061988831,0.7269999980926514,0.7269999980926514,0.7329999804496765,0.7459999918937683,0.734000027179718,0.7409999966621399,0.7390000224113464,0.7350000143051147,0.7509999871253967,0.7440000176429749,0.7379999756813049,0.7599999904632568,0.7400000095367432,0.7409999966621399,0.7590000033378601,0.7409999966621399,0.7440000176429749,0.7400000095367432,0.7450000047683716,0.75,0.7440000176429749,0.7409999966621399,0.7429999709129333,0.7440000176429749,0.7440000176429749,0.7559999823570251,0.7459999918937683,0.7559999823570251,0.7540000081062317,0.7599999904632568,0.7559999823570251,0.7490000128746033,0.7490000128746033,0.7429999709129333,0.7609999775886536,0.7519999742507935,0.7480000257492065,0.7490000128746033,0.7620000243186951,0.7580000162124634,0.7580000162124634,0.7540000081062317,0.7509999871253967,0.7519999742507935,0.7440000176429749,0.7459999918937683,0.7559999823570251,0.7620000243186951,0.746999979019165,0.7570000290870667,0.7620000243186951,0.7570000290870667,0.7540000081062317,0.7540000081062317,0.7570000290870667,0.7590000033378601,0.7519999742507935,0.75,0.7559999823570251,0.7590000033378601,0.7559999823570251,0.7519999742507935,0.7639999985694885,0.7620000243186951,0.7549999952316284,0.7490000128746033,0.7559999823570251,0.7639999985694885,0.7609999775886536,0.7609999775886536,0.7519999742507935,0.7549999952316284,0.7570000290870667,0.7620000243186951,0.7599999904632568,0.7639999985694885,0.7559999823570251,0.753000020980835,0.7649999856948853,0.753000020980835,0.7549999952316284,0.7609999775886536,0.7599999904632568,0.7680000066757202,0.7540000081062317,0.7559999823570251,0.7590000033378601,0.7590000033378601,0.7649999856948853,0.7639999985694885,0.7710000276565552,0.7699999809265137,0.7609999775886536,0.765999972820282,0.7670000195503235,0.7720000147819519,0.7639999985694885,0.7609999775886536,0.7549999952316284,0.7630000114440918,0.7670000195503235,0.7599999904632568,0.765999972820282,0.7670000195503235,0.7670000195503235,0.7670000195503235,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7829999923706055,0.7630000114440918,0.7720000147819519,0.7649999856948853,0.7630000114440918,0.7699999809265137,0.7720000147819519,0.7720000147819519,0.7689999938011169,0.777999997138977,0.7689999938011169,0.7760000228881836,0.7730000019073486,0.7799999713897705,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7770000100135803,0.777999997138977,0.7670000195503235,0.7789999842643738,0.7799999713897705,0.7749999761581421,0.7730000019073486,0.777999997138977,0.777999997138977,0.7799999713897705,0.7770000100135803,0.7770000100135803,0.7789999842643738,0.7760000228881836,0.7770000100135803,0.7770000100135803,0.7770000100135803,0.7739999890327454,0.7689999938011169,0.7760000228881836,0.777999997138977,0.7699999809265137,0.7739999890327454,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7730000019073486,0.7739999890327454,0.7680000066757202],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5,0.4979999959468841,0.4950000047683716,0.4950000047683716,0.5049999952316284,0.5329999923706055,0.5220000147819519,0.5139999985694885,0.5339999794960022,0.5130000114440918,0.5389999747276306,0.5400000214576721,0.5270000100135803,0.5320000052452087,0.5260000228881836,0.5370000004768372,0.527999997138977,0.5289999842643738,0.5339999794960022,0.5270000100135803,0.531000018119812,0.527999997138977,0.5400000214576721,0.5479999780654907,0.550000011920929,0.5400000214576721,0.5350000262260437,0.5410000085830688,0.5379999876022339,0.5299999713897705,0.5490000247955322,0.5509999990463257,0.5519999861717224,0.5429999828338623,0.5429999828338623,0.5440000295639038,0.5379999876022339,0.5379999876022339,0.5419999957084656,0.5609999895095825,0.5540000200271606,0.5370000004768372,0.5440000295639038,0.5410000085830688,0.5379999876022339,0.5329999923706055,0.5419999957084656,0.5419999957084656,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5400000214576721,0.5450000166893005,0.5509999990463257,0.5569999814033508,0.5550000071525574,0.5590000152587891,0.5479999780654907,0.5550000071525574,0.5440000295639038,0.5460000038146973,0.546999990940094,0.5559999942779541,0.5550000071525574,0.5490000247955322,0.5440000295639038,0.546999990940094,0.5450000166893005,0.546999990940094,0.5649999976158142,0.5490000247955322,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5519999861717224,0.5519999861717224,0.5529999732971191,0.5490000247955322,0.546999990940094,0.550000011920929,0.5720000267028809,0.5619999766349792,0.5490000247955322,0.5680000185966492,0.5519999861717224,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.5630000233650208,0.5529999732971191,0.5619999766349792,0.5609999895095825,0.550000011920929,0.5479999780654907,0.5529999732971191,0.5519999861717224,0.5580000281333923,0.5590000152587891,0.5529999732971191,0.550000011920929,0.5680000185966492,0.5580000281333923,0.5630000233650208,0.5630000233650208,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5649999976158142,0.5659999847412109,0.5559999942779541,0.5659999847412109,0.5630000233650208,0.5509999990463257,0.5669999718666077,0.5669999718666077,0.5479999780654907,0.5540000200271606,0.5580000281333923,0.5519999861717224,0.5590000152587891,0.5590000152587891,0.5619999766349792,0.5509999990463257,0.546999990940094,0.5609999895095825,0.5540000200271606,0.5630000233650208,0.5580000281333923,0.5559999942779541,0.5680000185966492,0.5649999976158142,0.5619999766349792,0.5580000281333923,0.5630000233650208,0.5559999942779541,0.5540000200271606,0.5540000200271606,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5600000023841858,0.5460000038146973,0.5429999828338623,0.5580000281333923,0.5550000071525574,0.5580000281333923,0.5540000200271606,0.5609999895095825,0.5519999861717224,0.550000011920929,0.5519999861717224,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5590000152587891,0.5690000057220459,0.5640000104904175,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5640000104904175,0.5600000023841858,0.5550000071525574,0.5640000104904175,0.5600000023841858,0.5540000200271606],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4869999885559082,0.4959999918937683,0.4979999959468841,0.5099999904632568,0.515999972820282,0.5080000162124634,0.5249999761581421,0.5239999890327454,0.5299999713897705,0.5239999890327454,0.5149999856948853,0.5270000100135803,0.5249999761581421,0.5180000066757202,0.5220000147819519,0.5329999923706055,0.5289999842643738,0.5239999890327454,0.5299999713897705,0.5230000019073486,0.5130000114440918,0.5180000066757202,0.5299999713897705,0.5199999809265137,0.5270000100135803,0.5230000019073486,0.5299999713897705,0.5320000052452087,0.5429999828338623,0.527999997138977,0.5379999876022339,0.527999997138977,0.5419999957084656,0.5329999923706055,0.5450000166893005,0.5320000052452087,0.5410000085830688,0.5249999761581421,0.5400000214576721,0.5249999761581421,0.5289999842643738,0.5320000052452087,0.5339999794960022,0.5320000052452087,0.5350000262260437,0.5400000214576721,0.5450000166893005,0.5440000295639038,0.5400000214576721,0.5379999876022339,0.5350000262260437,0.5410000085830688,0.5490000247955322,0.531000018119812,0.5389999747276306,0.546999990940094,0.5529999732971191,0.5370000004768372,0.5440000295639038,0.5400000214576721,0.5490000247955322,0.550000011920929,0.5580000281333923,0.5609999895095825,0.5429999828338623,0.5529999732971191,0.5519999861717224,0.5450000166893005,0.550000011920929,0.5379999876022339,0.5490000247955322,0.5460000038146973,0.5419999957084656,0.5569999814033508,0.5509999990463257,0.5490000247955322,0.5529999732971191,0.5479999780654907,0.5590000152587891,0.5479999780654907,0.5509999990463257,0.5440000295639038,0.5509999990463257,0.5540000200271606,0.5559999942779541,0.5630000233650208,0.5649999976158142,0.5640000104904175,0.5649999976158142,0.5490000247955322,0.5709999799728394,0.5659999847412109,0.5630000233650208,0.5640000104904175,0.5580000281333923,0.546999990940094,0.5550000071525574,0.5580000281333923,0.5429999828338623,0.5440000295639038,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5550000071525574,0.5649999976158142,0.5540000200271606,0.5630000233650208,0.5609999895095825,0.5580000281333923,0.5509999990463257,0.5550000071525574,0.5550000071525574,0.5519999861717224,0.5609999895095825,0.5630000233650208,0.5509999990463257,0.550000011920929,0.5490000247955322,0.5540000200271606,0.550000011920929,0.5529999732971191,0.5460000038146973,0.550000011920929,0.5529999732971191,0.5519999861717224,0.5529999732971191,0.5609999895095825,0.5590000152587891,0.5550000071525574,0.550000011920929,0.5609999895095825,0.5619999766349792,0.5609999895095825,0.5540000200271606,0.550000011920929,0.5600000023841858,0.5559999942779541,0.5609999895095825,0.5569999814033508,0.5600000023841858,0.5680000185966492,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5669999718666077,0.5709999799728394,0.5640000104904175,0.5569999814033508,0.5600000023841858,0.5569999814033508,0.5649999976158142,0.5600000023841858,0.5580000281333923,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5529999732971191,0.5640000104904175,0.5649999976158142,0.5659999847412109,0.5630000233650208,0.5630000233650208,0.5619999766349792,0.5609999895095825,0.5559999942779541,0.5529999732971191,0.5600000023841858],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5239999890327454,0.4900000095367431,0.5040000081062317,0.5099999904632568,0.4990000128746032,0.5170000195503235,0.5040000081062317,0.5009999871253967,0.5230000019073486,0.5109999775886536,0.5059999823570251,0.5130000114440918,0.5090000033378601,0.5180000066757202,0.5220000147819519,0.5189999938011169,0.5180000066757202,0.5220000147819519,0.5120000243186951,0.5460000038146973,0.5239999890327454,0.5289999842643738,0.5440000295639038,0.5339999794960022,0.5299999713897705,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5360000133514404,0.5299999713897705,0.5180000066757202,0.5249999761581421,0.5440000295639038,0.5299999713897705,0.5339999794960022,0.5239999890327454,0.527999997138977,0.5139999985694885,0.5289999842643738,0.5360000133514404,0.5260000228881836,0.5389999747276306,0.5460000038146973,0.5270000100135803,0.5339999794960022,0.5320000052452087,0.5329999923706055,0.5260000228881836,0.5220000147819519,0.5260000228881836,0.5379999876022339,0.5410000085830688,0.5350000262260437,0.5389999747276306,0.5320000052452087,0.5389999747276306,0.5379999876022339,0.5329999923706055,0.5270000100135803,0.5170000195503235,0.5329999923706055,0.5370000004768372,0.5379999876022339,0.5249999761581421,0.5479999780654907,0.546999990940094,0.5400000214576721,0.5440000295639038,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5370000004768372,0.5370000004768372,0.5479999780654907,0.5379999876022339,0.5400000214576721,0.5479999780654907,0.5379999876022339,0.5509999990463257,0.5440000295639038,0.5379999876022339,0.550000011920929,0.5389999747276306,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5360000133514404,0.5509999990463257,0.5360000133514404,0.5419999957084656,0.5419999957084656,0.550000011920929,0.5360000133514404,0.5519999861717224,0.5540000200271606,0.546999990940094,0.5370000004768372,0.5379999876022339,0.5519999861717224,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.550000011920929,0.5490000247955322,0.5360000133514404,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5490000247955322,0.5479999780654907,0.5350000262260437,0.5490000247955322,0.5370000004768372,0.5440000295639038,0.5329999923706055,0.5440000295639038,0.5429999828338623,0.5389999747276306,0.5450000166893005,0.5320000052452087,0.5450000166893005,0.5400000214576721,0.5419999957084656,0.5460000038146973,0.5370000004768372,0.5400000214576721,0.5460000038146973,0.5370000004768372,0.5370000004768372,0.5460000038146973,0.5400000214576721,0.5490000247955322,0.5529999732971191,0.5379999876022339,0.5460000038146973,0.5450000166893005,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5479999780654907,0.5460000038146973,0.5540000200271606,0.5400000214576721,0.5350000262260437,0.5490000247955322,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5410000085830688,0.5429999828338623,0.5379999876022339,0.5450000166893005,0.5389999747276306,0.5400000214576721,0.5400000214576721,0.550000011920929,0.5440000295639038,0.5389999747276306,0.5450000166893005,0.5400000214576721,0.5389999747276306,0.5419999957084656,0.5410000085830688,0.5440000295639038,0.5519999861717224,0.5479999780654907,0.5450000166893005,0.5569999814033508],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4880000054836273,0.492000013589859,0.5059999823570251,0.5139999985694885,0.5070000290870667,0.5090000033378601,0.5230000019073486,0.5189999938011169,0.5189999938011169,0.5220000147819519,0.5149999856948853,0.5260000228881836,0.5329999923706055,0.5230000019073486,0.5180000066757202,0.5289999842643738,0.5400000214576721,0.5410000085830688,0.5440000295639038,0.5329999923706055,0.550000011920929,0.5419999957084656,0.5360000133514404,0.5429999828338623,0.5429999828338623,0.5450000166893005,0.5490000247955322,0.5400000214576721,0.5509999990463257,0.5559999942779541,0.5479999780654907,0.5540000200271606,0.5490000247955322,0.5400000214576721,0.5429999828338623,0.5460000038146973,0.5370000004768372,0.5479999780654907,0.5550000071525574,0.5490000247955322,0.5400000214576721,0.5410000085830688,0.5460000038146973,0.546999990940094,0.5479999780654907,0.546999990940094,0.5509999990463257,0.5450000166893005,0.5590000152587891,0.5419999957084656,0.5540000200271606,0.5440000295639038,0.5450000166893005,0.5580000281333923,0.5540000200271606,0.5440000295639038,0.5619999766349792,0.5450000166893005,0.5600000023841858,0.5559999942779541,0.5600000023841858,0.5400000214576721,0.5569999814033508,0.5600000023841858,0.5619999766349792,0.5529999732971191,0.5649999976158142,0.5609999895095825,0.5550000071525574,0.5609999895095825,0.5580000281333923,0.5550000071525574,0.5619999766349792,0.5550000071525574,0.5519999861717224,0.5600000023841858,0.5550000071525574,0.5550000071525574,0.5590000152587891,0.5490000247955322,0.5580000281333923,0.5600000023841858,0.5419999957084656,0.5559999942779541,0.5559999942779541,0.5529999732971191,0.5609999895095825,0.5519999861717224,0.5569999814033508,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.546999990940094,0.5619999766349792,0.5460000038146973,0.5529999732971191,0.5619999766349792,0.5690000057220459,0.5680000185966492,0.5720000267028809,0.5640000104904175,0.5550000071525574,0.5509999990463257,0.550000011920929,0.5600000023841858,0.5609999895095825,0.5630000233650208,0.5649999976158142,0.5529999732971191,0.5540000200271606,0.5529999732971191,0.5659999847412109,0.5600000023841858,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5730000138282776,0.5569999814033508,0.5690000057220459,0.5619999766349792,0.5680000185966492,0.578000009059906,0.5730000138282776,0.5550000071525574,0.5529999732971191,0.5600000023841858,0.5630000233650208,0.5590000152587891,0.5659999847412109,0.5669999718666077,0.5609999895095825,0.5630000233650208,0.5569999814033508,0.5490000247955322,0.5619999766349792,0.5550000071525574,0.5630000233650208,0.5559999942779541,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5669999718666077,0.5609999895095825,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5720000267028809,0.5619999766349792,0.5649999976158142,0.5669999718666077,0.5680000185966492,0.5699999928474426,0.5640000104904175,0.5609999895095825,0.5740000009536743,0.5690000057220459,0.5669999718666077,0.5720000267028809,0.5699999928474426,0.5709999799728394,0.5740000009536743,0.5680000185966492,0.5619999766349792,0.5690000057220459,0.5659999847412109,0.574999988079071],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
assets/data/plots/custom_filters/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.35955795273184776, 0.3757704347372055, 0.3934198468923569, 0.398214865475893, 0.4062729831784963, 0.41363069601356983, 0.41463132016360754, 0.41851891577243805, 0.4239445272833109, 0.42439557053148746, 0.4273625332862139, 0.4289980959147215, 0.4327357914298773, 0.43017333932220936], "label": "Filters combined"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308933284133672, 0.35735468938946724, 0.3787423223257065, 0.391122592613101, 0.3976811040192842, 0.4041402228176594, 0.4110417179763317, 0.4150725454092026, 0.42221225984394545, 0.4235249478369951, 0.42567262239754194, 0.42764298990368843, 0.4280493911355734, 0.42981273680925364, 0.42845905013382435], "label": "Punctuation filter"}, "filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3308979943394661, 0.35727922804653645, 0.3758955802768469, 0.39312327839434147, 0.3984657619148493, 0.4037223849445581, 0.40907647646963596, 0.41408527828752995, 0.42114910110831255, 0.42039695382118225, 0.4248786196112633, 0.42590542137622833, 0.4263712782412767, 0.42797840014100075, 0.4277621991932392], "label": "Line duplicates filter"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.33087017294019455, 0.35839469730854034, 0.379800958558917, 0.3909519836306572, 0.3985003251582384, 0.4028578344732523, 0.4080309104174375, 0.411550747230649, 0.4152813777327537, 0.41849316097795963, 0.42109199613332743, 0.4223319999873638, 0.42558939941227436, 0.42717534117400646, 0.426479609683156], "label": "Short lines filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3309533800929785, 0.3574739173054695, 0.3774360120296478, 0.3879939243197441, 0.3961103111505508, 0.4038164801895618, 0.4059260934591293, 0.4138728193938732, 0.414092980325222, 0.4190553873777389, 0.4232541136443615, 0.4207314290106296, 0.4239514805376529, 0.425716370344162, 0.4249534271657467], "label": "Baseline"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.29474999010562897, 0.3184999972581863, 0.3392500132322311, 0.35074999928474426, 0.35300000011920923, 0.35750000178813934, 0.3684999942779541, 0.3817500025033951, 0.37800000607967377, 0.38199999928474426, 0.38600000739097595, 0.38525000214576716, 0.39000000059604645, 0.38850000500679016], "label": "Line duplicates filter"}, "filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.2905000001192093, 0.3199999928474426, 0.3397499918937683, 0.3467499911785126, 0.3540000021457672, 0.3662499934434891, 0.36374999582767487, 0.3647499978542328, 0.3675000071525574, 0.371749997138977, 0.37074999511241913, 0.375, 0.3787499964237213, 0.38099999725818634], "label": "Filters combined"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.2892500013113022, 0.3190000057220459, 0.3385000079870224, 0.3449999988079071, 0.3495000004768371, 0.36374999582767487, 0.3604999929666519, 0.36549998819828033, 0.37074999511241913, 0.37150000035762787, 0.3722500056028366, 0.37774999439716334, 0.3774999976158142, 0.37899999320507044], "label": "Short lines filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2515000104904175, 0.2854999899864197, 0.3215000033378601, 0.3384999930858612, 0.3445000052452087, 0.3540000021457672, 0.3544999957084656, 0.3650000095367431, 0.3714999854564667, 0.3695000112056732, 0.3700000047683716, 0.3720000088214874, 0.3770000040531158, 0.3770000040531158, 0.3774999976158142], "label": "Baseline"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2509999871253967, 0.29100000858306885, 0.31949999928474426, 0.33675000071525574, 0.34524999558925623, 0.35850000381469727, 0.3557499945163727, 0.36124999821186066, 0.3599999994039535, 0.36800000071525574, 0.36775000393390656, 0.3770000040531158, 0.37025000154972076, 0.37424999475479126, 0.37299999594688416], "label": "Punctuation filter"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2620000094175339, 0.28949999809265137, 0.2974999994039535, 0.30550000071525574, 0.30900000035762787, 0.31200000643730164, 0.3190000057220459, 0.32999999821186066, 0.3254999965429306, 0.3344999998807907, 0.3320000022649765, 0.3374999910593033, 0.3369999974966049, 0.33949999511241913], "label": "Short lines filter"}, "filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2644999921321869, 0.2750000059604645, 0.2989999949932098, 0.2974999994039535, 0.31599999964237213, 0.3149999976158142, 0.3199999928474426, 0.3244999945163727, 0.3269999921321869, 0.33550000190734863, 0.3275000005960464, 0.33599999547004694, 0.3349999934434891, 0.33849999308586115], "label": "Line duplicates filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2709999978542328, 0.2840000092983246, 0.2910000085830688, 0.3149999976158142, 0.3079999983310699, 0.3269999921321869, 0.3269999921321869, 0.3179999887943268, 0.3260000050067901, 0.328000009059906, 0.3350000083446502, 0.3330000042915344, 0.3409999907016754, 0.335999995470047], "label": "Baseline"}, "filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.2649999856948852, 0.2790000140666961, 0.29649999737739563, 0.3135000020265579, 0.3164999932050705, 0.32099999487400055, 0.3210000097751617, 0.3305000066757202, 0.3205000013113022, 0.32549999654293055, 0.3295000046491623, 0.33050000667572016, 0.335999995470047, 0.33200000226497645], "label": "Filters combined"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2329999953508377, 0.26349999010562897, 0.28849999606609344, 0.29600000381469727, 0.30650000274181366, 0.31900000572204584, 0.3229999989271164, 0.3150000125169754, 0.3244999945163727, 0.3310000002384186, 0.3310000002384186, 0.32999999821186066, 0.3334999978542328, 0.3344999998807907, 0.32999999821186066], "label": "Punctuation filter"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29349999129772186, 0.3210000097751617, 0.36150000989437103, 0.3734999895095825, 0.39599999785423273, 0.4125000089406967, 0.4234999865293503, 0.42749999463558197, 0.44699999690055847, 0.4549999982118606, 0.4660000056028366, 0.46600000560283655, 0.47050000727176666, 0.4675000011920929], "label": "Filters combined"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.29449999332427973, 0.33550000190734863, 0.34800000488758087, 0.3764999955892563, 0.3824999928474426, 0.3955000042915344, 0.41799999773502344, 0.4270000010728836, 0.43400000035762787, 0.44450001418590546, 0.45049999654293055, 0.45450000464916224, 0.45449998974800104, 0.4550000131130218], "label": "Punctuation filter"}, "filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.28900000452995295, 0.3310000002384186, 0.3505000025033951, 0.3790000081062317, 0.39250001311302185, 0.40549999475479126, 0.4224999994039535, 0.4284999966621399, 0.43050000071525574, 0.43799999356269836, 0.4459999948740005, 0.4495000094175339, 0.4564999938011169, 0.4529999941587448], "label": "Line duplicates filter"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.3020000010728836, 0.3310000002384186, 0.357000008225441, 0.37899999320507044, 0.38850000500679016, 0.3994999974966049, 0.40349999070167536, 0.4175000041723251, 0.42400000989437103, 0.4245000034570694, 0.4335000067949295, 0.4360000044107437, 0.44750000536441803, 0.44200000166893], "label": "Short lines filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.257999986410141, 0.2870000004768371, 0.3319999873638153, 0.3589999973773956, 0.3659999966621399, 0.3889999985694885, 0.402999997138977, 0.4180000126361847, 0.421999990940094, 0.421999990940094, 0.4289999902248382, 0.4309999942779541, 0.4320000112056732, 0.4370000064373016, 0.4350000023841858], "label": "Baseline"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files": {"agg_score": {"file": "agg_score.json"}, "commonsense_qa/acc_norm": {"file": "commonsense_qa_acc_norm.json"}, "hellaswag/acc_norm": {"file": "hellaswag_acc_norm.json"}, "openbookqa/acc_norm": {"file": "openbookqa_acc_norm.json"}, "piqa/acc_norm": {"file": "piqa_acc_norm.json"}, "siqa/acc_norm": {"file": "siqa_acc_norm.json"}, "winogrande/acc_norm": {"file": "winogrande_acc_norm.json"}, "arc/acc_norm": {"file": "arc_acc_norm.json"}, "mmlu/acc_norm": {"file": "mmlu_acc_norm.json"}}, "settings": {"defaultMetric": "agg_score", "slider": {"min": 0, "max": 10, "default": 3}}}
assets/data/plots/custom_filters/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.2534636557102203, 0.2621634304523468, 0.2661087810993194, 0.2704689502716064, 0.27318383753299713, 0.2757955640554428, 0.2758005559444427, 0.28340134024620056, 0.2835562080144882, 0.28641459345817566, 0.28565025329589844, 0.28998473286628723, 0.29013633728027344, 0.2888867110013962], "label": "Filters combined"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501466572284698, 0.2563375234603882, 0.26243858039379114, 0.26873072981834406, 0.27219884097576136, 0.27462176978588104, 0.27908372879028315, 0.2813303619623184, 0.28369809687137604, 0.28319956362247467, 0.28563097119331354, 0.28614395856857294, 0.28564512729644775, 0.2862519174814224, 0.2876724004745483], "label": "Punctuation filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2501270473003387, 0.253291368484497, 0.2609881162643432, 0.2644513845443725, 0.2703824639320373, 0.2735317945480346, 0.2759087681770324, 0.2779825627803802, 0.2812439203262329, 0.2799430787563324, 0.286032885313034, 0.2868514060974121, 0.2856118083000183, 0.2887309193611145, 0.2871274054050445], "label": "Baseline"}, "filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.25018398463726044, 0.2544838488101959, 0.2611646503210068, 0.2652362138032913, 0.2704761028289795, 0.2737790495157242, 0.276611790060997, 0.2786822021007538, 0.281442791223526, 0.2816756069660187, 0.2860289514064789, 0.28624334931373596, 0.2867202013731003, 0.28732720017433167, 0.28609761595726013], "label": "Line duplicates filter"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.24996141344308848, 0.25390757620334625, 0.26540763676166534, 0.27061584591865534, 0.27150256931781763, 0.2718626409769058, 0.27449728548526764, 0.2784059643745422, 0.28175103664398193, 0.28019529581069946, 0.2827359586954117, 0.2814059555530548, 0.2844651788473129, 0.28390273451805115, 0.2838368713855743], "label": "Short lines filter"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2559999972581863, 0.2849999964237213, 0.3110000044107437, 0.2979999929666519, 0.3009999990463257, 0.318000003695488, 0.3140000104904175, 0.32899999618530273, 0.32899999618530273, 0.3369999974966049, 0.33599999547004694, 0.32900001108646393, 0.3299999982118606, 0.3330000042915344], "label": "Punctuation filter"}, "filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2719999998807907, 0.277999997138977, 0.3039999902248382, 0.28199999034404755, 0.30200000107288355, 0.3050000071525574, 0.31299999356269836, 0.32099999487400055, 0.3269999921321869, 0.31599999964237213, 0.3260000050067901, 0.32600000500679016, 0.3299999982118606, 0.32500000298023224], "label": "Filters combined"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.25999999046325684, 0.28200000524520874, 0.28599999845027924, 0.289000004529953, 0.29999999701976776, 0.31300000846385956, 0.31900000572204584, 0.3149999976158142, 0.32099999487400055, 0.3139999955892563, 0.3190000057220459, 0.32200001180171967, 0.3229999989271164, 0.3240000009536743], "label": "Short lines filter"}, "filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.26900000870227814, 0.2670000046491623, 0.306999996304512, 0.2939999997615814, 0.2999999970197677, 0.306999996304512, 0.31200000643730164, 0.31299999356269836, 0.3200000077486038, 0.3229999989271164, 0.32099999487400055, 0.32500000298023224, 0.3240000009536743, 0.3219999969005584], "label": "Line duplicates filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.2860000133514404, 0.2599999904632568, 0.2680000066757202, 0.2800000011920929, 0.2860000133514404, 0.2960000038146972, 0.2980000078678131, 0.3039999902248382, 0.3059999942779541, 0.3179999887943268, 0.3319999873638153, 0.3140000104904175, 0.3199999928474426, 0.3140000104904175, 0.3160000145435333], "label": "Baseline"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.609499990940094, 0.652999997138977, 0.6744999885559082, 0.68299999833107, 0.6809999942779541, 0.6965000033378601, 0.6995000243186951, 0.7145000100135803, 0.7100000083446503, 0.7105000019073486, 0.7134999930858612, 0.7159999907016754, 0.7170000076293945, 0.7199999988079071], "label": "Line duplicates filter"}, "filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6274999976158142, 0.656000018119812, 0.6665000021457672, 0.6854999959468842, 0.6895000040531158, 0.7035000026226044, 0.7060000002384186, 0.7100000083446503, 0.7195000052452087, 0.7159999907016754, 0.715499997138977, 0.7170000076293945, 0.7274999916553497, 0.7199999988079071], "label": "Filters combined"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6254999935626984, 0.6530000269412994, 0.6665000021457672, 0.6860000193119049, 0.6980000138282776, 0.695499986410141, 0.7084999978542328, 0.7080000042915344, 0.7064999938011169, 0.7095000147819519, 0.7129999995231628, 0.7159999907016754, 0.7179999947547913, 0.718500018119812], "label": "Short lines filter"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6155000030994415, 0.648499995470047, 0.6649999916553497, 0.6865000128746033, 0.690500020980835, 0.6965000033378601, 0.7029999792575836, 0.7139999866485596, 0.7105000019073486, 0.7089999914169312, 0.7139999866485596, 0.7144999802112579, 0.7229999899864197, 0.7175000011920929], "label": "Punctuation filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.5099999904632568, 0.6209999918937683, 0.6520000100135803, 0.6639999747276306, 0.6880000233650208, 0.6890000104904175, 0.699999988079071, 0.6980000138282776, 0.7049999833106995, 0.7080000042915344, 0.7110000252723694, 0.7070000171661377, 0.7179999947547913, 0.7120000123977661, 0.7160000205039978], "label": "Baseline"}}, "layout": {"title": {"text": "Custom filters Performance"}}}
assets/data/plots/custom_filters/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"filtering-custom-line-char-duplicated-v2-0.01": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3894999921321869, 0.3989999890327453, 0.4060000032186508, 0.40299999713897705, 0.4055000096559524, 0.4095000028610229, 0.40450000762939453, 0.40750001370906824, 0.4074999988079071, 0.408500000834465, 0.41050000488758087, 0.40450000762939453, 0.40500000119209284, 0.4035000056028366], "label": "Line duplicates filter"}, "filtering-custom-lines-punc-0.12": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3930000066757202, 0.39750000834465027, 0.40049999952316284, 0.39849999547004694, 0.40449999272823334, 0.4054999947547912, 0.4020000100135803, 0.4115000069141388, 0.40800000727176666, 0.402999997138977, 0.4074999988079071, 0.40700000524520874, 0.4060000032186508, 0.40250000357627863], "label": "Punctuation filter"}, "filtering-baseline-2019-18-40gt": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3959999978542328, 0.3989999890327453, 0.4040000140666961, 0.3989999890327453, 0.4000000059604645, 0.3880000114440918, 0.4050000011920929, 0.4079999923706054, 0.4169999957084656, 0.4110000133514404, 0.4059999883174896, 0.414000004529953, 0.4099999964237213, 0.4020000100135803], "label": "Baseline"}, "filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.3889999985694885, 0.4040000140666961, 0.4035000056028366, 0.4050000011920929, 0.3995000123977661, 0.4064999967813492, 0.4050000011920929, 0.4025000035762787, 0.4055000096559524, 0.40799999237060547, 0.4000000059604645, 0.4025000035762787, 0.403999999165535, 0.40150000154972076], "label": "Filters combined"}, "filtering-custom-short-line-ratio-0.67": {"x": [0.0, 2.0971520000000003, 4.194304000000001, 6.291456, 8.388608000000001, 10.48576, 12.582912, 14.680064000000002, 16.777216000000003, 18.874368, 20.97152, 23.068672000000003, 25.165824, 27.262976000000002, 28.311552000000002], "y": [0.3619999885559082, 0.40350000560283655, 0.403999999165535, 0.4004999995231628, 0.4010000079870224, 0.39899998903274536, 0.4015000015497207, 0.39750000834465027, 0.3969999998807907, 0.4030000120401382, 0.4055000096559524, 0.4010000079870224, 0.4020000100135803, 0.40299999713897705, 0.3990000039339065], "label": "Short lines filter"}}, "layout": {"title": {"text": "Custom filters Performance"}}}