hynky HF staff commited on
Commit
443873d
1 Parent(s): bc3665e

actually ommit the build

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. dist/assets/data/clustering/data.csv +0 -0
  2. dist/assets/data/clustering/info.csv +106 -0
  3. dist/assets/data/plots/all_dumps_bad/agg_score.json +1 -0
  4. dist/assets/data/plots/all_dumps_bad/arc_acc_norm.json +1 -0
  5. dist/assets/data/plots/all_dumps_bad/commonsense_qa_acc_norm.json +1 -0
  6. dist/assets/data/plots/all_dumps_bad/hellaswag_acc_norm.json +1 -0
  7. dist/assets/data/plots/all_dumps_bad/index.json +1 -0
  8. dist/assets/data/plots/all_dumps_bad/mmlu_acc_norm.json +1 -0
  9. dist/assets/data/plots/all_dumps_bad/openbookqa_acc_norm.json +1 -0
  10. dist/assets/data/plots/all_dumps_bad/piqa_acc_norm.json +1 -0
  11. dist/assets/data/plots/all_dumps_bad/siqa_acc_norm.json +1 -0
  12. dist/assets/data/plots/all_dumps_bad/winogrande_acc_norm.json +1 -0
  13. dist/assets/data/plots/all_filtering_steps/agg_score.json +1 -0
  14. dist/assets/data/plots/all_filtering_steps/arc_acc_norm.json +1 -0
  15. dist/assets/data/plots/all_filtering_steps/commonsense_qa_acc_norm.json +1 -0
  16. dist/assets/data/plots/all_filtering_steps/hellaswag_acc_norm.json +1 -0
  17. dist/assets/data/plots/all_filtering_steps/index.json +1 -0
  18. dist/assets/data/plots/all_filtering_steps/mmlu_acc_norm.json +1 -0
  19. dist/assets/data/plots/all_filtering_steps/openbookqa_acc_norm.json +1 -0
  20. dist/assets/data/plots/all_filtering_steps/piqa_acc_norm.json +1 -0
  21. dist/assets/data/plots/all_filtering_steps/siqa_acc_norm.json +1 -0
  22. dist/assets/data/plots/all_filtering_steps/winogrande_acc_norm.json +1 -0
  23. dist/assets/data/plots/c4_filters_hellaswag/agg_score.json +1 -0
  24. dist/assets/data/plots/c4_filters_hellaswag/arc_acc_norm.json +1 -0
  25. dist/assets/data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json +1 -0
  26. dist/assets/data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json +1 -0
  27. dist/assets/data/plots/c4_filters_hellaswag/index.json +1 -0
  28. dist/assets/data/plots/c4_filters_hellaswag/mmlu_acc_norm.json +1 -0
  29. dist/assets/data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json +1 -0
  30. dist/assets/data/plots/c4_filters_hellaswag/piqa_acc_norm.json +1 -0
  31. dist/assets/data/plots/c4_filters_hellaswag/siqa_acc_norm.json +1 -0
  32. dist/assets/data/plots/c4_filters_hellaswag/winogrande_acc_norm.json +1 -0
  33. dist/assets/data/plots/cross_ind_unfiltered_comparison/agg_score.json +1 -0
  34. dist/assets/data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json +1 -0
  35. dist/assets/data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json +1 -0
  36. dist/assets/data/plots/cross_ind_unfiltered_comparison/index.json +1 -0
  37. dist/assets/data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json +1 -0
  38. dist/assets/data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json +1 -0
  39. dist/assets/data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json +1 -0
  40. dist/assets/data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json +1 -0
  41. dist/assets/data/plots/custom_filters/agg_score.json +1 -0
  42. dist/assets/data/plots/custom_filters/arc_acc_norm.json +1 -0
  43. dist/assets/data/plots/custom_filters/commonsense_qa_acc_norm.json +1 -0
  44. dist/assets/data/plots/custom_filters/hellaswag_acc_norm.json +1 -0
  45. dist/assets/data/plots/custom_filters/index.json +1 -0
  46. dist/assets/data/plots/custom_filters/mmlu_acc_norm.json +1 -0
  47. dist/assets/data/plots/custom_filters/openbookqa_acc_norm.json +1 -0
  48. dist/assets/data/plots/custom_filters/piqa_acc_norm.json +1 -0
  49. dist/assets/data/plots/custom_filters/siqa_acc_norm.json +1 -0
  50. dist/assets/data/plots/custom_filters/winogrande_acc_norm.json +1 -0
dist/assets/data/clustering/data.csv ADDED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/clustering/info.csv ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,cluster_id,cluster_summaries,cluster_position_x,cluster_position_y
2
+ 0,-1,None,9.926462,4.7121987
3
+ 1,0,Philosophical/Spiritual Introspection,10.312462,1.2666532
4
+ 2,1,"Scholarships,",8.167274,4.8995786
5
+ 3,2,Politics,8.81142,2.4859838
6
+ 4,3,Theology,9.615214,0.3783942
7
+ 5,4,Dating,4.985182,1.8439052
8
+ 6,5,Accommodation,11.457769,5.080919
9
+ 7,6,Football,6.6154537,-1.6859366
10
+ 8,7,Film Festival,6.9734483,1.4548192
11
+ 9,8,Culinary,13.426296,4.5412893
12
+ 10,9,Music,6.0653744,0.7536916
13
+ 11,10,Gambling,3.124241,3.2533677
14
+ 12,11,Baseball,7.133596,-2.4256644
15
+ 13,12,Technology,6.4929094,6.768577
16
+ 14,13,Website Policies,4.873843,5.771508
17
+ 15,14,Weddings,11.815845,3.7894728
18
+ 16,15,Gaming,5.529167,2.9530518
19
+ 17,16,Commodities/Services Provision,10.453564,5.8489122
20
+ 18,17,Crafts,13.287651,6.4237967
21
+ 19,18,Automobiles,9.9531145,8.840178
22
+ 20,19,Watches,13.893139,9.859185
23
+ 21,20,Dogs,12.595798,3.5351615
24
+ 22,21,Photography,10.7942295,3.5504062
25
+ 23,22,Legalities,8.942016,4.72733
26
+ 24,23,Consumer Electronics,7.078649,8.338984
27
+ 25,24,Insulation,10.520957,7.914946
28
+ 26,25,Cannabis,14.317424,3.2114828
29
+ 27,26,Footwear,15.052116,7.6956415
30
+ 28,27,Real Estate,9.536316,6.103533
31
+ 29,28,Relocation,10.205071,7.1883316
32
+ 30,29,Sports betting,3.2779586,2.443366
33
+ 31,30,Narratives,7.613535,1.8300554
34
+ 32,31,Dating,4.788838,2.1900373
35
+ 33,32,Apparel/Clothing,14.394226,7.3073387
36
+ 34,33,User Authentication,5.265638,6.4014487
37
+ 35,34,Academicwriting,6.9187264,3.4357684
38
+ 36,35,Sports,7.4969172,-2.086585
39
+ 37,36,Fashion/Lifestyle Products,13.821669,7.7150764
40
+ 38,37,Diverse events,9.437052,2.2438836
41
+ 39,38,Blockchain/Cryptocurrency,7.7586045,6.9439344
42
+ 40,39,Online Businesses/Marketing,6.522259,5.219268
43
+ 41,40,Healthcare,11.425277,2.3801014
44
+ 42,41,Home Decor,12.878046,7.2632184
45
+ 43,42,Biomedicine,12.789575,2.3376262
46
+ 44,43,Jewelry,14.259997,8.653363
47
+ 45,44,Addiction,11.561383,1.3774762
48
+ 46,45,Products,11.711758,8.423251
49
+ 47,46,Multi-purposefulness,11.080702,7.4574013
50
+ 48,47,"Mass transit,",9.910158,5.4402313
51
+ 49,48,Ethernet,6.9763823,7.7909245
52
+ 50,49,Legal,9.516912,4.636553
53
+ 51,50,E-commerce,13.263438,8.6548195
54
+ 52,51,Audio,7.717162,8.903019
55
+ 53,52,Infrastructure,10.52904,5.369669
56
+ 54,53,Firearms,11.062812,9.268473
57
+ 55,54,Freight/Logistics,9.551044,7.0336204
58
+ 56,55,Products,12.073747,7.645973
59
+ 57,56,Vaccinations,11.9387045,2.7824683
60
+ 58,57,Artwork,11.019163,4.1677165
61
+ 59,58,Viticulture,14.223523,5.0761614
62
+ 60,59,WordPress,5.9597983,5.824579
63
+ 61,60,Cosmetics/Dermatology,15.093273,3.4669027
64
+ 62,61,Software,6.375921,6.4298844
65
+ 63,62,Dentistry,14.76626,1.1620314
66
+ 64,63,Pest Control,13.201735,3.6806118
67
+ 65,64,SEO,5.720493,5.238112
68
+ 66,65,Lottery,1.7142816,2.9782674
69
+ 67,66,Narratives,8.460977,1.0804662
70
+ 68,67,Waste Reduction & Recycling,10.634534,6.959523
71
+ 69,68,Communication,6.438943,5.9467845
72
+ 70,69,Orthopedics,13.005415,1.1908791
73
+ 71,70,Home Decor & Furniture,12.732457,7.876862
74
+ 72,71,Education,7.6568975,3.4944353
75
+ 73,72,Sports,7.295141,-0.7343214
76
+ 74,73,Social Media Advertising,6.133886,4.8547883
77
+ 75,74,Privacy,4.756733,6.3598356
78
+ 76,75,Website design,6.1168823,5.465095
79
+ 77,76,Roofing,11.389448,8.080609
80
+ 78,77,Nutrition/Supplements,13.631578,2.5334294
81
+ 79,78,Haircare/Hairstyling,15.544645,4.54254
82
+ 80,79,Cookies,4.341592,6.819268
83
+ 81,80,International Trade,8.993828,6.4757586
84
+ 82,81,Entrepreneurial Resources,9.435777,5.3340797
85
+ 83,82,Cricket,6.5171986,-1.245905
86
+ 84,83,Crafts,13.852216,7.049825
87
+ 85,84,Floristry,13.407425,5.8741536
88
+ 86,85,Genealogy,9.530803,1.6548243
89
+ 87,86,Mental Health,11.074349,1.6069281
90
+ 88,87,Volunteerism,10.145443,3.6734574
91
+ 89,88,Lighting,11.385381,8.93693
92
+ 90,89,Artificial Intelligence,6.5306387,6.2178063
93
+ 91,90,Business,7.471462,6.4142885
94
+ 92,91,E-commerce,13.638669,6.5098934
95
+ 93,92,Urbanization/Over-tourism,10.221115,6.100654
96
+ 94,93,Events,10.8449,3.9822264
97
+ 95,94,Pharmaceuticals/Biotechnology,12.318266,2.4331784
98
+ 96,95,Professional Wrestling,6.856304,-0.65598303
99
+ 97,96,Various,9.3211975,3.4894605
100
+ 98,97,Medicine,13.17882,2.1281319
101
+ 99,98,Community Engagement,9.848856,3.5187004
102
+ 100,99,Fitness,12.504849,0.9134393
103
+ 101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
104
+ 102,101,Business Development,7.328447,5.659843
105
+ 103,102,Sports,7.6370654,-1.0701839
106
+ 104,103,Sexuality,13.817207,1.6510898
dist/assets/data/plots/all_dumps_bad/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3534814938902855,0.3764607086777687,0.38782499730587,0.3981050960719585,0.4028486795723438,0.4125883243978023,0.4117814563214779,0.414029736071825,0.4197172522544861,0.4211113378405571,0.4279881417751312,0.4280137903988361,0.4280424378812313,0.4291964024305343,0.4326301179826259,0.4371833503246307,0.4346669465303421,0.4336562640964985,0.4432648755609989,0.4401291646063328,0.4394684173166752,0.4476612061262131,0.4465444348752498,0.4472153298556804,0.4433343075215816,0.4510187618434429,0.4459567815065384,0.4460812956094742,0.4498684890568256,0.4529943652451038,0.4528274349868297,0.4551213420927524,0.4549156539142132,0.4564928151667118,0.4576693661510944,0.4557182416319847,0.4536240361630916,0.457439012825489,0.4570476822555065,0.4589823484420776,0.462024375796318,0.4540738053619861,0.4550252184271812,0.4576593860983848,0.4573238864541054,0.4575810581445694,0.4622134491801262,0.4592566937208175,0.4614734016358852,0.4637473002076149,0.4625372551381588,0.4613912180066108,0.4597448222339153,0.4594792164862156,0.4662549719214439,0.4634026065468788,0.4633508697152138,0.4635734222829342,0.4628961533308029,0.4670135043561458,0.4639505892992019,0.4631133340299129,0.4665167145431041,0.4672448337078094,0.4693268723785877,0.4630668573081493,0.4676454700529575,0.4646359197795391,0.4621579721570015,0.4692446552217006,0.4704835228621959,0.4663223996758461,0.4680556617677212,0.466339822858572,0.4682099223136902,0.4711195565760135,0.4722655527293682,0.4727961830794811,0.4676857478916645,0.4719390422105789,0.4713102728128433,0.4712141714990139,0.4721613004803657,0.4713456854224205,0.4682970903813839,0.4679934531450271,0.4685162976384163,0.4679946713149547,0.4681242071092129,0.4702276065945625,0.472664151340723,0.4730790853500366,0.4731674715876579,0.4718914777040481,0.4719801284372806,0.4761029370129108,0.4735167175531387,0.4730370938777923,0.4730173237621784,0.4735377207398414,0.4777223989367485,0.4796326830983162,0.4734170883893966,0.4739485755562782,0.4748299159109592,0.4765299335122108,0.4745025858283043,0.4754423759877682,0.4784592799842357,0.4761341325938701,0.4760282784700393,0.4769757278263569,0.47154351323843,0.4786738082766533,0.4804279990494251,0.4777076803147793,0.4798569902777672,0.4759011939167976,0.4784621745347976,0.479673832654953,0.4780617095530033,0.48076206818223,0.47995800152421,0.4790860973298549,0.4817167408764362,0.4811586998403072,0.482547752559185,0.4816697351634502,0.4809327870607376,0.4816545359790325,0.4804601892828941,0.4776877984404564,0.4813711903989315,0.4844604581594467,0.4819537848234176,0.4820829331874847,0.4778126627206802,0.482935007661581,0.48230691999197,0.4826001971960068,0.4823969900608063,0.4811219945549965,0.4789146520197391,0.484035175293684,0.4848698377609253,0.4855728335678577,0.4825376532971859,0.485215101391077,0.4824351668357849,0.4835342466831207,0.4822137206792831,0.4838785007596016,0.4837255179882049,0.4853012599050998,0.4857851006090641,0.4863366298377514,0.4856646582484245,0.4842503517866134,0.4838776960968971,0.4846346862614155,0.4837041422724724,0.4813097268342972,0.4873070046305656,0.4841253720223903,0.4837464913725853,0.483069509267807,0.4851242564618587,0.4861010462045669],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3605199865996837,0.3733148723840713,0.3882005847990513,0.3934122696518898,0.3947227671742439,0.4042885974049568,0.3974800482392311,0.4055779427289963,0.4133470430970192,0.4117913842201233,0.4113653488457203,0.4149517640471458,0.4187851920723915,0.4252083078026771,0.4206527359783649,0.4240428246557712,0.422003373503685,0.4280910938978195,0.4244147576391697,0.4316282644867897,0.4295645765960216,0.4310102686285972,0.4360743537545204,0.4313482865691185,0.4350991360843181,0.4378576353192329,0.4335876516997814,0.4347924515604973,0.4348904751241207,0.436600212007761,0.430036511272192,0.4350974671542644,0.4399556629359722,0.4371416717767715,0.4363861419260502,0.4376698136329651,0.4405004419386387,0.4373639523983001,0.4379038028419018,0.4371281825006008,0.4393439553678036,0.440426729619503,0.4401675276458263,0.4429537951946258,0.4449137263000011,0.4434786736965179,0.4450470842421055,0.4454202279448509,0.4394537284970283,0.442185215651989,0.4461225643754005,0.4427758157253265,0.4430646039545536,0.4476901069283485,0.4478763341903686,0.4493869319558143,0.4448477327823639,0.450044184923172,0.4498609118163585,0.4457665979862213,0.4506924152374267,0.449855338782072,0.448790930211544,0.4474099352955818,0.4546772800385952,0.4529431238770485,0.452015146613121,0.4502020999789238,0.4493804536759853,0.4523266032338142,0.4551868587732315,0.4501944817602634,0.4493303671479225,0.4526805207133293,0.4533850513398647,0.4518048763275146,0.4518973492085933,0.4531301632523536,0.4518006071448326,0.4553494565188885,0.4528752230107784,0.4536322727799415,0.4561733976006508,0.4549491256475448,0.4574789106845855,0.4577847123146057,0.4563642293214798,0.4578686729073524,0.4561499990522861,0.4537816494703293,0.4542164430022239,0.4559455662965774,0.4554723873734474,0.4575514122843742,0.4575202167034149,0.4592722058296203,0.4585275091230869,0.4580587856471538,0.456934317946434,0.4577495418488979,0.4540119916200638,0.4570806957781315,0.4608120545744896,0.4588425755500793,0.4578334167599678,0.4610816091299057,0.4598177038133144,0.461849745362997,0.4631866924464702,0.4601576402783394,0.4646804705262184,0.4632389545440674,0.4604574106633663,0.4602976888418197,0.4581312239170074,0.4654182009398937,0.4655338563024997,0.4616620391607284,0.461054053157568,0.4613021649420261,0.4658613465726375,0.4633531905710697,0.4613638147711754,0.4643996246159076,0.462500050663948,0.4650798961520195,0.4648764543235302,0.4639869071543216,0.4634246975183487,0.46585888043046,0.4639799632132053,0.4630857892334461,0.4644265696406364,0.4642998576164245,0.4686848931014538,0.4687492996454239,0.4650243632495403,0.4627032242715359,0.4665953740477562,0.4660026729106903,0.4664581045508384,0.4676475040614605,0.4657339677214622,0.4664678275585174,0.4673498086631298,0.4676674827933311,0.4680955372750759,0.4681585058569908,0.4659864418208599,0.4686457589268684,0.4661462865769863,0.4658931568264961,0.4674226939678192,0.46805215254426,0.4682257212698459,0.4689070098102093,0.4699570722877979,0.4655096270143986,0.4688013233244419,0.4707522802054882,0.4661469310522079,0.4688841328024864,0.4671329781413078,0.4662554152309894,0.4697433896362781,0.4698473587632179,0.4676505327224731,0.4696521013975143],"label":"FineWeb filtered only"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3551952373236418,0.3736435137689113,0.3814037963747978,0.3948809280991554,0.3996850810945034,0.4089604057371616,0.4100853353738785,0.4119834117591381,0.4168377220630646,0.4186493046581745,0.4169826358556747,0.4234288297593593,0.4229162000119686,0.4273439794778824,0.4290364980697632,0.4291782416403293,0.4296907968819141,0.4311576783657074,0.4326641112565994,0.430318683385849,0.430436260998249,0.4339037239551544,0.4363459683954716,0.4357402548193931,0.4342963136732578,0.4366712383925915,0.4363959729671478,0.436981026083231,0.4447868093848228,0.4411709941923618,0.4406092017889023,0.4424176625907421,0.4423875361680984,0.4422253370285034,0.4410557933151722,0.4447037056088447,0.4454837813973427,0.4435960277915001,0.4468514993786812,0.4479999616742134,0.4428562931716442,0.445764634758234,0.4456562362611294,0.4488007053732872,0.4475954286754131,0.4468922987580299,0.4548408314585686,0.4511027485132217,0.4530330970883369,0.4483681954443455,0.4531726539134979,0.45334542542696,0.4544384703040123,0.4530758671462536,0.4540613554418087,0.4510113634169101,0.4538320265710354,0.4518541917204857,0.4536847211420536,0.4532708041369915,0.4552236869931221,0.455034039914608,0.4562875479459762,0.4532428197562694,0.4574853852391243,0.4517738744616508,0.4579889141023159,0.4538268558681011,0.456730306148529,0.4526018649339676,0.4562746733427048,0.4560015797615051,0.4555426277220249,0.4561501257121563,0.4524396173655987,0.4557023830711841,0.4589769169688225,0.4581078588962555,0.4620813727378845,0.4586601965129375,0.4568093195557594,0.4569808952510357,0.4567535072565079,0.4575250148773193,0.4606908001005649,0.4603964723646641,0.4622848592698574,0.4594669193029403,0.4640629850327968,0.4604269936680794,0.4634841009974479,0.4644578285515308,0.4642514958977699,0.4666304066777229,0.4616626128554344,0.4588956907391548,0.4620226770639419,0.4628621749579906,0.4595407098531723,0.4635516740381717,0.46005355194211,0.4601523540914058,0.4644204638898372,0.4620639197528362,0.46614545956254,0.4636696502566337,0.4610077403485775,0.4640897810459137,0.4636163525283336,0.4630545899271965,0.466012816876173,0.4650349207222461,0.4613720141351223,0.4644323363900184,0.4647249802947044,0.4656480401754379,0.4651664271950722,0.4622530452907085,0.4655019529163837,0.4650313258171081,0.466718140989542,0.4661559611558914,0.4661237150430679,0.4664223715662956,0.4640601389110088,0.4642657749354839,0.4633881188929081,0.4629989042878151,0.4685831367969513,0.4675870984792709,0.467183344066143,0.4678030684590339,0.4660939238965511,0.4691914953291416,0.4670972637832165,0.468262892216444,0.4672016054391861,0.4676182121038437,0.4698677137494087,0.4658828042447567,0.4701816700398922,0.4684622809290886,0.466015312820673,0.4675401039421558,0.4693200923502445,0.4702670983970165,0.4679145030677318,0.4676233418285846,0.4674933589994907,0.4678357951343059,0.4669915996491909,0.4657857678830623,0.4666901864111423,0.4669371582567692,0.4672787226736545,0.4684535376727581,0.4685697965323925,0.4694835692644119,0.4683254994451999,0.4712230190634727,0.4683987610042095,0.4707653746008873,0.4663059376180172,0.4683133698999882,0.4686385430395603,0.4657671600580215,0.4692615270614624],"label":"FineWeb full MinHash"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2899999916553497,0.31700000166893,0.3409999907016754,0.3425000011920929,0.3485000133514404,0.3555000126361847,0.3574999868869781,0.3585000038146972,0.363999992609024,0.3619999885559082,0.3675000071525574,0.3865000009536743,0.3810000121593475,0.3740000128746032,0.3810000121593475,0.3810000121593475,0.3860000073909759,0.3810000121593475,0.3894999921321869,0.3849999904632568,0.3855000138282776,0.3989999890327453,0.3980000019073486,0.3995000123977661,0.395000010728836,0.4084999859333038,0.4040000140666961,0.4004999995231628,0.3955000042915344,0.4135000109672546,0.4070000052452087,0.4104999899864197,0.4014999866485595,0.4099999964237213,0.4199999868869781,0.414000004529953,0.402999997138977,0.4214999973773956,0.4095000028610229,0.4059999883174896,0.4090000092983246,0.4074999988079071,0.4120000004768371,0.4154999852180481,0.4189999997615814,0.4149999916553497,0.429500013589859,0.4154999852180481,0.4214999973773956,0.4244999885559082,0.4205000102519989,0.4269999861717224,0.4214999973773956,0.4180000126361847,0.4415000081062317,0.4320000112056732,0.4350000023841858,0.4259999990463257,0.4300000071525574,0.4259999990463257,0.4189999997615814,0.4269999861717224,0.4199999868869781,0.426499992609024,0.4350000023841858,0.4289999902248382,0.4345000088214874,0.4259999990463257,0.426499992609024,0.4395000040531158,0.4395000040531158,0.4359999895095825,0.4280000030994415,0.4370000064373016,0.4329999983310699,0.4309999942779541,0.4490000009536743,0.4399999976158142,0.4339999854564667,0.4399999976158142,0.4345000088214874,0.429500013589859,0.4370000064373016,0.4379999935626983,0.4284999966621399,0.4309999942779541,0.4350000023841858,0.4399999976158142,0.4314999878406524,0.4300000071525574,0.4410000145435333,0.4345000088214874,0.4410000145435333,0.4345000088214874,0.4339999854564667,0.4460000097751617,0.4410000145435333,0.4469999969005584,0.4480000138282776,0.4435000121593475,0.4375,0.4519999921321869,0.4480000138282776,0.4429999887943268,0.4519999921321869,0.4435000121593475,0.4334999918937683,0.4460000097751617,0.4564999938011169,0.4469999969005584,0.453000009059906,0.4485000073909759,0.4410000145435333,0.4444999992847442,0.4485000073909759,0.457500010728836,0.4469999969005584,0.4535000026226043,0.4535000026226043,0.4485000073909759,0.4490000009536743,0.4505000114440918,0.4595000147819519,0.4544999897480011,0.453000009059906,0.4605000019073486,0.4620000123977661,0.457500010728836,0.453000009059906,0.4550000131130218,0.460999995470047,0.4449999928474426,0.4474999904632568,0.457500010728836,0.4584999978542328,0.4494999945163727,0.4474999904632568,0.4625000059604645,0.4639999866485595,0.4555000066757202,0.4469999969005584,0.4600000083446502,0.453000009059906,0.4629999995231628,0.4589999914169311,0.4614999890327453,0.4555000066757202,0.4560000002384186,0.4580000042915344,0.4584999978542328,0.4560000002384186,0.4605000019073486,0.4595000147819519,0.4639999866485595,0.4614999890327453,0.4564999938011169,0.4634999930858612,0.4625000059604645,0.4614999890327453,0.4679999947547912,0.4584999978542328,0.4595000147819519,0.4505000114440918,0.4544999897480011,0.4595000147819519,0.4620000123977661,0.4670000076293945,0.4555000066757202],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2894999980926513,0.3235000073909759,0.3389999866485595,0.3384999930858612,0.3459999859333038,0.359499990940094,0.3429999947547912,0.3619999885559082,0.3564999997615814,0.3625000119209289,0.363999992609024,0.3680000007152557,0.3680000007152557,0.3785000145435333,0.3684999942779541,0.375,0.3734999895095825,0.3849999904632568,0.3944999873638153,0.3865000009536743,0.395000010728836,0.3935000002384186,0.3980000019073486,0.3910000026226043,0.3885000050067901,0.3914999961853027,0.3815000057220459,0.395000010728836,0.3894999921321869,0.395000010728836,0.3935000002384186,0.4034999907016754,0.4004999995231628,0.3970000147819519,0.3975000083446502,0.3995000123977661,0.3980000019073486,0.4034999907016754,0.3959999978542328,0.3989999890327453,0.402999997138977,0.3880000114440918,0.3980000019073486,0.4040000140666961,0.3989999890327453,0.3970000147819519,0.3925000131130218,0.4120000004768371,0.3935000002384186,0.395000010728836,0.4070000052452087,0.3935000002384186,0.4034999907016754,0.4189999997615814,0.4129999876022339,0.4160000085830688,0.4149999916553497,0.418500006198883,0.4225000143051147,0.4174999892711639,0.4210000038146972,0.4045000076293945,0.4079999923706054,0.4124999940395355,0.4144999980926513,0.4169999957084656,0.4194999933242798,0.4154999852180481,0.4169999957084656,0.4225000143051147,0.4225000143051147,0.4230000078678131,0.4160000085830688,0.4325000047683716,0.4325000047683716,0.4199999868869781,0.4199999868869781,0.4189999997615814,0.4269999861717224,0.4259999990463257,0.4230000078678131,0.4144999980926513,0.4329999983310699,0.4275000095367431,0.4305000007152557,0.4289999902248382,0.4235000014305115,0.4235000014305115,0.4325000047683716,0.4244999885559082,0.4314999878406524,0.4194999933242798,0.4350000023841858,0.4269999861717224,0.4235000014305115,0.4300000071525574,0.4284999966621399,0.4255000054836273,0.4280000030994415,0.4345000088214874,0.4225000143051147,0.4334999918937683,0.4300000071525574,0.4350000023841858,0.429500013589859,0.4325000047683716,0.4384999871253967,0.4345000088214874,0.4354999959468841,0.4359999895095825,0.4354999959468841,0.4424999952316284,0.4424999952316284,0.4320000112056732,0.4280000030994415,0.4390000104904175,0.4480000138282776,0.4415000081062317,0.4384999871253967,0.4390000104904175,0.4494999945163727,0.4449999928474426,0.4384999871253967,0.4424999952316284,0.4359999895095825,0.445499986410141,0.4399999976158142,0.4375,0.4410000145435333,0.4384999871253967,0.4375,0.4329999983310699,0.4370000064373016,0.4354999959468841,0.4440000057220459,0.4384999871253967,0.4384999871253967,0.4390000104904175,0.4424999952316284,0.4379999935626983,0.4345000088214874,0.4354999959468841,0.4440000057220459,0.4395000040531158,0.4465000033378601,0.4404999911785126,0.4505000114440918,0.4480000138282776,0.4449999928474426,0.445499986410141,0.4410000145435333,0.4485000073909759,0.4460000097751617,0.4480000138282776,0.4465000033378601,0.4460000097751617,0.4460000097751617,0.4395000040531158,0.4474999904632568,0.4469999969005584,0.4404999911785126,0.4440000057220459,0.4435000121593475,0.4435000121593475,0.4514999985694885,0.4474999904632568,0.4474999904632568,0.445499986410141],"label":"FineWeb filtered only"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2904999852180481,0.3289999961853027,0.3379999995231628,0.3400000035762787,0.3535000085830688,0.3700000047683716,0.3619999885559082,0.3695000112056732,0.3625000119209289,0.3745000064373016,0.3804999887943268,0.3835000097751617,0.3810000121593475,0.3785000145435333,0.3799999952316284,0.3885000050067901,0.3919999897480011,0.3899999856948852,0.3939999938011169,0.4004999995231628,0.3889999985694885,0.4000000059604645,0.3930000066757202,0.4025000035762787,0.398499995470047,0.3939999938011169,0.3989999890327453,0.4020000100135803,0.4079999923706054,0.4129999876022339,0.4014999866485595,0.4129999876022339,0.4079999923706054,0.4115000069141388,0.4070000052452087,0.4095000028610229,0.4199999868869781,0.4165000021457672,0.4239999949932098,0.4129999876022339,0.4034999907016754,0.4050000011920929,0.4135000109672546,0.4189999997615814,0.418500006198883,0.4199999868869781,0.4365000128746032,0.4320000112056732,0.4255000054836273,0.4259999990463257,0.4244999885559082,0.4275000095367431,0.4259999990463257,0.4210000038146972,0.421999990940094,0.4099999964237213,0.4305000007152557,0.4239999949932098,0.4194999933242798,0.4205000102519989,0.4255000054836273,0.414000004529953,0.4210000038146972,0.4180000126361847,0.4429999887943268,0.429500013589859,0.4165000021457672,0.4239999949932098,0.4255000054836273,0.4180000126361847,0.4325000047683716,0.4305000007152557,0.4329999983310699,0.4325000047683716,0.4320000112056732,0.4375,0.4410000145435333,0.4395000040531158,0.4379999935626983,0.4280000030994415,0.4365000128746032,0.4205000102519989,0.426499992609024,0.4280000030994415,0.4354999959468841,0.4314999878406524,0.429500013589859,0.421999990940094,0.4345000088214874,0.429500013589859,0.4354999959468841,0.4314999878406524,0.4404999911785126,0.4384999871253967,0.4359999895095825,0.4345000088214874,0.4320000112056732,0.4345000088214874,0.4375,0.4410000145435333,0.4280000030994415,0.4320000112056732,0.44200000166893,0.4460000097751617,0.4390000104904175,0.4314999878406524,0.4339999854564667,0.4390000104904175,0.4460000097751617,0.4309999942779541,0.4444999992847442,0.44200000166893,0.4404999911785126,0.4395000040531158,0.4370000064373016,0.4519999921321869,0.4429999887943268,0.4395000040531158,0.4415000081062317,0.4384999871253967,0.4494999945163727,0.4469999969005584,0.4375,0.4395000040531158,0.4345000088214874,0.4390000104904175,0.4375,0.4309999942779541,0.4320000112056732,0.4415000081062317,0.4354999959468841,0.445499986410141,0.4404999911785126,0.4429999887943268,0.4395000040531158,0.4354999959468841,0.4429999887943268,0.4410000145435333,0.4494999945163727,0.4429999887943268,0.4460000097751617,0.445499986410141,0.4429999887943268,0.4429999887943268,0.4350000023841858,0.4474999904632568,0.4415000081062317,0.4424999952316284,0.4375,0.4444999992847442,0.4424999952316284,0.4354999959468841,0.445499986410141,0.4379999935626983,0.4449999928474426,0.4365000128746032,0.4474999904632568,0.4440000057220459,0.4465000033378601,0.445499986410141,0.4474999904632568,0.4494999945163727,0.4449999928474426,0.4444999992847442,0.44200000166893,0.4345000088214874,0.4404999911785126],"label":"FineWeb full MinHash"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2529999911785126,0.2800000011920929,0.2870000004768371,0.3179999887943268,0.3129999935626983,0.3210000097751617,0.3160000145435333,0.3210000097751617,0.31700000166893,0.3330000042915344,0.3389999866485595,0.3289999961853027,0.3429999947547912,0.3370000123977661,0.3379999995231628,0.3459999859333038,0.3490000069141388,0.3470000028610229,0.3600000143051147,0.3569999933242798,0.3449999988079071,0.3650000095367431,0.3499999940395355,0.3540000021457672,0.3569999933242798,0.3619999885559082,0.3619999885559082,0.3580000102519989,0.3740000128746032,0.3709999918937683,0.3720000088214874,0.3759999871253967,0.3720000088214874,0.3659999966621399,0.3790000081062317,0.3610000014305115,0.3650000095367431,0.3650000095367431,0.3720000088214874,0.3729999959468841,0.3790000081062317,0.3680000007152557,0.3659999966621399,0.3680000007152557,0.3619999885559082,0.3619999885559082,0.3729999959468841,0.3720000088214874,0.3650000095367431,0.3759999871253967,0.367000013589859,0.3650000095367431,0.3680000007152557,0.3580000102519989,0.3589999973773956,0.3700000047683716,0.3680000007152557,0.367000013589859,0.3709999918937683,0.3880000114440918,0.3810000121593475,0.375,0.4040000140666961,0.3860000073909759,0.3840000033378601,0.3779999911785126,0.3729999959468841,0.3720000088214874,0.3799999952316284,0.3799999952316284,0.3779999911785126,0.3689999878406524,0.3770000040531158,0.3740000128746032,0.3819999992847442,0.3899999856948852,0.3799999952316284,0.3919999897480011,0.3720000088214874,0.3770000040531158,0.3930000066757202,0.3849999904632568,0.3899999856948852,0.3740000128746032,0.3740000128746032,0.3799999952316284,0.3779999911785126,0.3880000114440918,0.3709999918937683,0.3810000121593475,0.3880000114440918,0.3980000019073486,0.3819999992847442,0.3849999904632568,0.3810000121593475,0.3819999992847442,0.3889999985694885,0.3840000033378601,0.3910000026226043,0.3899999856948852,0.3959999978542328,0.3880000114440918,0.3869999945163727,0.3779999911785126,0.3819999992847442,0.3919999897480011,0.3849999904632568,0.3860000073909759,0.3919999897480011,0.3819999992847442,0.3819999992847442,0.3889999985694885,0.3889999985694885,0.3860000073909759,0.3880000114440918,0.3889999985694885,0.3939999938011169,0.3899999856948852,0.3869999945163727,0.3910000026226043,0.3910000026226043,0.3910000026226043,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3939999938011169,0.4000000059604645,0.3970000147819519,0.402999997138977,0.3959999978542328,0.3959999978542328,0.4000000059604645,0.4040000140666961,0.4020000100135803,0.3989999890327453,0.3919999897480011,0.3930000066757202,0.3930000066757202,0.3980000019073486,0.4000000059604645,0.395000010728836,0.3899999856948852,0.4059999883174896,0.4020000100135803,0.4020000100135803,0.4059999883174896,0.3970000147819519,0.4110000133514404,0.4050000011920929,0.4000000059604645,0.4090000092983246,0.3989999890327453,0.402999997138977,0.4009999930858612,0.3980000019073486,0.4090000092983246,0.4079999923706054,0.4079999923706054,0.4020000100135803,0.402999997138977,0.402999997138977,0.4059999883174896,0.4040000140666961,0.4059999883174896,0.3989999890327453,0.4070000052452087,0.4059999883174896],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2540000081062317,0.2870000004768371,0.2829999923706054,0.3210000097751617,0.3079999983310699,0.3230000138282776,0.3179999887943268,0.3160000145435333,0.3289999961853027,0.3199999928474426,0.324999988079071,0.3310000002384186,0.3260000050067901,0.335999995470047,0.335999995470047,0.3310000002384186,0.335999995470047,0.3339999914169311,0.3459999859333038,0.3330000042915344,0.3449999988079071,0.3429999947547912,0.3479999899864197,0.3420000076293945,0.3479999899864197,0.3459999859333038,0.3339999914169311,0.3350000083446502,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3379999995231628,0.3420000076293945,0.3610000014305115,0.3409999907016754,0.356000006198883,0.3630000054836273,0.3519999980926513,0.3510000109672546,0.3619999885559082,0.3569999933242798,0.3479999899864197,0.3529999852180481,0.3569999933242798,0.3529999852180481,0.3519999980926513,0.3549999892711639,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3619999885559082,0.3459999859333038,0.3519999980926513,0.3529999852180481,0.3680000007152557,0.3519999980926513,0.3580000102519989,0.3549999892711639,0.3490000069141388,0.3499999940395355,0.3600000143051147,0.3709999918937683,0.3659999966621399,0.3569999933242798,0.3510000109672546,0.3600000143051147,0.367000013589859,0.3529999852180481,0.363999992609024,0.3630000054836273,0.3619999885559082,0.356000006198883,0.367000013589859,0.3600000143051147,0.3540000021457672,0.3589999973773956,0.3610000014305115,0.356000006198883,0.3680000007152557,0.3519999980926513,0.3549999892711639,0.3479999899864197,0.3549999892711639,0.3519999980926513,0.367000013589859,0.3600000143051147,0.3600000143051147,0.3680000007152557,0.356000006198883,0.3610000014305115,0.3689999878406524,0.367000013589859,0.3689999878406524,0.3720000088214874,0.3680000007152557,0.3569999933242798,0.3650000095367431,0.363999992609024,0.3610000014305115,0.3709999918937683,0.3569999933242798,0.3540000021457672,0.3619999885559082,0.3549999892711639,0.3650000095367431,0.3680000007152557,0.3589999973773956,0.356000006198883,0.3610000014305115,0.3619999885559082,0.3740000128746032,0.3700000047683716,0.3650000095367431,0.3819999992847442,0.3770000040531158,0.3810000121593475,0.3729999959468841,0.3680000007152557,0.3689999878406524,0.3740000128746032,0.3779999911785126,0.3720000088214874,0.3740000128746032,0.367000013589859,0.363999992609024,0.367000013589859,0.3689999878406524,0.3709999918937683,0.3709999918937683,0.375,0.3680000007152557,0.375,0.3630000054836273,0.3720000088214874,0.3819999992847442,0.3729999959468841,0.3689999878406524,0.363999992609024,0.3709999918937683,0.3659999966621399,0.3700000047683716,0.367000013589859,0.3709999918937683,0.3759999871253967,0.3759999871253967,0.3729999959468841,0.3729999959468841,0.3729999959468841,0.3779999911785126,0.375,0.3700000047683716,0.3659999966621399,0.3759999871253967,0.3779999911785126,0.3709999918937683,0.3840000033378601,0.3720000088214874,0.375,0.367000013589859,0.3770000040531158,0.3709999918937683,0.375,0.3709999918937683,0.3740000128746032,0.3740000128746032,0.375,0.3770000040531158],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2599999904632568,0.277999997138977,0.2910000085830688,0.3070000112056732,0.3140000104904175,0.3019999861717224,0.3059999942779541,0.3210000097751617,0.3230000138282776,0.324999988079071,0.3149999976158142,0.3109999895095825,0.3339999914169311,0.3289999961853027,0.3319999873638153,0.3319999873638153,0.3300000131130218,0.3370000123977661,0.3219999969005584,0.3370000123977661,0.328000009059906,0.3339999914169311,0.3420000076293945,0.3400000035762787,0.3440000116825104,0.3510000109672546,0.3409999907016754,0.3449999988079071,0.3339999914169311,0.3540000021457672,0.3339999914169311,0.3470000028610229,0.3470000028610229,0.3440000116825104,0.3589999973773956,0.3569999933242798,0.3630000054836273,0.3549999892711639,0.3589999973773956,0.3449999988079071,0.3549999892711639,0.3449999988079071,0.3389999866485595,0.3499999940395355,0.3610000014305115,0.3619999885559082,0.3600000143051147,0.3519999980926513,0.3479999899864197,0.356000006198883,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3519999980926513,0.3470000028610229,0.3589999973773956,0.3449999988079071,0.3490000069141388,0.356000006198883,0.3619999885559082,0.3569999933242798,0.3659999966621399,0.3610000014305115,0.3549999892711639,0.3700000047683716,0.363999992609024,0.3600000143051147,0.3580000102519989,0.3549999892711639,0.3619999885559082,0.3689999878406524,0.3630000054836273,0.363999992609024,0.3700000047683716,0.367000013589859,0.3630000054836273,0.3630000054836273,0.3700000047683716,0.3589999973773956,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3589999973773956,0.3650000095367431,0.3709999918937683,0.3680000007152557,0.3689999878406524,0.3650000095367431,0.3729999959468841,0.3619999885559082,0.3689999878406524,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.363999992609024,0.3700000047683716,0.3659999966621399,0.3659999966621399,0.363999992609024,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3610000014305115,0.3720000088214874,0.3729999959468841,0.3810000121593475,0.3630000054836273,0.3689999878406524,0.3709999918937683,0.3759999871253967,0.382999986410141,0.3729999959468841,0.3720000088214874,0.3680000007152557,0.3659999966621399,0.3650000095367431,0.363999992609024,0.3589999973773956,0.356000006198883,0.3650000095367431,0.3659999966621399,0.367000013589859,0.3729999959468841,0.3720000088214874,0.375,0.3740000128746032,0.3700000047683716,0.3569999933242798,0.3759999871253967,0.3740000128746032,0.367000013589859,0.3770000040531158,0.3759999871253967,0.3709999918937683,0.3779999911785126,0.3709999918937683,0.3689999878406524,0.3799999952316284,0.3630000054836273,0.375,0.3700000047683716,0.3700000047683716,0.3729999959468841,0.3720000088214874,0.3790000081062317,0.375,0.3729999959468841,0.3770000040531158,0.3799999952316284,0.3779999911785126,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3799999952316284,0.3790000081062317,0.375,0.3740000128746032,0.3729999959468841,0.3840000033378601,0.3659999966621399,0.3759999871253967,0.3720000088214874,0.3720000088214874,0.3759999871253967,0.375,0.3650000095367431,0.3729999959468841],"label":"FineWeb filtered only"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2759999930858612,0.328000009059906,0.3499999940395355,0.3889999985694885,0.3910000026226043,0.402999997138977,0.4210000038146972,0.4280000030994415,0.4359999895095825,0.4469999969005584,0.4440000057220459,0.4600000083446502,0.4690000116825104,0.4600000083446502,0.4679999947547912,0.4729999899864197,0.4760000109672546,0.4839999973773956,0.4939999878406524,0.488999992609024,0.4990000128746032,0.4979999959468841,0.4979999959468841,0.5009999871253967,0.5,0.5090000033378601,0.5070000290870667,0.5180000066757202,0.5199999809265137,0.5109999775886536,0.5130000114440918,0.5249999761581421,0.5149999856948853,0.5299999713897705,0.5339999794960022,0.5189999938011169,0.5289999842643738,0.5249999761581421,0.5320000052452087,0.5460000038146973,0.5419999957084656,0.5260000228881836,0.5289999842643738,0.546999990940094,0.5419999957084656,0.5419999957084656,0.5460000038146973,0.5419999957084656,0.5389999747276306,0.5440000295639038,0.5569999814033508,0.5450000166893005,0.5329999923706055,0.5580000281333923,0.5339999794960022,0.5540000200271606,0.5460000038146973,0.5479999780654907,0.5529999732971191,0.5540000200271606,0.5619999766349792,0.5490000247955322,0.5410000085830688,0.5490000247955322,0.5569999814033508,0.550000011920929,0.5479999780654907,0.5630000233650208,0.546999990940094,0.5559999942779541,0.5600000023841858,0.5509999990463257,0.5569999814033508,0.5569999814033508,0.5580000281333923,0.5619999766349792,0.5580000281333923,0.5669999718666077,0.5569999814033508,0.5709999799728394,0.5529999732971191,0.5649999976158142,0.5659999847412109,0.5659999847412109,0.5690000057220459,0.5600000023841858,0.5580000281333923,0.5540000200271606,0.5640000104904175,0.5680000185966492,0.5709999799728394,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5640000104904175,0.5799999833106995,0.5699999928474426,0.5669999718666077,0.5680000185966492,0.5770000219345093,0.5709999799728394,0.5759999752044678,0.5690000057220459,0.5789999961853027,0.5740000009536743,0.5709999799728394,0.5789999961853027,0.5709999799728394,0.5770000219345093,0.5770000219345093,0.5730000138282776,0.5809999704360962,0.5720000267028809,0.5849999785423279,0.5820000171661377,0.5799999833106995,0.5830000042915344,0.5759999752044678,0.5730000138282776,0.5799999833106995,0.5830000042915344,0.5860000252723694,0.5789999961853027,0.5789999961853027,0.5860000252723694,0.5979999899864197,0.5920000076293945,0.5820000171661377,0.5870000123977661,0.5889999866485596,0.5839999914169312,0.5849999785423279,0.5899999737739563,0.5920000076293945,0.593999981880188,0.597000002861023,0.5889999866485596,0.5889999866485596,0.5849999785423279,0.5899999737739563,0.5989999771118164,0.5899999737739563,0.5839999914169312,0.5910000205039978,0.5910000205039978,0.5929999947547913,0.5920000076293945,0.5929999947547913,0.5889999866485596,0.5899999737739563,0.593999981880188,0.5910000205039978,0.5960000157356262,0.5920000076293945,0.5889999866485596,0.593999981880188,0.5879999995231628,0.5960000157356262,0.5920000076293945,0.5960000157356262,0.5960000157356262,0.5920000076293945,0.6010000109672546,0.5920000076293945,0.5899999737739563,0.5889999866485596,0.5920000076293945,0.6019999980926514],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2809999883174896,0.3230000138282776,0.3409999907016754,0.3600000143051147,0.3569999933242798,0.3889999985694885,0.395000010728836,0.4199999868869781,0.4180000126361847,0.421999990940094,0.4289999902248382,0.4350000023841858,0.4359999895095825,0.4469999969005584,0.4350000023841858,0.4480000138282776,0.4480000138282776,0.453000009059906,0.4550000131130218,0.4589999914169311,0.4639999866485595,0.4600000083446502,0.460999995470047,0.4589999914169311,0.481000006198883,0.4769999980926513,0.4709999859333038,0.4740000069141388,0.4679999947547912,0.4790000021457672,0.4729999899864197,0.4819999933242798,0.4850000143051147,0.4819999933242798,0.4819999933242798,0.4880000054836273,0.4869999885559082,0.4959999918937683,0.4850000143051147,0.4959999918937683,0.492000013589859,0.503000020980835,0.4930000007152557,0.5099999904632568,0.5040000081062317,0.5009999871253967,0.4970000088214874,0.4979999959468841,0.5059999823570251,0.5070000290870667,0.5040000081062317,0.5059999823570251,0.5049999952316284,0.5080000162124634,0.5049999952316284,0.5019999742507935,0.5120000243186951,0.5170000195503235,0.5170000195503235,0.5090000033378601,0.5239999890327454,0.527999997138977,0.5230000019073486,0.5210000276565552,0.5149999856948853,0.5189999938011169,0.5270000100135803,0.5149999856948853,0.5099999904632568,0.5299999713897705,0.5199999809265137,0.5230000019073486,0.5260000228881836,0.5249999761581421,0.5239999890327454,0.5329999923706055,0.5210000276565552,0.5260000228881836,0.5170000195503235,0.531000018119812,0.5289999842643738,0.531000018119812,0.5270000100135803,0.5299999713897705,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5329999923706055,0.5360000133514404,0.5299999713897705,0.5360000133514404,0.5270000100135803,0.5450000166893005,0.5410000085830688,0.546999990940094,0.5329999923706055,0.5329999923706055,0.5379999876022339,0.5299999713897705,0.5429999828338623,0.5360000133514404,0.5339999794960022,0.5419999957084656,0.5410000085830688,0.5370000004768372,0.5389999747276306,0.527999997138977,0.5400000214576721,0.5400000214576721,0.531000018119812,0.5440000295639038,0.5460000038146973,0.5479999780654907,0.5460000038146973,0.5410000085830688,0.5509999990463257,0.5479999780654907,0.5410000085830688,0.5389999747276306,0.550000011920929,0.5569999814033508,0.550000011920929,0.5490000247955322,0.5490000247955322,0.5569999814033508,0.5519999861717224,0.5479999780654907,0.5559999942779541,0.5550000071525574,0.5460000038146973,0.5540000200271606,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5550000071525574,0.5479999780654907,0.5479999780654907,0.5540000200271606,0.5550000071525574,0.5529999732971191,0.5529999732971191,0.5509999990463257,0.5509999990463257,0.5419999957084656,0.546999990940094,0.5509999990463257,0.5559999942779541,0.5490000247955322,0.5509999990463257,0.5529999732971191,0.550000011920929,0.5540000200271606,0.5550000071525574,0.5580000281333923,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5519999861717224,0.5519999861717224,0.5559999942779541,0.5569999814033508,0.5559999942779541,0.5550000071525574,0.5559999942779541,0.5490000247955322,0.5550000071525574,0.5600000023841858],"label":"FineWeb filtered only"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3009999990463257,0.3149999976158142,0.3400000035762787,0.3610000014305115,0.3680000007152557,0.3799999952316284,0.4020000100135803,0.4180000126361847,0.4129999876022339,0.4259999990463257,0.4239999949932098,0.4440000057220459,0.44200000166893,0.4440000057220459,0.4580000042915344,0.4510000050067901,0.4560000002384186,0.4650000035762787,0.4569999873638153,0.460999995470047,0.4659999907016754,0.4679999947547912,0.4779999852180481,0.4740000069141388,0.4600000083446502,0.4860000014305115,0.4790000021457672,0.4880000054836273,0.4930000007152557,0.4860000014305115,0.4850000143051147,0.4900000095367431,0.4850000143051147,0.4900000095367431,0.4959999918937683,0.492000013589859,0.4850000143051147,0.4970000088214874,0.4900000095367431,0.4979999959468841,0.503000020980835,0.5040000081062317,0.4990000128746032,0.4979999959468841,0.5080000162124634,0.5019999742507935,0.4970000088214874,0.4939999878406524,0.5120000243186951,0.5070000290870667,0.503000020980835,0.5070000290870667,0.503000020980835,0.5109999775886536,0.5080000162124634,0.5009999871253967,0.5090000033378601,0.5,0.5149999856948853,0.5109999775886536,0.5099999904632568,0.5130000114440918,0.5080000162124634,0.5080000162124634,0.5109999775886536,0.5099999904632568,0.5239999890327454,0.5180000066757202,0.5130000114440918,0.5120000243186951,0.5180000066757202,0.515999972820282,0.5260000228881836,0.5199999809265137,0.5239999890327454,0.5220000147819519,0.527999997138977,0.5249999761581421,0.5270000100135803,0.5249999761581421,0.5189999938011169,0.5230000019073486,0.5249999761581421,0.5199999809265137,0.5230000019073486,0.5299999713897705,0.5350000262260437,0.5339999794960022,0.5329999923706055,0.5249999761581421,0.5299999713897705,0.5360000133514404,0.5329999923706055,0.5410000085830688,0.5249999761581421,0.5289999842643738,0.5360000133514404,0.5360000133514404,0.5370000004768372,0.5389999747276306,0.5289999842643738,0.5299999713897705,0.5410000085830688,0.5329999923706055,0.5419999957084656,0.5410000085830688,0.527999997138977,0.5370000004768372,0.5429999828338623,0.5419999957084656,0.5389999747276306,0.5320000052452087,0.5350000262260437,0.5419999957084656,0.5410000085830688,0.5339999794960022,0.5440000295639038,0.5329999923706055,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5429999828338623,0.5479999780654907,0.550000011920929,0.5490000247955322,0.5410000085830688,0.5450000166893005,0.5429999828338623,0.550000011920929,0.5529999732971191,0.5490000247955322,0.5450000166893005,0.5450000166893005,0.5519999861717224,0.5569999814033508,0.5460000038146973,0.546999990940094,0.5509999990463257,0.5509999990463257,0.5450000166893005,0.5440000295639038,0.5440000295639038,0.546999990940094,0.5479999780654907,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5440000295639038,0.5410000085830688,0.5440000295639038,0.5389999747276306,0.5410000085830688,0.546999990940094,0.546999990940094,0.5479999780654907,0.546999990940094,0.550000011920929,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5479999780654907,0.5519999861717224,0.550000011920929],"label":"FineWeb full MinHash"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"siqa/acc_norm":{"file":"siqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"defaultMetric":"agg_score","slider":{"min":0,"max":30,"default":5}}}
dist/assets/data/plots/all_dumps_bad/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2528519630432129,0.2616856694221496,0.2665999829769134,0.2683407664299011,0.2742894291877746,0.2762066125869751,0.2807516455650329,0.2767378389835357,0.2807380557060241,0.2788906991481781,0.2844051718711853,0.2856102883815765,0.2883394360542297,0.2875711619853973,0.2890409529209137,0.2894668281078338,0.2883355319499969,0.2872501015663147,0.291619062423706,0.2900333702564239,0.2962473034858703,0.2962896525859833,0.297355443239212,0.2932226359844208,0.2886744439601898,0.29665008187294,0.2976542115211487,0.2991503179073334,0.3004479110240936,0.3044549524784088,0.2976194322109222,0.3014707863330841,0.3048252463340759,0.3039425611495971,0.303354948759079,0.3027459383010864,0.2999922931194305,0.3050121665000915,0.2998814284801483,0.2978588044643402,0.3041949570178985,0.3010904192924499,0.3022017180919647,0.2997751235961914,0.3015910983085632,0.3096485137939453,0.3012076020240783,0.3065535724163055,0.3042872548103332,0.3104783594608307,0.2997980415821075,0.3051296770572662,0.303458571434021,0.3088337182998657,0.3145398199558258,0.3032208085060119,0.310806930065155,0.3075874149799347,0.3101692199707031,0.310107946395874,0.3066047430038452,0.3109066784381866,0.3081336915493011,0.3084586262702942,0.3086149394512176,0.3085348606109619,0.3136637806892395,0.3110873103141784,0.31076380610466,0.3084572553634643,0.3133681714534759,0.3125792145729065,0.3124453127384186,0.3097185790538788,0.3106793165206909,0.3089564740657806,0.3111244142055511,0.3123694658279419,0.3144859969615936,0.3135123550891876,0.311982125043869,0.3142133951187134,0.3122903704643249,0.3147654831409454,0.3078767359256744,0.314947634935379,0.3171303570270538,0.3129573762416839,0.3154936134815216,0.3158208429813385,0.3153132200241089,0.3141326904296875,0.3163397014141083,0.3166318237781524,0.3168410360813141,0.3198235332965851,0.3201336860656738,0.3212967813014984,0.3191385567188263,0.3178017139434814,0.3192791938781738,0.323061466217041,0.320336639881134,0.3165886104106903,0.3206393420696258,0.3167395293712616,0.3135207295417785,0.315539002418518,0.3191742599010467,0.321073055267334,0.3222262561321258,0.3193058371543884,0.3213480710983276,0.3198905289173126,0.3219239711761474,0.3211614489555359,0.318855881690979,0.3177095353603363,0.324197381734848,0.3208906352519989,0.3264936804771423,0.3245965242385864,0.3231639564037323,0.3221887946128845,0.3277338445186615,0.3227696120738983,0.3263820111751556,0.3258577883243561,0.3264622390270233,0.3222362995147705,0.3286814987659454,0.3235024213790893,0.32446950674057,0.3311836123466491,0.328130304813385,0.3271634578704834,0.3250012993812561,0.3309800624847412,0.3274554014205932,0.3273015916347503,0.3261759579181671,0.32697594165802,0.3303172886371612,0.3282814025878906,0.3289586305618286,0.3260826468467712,0.3258011937141418,0.3297208249568939,0.3254813551902771,0.3287739753723144,0.3287097811698913,0.3275279700756073,0.3293041586875915,0.3314100801944732,0.3287808299064636,0.3251930773258209,0.3288172781467438,0.3265027701854706,0.3275215625762939,0.3290774822235107,0.3261331617832184,0.3299777805805206,0.331955999135971,0.3305029273033142,0.3274719417095184,0.3235560953617096,0.3269940316677093,0.3323083519935608],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2510619163513183,0.2621481418609619,0.2632303833961487,0.2720474302768707,0.2719806432723999,0.2726832032203674,0.2786827087402344,0.2823672890663147,0.276201844215393,0.2816944718360901,0.280361145734787,0.2819306254386902,0.2823295891284942,0.2892518043518066,0.2872919738292694,0.2859259247779846,0.2885263860225677,0.2862614393234253,0.2933129370212555,0.2930494546890259,0.2884900867938995,0.2942298054695129,0.2927677929401397,0.2954220175743103,0.2918704748153686,0.2943699061870575,0.2891678512096405,0.291848212480545,0.2942944765090942,0.2973679602146148,0.2953736186027527,0.2963412702083587,0.297100305557251,0.2963026762008667,0.2944463491439819,0.2971296310424804,0.293870210647583,0.2982682287693023,0.2978119254112243,0.2989997565746307,0.2993503510951996,0.298117071390152,0.2977498769760132,0.3004056811332702,0.3012634217739105,0.3001384139060974,0.3052266240119934,0.3038219809532165,0.3037647306919098,0.3009455502033233,0.3038812279701233,0.303263396024704,0.3025077581405639,0.3056069612503052,0.3024908602237701,0.3050909340381622,0.3001562356948852,0.303833544254303,0.3019777834415436,0.3036664128303528,0.3022894859313965,0.3042722940444946,0.3023003339767456,0.3069425821304321,0.307883083820343,0.3026910126209259,0.3054113090038299,0.3046148121356964,0.305342435836792,0.3048149049282074,0.3066973984241485,0.3055126965045929,0.3063409924507141,0.307701051235199,0.3075169324874878,0.3091190159320831,0.3098153173923492,0.31436288356781,0.3096509575843811,0.3022815883159637,0.3119745552539825,0.3083471357822418,0.3085280954837799,0.3082001209259033,0.3080264329910278,0.3116717934608459,0.3097788393497467,0.3117353916168213,0.3170038759708404,0.3099159002304077,0.3133728504180908,0.3161626160144806,0.3095119595527649,0.3135432302951813,0.3103009164333343,0.3126655519008636,0.3121814131736755,0.3123973608016968,0.3148256838321686,0.3144133985042572,0.3124284744262695,0.3102188408374786,0.3123636841773987,0.3115113973617553,0.3151636719703674,0.3148572146892547,0.315061867237091,0.3127182424068451,0.3139308094978332,0.3134367167949676,0.3136025071144104,0.3172793388366699,0.3134761154651642,0.3109587132930755,0.3127998411655426,0.3161843717098236,0.3163313865661621,0.3145243525505066,0.3155156075954437,0.3127505779266357,0.3182451128959656,0.3162476718425751,0.3124897480010986,0.3128789663314819,0.3119811117649078,0.314126193523407,0.3136049509048462,0.3149912655353546,0.3146650791168213,0.3151968121528625,0.3179666996002197,0.3169245719909668,0.3202513754367828,0.3185319602489471,0.3202781081199646,0.3186031281948089,0.3166128396987915,0.3199457228183746,0.3194417059421539,0.3170624077320099,0.3184532523155212,0.3191981911659241,0.3191225528717041,0.3173209130764007,0.3195607960224151,0.3166368305683136,0.3188160359859466,0.3174867630004883,0.3184468746185303,0.3211863338947296,0.3184327483177185,0.3177861273288727,0.3180214762687683,0.3194973170757293,0.3212297558784485,0.3211282789707184,0.3200584352016449,0.3168685734272003,0.3211040198802948,0.3222841620445251,0.3196901082992553,0.3236229419708252,0.3204475045204162,0.3210069537162781,0.3191083669662475,0.31863734126091,0.3195922076702118],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2516599297523498,0.2610189318656921,0.2666046619415283,0.2667981088161468,0.2667821645736694,0.2708088159561157,0.2738403379917145,0.2726235687732696,0.2762763500213623,0.2768311202526092,0.2809228301048279,0.2836140990257263,0.2822815179824829,0.2831664383411407,0.2797218561172485,0.286342591047287,0.2855269610881805,0.2847287058830261,0.2888180613517761,0.286526083946228,0.2865165770053863,0.294582188129425,0.2925947606563568,0.2947863042354584,0.2892930805683136,0.2903610467910766,0.288201242685318,0.2873396277427673,0.2916238009929657,0.2908017039299011,0.2907920777797699,0.2952797412872314,0.2941452264785766,0.2921333611011505,0.2925891280174255,0.2968584895133972,0.2980035543441772,0.2964116632938385,0.2962304651737213,0.2950254380702972,0.2977516651153564,0.2944138348102569,0.3003402054309845,0.2976303696632385,0.3013098239898681,0.302829384803772,0.3018766045570373,0.305361807346344,0.2971298694610595,0.3014816343784332,0.3019805550575256,0.3037064969539642,0.2970167994499206,0.2995208501815796,0.2970106601715088,0.2990955114364624,0.3027818500995636,0.3048534691333771,0.2993872463703155,0.2986327707767486,0.3015393316745758,0.3003426790237427,0.3003274798393249,0.3017795085906982,0.3019182682037353,0.3015450537204742,0.3046211004257202,0.3031167984008789,0.3020436763763428,0.3011128306388855,0.3029948472976684,0.3045558631420135,0.301642894744873,0.3029441833496094,0.3035804331302643,0.3004390001296997,0.3021787703037262,0.306041270494461,0.3064048886299133,0.3087956011295318,0.3070018291473388,0.3065581619739532,0.3093871772289276,0.3060930073261261,0.3033313155174255,0.3072777390480041,0.306413859128952,0.3104493916034698,0.3056999444961548,0.3077532052993774,0.309231549501419,0.3070645034313202,0.3117790520191192,0.3114112913608551,0.312661737203598,0.3181777000427246,0.3117201030254364,0.3099702894687652,0.3074746131896972,0.3064963519573211,0.3105958700180053,0.3111456036567688,0.3084964454174042,0.3087405860424042,0.3121673166751861,0.3121528625488281,0.3100416660308838,0.3142979145050049,0.3129935264587402,0.3112611472606659,0.3119436800479889,0.3154115974903106,0.3091593086719513,0.3103814721107483,0.3130497634410858,0.3133455514907837,0.3152708411216736,0.3137963414192199,0.3099324703216553,0.3164172768592834,0.3133907914161682,0.3128255009651184,0.3134104907512665,0.3106969892978668,0.3130004107952118,0.3131391704082489,0.3130116462707519,0.3143952488899231,0.3143975436687469,0.3143710494041443,0.3163396418094635,0.3166862726211548,0.3184126019477844,0.3178988993167877,0.317479133605957,0.3184944093227386,0.316694974899292,0.3176258206367492,0.3182629346847534,0.3200214207172394,0.3181648552417755,0.320680022239685,0.3178716897964477,0.3182425796985626,0.3182984292507171,0.3158398568630218,0.3152642548084259,0.3132680356502533,0.3178914785385132,0.3156660795211792,0.3161703050136566,0.3176451921463012,0.3173815906047821,0.3194171786308288,0.3193057179450989,0.3172560334205627,0.317656546831131,0.3155770003795624,0.3199106156826019,0.3170182108879089,0.3156754970550537,0.3180731236934662,0.3205638229846954,0.3175432682037353,0.3184471428394317,0.3192788958549499,0.3197042346000671,0.3177168369293213],"label":"FineWeb filtered only"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2840000092983246,0.3059999942779541,0.3059999942779541,0.2980000078678131,0.3240000009536743,0.3100000023841858,0.3000000119209289,0.3160000145435333,0.3140000104904175,0.3260000050067901,0.3199999928474426,0.2980000078678131,0.3179999887943268,0.3179999887943268,0.3319999873638153,0.3019999861717224,0.2939999997615814,0.3319999873638153,0.3319999873638153,0.3219999969005584,0.3379999995231628,0.3379999995231628,0.3339999914169311,0.3240000009536743,0.3479999899864197,0.3300000131130218,0.3240000009536743,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3319999873638153,0.3379999995231628,0.356000006198883,0.3339999914169311,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.3479999899864197,0.3339999914169311,0.3400000035762787,0.3479999899864197,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3499999940395355,0.3420000076293945,0.3659999966621399,0.3400000035762787,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3400000035762787,0.356000006198883,0.3339999914169311,0.3339999914169311,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3339999914169311,0.3440000116825104,0.3400000035762787,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3499999940395355,0.3420000076293945,0.3379999995231628,0.335999995470047,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3519999980926513,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3400000035762787,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3479999899864197,0.3379999995231628,0.3440000116825104,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3499999940395355,0.3600000143051147,0.3440000116825104,0.3499999940395355,0.356000006198883,0.3420000076293945,0.3479999899864197,0.3379999995231628,0.3379999995231628,0.3459999859333038,0.356000006198883,0.328000009059906,0.3459999859333038,0.3519999980926513,0.3499999940395355,0.3519999980926513,0.3420000076293945,0.3499999940395355,0.3420000076293945,0.3339999914169311,0.335999995470047,0.3379999995231628,0.3379999995231628,0.3540000021457672,0.356000006198883,0.356000006198883,0.335999995470047,0.363999992609024,0.363999992609024,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3540000021457672,0.3459999859333038,0.3479999899864197,0.3519999980926513,0.3519999980926513,0.3420000076293945,0.3440000116825104,0.3379999995231628,0.3519999980926513,0.356000006198883,0.3420000076293945,0.3580000102519989,0.3499999940395355,0.3619999885559082,0.3519999980926513,0.3600000143051147,0.3459999859333038,0.3519999980926513,0.3519999980926513,0.3499999940395355,0.3580000102519989,0.356000006198883,0.3580000102519989,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3440000116825104,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3600000143051147,0.3580000102519989,0.3540000021457672,0.3519999980926513,0.3459999859333038,0.3459999859333038,0.3540000021457672,0.335999995470047,0.3540000021457672,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3499999940395355,0.356000006198883],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2720000147819519,0.2980000078678131,0.2840000092983246,0.2879999876022339,0.3039999902248382,0.2860000133514404,0.2899999916553497,0.3019999861717224,0.2960000038146972,0.3039999902248382,0.3100000023841858,0.3160000145435333,0.3260000050067901,0.3160000145435333,0.3260000050067901,0.3179999887943268,0.3420000076293945,0.3219999969005584,0.328000009059906,0.3240000009536743,0.3300000131130218,0.328000009059906,0.3199999928474426,0.3379999995231628,0.3400000035762787,0.3240000009536743,0.3120000064373016,0.3319999873638153,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3140000104904175,0.3179999887943268,0.3160000145435333,0.3199999928474426,0.3240000009536743,0.3260000050067901,0.3179999887943268,0.3300000131130218,0.3179999887943268,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3199999928474426,0.3400000035762787,0.3339999914169311,0.328000009059906,0.328000009059906,0.3339999914169311,0.328000009059906,0.328000009059906,0.335999995470047,0.3580000102519989,0.3499999940395355,0.3260000050067901,0.3499999940395355,0.3420000076293945,0.3160000145435333,0.3339999914169311,0.335999995470047,0.3400000035762787,0.3240000009536743,0.3319999873638153,0.3379999995231628,0.3400000035762787,0.3379999995231628,0.3319999873638153,0.3319999873638153,0.3440000116825104,0.3300000131130218,0.3219999969005584,0.3260000050067901,0.3219999969005584,0.3339999914169311,0.328000009059906,0.3300000131130218,0.3219999969005584,0.3379999995231628,0.3400000035762787,0.3319999873638153,0.328000009059906,0.3440000116825104,0.3339999914169311,0.328000009059906,0.3379999995231628,0.3499999940395355,0.3339999914169311,0.3300000131130218,0.328000009059906,0.335999995470047,0.3240000009536743,0.335999995470047,0.3240000009536743,0.3400000035762787,0.3400000035762787,0.3420000076293945,0.3319999873638153,0.3339999914169311,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3400000035762787,0.3379999995231628,0.3459999859333038,0.3379999995231628,0.3300000131130218,0.3519999980926513,0.3379999995231628,0.356000006198883,0.335999995470047,0.3420000076293945,0.3400000035762787,0.328000009059906,0.3540000021457672,0.3499999940395355,0.3479999899864197,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3540000021457672,0.3440000116825104,0.3499999940395355,0.356000006198883,0.356000006198883,0.356000006198883,0.363999992609024,0.3600000143051147,0.356000006198883,0.3479999899864197,0.356000006198883,0.3459999859333038,0.3479999899864197,0.3619999885559082,0.363999992609024,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3540000021457672,0.3619999885559082,0.3580000102519989,0.3540000021457672,0.356000006198883,0.3479999899864197,0.3519999980926513,0.356000006198883,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3440000116825104,0.3580000102519989,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3540000021457672,0.3519999980926513,0.3540000021457672,0.356000006198883,0.363999992609024,0.356000006198883,0.356000006198883],"label":"FineWeb filtered only"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2460000067949295,0.2720000147819519,0.270000010728836,0.2939999997615814,0.2960000038146972,0.3240000009536743,0.3019999861717224,0.2879999876022339,0.3179999887943268,0.3059999942779541,0.2899999916553497,0.3100000023841858,0.3179999887943268,0.3219999969005584,0.3219999969005584,0.3300000131130218,0.3140000104904175,0.3240000009536743,0.3079999983310699,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3179999887943268,0.3260000050067901,0.3260000050067901,0.3240000009536743,0.3379999995231628,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3339999914169311,0.328000009059906,0.3319999873638153,0.3199999928474426,0.3000000119209289,0.3260000050067901,0.3240000009536743,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3440000116825104,0.3199999928474426,0.3319999873638153,0.3219999969005584,0.335999995470047,0.3519999980926513,0.3379999995231628,0.328000009059906,0.3300000131130218,0.335999995470047,0.3479999899864197,0.3459999859333038,0.3479999899864197,0.3540000021457672,0.3479999899864197,0.3300000131130218,0.356000006198883,0.3479999899864197,0.356000006198883,0.335999995470047,0.335999995470047,0.3479999899864197,0.3339999914169311,0.3540000021457672,0.3300000131130218,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3479999899864197,0.335999995470047,0.3400000035762787,0.3179999887943268,0.335999995470047,0.328000009059906,0.328000009059906,0.3540000021457672,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3440000116825104,0.3499999940395355,0.335999995470047,0.3540000021457672,0.356000006198883,0.3400000035762787,0.3600000143051147,0.3580000102519989,0.3519999980926513,0.3499999940395355,0.3540000021457672,0.3519999980926513,0.3499999940395355,0.3440000116825104,0.356000006198883,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3499999940395355,0.3440000116825104,0.3519999980926513,0.3440000116825104,0.356000006198883,0.3459999859333038,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3420000076293945,0.3379999995231628,0.3479999899864197,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.3420000076293945,0.3420000076293945,0.3499999940395355,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3459999859333038,0.3479999899864197,0.3440000116825104,0.3720000088214874,0.3619999885559082,0.356000006198883,0.3519999980926513,0.3459999859333038,0.3440000116825104,0.3420000076293945,0.3580000102519989,0.3600000143051147,0.3519999980926513,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3619999885559082,0.3499999940395355,0.3499999940395355,0.363999992609024,0.3580000102519989,0.3499999940395355,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3540000021457672,0.3600000143051147,0.3420000076293945,0.3519999980926513,0.3440000116825104,0.3519999980926513,0.3540000021457672,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3519999980926513,0.3580000102519989,0.3440000116825104,0.3499999940395355,0.3580000102519989,0.3479999899864197,0.3479999899864197],"label":"FineWeb full MinHash"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6019999980926514,0.652999997138977,0.6710000038146973,0.6740000247955322,0.6899999976158142,0.6919999718666077,0.6909999847412109,0.7070000171661377,0.7089999914169312,0.7129999995231628,0.7229999899864197,0.7120000123977661,0.7200000286102295,0.7300000190734863,0.7279999852180481,0.7369999885559082,0.7390000224113464,0.7350000143051147,0.7319999933242798,0.7279999852180481,0.7269999980926514,0.7459999918937683,0.7400000095367432,0.7390000224113464,0.7319999933242798,0.7390000224113464,0.7379999756813049,0.7390000224113464,0.7360000014305115,0.7440000176429749,0.7400000095367432,0.7360000014305115,0.7480000257492065,0.7360000014305115,0.7440000176429749,0.7459999918937683,0.7409999966621399,0.746999979019165,0.7440000176429749,0.7450000047683716,0.753000020980835,0.7390000224113464,0.7490000128746033,0.7419999837875366,0.7390000224113464,0.7559999823570251,0.7519999742507935,0.7549999952316284,0.7419999837875366,0.7490000128746033,0.7540000081062317,0.7480000257492065,0.7450000047683716,0.7429999709129333,0.7509999871253967,0.7549999952316284,0.7490000128746033,0.7490000128746033,0.7400000095367432,0.753000020980835,0.75,0.7509999871253967,0.7570000290870667,0.7590000033378601,0.7570000290870667,0.7329999804496765,0.7540000081062317,0.746999979019165,0.7409999966621399,0.7590000033378601,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7580000162124634,0.7639999985694885,0.7630000114440918,0.7590000033378601,0.7549999952316284,0.7480000257492065,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7549999952316284,0.7559999823570251,0.7580000162124634,0.7580000162124634,0.753000020980835,0.7490000128746033,0.7540000081062317,0.7639999985694885,0.7580000162124634,0.7519999742507935,0.7590000033378601,0.75,0.7570000290870667,0.7620000243186951,0.7710000276565552,0.7739999890327454,0.7620000243186951,0.7549999952316284,0.7599999904632568,0.765999972820282,0.7680000066757202,0.7639999985694885,0.7540000081062317,0.7649999856948853,0.7649999856948853,0.7609999775886536,0.7549999952316284,0.765999972820282,0.7639999985694885,0.7580000162124634,0.7710000276565552,0.7570000290870667,0.7630000114440918,0.7580000162124634,0.7599999904632568,0.7649999856948853,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7559999823570251,0.7609999775886536,0.7620000243186951,0.7620000243186951,0.7609999775886536,0.753000020980835,0.7570000290870667,0.7620000243186951,0.7609999775886536,0.7609999775886536,0.7559999823570251,0.7540000081062317,0.7570000290870667,0.7639999985694885,0.7590000033378601,0.7680000066757202,0.7680000066757202,0.765999972820282,0.765999972820282,0.7670000195503235,0.7739999890327454,0.7649999856948853,0.7749999761581421,0.7699999809265137,0.7639999985694885,0.7680000066757202,0.7630000114440918,0.7680000066757202,0.7699999809265137,0.7739999890327454,0.7749999761581421,0.765999972820282,0.7680000066757202,0.7710000276565552,0.7680000066757202,0.765999972820282,0.7689999938011169,0.7760000228881836,0.7710000276565552,0.7680000066757202,0.7649999856948853,0.7720000147819519,0.7730000019073486],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6169999837875366,0.6359999775886536,0.6769999861717224,0.6769999861717224,0.6970000267028809,0.6990000009536743,0.6970000267028809,0.6959999799728394,0.7049999833106995,0.7089999914169312,0.7179999947547913,0.7099999785423279,0.7160000205039978,0.7260000109672546,0.7229999899864197,0.7179999947547913,0.7210000157356262,0.7200000286102295,0.734000027179718,0.7089999914169312,0.7229999899864197,0.7239999771118164,0.7310000061988831,0.7300000190734863,0.7260000109672546,0.7250000238418579,0.7239999771118164,0.7289999723434448,0.7390000224113464,0.7229999899864197,0.7310000061988831,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7289999723434448,0.7329999804496765,0.7300000190734863,0.7319999933242798,0.7440000176429749,0.746999979019165,0.7310000061988831,0.7329999804496765,0.7480000257492065,0.7429999709129333,0.7369999885559082,0.7269999980926514,0.7269999980926514,0.7379999756813049,0.75,0.7360000014305115,0.746999979019165,0.7409999966621399,0.7369999885559082,0.7459999918937683,0.7400000095367432,0.7409999966621399,0.746999979019165,0.7360000014305115,0.7459999918937683,0.7400000095367432,0.7429999709129333,0.7350000143051147,0.7390000224113464,0.7379999756813049,0.7480000257492065,0.7329999804496765,0.734000027179718,0.7390000224113464,0.7459999918937683,0.7360000014305115,0.7419999837875366,0.7429999709129333,0.7400000095367432,0.7379999756813049,0.7310000061988831,0.7360000014305115,0.7390000224113464,0.75,0.7369999885559082,0.7570000290870667,0.7409999966621399,0.7459999918937683,0.7350000143051147,0.7459999918937683,0.7509999871253967,0.7429999709129333,0.7419999837875366,0.7419999837875366,0.75,0.7440000176429749,0.7450000047683716,0.75,0.7409999966621399,0.7490000128746033,0.7409999966621399,0.7419999837875366,0.7429999709129333,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.75,0.753000020980835,0.75,0.746999979019165,0.7519999742507935,0.746999979019165,0.7570000290870667,0.7549999952316284,0.75,0.7540000081062317,0.7480000257492065,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.746999979019165,0.746999979019165,0.75,0.7519999742507935,0.7580000162124634,0.7549999952316284,0.7490000128746033,0.7480000257492065,0.7519999742507935,0.7590000033378601,0.7450000047683716,0.75,0.7440000176429749,0.7419999837875366,0.7519999742507935,0.7450000047683716,0.753000020980835,0.7450000047683716,0.7440000176429749,0.7559999823570251,0.7509999871253967,0.7540000081062317,0.7440000176429749,0.7509999871253967,0.753000020980835,0.7490000128746033,0.7570000290870667,0.7490000128746033,0.746999979019165,0.746999979019165,0.7509999871253967,0.7509999871253967,0.7519999742507935,0.7570000290870667,0.7540000081062317,0.7440000176429749,0.7480000257492065,0.7509999871253967,0.7509999871253967,0.7509999871253967,0.7549999952316284,0.75,0.7559999823570251,0.746999979019165,0.7609999775886536,0.7549999952316284,0.746999979019165,0.7490000128746033,0.753000020980835,0.753000020980835,0.7609999775886536,0.746999979019165,0.7580000162124634],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.621999979019165,0.6439999938011169,0.6700000166893005,0.6790000200271606,0.6869999766349792,0.6959999799728394,0.6790000200271606,0.6880000233650208,0.7049999833106995,0.699999988079071,0.6990000009536743,0.6940000057220459,0.7110000252723694,0.7089999914169312,0.7120000123977661,0.7070000171661377,0.7070000171661377,0.6990000009536743,0.7009999752044678,0.7160000205039978,0.7200000286102295,0.7149999737739563,0.7250000238418579,0.7210000157356262,0.722000002861023,0.7310000061988831,0.7289999723434448,0.7319999933242798,0.7250000238418579,0.722000002861023,0.7210000157356262,0.7170000076293945,0.7260000109672546,0.7250000238418579,0.7210000157356262,0.7200000286102295,0.7379999756813049,0.7239999771118164,0.7239999771118164,0.7080000042915344,0.7289999723434448,0.7289999723434448,0.7300000190734863,0.7329999804496765,0.7319999933242798,0.7350000143051147,0.7390000224113464,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7329999804496765,0.7400000095367432,0.7409999966621399,0.7310000061988831,0.7350000143051147,0.7360000014305115,0.7360000014305115,0.7409999966621399,0.7319999933242798,0.7409999966621399,0.7400000095367432,0.7390000224113464,0.7329999804496765,0.7459999918937683,0.753000020980835,0.746999979019165,0.734000027179718,0.7369999885559082,0.7419999837875366,0.734000027179718,0.7419999837875366,0.7289999723434448,0.7350000143051147,0.7300000190734863,0.7519999742507935,0.7390000224113464,0.7400000095367432,0.7409999966621399,0.7429999709129333,0.7450000047683716,0.7329999804496765,0.7260000109672546,0.7570000290870667,0.7360000014305115,0.7519999742507935,0.7419999837875366,0.7379999756813049,0.7390000224113464,0.7490000128746033,0.734000027179718,0.7360000014305115,0.7390000224113464,0.7440000176429749,0.7450000047683716,0.7319999933242798,0.7429999709129333,0.7519999742507935,0.7540000081062317,0.7519999742507935,0.753000020980835,0.7480000257492065,0.7440000176429749,0.7459999918937683,0.7369999885559082,0.7419999837875366,0.7480000257492065,0.7419999837875366,0.765999972820282,0.746999979019165,0.7459999918937683,0.7570000290870667,0.7390000224113464,0.7409999966621399,0.7459999918937683,0.75,0.7570000290870667,0.753000020980835,0.7549999952316284,0.7519999742507935,0.7490000128746033,0.746999979019165,0.7459999918937683,0.7459999918937683,0.746999979019165,0.7409999966621399,0.7419999837875366,0.7459999918937683,0.7440000176429749,0.7459999918937683,0.7490000128746033,0.7450000047683716,0.7409999966621399,0.7419999837875366,0.7490000128746033,0.7590000033378601,0.7549999952316284,0.7549999952316284,0.746999979019165,0.753000020980835,0.7549999952316284,0.746999979019165,0.7580000162124634,0.7490000128746033,0.753000020980835,0.75,0.75,0.7540000081062317,0.7540000081062317,0.7490000128746033,0.7570000290870667,0.7570000290870667,0.7590000033378601,0.7559999823570251,0.7620000243186951,0.7590000033378601,0.7509999871253967,0.7639999985694885,0.7580000162124634,0.7599999904632568,0.7620000243186951,0.7590000033378601,0.7609999775886536,0.7559999823570251,0.75,0.7509999871253967,0.7549999952316284,0.7540000081062317,0.7540000081062317],"label":"FineWeb filtered only"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.3980000019073486,0.3899999856948852,0.3860000073909759,0.3919999897480011,0.402999997138977,0.3959999978542328,0.3959999978542328,0.4070000052452087,0.4009999930858612,0.4079999923706054,0.4009999930858612,0.3910000026226043,0.3980000019073486,0.395000010728836,0.4129999876022339,0.4020000100135803,0.4090000092983246,0.4120000004768371,0.4129999876022339,0.4129999876022339,0.4099999964237213,0.4110000133514404,0.4110000133514404,0.4090000092983246,0.4000000059604645,0.4050000011920929,0.3939999938011169,0.3889999985694885,0.4050000011920929,0.4099999964237213,0.3980000019073486,0.4090000092983246,0.4079999923706054,0.4070000052452087,0.4040000140666961,0.4129999876022339,0.4090000092983246,0.4059999883174896,0.4090000092983246,0.4090000092983246,0.4149999916553497,0.4059999883174896,0.4000000059604645,0.4000000059604645,0.4070000052452087,0.402999997138977,0.4040000140666961,0.3989999890327453,0.4020000100135803,0.4160000085830688,0.4050000011920929,0.4110000133514404,0.4059999883174896,0.3989999890327453,0.4169999957084656,0.4040000140666961,0.4050000011920929,0.4149999916553497,0.4020000100135803,0.402999997138977,0.4129999876022339,0.4009999930858612,0.4059999883174896,0.4040000140666961,0.4099999964237213,0.414000004529953,0.4210000038146972,0.4110000133514404,0.4070000052452087,0.4099999964237213,0.4169999957084656,0.4070000052452087,0.4199999868869781,0.4079999923706054,0.4180000126361847,0.4110000133514404,0.4110000133514404,0.4189999997615814,0.414000004529953,0.4129999876022339,0.4180000126361847,0.4070000052452087,0.4059999883174896,0.4059999883174896,0.4129999876022339,0.4149999916553497,0.4099999964237213,0.4009999930858612,0.4020000100135803,0.4099999964237213,0.4169999957084656,0.4129999876022339,0.414000004529953,0.4099999964237213,0.4189999997615814,0.4210000038146972,0.4090000092983246,0.4079999923706054,0.4099999964237213,0.4099999964237213,0.4129999876022339,0.4099999964237213,0.4099999964237213,0.4110000133514404,0.4020000100135803,0.4079999923706054,0.4079999923706054,0.414000004529953,0.4129999876022339,0.4189999997615814,0.4129999876022339,0.4180000126361847,0.4050000011920929,0.4230000078678131,0.4180000126361847,0.4120000004768371,0.4149999916553497,0.4189999997615814,0.4110000133514404,0.4160000085830688,0.4059999883174896,0.4110000133514404,0.4110000133514404,0.4110000133514404,0.4040000140666961,0.4149999916553497,0.414000004529953,0.4160000085830688,0.414000004529953,0.4129999876022339,0.4120000004768371,0.4149999916553497,0.4169999957084656,0.4110000133514404,0.414000004529953,0.4160000085830688,0.4110000133514404,0.4120000004768371,0.4110000133514404,0.4149999916553497,0.4129999876022339,0.4110000133514404,0.4129999876022339,0.4099999964237213,0.4180000126361847,0.414000004529953,0.4040000140666961,0.4099999964237213,0.4099999964237213,0.4120000004768371,0.4149999916553497,0.4129999876022339,0.4079999923706054,0.4040000140666961,0.4129999876022339,0.4149999916553497,0.4120000004768371,0.402999997138977,0.4090000092983246,0.4110000133514404,0.4090000092983246,0.4070000052452087,0.4149999916553497,0.4070000052452087,0.4120000004768371,0.4059999883174896,0.4059999883174896,0.4099999964237213],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.395000010728836,0.3919999897480011,0.3819999992847442,0.3840000033378601,0.3869999945163727,0.395000010728836,0.3959999978542328,0.4020000100135803,0.4009999930858612,0.4079999923706054,0.402999997138977,0.4000000059604645,0.3930000066757202,0.4050000011920929,0.4040000140666961,0.3959999978542328,0.4009999930858612,0.4059999883174896,0.3989999890327453,0.3970000147819519,0.4070000052452087,0.4079999923706054,0.4000000059604645,0.3959999978542328,0.3970000147819519,0.4009999930858612,0.3980000019073486,0.3959999978542328,0.3970000147819519,0.4000000059604645,0.3910000026226043,0.4110000133514404,0.4040000140666961,0.3919999897480011,0.4160000085830688,0.4120000004768371,0.4070000052452087,0.4000000059604645,0.4040000140666961,0.4120000004768371,0.3939999938011169,0.4020000100135803,0.4000000059604645,0.4090000092983246,0.4059999883174896,0.3980000019073486,0.4210000038146972,0.402999997138977,0.4149999916553497,0.4009999930858612,0.414000004529953,0.4129999876022339,0.4199999868869781,0.4090000092983246,0.3989999890327453,0.4040000140666961,0.402999997138977,0.402999997138977,0.4059999883174896,0.4050000011920929,0.4160000085830688,0.4169999957084656,0.4079999923706054,0.402999997138977,0.4020000100135803,0.3959999978542328,0.4169999957084656,0.3970000147819519,0.4099999964237213,0.402999997138977,0.4059999883174896,0.402999997138977,0.3939999938011169,0.3939999938011169,0.4020000100135803,0.3970000147819519,0.4120000004768371,0.4040000140666961,0.4040000140666961,0.4090000092983246,0.3980000019073486,0.4079999923706054,0.4070000052452087,0.4099999964237213,0.3989999890327453,0.4000000059604645,0.4070000052452087,0.3980000019073486,0.402999997138977,0.4090000092983246,0.4040000140666961,0.3889999985694885,0.4000000059604645,0.402999997138977,0.4050000011920929,0.395000010728836,0.4009999930858612,0.3989999890327453,0.3970000147819519,0.4009999930858612,0.3989999890327453,0.3970000147819519,0.4099999964237213,0.3989999890327453,0.4070000052452087,0.4009999930858612,0.3880000114440918,0.3959999978542328,0.3910000026226043,0.3930000066757202,0.3980000019073486,0.402999997138977,0.4009999930858612,0.4000000059604645,0.3919999897480011,0.3980000019073486,0.395000010728836,0.4020000100135803,0.3989999890327453,0.4020000100135803,0.4040000140666961,0.4070000052452087,0.4090000092983246,0.4079999923706054,0.4099999964237213,0.4040000140666961,0.3889999985694885,0.3989999890327453,0.4020000100135803,0.3989999890327453,0.3970000147819519,0.4009999930858612,0.4090000092983246,0.414000004529953,0.395000010728836,0.4009999930858612,0.4020000100135803,0.4009999930858612,0.3980000019073486,0.402999997138977,0.3980000019073486,0.402999997138977,0.395000010728836,0.4020000100135803,0.395000010728836,0.3989999890327453,0.3970000147819519,0.3980000019073486,0.3980000019073486,0.3970000147819519,0.3939999938011169,0.395000010728836,0.3989999890327453,0.3970000147819519,0.4020000100135803,0.3930000066757202,0.3989999890327453,0.4050000011920929,0.3930000066757202,0.4040000140666961,0.4000000059604645,0.4020000100135803,0.3880000114440918,0.395000010728836,0.3910000026226043,0.3980000019073486,0.4009999930858612],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.4000000059604645,0.395000010728836,0.3959999978542328,0.4020000100135803,0.4000000059604645,0.3959999978542328,0.3930000066757202,0.3899999856948852,0.402999997138977,0.4009999930858612,0.3930000066757202,0.4050000011920929,0.3939999938011169,0.4110000133514404,0.4000000059604645,0.3989999890327453,0.3959999978542328,0.4020000100135803,0.4000000059604645,0.3939999938011169,0.395000010728836,0.3919999897480011,0.3980000019073486,0.3910000026226043,0.3880000114440918,0.3959999978542328,0.3980000019073486,0.3989999890327453,0.402999997138977,0.3959999978542328,0.3980000019073486,0.395000010728836,0.4090000092983246,0.4090000092983246,0.3889999985694885,0.3959999978542328,0.3880000114440918,0.3840000033378601,0.3959999978542328,0.3880000114440918,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.3939999938011169,0.4020000100135803,0.3980000019073486,0.3970000147819519,0.4009999930858612,0.3919999897480011,0.3899999856948852,0.3989999890327453,0.3860000073909759,0.3860000073909759,0.3970000147819519,0.3959999978542328,0.3939999938011169,0.3840000033378601,0.3869999945163727,0.402999997138977,0.4050000011920929,0.395000010728836,0.3880000114440918,0.3869999945163727,0.3939999938011169,0.402999997138977,0.3899999856948852,0.3910000026226043,0.3910000026226043,0.4009999930858612,0.3919999897480011,0.3970000147819519,0.3919999897480011,0.3930000066757202,0.3869999945163727,0.3880000114440918,0.3849999904632568,0.3930000066757202,0.395000010728836,0.3889999985694885,0.3959999978542328,0.3989999890327453,0.402999997138977,0.3939999938011169,0.4000000059604645,0.4000000059604645,0.4050000011920929,0.3989999890327453,0.3869999945163727,0.3910000026226043,0.3889999985694885,0.3889999985694885,0.4000000059604645,0.3910000026226043,0.3970000147819519,0.3989999890327453,0.3989999890327453,0.3959999978542328,0.3910000026226043,0.3880000114440918,0.3939999938011169,0.382999986410141,0.3849999904632568,0.3959999978542328,0.3989999890327453,0.3959999978542328,0.3880000114440918,0.3840000033378601,0.3980000019073486,0.4000000059604645,0.4000000059604645,0.4020000100135803,0.395000010728836,0.3910000026226043,0.3919999897480011,0.4040000140666961,0.3989999890327453,0.4020000100135803,0.3910000026226043,0.4009999930858612,0.3959999978542328,0.3939999938011169,0.3930000066757202,0.3910000026226043,0.3970000147819519,0.3880000114440918,0.3970000147819519,0.3959999978542328,0.3889999985694885,0.3970000147819519,0.4009999930858612,0.3970000147819519,0.3959999978542328,0.3959999978542328,0.3989999890327453,0.4040000140666961,0.3959999978542328,0.3980000019073486,0.3970000147819519,0.3970000147819519,0.3989999890327453,0.4020000100135803,0.3980000019073486,0.4000000059604645,0.4000000059604645,0.402999997138977,0.4090000092983246,0.3970000147819519,0.4020000100135803,0.3970000147819519,0.4009999930858612,0.3959999978542328,0.3970000147819519,0.3989999890327453,0.3939999938011169,0.3989999890327453,0.4000000059604645,0.4000000059604645,0.3989999890327453,0.4050000011920929,0.4059999883174896,0.4009999930858612,0.3989999890327453,0.3959999978542328,0.3939999938011169,0.3970000147819519,0.4009999930858612,0.3989999890327453,0.3939999938011169],"label":"FineWeb filtered only"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_dumps_bad/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4869999885559082,0.4959999918937683,0.4979999959468841,0.5099999904632568,0.515999972820282,0.5080000162124634,0.5249999761581421,0.5239999890327454,0.5299999713897705,0.5239999890327454,0.5149999856948853,0.5270000100135803,0.5249999761581421,0.5180000066757202,0.5220000147819519,0.5329999923706055,0.5289999842643738,0.5239999890327454,0.5299999713897705,0.5230000019073486,0.5130000114440918,0.5180000066757202,0.5299999713897705,0.5199999809265137,0.5270000100135803,0.5230000019073486,0.5299999713897705,0.5320000052452087,0.5429999828338623,0.527999997138977,0.5379999876022339,0.527999997138977,0.5419999957084656,0.5329999923706055,0.5450000166893005,0.5320000052452087,0.5410000085830688,0.5249999761581421,0.5400000214576721,0.5249999761581421,0.5289999842643738,0.5320000052452087,0.5339999794960022,0.5320000052452087,0.5350000262260437,0.5400000214576721,0.5450000166893005,0.5440000295639038,0.5400000214576721,0.5379999876022339,0.5350000262260437,0.5410000085830688,0.5490000247955322,0.531000018119812,0.5389999747276306,0.546999990940094,0.5529999732971191,0.5370000004768372,0.5440000295639038,0.5400000214576721,0.5490000247955322,0.550000011920929,0.5580000281333923,0.5609999895095825,0.5429999828338623,0.5529999732971191,0.5519999861717224,0.5450000166893005,0.550000011920929,0.5379999876022339,0.5490000247955322,0.5460000038146973,0.5419999957084656,0.5569999814033508,0.5509999990463257,0.5490000247955322,0.5529999732971191,0.5479999780654907,0.5590000152587891,0.5479999780654907,0.5509999990463257,0.5440000295639038,0.5509999990463257,0.5540000200271606,0.5559999942779541,0.5630000233650208,0.5649999976158142,0.5640000104904175,0.5649999976158142,0.5490000247955322,0.5709999799728394,0.5659999847412109,0.5630000233650208,0.5640000104904175,0.5580000281333923,0.546999990940094,0.5550000071525574,0.5580000281333923,0.5429999828338623,0.5440000295639038,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5550000071525574,0.5649999976158142,0.5540000200271606,0.5630000233650208,0.5609999895095825,0.5580000281333923,0.5509999990463257,0.5550000071525574,0.5550000071525574,0.5519999861717224,0.5609999895095825,0.5630000233650208,0.5509999990463257,0.550000011920929,0.5490000247955322,0.5540000200271606,0.550000011920929,0.5529999732971191,0.5460000038146973,0.550000011920929,0.5529999732971191,0.5519999861717224,0.5529999732971191,0.5609999895095825,0.5590000152587891,0.5550000071525574,0.550000011920929,0.5609999895095825,0.5619999766349792,0.5609999895095825,0.5540000200271606,0.550000011920929,0.5600000023841858,0.5559999942779541,0.5609999895095825,0.5569999814033508,0.5600000023841858,0.5680000185966492,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5669999718666077,0.5709999799728394,0.5640000104904175,0.5569999814033508,0.5600000023841858,0.5569999814033508,0.5649999976158142,0.5600000023841858,0.5580000281333923,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5529999732971191,0.5640000104904175,0.5649999976158142,0.5659999847412109,0.5630000233650208,0.5630000233650208,0.5619999766349792,0.5609999895095825,0.5559999942779541,0.5529999732971191,0.5600000023841858],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5239999890327454,0.4900000095367431,0.5040000081062317,0.5099999904632568,0.4990000128746032,0.5170000195503235,0.5040000081062317,0.5009999871253967,0.5230000019073486,0.5109999775886536,0.5059999823570251,0.5130000114440918,0.5090000033378601,0.5180000066757202,0.5220000147819519,0.5189999938011169,0.5180000066757202,0.5220000147819519,0.5120000243186951,0.5460000038146973,0.5239999890327454,0.5289999842643738,0.5440000295639038,0.5339999794960022,0.5299999713897705,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5360000133514404,0.5299999713897705,0.5180000066757202,0.5249999761581421,0.5440000295639038,0.5299999713897705,0.5339999794960022,0.5239999890327454,0.527999997138977,0.5139999985694885,0.5289999842643738,0.5360000133514404,0.5260000228881836,0.5389999747276306,0.5460000038146973,0.5270000100135803,0.5339999794960022,0.5320000052452087,0.5329999923706055,0.5260000228881836,0.5220000147819519,0.5260000228881836,0.5379999876022339,0.5410000085830688,0.5350000262260437,0.5389999747276306,0.5320000052452087,0.5389999747276306,0.5379999876022339,0.5329999923706055,0.5270000100135803,0.5170000195503235,0.5329999923706055,0.5370000004768372,0.5379999876022339,0.5249999761581421,0.5479999780654907,0.546999990940094,0.5400000214576721,0.5440000295639038,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5370000004768372,0.5370000004768372,0.5479999780654907,0.5379999876022339,0.5400000214576721,0.5479999780654907,0.5379999876022339,0.5509999990463257,0.5440000295639038,0.5379999876022339,0.550000011920929,0.5389999747276306,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5360000133514404,0.5509999990463257,0.5360000133514404,0.5419999957084656,0.5419999957084656,0.550000011920929,0.5360000133514404,0.5519999861717224,0.5540000200271606,0.546999990940094,0.5370000004768372,0.5379999876022339,0.5519999861717224,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.550000011920929,0.5490000247955322,0.5360000133514404,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5490000247955322,0.5479999780654907,0.5350000262260437,0.5490000247955322,0.5370000004768372,0.5440000295639038,0.5329999923706055,0.5440000295639038,0.5429999828338623,0.5389999747276306,0.5450000166893005,0.5320000052452087,0.5450000166893005,0.5400000214576721,0.5419999957084656,0.5460000038146973,0.5370000004768372,0.5400000214576721,0.5460000038146973,0.5370000004768372,0.5370000004768372,0.5460000038146973,0.5400000214576721,0.5490000247955322,0.5529999732971191,0.5379999876022339,0.5460000038146973,0.5450000166893005,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5479999780654907,0.5460000038146973,0.5540000200271606,0.5400000214576721,0.5350000262260437,0.5490000247955322,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5410000085830688,0.5429999828338623,0.5379999876022339,0.5450000166893005,0.5389999747276306,0.5400000214576721,0.5400000214576721,0.550000011920929,0.5440000295639038,0.5389999747276306,0.5450000166893005,0.5400000214576721,0.5389999747276306,0.5419999957084656,0.5410000085830688,0.5440000295639038,0.5519999861717224,0.5479999780654907,0.5450000166893005,0.5569999814033508],"label":"FineWeb filtered only"},"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5,0.4979999959468841,0.4950000047683716,0.4950000047683716,0.5049999952316284,0.5329999923706055,0.5220000147819519,0.5139999985694885,0.5339999794960022,0.5130000114440918,0.5389999747276306,0.5400000214576721,0.5270000100135803,0.5320000052452087,0.5260000228881836,0.5370000004768372,0.527999997138977,0.5289999842643738,0.5339999794960022,0.5270000100135803,0.531000018119812,0.527999997138977,0.5400000214576721,0.5479999780654907,0.550000011920929,0.5400000214576721,0.5350000262260437,0.5410000085830688,0.5379999876022339,0.5299999713897705,0.5490000247955322,0.5509999990463257,0.5519999861717224,0.5429999828338623,0.5429999828338623,0.5440000295639038,0.5379999876022339,0.5379999876022339,0.5419999957084656,0.5609999895095825,0.5540000200271606,0.5370000004768372,0.5440000295639038,0.5410000085830688,0.5379999876022339,0.5329999923706055,0.5419999957084656,0.5419999957084656,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5400000214576721,0.5450000166893005,0.5509999990463257,0.5569999814033508,0.5550000071525574,0.5590000152587891,0.5479999780654907,0.5550000071525574,0.5440000295639038,0.5460000038146973,0.546999990940094,0.5559999942779541,0.5550000071525574,0.5490000247955322,0.5440000295639038,0.546999990940094,0.5450000166893005,0.546999990940094,0.5649999976158142,0.5490000247955322,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5519999861717224,0.5519999861717224,0.5529999732971191,0.5490000247955322,0.546999990940094,0.550000011920929,0.5720000267028809,0.5619999766349792,0.5490000247955322,0.5680000185966492,0.5519999861717224,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.5630000233650208,0.5529999732971191,0.5619999766349792,0.5609999895095825,0.550000011920929,0.5479999780654907,0.5529999732971191,0.5519999861717224,0.5580000281333923,0.5590000152587891,0.5529999732971191,0.550000011920929,0.5680000185966492,0.5580000281333923,0.5630000233650208,0.5630000233650208,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5649999976158142,0.5659999847412109,0.5559999942779541,0.5659999847412109,0.5630000233650208,0.5509999990463257,0.5669999718666077,0.5669999718666077,0.5479999780654907,0.5540000200271606,0.5580000281333923,0.5519999861717224,0.5590000152587891,0.5590000152587891,0.5619999766349792,0.5509999990463257,0.546999990940094,0.5609999895095825,0.5540000200271606,0.5630000233650208,0.5580000281333923,0.5559999942779541,0.5680000185966492,0.5649999976158142,0.5619999766349792,0.5580000281333923,0.5630000233650208,0.5559999942779541,0.5540000200271606,0.5540000200271606,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5600000023841858,0.5460000038146973,0.5429999828338623,0.5580000281333923,0.5550000071525574,0.5580000281333923,0.5540000200271606,0.5609999895095825,0.5519999861717224,0.550000011920929,0.5519999861717224,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5590000152587891,0.5690000057220459,0.5640000104904175,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5640000104904175,0.5600000023841858,0.5550000071525574,0.5640000104904175,0.5600000023841858,0.5540000200271606],"label":"RefinedWeb"}},"layout":{"title":{"text":"Dedup across all dumps does not improve performance"}}}
dist/assets/data/plots/all_filtering_steps/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3552836012095213,0.3781493119895458,0.3866849727928638,0.4050675220787525,0.4032807648181915,0.4174600429832935,0.4206059761345386,0.427497424185276,0.4316632784903049,0.4385909177362919,0.4334069043397903,0.4360812865197658,0.4404293224215507,0.4385774843394756,0.4407080821692943,0.4467254020273685,0.4470436163246631,0.4486658610403538,0.4459679573774338,0.4454015754163265,0.4515932314097881,0.4482216536998749,0.4484201297163963,0.455057855695486,0.4526158757507801,0.453176885843277,0.450159091502428,0.4516039006412029,0.4549933448433876,0.4555377587676048,0.4575010798871517,0.4577344059944153,0.4540543705224991,0.4537974074482918,0.4611785635352134,0.4586966186761856,0.4594406597316265,0.4598931074142456,0.457538403570652,0.4591932781040668,0.4636382386088371,0.4582749158143997,0.4625946804881096,0.4633439630270004,0.4666871763765812,0.4649887941777706,0.4671247974038124,0.4665776938199997,0.4672530107200145,0.4666078947484493,0.4666155055165291,0.4727727174758911,0.467480719089508,0.4681386984884739,0.4651658721268177,0.4668439887464046,0.4671731516718864,0.4719251021742821,0.4699816256761551,0.4723306186497211,0.4686817973852157,0.468911949545145,0.4714248068630695,0.4724191203713417,0.4700912088155746,0.4685601107776165,0.4716645181179046,0.4724556542932987,0.4670086726546287,0.4703365340828895,0.4698334187269211,0.471625205129385,0.4688323326408863,0.4735309742391109,0.4729253277182579,0.4747676998376846,0.4723741039633751,0.4764323942363262,0.4737579710781574,0.4758132360875606,0.4755662642419338,0.4730159305036068,0.4787128046154976,0.4740134924650192,0.4785312972962856,0.4783577285706997,0.4752367511391639,0.474204134196043,0.4737414345145225,0.4780189953744411,0.477523285895586,0.4751617163419723,0.4776186011731624,0.4769949465990066,0.4790891669690609,0.479917362332344,0.4771673306822777,0.4825278185307979,0.4811677671968937,0.4787211790680885,0.4817796200513839,0.4819813556969166,0.4802381917834282,0.4810985140502453,0.481117732822895,0.4791575670242309,0.4798801243305206,0.4829155020415783,0.4822122864425182,0.4827562272548675,0.4839778505265713,0.4820474348962307,0.4858015961945057,0.4826803356409073,0.4831027314066887,0.4827458150684833,0.4819435514509678,0.4836879819631576,0.4835174195468426,0.4855972006917,0.4871680215001106,0.4840429238975048,0.4827739149332046,0.4881435632705688,0.4871019721031189,0.486987367272377,0.4836358055472374,0.4867987409234047,0.4869474284350872,0.4886575266718864,0.4855775311589241,0.4863000251352787,0.4841057248413563,0.488163661211729,0.4904011823236942,0.4870587214827537,0.4884037151932716,0.4873756393790245,0.4925794936716556,0.4874482750892639,0.4898910224437713,0.4893574342131614,0.4888269044458866,0.4887814335525036,0.4876748844981193,0.4853886738419533,0.4878034777939319,0.4911742769181728,0.4905468784272671,0.4896938055753708,0.4875142201781273,0.4900367334485054,0.4900274313986301,0.4905461706221103,0.4891181476414203,0.4881824217736721,0.4902780950069427,0.4895042479038238,0.4890727028250694,0.4897591508924961,0.4879062548279762,0.4897833876311779,0.4902243539690971,0.4884885586798191,0.4880276583135128,0.4927133433520794,0.4899616949260235],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3593025095760822,0.3753932043910026,0.3896549865603447,0.4011945575475693,0.4079862833023071,0.4100634902715683,0.4188448339700699,0.4182912856340408,0.4209799654781818,0.426167830824852,0.4270535074174404,0.4293412938714027,0.4376098960638046,0.4369498938322067,0.4447805918753147,0.4420784451067447,0.4401859976351261,0.4450364373624325,0.4467439614236355,0.4494622647762298,0.4474291987717151,0.4474774301052093,0.4496959559619427,0.4504862427711487,0.4483809620141983,0.4500409476459026,0.4506221041083336,0.4519891515374183,0.4511651210486889,0.4493776857852936,0.4546159133315086,0.4542211070656776,0.4540864638984203,0.4535767734050751,0.4580400213599205,0.451940905302763,0.4536588154733181,0.4593464843928814,0.4576366357505321,0.4563389606773853,0.4556163437664509,0.4611873291432857,0.4606512449681759,0.4602674432098865,0.4573654346168041,0.4579697586596012,0.4577618762850761,0.465243399143219,0.4626524560153484,0.4652697443962097,0.4616814218461513,0.4664025083184242,0.4648593515157699,0.4665380977094173,0.4670920372009277,0.4651120826601982,0.4648002386093139,0.4674604535102844,0.4694998189806938,0.4647957049310207,0.4655059054493904,0.4694474637508392,0.4685290567576885,0.4678448662161827,0.4666110426187515,0.466820664703846,0.4703560136258602,0.4655868485569954,0.4657375514507293,0.4673589915037155,0.4694744572043419,0.4697113968431949,0.4663790501654148,0.4678909480571747,0.4731503240764141,0.4703953340649605,0.4711540788412094,0.4689725339412689,0.4709760397672653,0.4721849896013737,0.4684626050293445,0.4728966951370239,0.4708623439073562,0.4755619578063488,0.4722185768187046,0.4752251170575619,0.4724387377500534,0.4767676629126072,0.4720797315239906,0.476152952760458,0.4784524105489254,0.472656887024641,0.4761070720851421,0.4791567139327526,0.4773554690182209,0.4749615713953972,0.4786102436482906,0.4776762872934341,0.4759960658848285,0.4783963784575462,0.4794723503291607,0.4783952049911022,0.4814380966126919,0.476895060390234,0.479157205671072,0.4783024378120899,0.4772652834653854,0.4805076755583286,0.4786335416138172,0.4829660281538963,0.4798073060810566,0.4846024662256241,0.4791539534926414,0.4836216196417808,0.482492484152317,0.4832956567406654,0.4811016321182251,0.480607770383358,0.4813096337020397,0.4819207563996315,0.482705220580101,0.4817859195172786,0.4817019775509834,0.4848218411207199,0.4850655570626259,0.4847046621143818,0.4811170361936092,0.4863272421061992,0.484540831297636,0.4826735481619835,0.4844910651445389,0.4825031049549579,0.4849743507802486,0.484294731169939,0.4857852198183536,0.4881704896688461,0.4850401543080807,0.4885894693434238,0.4855906665325165,0.4871751256287098,0.48358104377985,0.4859574064612388,0.4833582155406475,0.4867088869214058,0.4869902320206165,0.4876262210309505,0.4864178374409675,0.4864541031420231,0.4867057502269745,0.4884936697781086,0.4854058027267456,0.4880223199725151,0.4881350100040436,0.4871640801429748,0.4859121330082416,0.4894774369895458,0.4890438541769981,0.489189263433218,0.4893344156444073,0.4886334165930748,0.4900187514722347,0.4877792187035084,0.4887096807360649,0.4900767691433429,0.4877709597349167,0.48653694242239,0.4897000454366207],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3608616776764393,0.3745453506708145,0.3862277194857597,0.3989979773759842,0.406296543776989,0.4094927236437797,0.4138859286904335,0.4177777022123337,0.4208802655339241,0.4254550077021122,0.4283009432256222,0.429458349943161,0.4330311268568039,0.4349483698606491,0.4348161295056343,0.438955657184124,0.4389265701174736,0.4393925778567791,0.4383306242525577,0.4436748661100864,0.4423373565077781,0.4460027255117893,0.4440812170505523,0.4476902261376381,0.4465879611670971,0.4497823156416416,0.4513350501656532,0.4518667235970497,0.45149727165699,0.4513994492590427,0.4521937072277069,0.4520382955670357,0.4530793912708759,0.4516105614602566,0.4530563354492187,0.4495660625398159,0.4520940892398357,0.4561133235692978,0.4522969461977482,0.4575686641037464,0.4589144177734852,0.4582882039248943,0.457970168441534,0.4554797261953354,0.4622044861316681,0.4596928395330906,0.4624353349208832,0.4619148448109627,0.461100060492754,0.458431463688612,0.4620467089116573,0.4562215581536293,0.4620163068175316,0.4631462283432483,0.4600549824535846,0.4620365314185619,0.458735141903162,0.461642112582922,0.461245734244585,0.4645131677389145,0.4629777930676937,0.4651660025119781,0.4653937108814716,0.4676259346306324,0.4667201824486255,0.4650012850761413,0.4676916748285293,0.4708514772355556,0.4673572592437267,0.4689626581966877,0.4678038358688354,0.4667215310037136,0.4646228328347206,0.4662510119378567,0.4674677737057209,0.4690804108977318,0.4634581170976162,0.4701276533305645,0.4676450751721859,0.4672758504748344,0.4674397967755794,0.4656238108873367,0.4690065123140812,0.4677213467657566,0.4678985886275768,0.4735414572060108,0.4705612398684025,0.4703374318778515,0.4704933613538742,0.4688010476529598,0.4699571952223778,0.4674785658717155,0.4701188169419765,0.4682065695524215,0.4729971997439861,0.4748715870082378,0.4745333231985569,0.4737020246684551,0.4747246317565441,0.4771635122597217,0.4740425907075405,0.475264236330986,0.4744705818593502,0.474684040993452,0.4721556939184665,0.475641455501318,0.476833701133728,0.4746401384472847,0.4742486327886581,0.4730467088520527,0.4773029200732708,0.4760043211281299,0.4770320989191532,0.4742161482572555,0.4780259765684604,0.4806670732796192,0.4784667380154133,0.4788618609309196,0.4762138128280639,0.4777246937155723,0.4796081893146038,0.4798486456274986,0.475479181855917,0.4779988899827003,0.4765858314931392,0.4772914499044418,0.47843898832798,0.4799034222960472,0.4803600236773491,0.4751846008002758,0.4777872562408447,0.4779460839927196,0.4787487275898456,0.4808406494557857,0.4810357913374901,0.4797308407723903,0.4800078608095646,0.4806460626423359,0.4810502976179123,0.4797912389039993,0.477332629263401,0.4818884879350662,0.482621606439352,0.4833096489310264,0.4821632876992225,0.4831674285233021,0.4830279909074306,0.4849893450736999,0.4845218025147915,0.4825541749596596,0.4833571836352348,0.4853803217411041,0.483093187212944,0.4850797094404697,0.485261783003807,0.4837660938501358,0.4835929833352566,0.4855643883347511,0.4832059442996979,0.484714712947607,0.4839249886572361,0.4829078912734985,0.4818423055112362,0.482727088034153,0.4824129492044449,0.4820138849318027,0.4865870922803879],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3605199865996837,0.3733148723840713,0.3882005847990513,0.3934122696518898,0.3947227671742439,0.4042885974049568,0.3974800482392311,0.4055779427289963,0.4133470430970192,0.4117913842201233,0.4113653488457203,0.4149517640471458,0.4187851920723915,0.4206527359783649,0.4240428246557712,0.422003373503685,0.4280910938978195,0.4244147576391697,0.4316282644867897,0.4295645765960216,0.4310102686285972,0.4360743537545204,0.4313482865691185,0.4350991360843181,0.4378576353192329,0.4335876516997814,0.4347924515604973,0.4348904751241207,0.436600212007761,0.430036511272192,0.4350974671542644,0.4399556629359722,0.4371416717767715,0.4363861419260502,0.4376698136329651,0.4405004419386387,0.4373639523983001,0.4379038028419018,0.4371281825006008,0.4393439553678036,0.440426729619503,0.4401675276458263,0.4429537951946258,0.4449137263000011,0.4434786736965179,0.4450470842421055,0.4454202279448509,0.4394537284970283,0.442185215651989,0.4461225643754005,0.4427758157253265,0.4430646039545536,0.4476901069283485,0.4478763341903686,0.4493869319558143,0.4448477327823639,0.450044184923172,0.4498609118163585,0.4457665979862213,0.4506924152374267,0.449855338782072,0.448790930211544,0.4474099352955818,0.4546772800385952,0.4529431238770485,0.452015146613121,0.4502020999789238,0.4493804536759853,0.4523266032338142,0.4551868587732315,0.4501944817602634,0.4493303671479225,0.4526805207133293,0.4533850513398647,0.4518048763275146,0.4518973492085933,0.4531301632523536,0.4518006071448326,0.4553494565188885,0.4528752230107784,0.4536322727799415,0.4561733976006508,0.4549491256475448,0.4574789106845855,0.4577847123146057,0.4563642293214798,0.4578686729073524,0.4561499990522861,0.4537816494703293,0.4542164430022239,0.4559455662965774,0.4554723873734474,0.4575514122843742,0.4575202167034149,0.4592722058296203,0.4585275091230869,0.4580587856471538,0.456934317946434,0.4577495418488979,0.4540119916200638,0.4570806957781315,0.4608120545744896,0.4588425755500793,0.4578334167599678,0.4610816091299057,0.4598177038133144,0.461849745362997,0.4631866924464702,0.4601576402783394,0.4646804705262184,0.4632389545440674,0.4604574106633663,0.4602976888418197,0.4581312239170074,0.4654182009398937,0.4655338563024997,0.4616620391607284,0.461054053157568,0.4613021649420261,0.4658613465726375,0.4633531905710697,0.4613638147711754,0.4643996246159076,0.462500050663948,0.4650798961520195,0.4648764543235302,0.4639869071543216,0.4634246975183487,0.46585888043046,0.4639799632132053,0.4630857892334461,0.4644265696406364,0.4642998576164245,0.4686848931014538,0.4687492996454239,0.4650243632495403,0.4627032242715359,0.4665953740477562,0.4660026729106903,0.4664581045508384,0.4676475040614605,0.4657339677214622,0.4664678275585174,0.4673498086631298,0.4676674827933311,0.4680955372750759,0.4681585058569908,0.4659864418208599,0.4686457589268684,0.4661462865769863,0.4658931568264961,0.4674226939678192,0.46805215254426,0.4682257212698459,0.4689070098102093,0.4699570722877979,0.4655096270143986,0.4688013233244419,0.4707522802054882,0.4661469310522079,0.4688841328024864,0.4671329781413078,0.4662554152309894,0.4697433896362781,0.4698473587632179,0.4676505327224731,0.4696521013975143],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2939999997615814,0.3174999952316284,0.3294999897480011,0.3510000109672546,0.3485000133514404,0.3634999990463257,0.3700000047683716,0.3524999916553497,0.375,0.3804999887943268,0.37950000166893,0.3824999928474426,0.3799999952316284,0.395000010728836,0.3844999969005584,0.3894999921321869,0.3855000138282776,0.3955000042915344,0.3995000123977661,0.4009999930858612,0.3939999938011169,0.3970000147819519,0.3955000042915344,0.3955000042915344,0.4079999923706054,0.3959999978542328,0.4090000092983246,0.4045000076293945,0.3930000066757202,0.4099999964237213,0.4054999947547912,0.4124999940395355,0.4160000085830688,0.4149999916553497,0.4070000052452087,0.4110000133514404,0.4144999980926513,0.4120000004768371,0.4050000011920929,0.4165000021457672,0.4180000126361847,0.4050000011920929,0.4120000004768371,0.4135000109672546,0.4320000112056732,0.4284999966621399,0.4269999861717224,0.414000004529953,0.4255000054836273,0.4165000021457672,0.4144999980926513,0.4079999923706054,0.4205000102519989,0.4180000126361847,0.4244999885559082,0.4235000014305115,0.4244999885559082,0.4300000071525574,0.4160000085830688,0.4205000102519989,0.4329999983310699,0.4280000030994415,0.4244999885559082,0.4375,0.4244999885559082,0.4365000128746032,0.4329999983310699,0.4424999952316284,0.4390000104904175,0.4449999928474426,0.445499986410141,0.4320000112056732,0.4365000128746032,0.4244999885559082,0.429500013589859,0.4395000040531158,0.4284999966621399,0.44200000166893,0.4370000064373016,0.4399999976158142,0.4334999918937683,0.4429999887943268,0.44200000166893,0.4334999918937683,0.4384999871253967,0.4365000128746032,0.4390000104904175,0.4354999959468841,0.44200000166893,0.4350000023841858,0.4390000104904175,0.4404999911785126,0.4410000145435333,0.4305000007152557,0.4490000009536743,0.4510000050067901,0.4605000019073486,0.4490000009536743,0.449999988079071,0.4595000147819519,0.4514999985694885,0.4490000009536743,0.4474999904632568,0.4444999992847442,0.4524999856948852,0.4465000033378601,0.4519999921321869,0.4550000131130218,0.4524999856948852,0.4429999887943268,0.4550000131130218,0.4510000050067901,0.4560000002384186,0.4465000033378601,0.4485000073909759,0.4524999856948852,0.4440000057220459,0.457500010728836,0.4544999897480011,0.4480000138282776,0.4584999978542328,0.4544999897480011,0.4569999873638153,0.4584999978542328,0.4444999992847442,0.4629999995231628,0.457500010728836,0.4555000066757202,0.4569999873638153,0.4474999904632568,0.4564999938011169,0.4595000147819519,0.4634999930858612,0.4555000066757202,0.453000009059906,0.457500010728836,0.4614999890327453,0.460999995470047,0.4539999961853027,0.4595000147819519,0.4629999995231628,0.4670000076293945,0.4580000042915344,0.4639999866485595,0.457500010728836,0.4595000147819519,0.4665000140666961,0.4584999978542328,0.4629999995231628,0.4595000147819519,0.4659999907016754,0.4645000100135803,0.4675000011920929,0.4690000116825104,0.4715000092983246,0.4634999930858612,0.4634999930858612,0.4639999866485595,0.465499997138977,0.4675000011920929,0.4670000076293945,0.4600000083446502,0.4595000147819519,0.4625000059604645,0.4600000083446502,0.4645000100135803,0.4715000092983246],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2985000014305115,0.3269999921321869,0.340499997138977,0.3495000004768371,0.3535000085830688,0.3519999980926513,0.3625000119209289,0.3569999933242798,0.3659999966621399,0.3619999885559082,0.3759999871253967,0.3779999911785126,0.3919999897480011,0.3835000097751617,0.402999997138977,0.3899999856948852,0.3869999945163727,0.3885000050067901,0.3989999890327453,0.390500009059906,0.4054999947547912,0.398499995470047,0.3989999890327453,0.398499995470047,0.4014999866485595,0.398499995470047,0.4135000109672546,0.4045000076293945,0.4144999980926513,0.4079999923706054,0.4124999940395355,0.4169999957084656,0.4074999988079071,0.4205000102519989,0.4135000109672546,0.4160000085830688,0.4124999940395355,0.4225000143051147,0.4214999973773956,0.418500006198883,0.4115000069141388,0.4165000021457672,0.4199999868869781,0.418500006198883,0.414000004529953,0.4194999933242798,0.4095000028610229,0.4214999973773956,0.4149999916553497,0.426499992609024,0.4160000085830688,0.4169999957084656,0.4314999878406524,0.4404999911785126,0.4325000047683716,0.4305000007152557,0.4275000095367431,0.4250000119209289,0.4230000078678131,0.4214999973773956,0.4275000095367431,0.4354999959468841,0.4235000014305115,0.4244999885559082,0.4199999868869781,0.4235000014305115,0.4275000095367431,0.4205000102519989,0.4244999885559082,0.4230000078678131,0.4235000014305115,0.4280000030994415,0.4305000007152557,0.4305000007152557,0.4359999895095825,0.4345000088214874,0.4395000040531158,0.4280000030994415,0.4350000023841858,0.4365000128746032,0.4255000054836273,0.4339999854564667,0.4314999878406524,0.4329999983310699,0.4345000088214874,0.4395000040531158,0.4350000023841858,0.4535000026226043,0.4449999928474426,0.445499986410141,0.4404999911785126,0.4424999952316284,0.4505000114440918,0.4440000057220459,0.4519999921321869,0.4449999928474426,0.4474999904632568,0.4494999945163727,0.4494999945163727,0.445499986410141,0.4510000050067901,0.4524999856948852,0.4395000040531158,0.4444999992847442,0.4469999969005584,0.4460000097751617,0.4539999961853027,0.4494999945163727,0.4465000033378601,0.4544999897480011,0.4474999904632568,0.4550000131130218,0.4510000050067901,0.4555000066757202,0.4480000138282776,0.4589999914169311,0.4550000131130218,0.4510000050067901,0.4519999921321869,0.4514999985694885,0.4539999961853027,0.4535000026226043,0.4569999873638153,0.4620000123977661,0.4634999930858612,0.4555000066757202,0.4465000033378601,0.4550000131130218,0.4485000073909759,0.4435000121593475,0.4480000138282776,0.4555000066757202,0.4469999969005584,0.4535000026226043,0.4555000066757202,0.4519999921321869,0.4485000073909759,0.4639999866485595,0.4584999978542328,0.4490000009536743,0.4524999856948852,0.453000009059906,0.4535000026226043,0.460999995470047,0.4589999914169311,0.4544999897480011,0.4589999914169311,0.4569999873638153,0.4544999897480011,0.4625000059604645,0.4474999904632568,0.4510000050067901,0.4480000138282776,0.453000009059906,0.4460000097751617,0.460999995470047,0.4634999930858612,0.4679999947547912,0.4639999866485595,0.4720000028610229,0.4659999907016754,0.4650000035762787,0.4620000123977661,0.4659999907016754,0.465499997138977,0.4595000147819519,0.4620000123977661],"label":"FineWeb: id mh + C4 filters"},"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.296999990940094,0.3219999969005584,0.3305000066757202,0.3555000126361847,0.351500004529953,0.3600000143051147,0.363999992609024,0.3680000007152557,0.3785000145435333,0.3765000104904175,0.382999986410141,0.3785000145435333,0.3835000097751617,0.3819999992847442,0.3935000002384186,0.387499988079071,0.3935000002384186,0.3959999978542328,0.3860000073909759,0.3935000002384186,0.3885000050067901,0.3810000121593475,0.3880000114440918,0.3964999914169311,0.4054999947547912,0.3935000002384186,0.3944999873638153,0.3989999890327453,0.3980000019073486,0.4050000011920929,0.4054999947547912,0.4009999930858612,0.4110000133514404,0.4054999947547912,0.4180000126361847,0.4110000133514404,0.4050000011920929,0.4079999923706054,0.4120000004768371,0.402999997138977,0.4205000102519989,0.4129999876022339,0.4120000004768371,0.4169999957084656,0.4269999861717224,0.4230000078678131,0.4225000143051147,0.4300000071525574,0.4180000126361847,0.4284999966621399,0.4165000021457672,0.4325000047683716,0.4235000014305115,0.4210000038146972,0.4239999949932098,0.4235000014305115,0.421999990940094,0.4280000030994415,0.4300000071525574,0.4275000095367431,0.4305000007152557,0.4244999885559082,0.4314999878406524,0.4325000047683716,0.4395000040531158,0.4325000047683716,0.4300000071525574,0.4399999976158142,0.4320000112056732,0.4370000064373016,0.4280000030994415,0.4309999942779541,0.4314999878406524,0.4370000064373016,0.4280000030994415,0.4325000047683716,0.4300000071525574,0.4334999918937683,0.4334999918937683,0.4379999935626983,0.4399999976158142,0.4350000023841858,0.4395000040531158,0.4375,0.4390000104904175,0.4365000128746032,0.4435000121593475,0.4365000128746032,0.445499986410141,0.4440000057220459,0.4460000097751617,0.4415000081062317,0.4415000081062317,0.4339999854564667,0.4429999887943268,0.4399999976158142,0.4359999895095825,0.4370000064373016,0.4469999969005584,0.4404999911785126,0.4435000121593475,0.445499986410141,0.4424999952316284,0.4480000138282776,0.4370000064373016,0.4444999992847442,0.4465000033378601,0.4309999942779541,0.4440000057220459,0.4469999969005584,0.4539999961853027,0.4440000057220459,0.4555000066757202,0.4519999921321869,0.4510000050067901,0.4519999921321869,0.4544999897480011,0.4494999945163727,0.4584999978542328,0.4580000042915344,0.4544999897480011,0.4514999985694885,0.4550000131130218,0.4560000002384186,0.4600000083446502,0.4589999914169311,0.4560000002384186,0.457500010728836,0.4679999947547912,0.4494999945163727,0.4505000114440918,0.4440000057220459,0.4539999961853027,0.4535000026226043,0.4514999985694885,0.457500010728836,0.4620000123977661,0.4564999938011169,0.4595000147819519,0.4564999938011169,0.4550000131130218,0.4539999961853027,0.4544999897480011,0.4569999873638153,0.457500010728836,0.4539999961853027,0.4595000147819519,0.4665000140666961,0.465499997138977,0.4625000059604645,0.4629999995231628,0.4580000042915344,0.4569999873638153,0.4620000123977661,0.457500010728836,0.4550000131130218,0.4645000100135803,0.4629999995231628,0.4584999978542328,0.465499997138977,0.460999995470047,0.4634999930858612,0.4605000019073486,0.4584999978542328,0.4550000131130218,0.4564999938011169,0.4600000083446502],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2894999980926513,0.3235000073909759,0.3389999866485595,0.3384999930858612,0.3459999859333038,0.359499990940094,0.3429999947547912,0.3619999885559082,0.3564999997615814,0.3625000119209289,0.363999992609024,0.3680000007152557,0.3680000007152557,0.3684999942779541,0.375,0.3734999895095825,0.3849999904632568,0.3944999873638153,0.3865000009536743,0.395000010728836,0.3935000002384186,0.3980000019073486,0.3910000026226043,0.3885000050067901,0.3914999961853027,0.3815000057220459,0.395000010728836,0.3894999921321869,0.395000010728836,0.3935000002384186,0.4034999907016754,0.4004999995231628,0.3970000147819519,0.3975000083446502,0.3995000123977661,0.3980000019073486,0.4034999907016754,0.3959999978542328,0.3989999890327453,0.402999997138977,0.3880000114440918,0.3980000019073486,0.4040000140666961,0.3989999890327453,0.3970000147819519,0.3925000131130218,0.4120000004768371,0.3935000002384186,0.395000010728836,0.4070000052452087,0.3935000002384186,0.4034999907016754,0.4189999997615814,0.4129999876022339,0.4160000085830688,0.4149999916553497,0.418500006198883,0.4225000143051147,0.4174999892711639,0.4210000038146972,0.4045000076293945,0.4079999923706054,0.4124999940395355,0.4144999980926513,0.4169999957084656,0.4194999933242798,0.4154999852180481,0.4169999957084656,0.4225000143051147,0.4225000143051147,0.4230000078678131,0.4160000085830688,0.4325000047683716,0.4325000047683716,0.4199999868869781,0.4199999868869781,0.4189999997615814,0.4269999861717224,0.4259999990463257,0.4230000078678131,0.4144999980926513,0.4329999983310699,0.4275000095367431,0.4305000007152557,0.4289999902248382,0.4235000014305115,0.4235000014305115,0.4325000047683716,0.4244999885559082,0.4314999878406524,0.4194999933242798,0.4350000023841858,0.4269999861717224,0.4235000014305115,0.4300000071525574,0.4284999966621399,0.4255000054836273,0.4280000030994415,0.4345000088214874,0.4225000143051147,0.4334999918937683,0.4300000071525574,0.4350000023841858,0.429500013589859,0.4325000047683716,0.4384999871253967,0.4345000088214874,0.4354999959468841,0.4359999895095825,0.4354999959468841,0.4424999952316284,0.4424999952316284,0.4320000112056732,0.4280000030994415,0.4390000104904175,0.4480000138282776,0.4415000081062317,0.4384999871253967,0.4390000104904175,0.4494999945163727,0.4449999928474426,0.4384999871253967,0.4424999952316284,0.4359999895095825,0.445499986410141,0.4399999976158142,0.4375,0.4410000145435333,0.4384999871253967,0.4375,0.4329999983310699,0.4370000064373016,0.4354999959468841,0.4440000057220459,0.4384999871253967,0.4384999871253967,0.4390000104904175,0.4424999952316284,0.4379999935626983,0.4345000088214874,0.4354999959468841,0.4440000057220459,0.4395000040531158,0.4465000033378601,0.4404999911785126,0.4505000114440918,0.4480000138282776,0.4449999928474426,0.445499986410141,0.4410000145435333,0.4485000073909759,0.4460000097751617,0.4480000138282776,0.4465000033378601,0.4460000097751617,0.4460000097751617,0.4395000040531158,0.4474999904632568,0.4469999969005584,0.4404999911785126,0.4440000057220459,0.4435000121593475,0.4435000121593475,0.4514999985694885,0.4474999904632568,0.4474999904632568,0.445499986410141],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2639999985694885,0.2790000140666961,0.296999990940094,0.3109999895095825,0.3240000009536743,0.3070000112056732,0.3210000097751617,0.31700000166893,0.3339999914169311,0.324999988079071,0.3260000050067901,0.3330000042915344,0.3409999907016754,0.3400000035762787,0.3529999852180481,0.3400000035762787,0.3490000069141388,0.3529999852180481,0.3499999940395355,0.3459999859333038,0.3370000123977661,0.356000006198883,0.3490000069141388,0.3429999947547912,0.3490000069141388,0.3610000014305115,0.3499999940395355,0.3569999933242798,0.3610000014305115,0.3619999885559082,0.3449999988079071,0.3409999907016754,0.3420000076293945,0.3449999988079071,0.3409999907016754,0.3379999995231628,0.3420000076293945,0.3569999933242798,0.3529999852180481,0.3610000014305115,0.363999992609024,0.3600000143051147,0.3540000021457672,0.3499999940395355,0.3689999878406524,0.367000013589859,0.3569999933242798,0.3610000014305115,0.3680000007152557,0.3630000054836273,0.3709999918937683,0.3540000021457672,0.3580000102519989,0.367000013589859,0.3529999852180481,0.356000006198883,0.3569999933242798,0.3610000014305115,0.3700000047683716,0.375,0.3709999918937683,0.3819999992847442,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3840000033378601,0.3740000128746032,0.375,0.356000006198883,0.3689999878406524,0.3700000047683716,0.3819999992847442,0.3799999952316284,0.3779999911785126,0.3729999959468841,0.3709999918937683,0.3759999871253967,0.3709999918937683,0.3759999871253967,0.3779999911785126,0.3779999911785126,0.3689999878406524,0.3840000033378601,0.3860000073909759,0.3849999904632568,0.3790000081062317,0.375,0.3849999904632568,0.3720000088214874,0.3770000040531158,0.3799999952316284,0.3810000121593475,0.382999986410141,0.3650000095367431,0.3740000128746032,0.382999986410141,0.3689999878406524,0.3759999871253967,0.3869999945163727,0.3889999985694885,0.3860000073909759,0.3819999992847442,0.3689999878406524,0.3860000073909759,0.3810000121593475,0.382999986410141,0.3819999992847442,0.3840000033378601,0.3889999985694885,0.3880000114440918,0.3849999904632568,0.3799999952316284,0.3910000026226043,0.3989999890327453,0.3880000114440918,0.3880000114440918,0.3840000033378601,0.3880000114440918,0.3860000073909759,0.3919999897480011,0.3880000114440918,0.3939999938011169,0.3869999945163727,0.3919999897480011,0.3910000026226043,0.382999986410141,0.3930000066757202,0.3840000033378601,0.3880000114440918,0.3840000033378601,0.3819999992847442,0.382999986410141,0.3880000114440918,0.3860000073909759,0.3860000073909759,0.3869999945163727,0.3860000073909759,0.3899999856948852,0.3819999992847442,0.3860000073909759,0.3889999985694885,0.3840000033378601,0.395000010728836,0.3899999856948852,0.3899999856948852,0.3910000026226043,0.3959999978542328,0.3959999978542328,0.3919999897480011,0.3980000019073486,0.3880000114440918,0.3930000066757202,0.4000000059604645,0.3919999897480011,0.3919999897480011,0.4040000140666961,0.3930000066757202,0.3970000147819519,0.3889999985694885,0.3959999978542328,0.3930000066757202,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.4020000100135803],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2630000114440918,0.2770000100135803,0.3050000071525574,0.3100000023841858,0.3149999976158142,0.3190000057220459,0.3350000083446502,0.3210000097751617,0.3310000002384186,0.3389999866485595,0.3289999961853027,0.3379999995231628,0.3420000076293945,0.3409999907016754,0.3510000109672546,0.3479999899864197,0.3440000116825104,0.3569999933242798,0.3529999852180481,0.3680000007152557,0.3549999892711639,0.3499999940395355,0.3589999973773956,0.3529999852180481,0.3459999859333038,0.3529999852180481,0.3630000054836273,0.3600000143051147,0.3490000069141388,0.3540000021457672,0.3600000143051147,0.356000006198883,0.3470000028610229,0.3470000028610229,0.3549999892711639,0.3440000116825104,0.3529999852180481,0.3630000054836273,0.3449999988079071,0.3479999899864197,0.3490000069141388,0.3519999980926513,0.367000013589859,0.356000006198883,0.356000006198883,0.3519999980926513,0.3580000102519989,0.3569999933242798,0.3659999966621399,0.3759999871253967,0.3689999878406524,0.3779999911785126,0.3549999892711639,0.3610000014305115,0.3650000095367431,0.3610000014305115,0.3580000102519989,0.3729999959468841,0.367000013589859,0.3689999878406524,0.3540000021457672,0.363999992609024,0.3700000047683716,0.3650000095367431,0.3529999852180481,0.3709999918937683,0.3740000128746032,0.3680000007152557,0.3689999878406524,0.3580000102519989,0.3650000095367431,0.3619999885559082,0.3619999885559082,0.3630000054836273,0.3610000014305115,0.3659999966621399,0.375,0.375,0.3700000047683716,0.3840000033378601,0.3779999911785126,0.382999986410141,0.367000013589859,0.3860000073909759,0.3770000040531158,0.3790000081062317,0.3880000114440918,0.3659999966621399,0.3630000054836273,0.3770000040531158,0.3779999911785126,0.3680000007152557,0.3779999911785126,0.375,0.3819999992847442,0.3720000088214874,0.3799999952316284,0.382999986410141,0.375,0.367000013589859,0.3869999945163727,0.3810000121593475,0.382999986410141,0.3709999918937683,0.3720000088214874,0.3689999878406524,0.367000013589859,0.3819999992847442,0.3720000088214874,0.3849999904632568,0.3709999918937683,0.3740000128746032,0.3709999918937683,0.3799999952316284,0.3799999952316284,0.3869999945163727,0.375,0.3680000007152557,0.3779999911785126,0.3799999952316284,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3819999992847442,0.3770000040531158,0.3810000121593475,0.3720000088214874,0.3860000073909759,0.3810000121593475,0.3790000081062317,0.3860000073909759,0.3759999871253967,0.3860000073909759,0.3810000121593475,0.3790000081062317,0.3799999952316284,0.3840000033378601,0.3810000121593475,0.3810000121593475,0.3849999904632568,0.3869999945163727,0.3819999992847442,0.3740000128746032,0.3779999911785126,0.3860000073909759,0.3889999985694885,0.3849999904632568,0.3889999985694885,0.3810000121593475,0.3849999904632568,0.3840000033378601,0.3860000073909759,0.3889999985694885,0.382999986410141,0.3849999904632568,0.3840000033378601,0.3880000114440918,0.3810000121593475,0.3849999904632568,0.3790000081062317,0.3799999952316284,0.3819999992847442,0.382999986410141,0.3790000081062317,0.3810000121593475,0.3779999911785126,0.3889999985694885],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2599999904632568,0.277999997138977,0.2910000085830688,0.3070000112056732,0.3140000104904175,0.3019999861717224,0.3059999942779541,0.3210000097751617,0.3230000138282776,0.324999988079071,0.3149999976158142,0.3109999895095825,0.3339999914169311,0.3319999873638153,0.3319999873638153,0.3300000131130218,0.3370000123977661,0.3219999969005584,0.3370000123977661,0.328000009059906,0.3339999914169311,0.3420000076293945,0.3400000035762787,0.3440000116825104,0.3510000109672546,0.3409999907016754,0.3449999988079071,0.3339999914169311,0.3540000021457672,0.3339999914169311,0.3470000028610229,0.3470000028610229,0.3440000116825104,0.3589999973773956,0.3569999933242798,0.3630000054836273,0.3549999892711639,0.3589999973773956,0.3449999988079071,0.3549999892711639,0.3449999988079071,0.3389999866485595,0.3499999940395355,0.3610000014305115,0.3619999885559082,0.3600000143051147,0.3519999980926513,0.3479999899864197,0.356000006198883,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3519999980926513,0.3470000028610229,0.3589999973773956,0.3449999988079071,0.3490000069141388,0.356000006198883,0.3619999885559082,0.3569999933242798,0.3659999966621399,0.3610000014305115,0.3549999892711639,0.3700000047683716,0.363999992609024,0.3600000143051147,0.3580000102519989,0.3549999892711639,0.3619999885559082,0.3689999878406524,0.3630000054836273,0.363999992609024,0.3700000047683716,0.367000013589859,0.3630000054836273,0.3630000054836273,0.3700000047683716,0.3589999973773956,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3589999973773956,0.3650000095367431,0.3709999918937683,0.3680000007152557,0.3689999878406524,0.3650000095367431,0.3729999959468841,0.3619999885559082,0.3689999878406524,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.363999992609024,0.3700000047683716,0.3659999966621399,0.3659999966621399,0.363999992609024,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3610000014305115,0.3720000088214874,0.3729999959468841,0.3810000121593475,0.3630000054836273,0.3689999878406524,0.3709999918937683,0.3759999871253967,0.382999986410141,0.3729999959468841,0.3720000088214874,0.3680000007152557,0.3659999966621399,0.3650000095367431,0.363999992609024,0.3589999973773956,0.356000006198883,0.3650000095367431,0.3659999966621399,0.367000013589859,0.3729999959468841,0.3720000088214874,0.375,0.3740000128746032,0.3700000047683716,0.3569999933242798,0.3759999871253967,0.3740000128746032,0.367000013589859,0.3770000040531158,0.3759999871253967,0.3709999918937683,0.3779999911785126,0.3709999918937683,0.3689999878406524,0.3799999952316284,0.3630000054836273,0.375,0.3700000047683716,0.3700000047683716,0.3729999959468841,0.3720000088214874,0.3790000081062317,0.375,0.3729999959468841,0.3770000040531158,0.3799999952316284,0.3779999911785126,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3799999952316284,0.3790000081062317,0.375,0.3740000128746032,0.3729999959468841,0.3840000033378601,0.3659999966621399,0.3759999871253967,0.3720000088214874,0.3720000088214874,0.3759999871253967,0.375,0.3650000095367431,0.3729999959468841],"label":"FineWeb: base filtering only"},"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2630000114440918,0.2879999876022339,0.296999990940094,0.2960000038146972,0.3039999902248382,0.3129999935626983,0.3149999976158142,0.3300000131130218,0.3300000131130218,0.3350000083446502,0.3379999995231628,0.3370000123977661,0.3330000042915344,0.3370000123977661,0.3389999866485595,0.3429999947547912,0.3659999966621399,0.3459999859333038,0.3479999899864197,0.3440000116825104,0.3470000028610229,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.3529999852180481,0.3680000007152557,0.3549999892711639,0.3540000021457672,0.3529999852180481,0.3499999940395355,0.3569999933242798,0.3529999852180481,0.3499999940395355,0.3540000021457672,0.3659999966621399,0.3600000143051147,0.3680000007152557,0.3659999966621399,0.3600000143051147,0.3659999966621399,0.3540000021457672,0.3580000102519989,0.367000013589859,0.3549999892711639,0.3729999959468841,0.3580000102519989,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3650000095367431,0.3619999885559082,0.3759999871253967,0.3689999878406524,0.3689999878406524,0.3619999885559082,0.3630000054836273,0.3650000095367431,0.3799999952316284,0.3729999959468841,0.3740000128746032,0.367000013589859,0.3720000088214874,0.3600000143051147,0.3650000095367431,0.3729999959468841,0.3589999973773956,0.3799999952316284,0.3589999973773956,0.3799999952316284,0.3680000007152557,0.367000013589859,0.367000013589859,0.3700000047683716,0.3790000081062317,0.3729999959468841,0.3770000040531158,0.3709999918937683,0.3759999871253967,0.3759999871253967,0.3700000047683716,0.3720000088214874,0.3840000033378601,0.3770000040531158,0.3770000040531158,0.3790000081062317,0.3860000073909759,0.3759999871253967,0.3650000095367431,0.3700000047683716,0.3819999992847442,0.3819999992847442,0.3630000054836273,0.3689999878406524,0.3759999871253967,0.3759999871253967,0.3779999911785126,0.3740000128746032,0.3860000073909759,0.3619999885559082,0.3740000128746032,0.3799999952316284,0.3819999992847442,0.3740000128746032,0.3770000040531158,0.375,0.3810000121593475,0.3729999959468841,0.3880000114440918,0.3840000033378601,0.3840000033378601,0.3770000040531158,0.3740000128746032,0.382999986410141,0.3840000033378601,0.3770000040531158,0.3869999945163727,0.3729999959468841,0.3770000040531158,0.3759999871253967,0.3840000033378601,0.3880000114440918,0.3759999871253967,0.3740000128746032,0.3720000088214874,0.3790000081062317,0.3740000128746032,0.3630000054836273,0.3810000121593475,0.3720000088214874,0.3729999959468841,0.3720000088214874,0.3840000033378601,0.3759999871253967,0.3840000033378601,0.3790000081062317,0.3819999992847442,0.3689999878406524,0.3700000047683716,0.3790000081062317,0.3729999959468841,0.3799999952316284,0.3799999952316284,0.3740000128746032,0.3689999878406524,0.3810000121593475,0.3720000088214874,0.382999986410141,0.3819999992847442,0.3720000088214874,0.3799999952316284,0.3740000128746032,0.3729999959468841,0.3790000081062317,0.3720000088214874,0.3680000007152557,0.3779999911785126,0.3799999952316284,0.3729999959468841,0.3740000128746032,0.3729999959468841,0.3759999871253967,0.3790000081062317,0.3689999878406524,0.3680000007152557,0.3659999966621399,0.3729999959468841,0.3680000007152557],"label":"FineWeb: id mh + C4 + custom filters"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2919999957084656,0.3310000002384186,0.3549999892711639,0.3939999938011169,0.4149999916553497,0.4329999983310699,0.4460000097751617,0.4589999914169311,0.4819999933242798,0.4769999980926513,0.4830000102519989,0.4909999966621399,0.5059999823570251,0.5059999823570251,0.503000020980835,0.5170000195503235,0.5049999952316284,0.5210000276565552,0.5130000114440918,0.5189999938011169,0.5360000133514404,0.5320000052452087,0.5460000038146973,0.5400000214576721,0.5379999876022339,0.531000018119812,0.5460000038146973,0.5509999990463257,0.5519999861717224,0.5559999942779541,0.5609999895095825,0.5559999942779541,0.5580000281333923,0.5450000166893005,0.5509999990463257,0.5590000152587891,0.5649999976158142,0.5619999766349792,0.5680000185966492,0.5669999718666077,0.5709999799728394,0.5569999814033508,0.5640000104904175,0.5690000057220459,0.5720000267028809,0.5759999752044678,0.5839999914169312,0.5699999928474426,0.5740000009536743,0.5830000042915344,0.5839999914169312,0.5799999833106995,0.5830000042915344,0.574999988079071,0.5910000205039978,0.5799999833106995,0.5879999995231628,0.6039999723434448,0.578000009059906,0.5849999785423279,0.5889999866485596,0.5849999785423279,0.6019999980926514,0.5929999947547913,0.5820000171661377,0.5860000252723694,0.5910000205039978,0.5849999785423279,0.5849999785423279,0.5839999914169312,0.5860000252723694,0.5979999899864197,0.5849999785423279,0.597000002861023,0.5960000157356262,0.6019999980926514,0.6060000061988831,0.5989999771118164,0.5889999866485596,0.5920000076293945,0.5960000157356262,0.5950000286102295,0.6060000061988831,0.5960000157356262,0.6000000238418579,0.6069999933242798,0.6039999723434448,0.6069999933242798,0.6010000109672546,0.6060000061988831,0.6129999756813049,0.5989999771118164,0.6200000047683716,0.5979999899864197,0.609000027179718,0.6029999852180481,0.609000027179718,0.6179999709129333,0.6150000095367432,0.6060000061988831,0.6069999933242798,0.6119999885559082,0.6190000176429749,0.6079999804496765,0.6150000095367432,0.6079999804496765,0.6190000176429749,0.6079999804496765,0.609000027179718,0.6079999804496765,0.6179999709129333,0.6140000224113464,0.6200000047683716,0.621999979019165,0.6129999756813049,0.6200000047683716,0.6129999756813049,0.6110000014305115,0.6069999933242798,0.609000027179718,0.6159999966621399,0.6169999837875366,0.6129999756813049,0.6169999837875366,0.6159999966621399,0.6200000047683716,0.6150000095367432,0.6240000128746033,0.6179999709129333,0.6179999709129333,0.6129999756813049,0.6179999709129333,0.6110000014305115,0.6190000176429749,0.6200000047683716,0.6150000095367432,0.6159999966621399,0.621999979019165,0.6209999918937683,0.6230000257492065,0.6200000047683716,0.6240000128746033,0.6159999966621399,0.6200000047683716,0.6159999966621399,0.6179999709129333,0.6119999885559082,0.6269999742507935,0.6230000257492065,0.6200000047683716,0.6240000128746033,0.6190000176429749,0.6169999837875366,0.6299999952316284,0.625,0.6179999709129333,0.6150000095367432,0.6259999871253967,0.621999979019165,0.625,0.6190000176429749,0.6259999871253967,0.6340000033378601,0.628000020980835,0.6290000081062317,0.628000020980835,0.6269999742507935],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2849999964237213,0.3240000009536743,0.3580000102519989,0.3930000066757202,0.395000010728836,0.4309999942779541,0.44200000166893,0.4399999976158142,0.453000009059906,0.453000009059906,0.4650000035762787,0.4699999988079071,0.481000006198883,0.4839999973773956,0.4970000088214874,0.5059999823570251,0.4909999966621399,0.5120000243186951,0.5139999985694885,0.5170000195503235,0.5199999809265137,0.5170000195503235,0.5249999761581421,0.5220000147819519,0.5289999842643738,0.5350000262260437,0.531000018119812,0.5289999842643738,0.5339999794960022,0.527999997138977,0.5260000228881836,0.5429999828338623,0.5370000004768372,0.5329999923706055,0.5460000038146973,0.5429999828338623,0.5490000247955322,0.546999990940094,0.546999990940094,0.5490000247955322,0.5460000038146973,0.5559999942779541,0.5619999766349792,0.5569999814033508,0.5509999990463257,0.5550000071525574,0.5649999976158142,0.5690000057220459,0.5619999766349792,0.5529999732971191,0.5649999976158142,0.5730000138282776,0.5669999718666077,0.5740000009536743,0.5690000057220459,0.5699999928474426,0.574999988079071,0.5640000104904175,0.5789999961853027,0.5720000267028809,0.5640000104904175,0.574999988079071,0.5770000219345093,0.5740000009536743,0.5770000219345093,0.5740000009536743,0.5740000009536743,0.578000009059906,0.5759999752044678,0.5789999961853027,0.5799999833106995,0.578000009059906,0.5860000252723694,0.5809999704360962,0.5770000219345093,0.5849999785423279,0.5849999785423279,0.5799999833106995,0.578000009059906,0.5809999704360962,0.5870000123977661,0.5830000042915344,0.5720000267028809,0.5879999995231628,0.5830000042915344,0.5929999947547913,0.578000009059906,0.5889999866485596,0.5809999704360962,0.5789999961853027,0.593999981880188,0.5820000171661377,0.5910000205039978,0.5830000042915344,0.5879999995231628,0.5879999995231628,0.5889999866485596,0.5879999995231628,0.5899999737739563,0.5960000157356262,0.5899999737739563,0.5879999995231628,0.5870000123977661,0.5910000205039978,0.593999981880188,0.597000002861023,0.593999981880188,0.5979999899864197,0.593999981880188,0.5989999771118164,0.5929999947547913,0.597000002861023,0.6019999980926514,0.5989999771118164,0.6019999980926514,0.597000002861023,0.6000000238418579,0.6019999980926514,0.6039999723434448,0.597000002861023,0.6019999980926514,0.5950000286102295,0.6019999980926514,0.6079999804496765,0.6039999723434448,0.6100000143051147,0.6039999723434448,0.6029999852180481,0.6069999933242798,0.6060000061988831,0.6069999933242798,0.6000000238418579,0.6100000143051147,0.6100000143051147,0.6129999756813049,0.609000027179718,0.6010000109672546,0.6000000238418579,0.6110000014305115,0.609000027179718,0.6069999933242798,0.6119999885559082,0.6050000190734863,0.6110000014305115,0.6190000176429749,0.6169999837875366,0.6140000224113464,0.6100000143051147,0.6200000047683716,0.6200000047683716,0.6110000014305115,0.6150000095367432,0.6129999756813049,0.6079999804496765,0.6179999709129333,0.6200000047683716,0.6129999756813049,0.6190000176429749,0.6150000095367432,0.6240000128746033,0.6240000128746033,0.609000027179718,0.609000027179718,0.6159999966621399,0.6110000014305115,0.6110000014305115,0.6190000176429749],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3019999861717224,0.3059999942779541,0.335999995470047,0.3610000014305115,0.3819999992847442,0.4009999930858612,0.4020000100135803,0.4250000119209289,0.4309999942779541,0.4469999969005584,0.4519999921321869,0.453000009059906,0.4580000042915344,0.4749999940395355,0.4699999988079071,0.4799999892711639,0.4749999940395355,0.4769999980926513,0.481000006198883,0.4839999973773956,0.4959999918937683,0.5040000081062317,0.4970000088214874,0.4979999959468841,0.5070000290870667,0.5049999952316284,0.5109999775886536,0.515999972820282,0.5120000243186951,0.5120000243186951,0.515999972820282,0.5120000243186951,0.5249999761581421,0.5170000195503235,0.5199999809265137,0.5270000100135803,0.5170000195503235,0.5220000147819519,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5370000004768372,0.5339999794960022,0.5329999923706055,0.531000018119812,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.5389999747276306,0.5419999957084656,0.5429999828338623,0.5360000133514404,0.5299999713897705,0.546999990940094,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5350000262260437,0.5339999794960022,0.5419999957084656,0.5450000166893005,0.5460000038146973,0.5370000004768372,0.5490000247955322,0.5440000295639038,0.550000011920929,0.5490000247955322,0.5450000166893005,0.5490000247955322,0.5559999942779541,0.5559999942779541,0.5410000085830688,0.5419999957084656,0.5529999732971191,0.5460000038146973,0.5540000200271606,0.5379999876022339,0.5509999990463257,0.5540000200271606,0.5419999957084656,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5519999861717224,0.5600000023841858,0.5540000200271606,0.5509999990463257,0.5609999895095825,0.5619999766349792,0.5590000152587891,0.5559999942779541,0.5580000281333923,0.5640000104904175,0.5649999976158142,0.5590000152587891,0.5550000071525574,0.5630000233650208,0.5630000233650208,0.5609999895095825,0.5559999942779541,0.5609999895095825,0.5630000233650208,0.5680000185966492,0.5630000233650208,0.5690000057220459,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5690000057220459,0.5640000104904175,0.5630000233650208,0.574999988079071,0.5630000233650208,0.5619999766349792,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5590000152587891,0.5600000023841858,0.5619999766349792,0.5799999833106995,0.5619999766349792,0.5699999928474426,0.5709999799728394,0.5669999718666077,0.5680000185966492,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5720000267028809,0.5709999799728394,0.5770000219345093,0.574999988079071,0.5730000138282776,0.5690000057220459,0.5740000009536743,0.578000009059906,0.574999988079071,0.5820000171661377,0.5730000138282776,0.5740000009536743,0.574999988079071,0.5770000219345093,0.5789999961853027,0.5759999752044678,0.5720000267028809,0.5770000219345093,0.5759999752044678,0.5789999961853027,0.5789999961853027,0.5730000138282776,0.5789999961853027,0.5759999752044678,0.5690000057220459,0.5849999785423279,0.5759999752044678,0.5699999928474426,0.5789999961853027,0.5820000171661377,0.5730000138282776,0.5730000138282776,0.5789999961853027],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2809999883174896,0.3230000138282776,0.3409999907016754,0.3600000143051147,0.3569999933242798,0.3889999985694885,0.395000010728836,0.4199999868869781,0.4180000126361847,0.421999990940094,0.4289999902248382,0.4350000023841858,0.4359999895095825,0.4350000023841858,0.4480000138282776,0.4480000138282776,0.453000009059906,0.4550000131130218,0.4589999914169311,0.4639999866485595,0.4600000083446502,0.460999995470047,0.4589999914169311,0.481000006198883,0.4769999980926513,0.4709999859333038,0.4740000069141388,0.4679999947547912,0.4790000021457672,0.4729999899864197,0.4819999933242798,0.4850000143051147,0.4819999933242798,0.4819999933242798,0.4880000054836273,0.4869999885559082,0.4959999918937683,0.4850000143051147,0.4959999918937683,0.492000013589859,0.503000020980835,0.4930000007152557,0.5099999904632568,0.5040000081062317,0.5009999871253967,0.4970000088214874,0.4979999959468841,0.5059999823570251,0.5070000290870667,0.5040000081062317,0.5059999823570251,0.5049999952316284,0.5080000162124634,0.5049999952316284,0.5019999742507935,0.5120000243186951,0.5170000195503235,0.5170000195503235,0.5090000033378601,0.5239999890327454,0.527999997138977,0.5230000019073486,0.5210000276565552,0.5149999856948853,0.5189999938011169,0.5270000100135803,0.5149999856948853,0.5099999904632568,0.5299999713897705,0.5199999809265137,0.5230000019073486,0.5260000228881836,0.5249999761581421,0.5239999890327454,0.5329999923706055,0.5210000276565552,0.5260000228881836,0.5170000195503235,0.531000018119812,0.5289999842643738,0.531000018119812,0.5270000100135803,0.5299999713897705,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5329999923706055,0.5360000133514404,0.5299999713897705,0.5360000133514404,0.5270000100135803,0.5450000166893005,0.5410000085830688,0.546999990940094,0.5329999923706055,0.5329999923706055,0.5379999876022339,0.5299999713897705,0.5429999828338623,0.5360000133514404,0.5339999794960022,0.5419999957084656,0.5410000085830688,0.5370000004768372,0.5389999747276306,0.527999997138977,0.5400000214576721,0.5400000214576721,0.531000018119812,0.5440000295639038,0.5460000038146973,0.5479999780654907,0.5460000038146973,0.5410000085830688,0.5509999990463257,0.5479999780654907,0.5410000085830688,0.5389999747276306,0.550000011920929,0.5569999814033508,0.550000011920929,0.5490000247955322,0.5490000247955322,0.5569999814033508,0.5519999861717224,0.5479999780654907,0.5559999942779541,0.5550000071525574,0.5460000038146973,0.5540000200271606,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5550000071525574,0.5479999780654907,0.5479999780654907,0.5540000200271606,0.5550000071525574,0.5529999732971191,0.5529999732971191,0.5509999990463257,0.5509999990463257,0.5419999957084656,0.546999990940094,0.5509999990463257,0.5559999942779541,0.5490000247955322,0.5509999990463257,0.5529999732971191,0.550000011920929,0.5540000200271606,0.5550000071525574,0.5580000281333923,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5519999861717224,0.5519999861717224,0.5559999942779541,0.5569999814033508,0.5559999942779541,0.5550000071525574,0.5559999942779541,0.5490000247955322,0.5550000071525574,0.5600000023841858],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"siqa/acc_norm":{"file":"siqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"defaultMetric":"agg_score","slider":{"min":0,"max":30,"default":5}}}
dist/assets/data/plots/all_filtering_steps/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2558934390544891,0.2618628144264221,0.2683217823505401,0.2699837982654571,0.2738722860813141,0.2744417488574981,0.2740873992443084,0.2807216048240661,0.2820421457290649,0.2891400754451751,0.2879075407981872,0.2881667613983154,0.2892490327358246,0.2935869693756103,0.2870290875434875,0.2911452651023865,0.2949125170707702,0.2916406095027923,0.2981449663639068,0.2953989207744598,0.2946988642215729,0.297021746635437,0.3001497685909271,0.3010218441486358,0.2977036237716675,0.2992585003376007,0.2986803948879242,0.2994338274002075,0.2989781498908996,0.3041955828666687,0.3030496537685394,0.303806334733963,0.3036351203918457,0.3058845102787018,0.300450712442398,0.3025284707546234,0.3072526752948761,0.3039065897464752,0.3073755502700805,0.3070493042469024,0.3083153367042541,0.3123056292533874,0.307761400938034,0.3053378164768219,0.3116358816623688,0.3080427348613739,0.308482676744461,0.307318776845932,0.3083004653453827,0.3089516758918762,0.3088736236095428,0.3077724277973175,0.3126304149627685,0.3101697862148285,0.3159398734569549,0.314792275428772,0.3103811144828796,0.3111368715763092,0.3129658997058868,0.311605304479599,0.3118223249912262,0.3133279979228973,0.3146496713161468,0.3195074200630188,0.3142614662647247,0.3125102519989013,0.3115333616733551,0.3183117806911468,0.3168580532073974,0.3187012672424316,0.3179306983947754,0.3157722651958465,0.3214826583862304,0.3145081698894501,0.3172421753406524,0.3151432573795318,0.3181649446487427,0.3180212080478668,0.3171605765819549,0.3212067782878876,0.3180184066295624,0.3209905624389648,0.319052129983902,0.3212707936763763,0.3196887373924255,0.3188316226005554,0.3164899051189422,0.3241994678974151,0.3179469406604767,0.3214083909988403,0.3206575512886047,0.3263285160064697,0.3219505250453949,0.3181525468826294,0.3219776451587677,0.3259726762771606,0.3197665512561798,0.3236161768436432,0.3177970349788666,0.3258080780506134,0.3208407461643219,0.3251138925552368,0.3242645859718323,0.3229723274707794,0.3227455914020538,0.3206316232681274,0.3256695866584778,0.3241210877895355,0.3224890530109405,0.3263737261295318,0.3214233517646789,0.3240345120429992,0.3222567737102508,0.3242291808128357,0.3257078528404236,0.3278365731239319,0.3277338743209839,0.3253948092460632,0.3232105076313019,0.3267974853515625,0.3263654410839081,0.3262891769409179,0.3238334357738495,0.3294911682605743,0.3261866867542267,0.3243315815925598,0.3250119090080261,0.326727420091629,0.3268802464008331,0.3269768059253692,0.3257980346679687,0.3280686736106872,0.3274897634983063,0.3282252252101898,0.3272863030433655,0.328346699476242,0.325562834739685,0.3301684856414795,0.3284023404121399,0.3268299400806427,0.3286610245704651,0.3291078805923462,0.324972927570343,0.3314772248268127,0.3278062343597412,0.326839417219162,0.3277239501476288,0.330414742231369,0.3271744549274444,0.3279334008693695,0.3288575112819671,0.3285425007343292,0.3282454907894134,0.3296376466751098,0.3305942714214325,0.3276287615299225,0.3292438983917236,0.329515129327774,0.3281475007534027,0.3282177448272705,0.3333999514579773,0.3302631080150604,0.330238401889801,0.3323166668415069,0.3313035368919372,0.32961106300354,0.3321967124938965],"label":"FineWeb: independent MinHash (id mh)"},"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2562687695026397,0.264194518327713,0.2659797668457031,0.2690401375293731,0.2707462012767792,0.2736803293228149,0.2808477580547333,0.2819793820381164,0.2818062305450439,0.2852273285388946,0.2852552533149719,0.293150246143341,0.2869345247745514,0.2926198840141296,0.2911646664142608,0.2883031964302063,0.2938489317893982,0.2923268675804138,0.2927436530590057,0.2957125902175903,0.2942458391189575,0.2957732379436493,0.2933609783649444,0.2939628064632416,0.2984270751476288,0.2989151179790497,0.3007727265357971,0.2968312501907348,0.2969468235969543,0.3013020753860473,0.3045085966587066,0.3018752634525299,0.3014349043369293,0.2988792657852173,0.3034284710884094,0.3015728890895843,0.3065252900123596,0.3021449446678161,0.3043071627616882,0.303546279668808,0.3056059181690216,0.2971993386745453,0.3057574033737182,0.3057517111301422,0.3124973773956299,0.3139103651046753,0.3144983947277069,0.3126215636730194,0.3140240907669067,0.3113631308078766,0.3124240636825561,0.3126817643642425,0.3123457431793213,0.3111095428466797,0.3113269805908203,0.3142518699169159,0.3163851797580719,0.3134008049964905,0.3138530254364013,0.3171449303627014,0.3119543790817261,0.3147956132888794,0.3138984441757202,0.3178529143333435,0.3162296414375305,0.315980851650238,0.3123161196708679,0.3166452944278717,0.3140694200992584,0.3176922798156738,0.3176673054695129,0.3150016367435455,0.3161586821079254,0.3222477436065674,0.3194025754928589,0.3176416158676147,0.3159928619861603,0.3169592320919037,0.3135637938976288,0.3155058920383453,0.3215300440788269,0.3201274275779724,0.3192023932933807,0.3156079053878784,0.3212503492832184,0.3163617849349975,0.3223940432071686,0.3191330432891845,0.3194314539432525,0.3221519589424133,0.3211863040924072,0.3197937309741974,0.3174488544464111,0.3159596025943756,0.3157133460044861,0.3193388879299164,0.3163386285305023,0.3202225565910339,0.3163421154022217,0.3212694227695465,0.3187369704246521,0.3203508555889129,0.3224054872989654,0.3207881152629852,0.3219418525695801,0.3197605609893799,0.3255409598350525,0.3253240585327148,0.319698303937912,0.3250498473644256,0.3228228390216827,0.3213794529438019,0.3219127357006073,0.3214426934719085,0.3238218128681183,0.3229665458202362,0.3220484256744385,0.3240038454532623,0.3246393501758575,0.3237775564193725,0.3258441984653473,0.322843462228775,0.3241913020610809,0.324148565530777,0.3238157927989959,0.3248989582061767,0.3280864655971527,0.3288898766040802,0.3265794515609741,0.3277602791786194,0.3231202363967895,0.3224002718925476,0.323845773935318,0.3278093039989471,0.3247094452381134,0.3289697468280792,0.3272296786308288,0.3275051414966583,0.3271359801292419,0.3280861675739288,0.3281281590461731,0.327859491109848,0.3281152546405792,0.3282515406608581,0.3258990049362182,0.3271094560623169,0.3259278535842895,0.3258941769599914,0.3278749883174896,0.3300504386425018,0.326113760471344,0.3242938220500946,0.3262194991111755,0.3263693153858185,0.3274452090263366,0.3254594206809997,0.3287247717380523,0.3250340223312378,0.3270816206932068,0.3275731801986694,0.3282500207424164,0.3257671594619751,0.3272948265075683,0.3274084031581878,0.3302212655544281,0.3322067260742187,0.3296935856342315],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.251920074224472,0.2591456174850464,0.2687398791313171,0.269056499004364,0.2683902382850647,0.2725079655647278,0.2752586305141449,0.2753303050994873,0.2848396897315979,0.2833426892757416,0.2844280302524566,0.2847303748130798,0.294879138469696,0.2900991439819336,0.2932447791099548,0.2926276624202728,0.2924879789352417,0.2937914729118347,0.2919517457485199,0.2991980910301208,0.2929336428642273,0.3003193736076355,0.2955676615238189,0.2993899285793304,0.2975476682186126,0.2978275716304779,0.2994768321514129,0.2984132170677185,0.2998209595680237,0.3030214607715606,0.2984272837638855,0.2997688949108124,0.3041917085647583,0.3071142137050628,0.3038201630115509,0.3035272359848022,0.3047704994678497,0.3072718679904938,0.3085931539535522,0.3052116930484772,0.3084307312965393,0.3089986145496368,0.3102100193500519,0.3066395819187164,0.3109234273433685,0.3082580268383026,0.3055950105190277,0.3064471781253814,0.3052197098731994,0.3076579868793487,0.3114514350891113,0.3092200756072998,0.3083749115467071,0.3078047931194305,0.3102362751960754,0.3083966672420501,0.3149019181728363,0.3096835613250732,0.3129985630512237,0.3098655939102173,0.3105471730232239,0.3110797703266144,0.3097324073314667,0.3102588951587677,0.3108883202075958,0.3140653371810913,0.3143481016159057,0.3121947944164276,0.3064004778861999,0.3148718774318695,0.3152956068515777,0.3166911900043487,0.3115324079990387,0.311627596616745,0.3122025728225708,0.3186626732349396,0.3177326321601867,0.3107803463935852,0.3128083050251007,0.3109799027442932,0.3142008483409881,0.3121736049652099,0.3163987696170807,0.3134956955909729,0.3152486085891723,0.3163009285926819,0.3165099024772644,0.3186413049697876,0.315637856721878,0.3207236230373382,0.3161193430423736,0.3157550990581512,0.320356547832489,0.3172537386417389,0.318843811750412,0.3146925568580627,0.3193819522857666,0.3169102966785431,0.3154685497283935,0.3166710138320923,0.3187788426876068,0.3196616470813751,0.3190047442913055,0.3186605274677276,0.3162576556205749,0.3164195120334625,0.3191222250461578,0.3135613799095154,0.3175683617591858,0.3212282657623291,0.3279584646224975,0.3228197395801544,0.3242316544055938,0.3254729807376861,0.3239398598670959,0.3223652243614197,0.3198129832744598,0.3218621611595154,0.3264771103858948,0.323866069316864,0.32564178109169,0.32478728890419,0.3236158192157745,0.3245747685432434,0.3280244767665863,0.3271372020244598,0.3254362642765045,0.3266178965568542,0.3218266665935516,0.3268883228302002,0.321928471326828,0.324524849653244,0.3237947523593902,0.3238577842712402,0.3237817287445068,0.3233639299869537,0.325821191072464,0.3257157802581787,0.3272253274917602,0.3244009912014007,0.3231483995914459,0.3226592242717743,0.3233656585216522,0.3266710937023163,0.3259218335151672,0.3275097906589508,0.3273427188396454,0.3276328444480896,0.3251460194587707,0.3274493515491485,0.3227463960647583,0.3261785507202148,0.32408007979393,0.3253126442432403,0.3242971301078796,0.326819509267807,0.3268508613109588,0.3265140950679779,0.3266753256320953,0.3250673115253448,0.3271500170230865,0.3292337656021118,0.3286773562431335,0.3286141455173492,0.3296676576137543,0.3257955610752105,0.3266003727912903],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2516599297523498,0.2610189318656921,0.2666046619415283,0.2667981088161468,0.2667821645736694,0.2708088159561157,0.2738403379917145,0.2726235687732696,0.2762763500213623,0.2768311202526092,0.2809228301048279,0.2836140990257263,0.2822815179824829,0.2797218561172485,0.286342591047287,0.2855269610881805,0.2847287058830261,0.2888180613517761,0.286526083946228,0.2865165770053863,0.294582188129425,0.2925947606563568,0.2947863042354584,0.2892930805683136,0.2903610467910766,0.288201242685318,0.2873396277427673,0.2916238009929657,0.2908017039299011,0.2907920777797699,0.2952797412872314,0.2941452264785766,0.2921333611011505,0.2925891280174255,0.2968584895133972,0.2980035543441772,0.2964116632938385,0.2962304651737213,0.2950254380702972,0.2977516651153564,0.2944138348102569,0.3003402054309845,0.2976303696632385,0.3013098239898681,0.302829384803772,0.3018766045570373,0.305361807346344,0.2971298694610595,0.3014816343784332,0.3019805550575256,0.3037064969539642,0.2970167994499206,0.2995208501815796,0.2970106601715088,0.2990955114364624,0.3027818500995636,0.3048534691333771,0.2993872463703155,0.2986327707767486,0.3015393316745758,0.3003426790237427,0.3003274798393249,0.3017795085906982,0.3019182682037353,0.3015450537204742,0.3046211004257202,0.3031167984008789,0.3020436763763428,0.3011128306388855,0.3029948472976684,0.3045558631420135,0.301642894744873,0.3029441833496094,0.3035804331302643,0.3004390001296997,0.3021787703037262,0.306041270494461,0.3064048886299133,0.3087956011295318,0.3070018291473388,0.3065581619739532,0.3093871772289276,0.3060930073261261,0.3033313155174255,0.3072777390480041,0.306413859128952,0.3104493916034698,0.3056999444961548,0.3077532052993774,0.309231549501419,0.3070645034313202,0.3117790520191192,0.3114112913608551,0.312661737203598,0.3181777000427246,0.3117201030254364,0.3099702894687652,0.3074746131896972,0.3064963519573211,0.3105958700180053,0.3111456036567688,0.3084964454174042,0.3087405860424042,0.3121673166751861,0.3121528625488281,0.3100416660308838,0.3142979145050049,0.3129935264587402,0.3112611472606659,0.3119436800479889,0.3154115974903106,0.3091593086719513,0.3103814721107483,0.3130497634410858,0.3133455514907837,0.3152708411216736,0.3137963414192199,0.3099324703216553,0.3164172768592834,0.3133907914161682,0.3128255009651184,0.3134104907512665,0.3106969892978668,0.3130004107952118,0.3131391704082489,0.3130116462707519,0.3143952488899231,0.3143975436687469,0.3143710494041443,0.3163396418094635,0.3166862726211548,0.3184126019477844,0.3178988993167877,0.317479133605957,0.3184944093227386,0.316694974899292,0.3176258206367492,0.3182629346847534,0.3200214207172394,0.3181648552417755,0.320680022239685,0.3178716897964477,0.3182425796985626,0.3182984292507171,0.3158398568630218,0.3152642548084259,0.3132680356502533,0.3178914785385132,0.3156660795211792,0.3161703050136566,0.3176451921463012,0.3173815906047821,0.3194171786308288,0.3193057179450989,0.3172560334205627,0.317656546831131,0.3155770003795624,0.3199106156826019,0.3170182108879089,0.3156754970550537,0.3180731236934662,0.3205638229846954,0.3175432682037353,0.3184471428394317,0.3192788958549499,0.3197042346000671,0.3177168369293213],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2440000027418136,0.270000010728836,0.2720000147819519,0.3000000119209289,0.2919999957084656,0.3160000145435333,0.3160000145435333,0.3179999887943268,0.3199999928474426,0.3440000116825104,0.3179999887943268,0.3240000009536743,0.3300000131130218,0.3240000009536743,0.3199999928474426,0.335999995470047,0.3339999914169311,0.3440000116825104,0.3459999859333038,0.3400000035762787,0.3440000116825104,0.335999995470047,0.3379999995231628,0.3519999980926513,0.3379999995231628,0.3420000076293945,0.3319999873638153,0.3479999899864197,0.3459999859333038,0.3339999914169311,0.3440000116825104,0.3420000076293945,0.3219999969005584,0.3319999873638153,0.3479999899864197,0.3379999995231628,0.335999995470047,0.3499999940395355,0.3420000076293945,0.3319999873638153,0.3400000035762787,0.3400000035762787,0.3519999980926513,0.3479999899864197,0.3379999995231628,0.335999995470047,0.3400000035762787,0.3319999873638153,0.3580000102519989,0.3499999940395355,0.3700000047683716,0.3680000007152557,0.335999995470047,0.3600000143051147,0.3499999940395355,0.356000006198883,0.3499999940395355,0.356000006198883,0.3619999885559082,0.363999992609024,0.3519999980926513,0.3540000021457672,0.3600000143051147,0.3600000143051147,0.3540000021457672,0.356000006198883,0.363999992609024,0.363999992609024,0.3499999940395355,0.3659999966621399,0.356000006198883,0.363999992609024,0.3540000021457672,0.3540000021457672,0.3619999885559082,0.3740000128746032,0.3519999980926513,0.3659999966621399,0.3680000007152557,0.3700000047683716,0.3580000102519989,0.3499999940395355,0.3740000128746032,0.3659999966621399,0.3659999966621399,0.3580000102519989,0.3479999899864197,0.363999992609024,0.3519999980926513,0.3580000102519989,0.356000006198883,0.3740000128746032,0.363999992609024,0.3700000047683716,0.363999992609024,0.3700000047683716,0.363999992609024,0.3799999952316284,0.3860000073909759,0.3680000007152557,0.3779999911785126,0.3740000128746032,0.3600000143051147,0.3659999966621399,0.3680000007152557,0.3619999885559082,0.3700000047683716,0.3759999871253967,0.363999992609024,0.3740000128746032,0.3799999952316284,0.3779999911785126,0.3659999966621399,0.3600000143051147,0.3740000128746032,0.3600000143051147,0.363999992609024,0.363999992609024,0.363999992609024,0.3779999911785126,0.3700000047683716,0.3799999952316284,0.3720000088214874,0.3819999992847442,0.3759999871253967,0.3799999952316284,0.3740000128746032,0.3860000073909759,0.3779999911785126,0.3959999978542328,0.3880000114440918,0.3799999952316284,0.3860000073909759,0.3759999871253967,0.3939999938011169,0.3779999911785126,0.3959999978542328,0.3779999911785126,0.3899999856948852,0.3860000073909759,0.3959999978542328,0.3759999871253967,0.3720000088214874,0.3799999952316284,0.3740000128746032,0.3759999871253967,0.3799999952316284,0.3819999992847442,0.3840000033378601,0.3720000088214874,0.363999992609024,0.3840000033378601,0.3919999897480011,0.3819999992847442,0.3819999992847442,0.3779999911785126,0.3799999952316284,0.3840000033378601,0.3819999992847442,0.3899999856948852,0.3860000073909759,0.3819999992847442,0.3840000033378601,0.3720000088214874,0.3799999952316284,0.3819999992847442,0.3959999978542328],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2759999930858612,0.2739999890327453,0.2800000011920929,0.2879999876022339,0.3179999887943268,0.3019999861717224,0.3000000119209289,0.3240000009536743,0.3100000023841858,0.3140000104904175,0.3260000050067901,0.3260000050067901,0.3339999914169311,0.328000009059906,0.335999995470047,0.3319999873638153,0.3379999995231628,0.3519999980926513,0.3420000076293945,0.3440000116825104,0.3379999995231628,0.3420000076293945,0.3499999940395355,0.3420000076293945,0.3420000076293945,0.3499999940395355,0.3300000131130218,0.3459999859333038,0.3379999995231628,0.3400000035762787,0.3440000116825104,0.3319999873638153,0.3339999914169311,0.3459999859333038,0.3459999859333038,0.335999995470047,0.3379999995231628,0.3479999899864197,0.3540000021457672,0.3479999899864197,0.3420000076293945,0.3600000143051147,0.3499999940395355,0.3459999859333038,0.3600000143051147,0.335999995470047,0.3400000035762787,0.3619999885559082,0.3619999885559082,0.3580000102519989,0.3459999859333038,0.363999992609024,0.3479999899864197,0.356000006198883,0.3420000076293945,0.3619999885559082,0.3479999899864197,0.356000006198883,0.363999992609024,0.356000006198883,0.3700000047683716,0.356000006198883,0.356000006198883,0.3600000143051147,0.3680000007152557,0.3519999980926513,0.363999992609024,0.3540000021457672,0.3600000143051147,0.3600000143051147,0.3580000102519989,0.3680000007152557,0.3459999859333038,0.356000006198883,0.3740000128746032,0.356000006198883,0.3580000102519989,0.3479999899864197,0.3540000021457672,0.3499999940395355,0.3580000102519989,0.3540000021457672,0.3659999966621399,0.3680000007152557,0.356000006198883,0.3600000143051147,0.3519999980926513,0.363999992609024,0.3440000116825104,0.3659999966621399,0.3580000102519989,0.363999992609024,0.3600000143051147,0.3759999871253967,0.363999992609024,0.3700000047683716,0.3619999885559082,0.3600000143051147,0.3720000088214874,0.3779999911785126,0.3680000007152557,0.3759999871253967,0.3819999992847442,0.363999992609024,0.3740000128746032,0.363999992609024,0.3619999885559082,0.3700000047683716,0.3680000007152557,0.3659999966621399,0.3700000047683716,0.3759999871253967,0.363999992609024,0.3720000088214874,0.3700000047683716,0.3619999885559082,0.3680000007152557,0.3799999952316284,0.3659999966621399,0.3740000128746032,0.3740000128746032,0.3740000128746032,0.3659999966621399,0.3700000047683716,0.3779999911785126,0.3720000088214874,0.3700000047683716,0.3860000073909759,0.3759999871253967,0.3659999966621399,0.3680000007152557,0.3680000007152557,0.3700000047683716,0.3700000047683716,0.3799999952316284,0.3860000073909759,0.3840000033378601,0.3899999856948852,0.3720000088214874,0.3939999938011169,0.3700000047683716,0.3779999911785126,0.3860000073909759,0.3720000088214874,0.3700000047683716,0.3759999871253967,0.3799999952316284,0.3840000033378601,0.3759999871253967,0.3720000088214874,0.3759999871253967,0.3779999911785126,0.3880000114440918,0.3799999952316284,0.3759999871253967,0.3840000033378601,0.3759999871253967,0.3720000088214874,0.3779999911785126,0.3700000047683716,0.3799999952316284,0.3799999952316284,0.3860000073909759,0.3799999952316284,0.3779999911785126,0.3740000128746032,0.3779999911785126],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2660000026226043,0.277999997138977,0.2820000052452087,0.3079999983310699,0.3140000104904175,0.3260000050067901,0.3039999902248382,0.3319999873638153,0.3240000009536743,0.3199999928474426,0.3379999995231628,0.3339999914169311,0.3319999873638153,0.3319999873638153,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3199999928474426,0.3179999887943268,0.3400000035762787,0.3219999969005584,0.335999995470047,0.3339999914169311,0.3420000076293945,0.3240000009536743,0.3440000116825104,0.3420000076293945,0.3379999995231628,0.3459999859333038,0.328000009059906,0.3420000076293945,0.3459999859333038,0.3479999899864197,0.3379999995231628,0.356000006198883,0.3379999995231628,0.3440000116825104,0.3400000035762787,0.3379999995231628,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3379999995231628,0.356000006198883,0.3400000035762787,0.3519999980926513,0.3479999899864197,0.3479999899864197,0.3400000035762787,0.3459999859333038,0.3519999980926513,0.3440000116825104,0.3400000035762787,0.356000006198883,0.3420000076293945,0.356000006198883,0.3540000021457672,0.3600000143051147,0.3339999914169311,0.3499999940395355,0.3580000102519989,0.3440000116825104,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3339999914169311,0.3540000021457672,0.3459999859333038,0.3459999859333038,0.3400000035762787,0.356000006198883,0.356000006198883,0.3420000076293945,0.3420000076293945,0.3400000035762787,0.3479999899864197,0.3519999980926513,0.3319999873638153,0.3580000102519989,0.356000006198883,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3400000035762787,0.3440000116825104,0.3339999914169311,0.3379999995231628,0.3479999899864197,0.3680000007152557,0.3619999885559082,0.3440000116825104,0.3619999885559082,0.3580000102519989,0.356000006198883,0.3600000143051147,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3540000021457672,0.3600000143051147,0.356000006198883,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3540000021457672,0.363999992609024,0.3580000102519989,0.3680000007152557,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3420000076293945,0.363999992609024,0.3580000102519989,0.3619999885559082,0.3759999871253967,0.3740000128746032,0.363999992609024,0.3580000102519989,0.3700000047683716,0.3700000047683716,0.363999992609024,0.3440000116825104,0.3580000102519989,0.3680000007152557,0.3700000047683716,0.3740000128746032,0.3619999885559082,0.3619999885559082,0.3700000047683716,0.363999992609024,0.363999992609024,0.363999992609024,0.3700000047683716,0.3600000143051147,0.3680000007152557,0.363999992609024,0.3659999966621399,0.363999992609024,0.3680000007152557,0.3580000102519989,0.363999992609024,0.3659999966621399,0.363999992609024,0.3580000102519989,0.3600000143051147,0.3600000143051147,0.3580000102519989,0.3600000143051147],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2720000147819519,0.2980000078678131,0.2840000092983246,0.2879999876022339,0.3039999902248382,0.2860000133514404,0.2899999916553497,0.3019999861717224,0.2960000038146972,0.3039999902248382,0.3100000023841858,0.3160000145435333,0.3160000145435333,0.3260000050067901,0.3179999887943268,0.3420000076293945,0.3219999969005584,0.328000009059906,0.3240000009536743,0.3300000131130218,0.328000009059906,0.3199999928474426,0.3379999995231628,0.3400000035762787,0.3240000009536743,0.3120000064373016,0.3319999873638153,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3140000104904175,0.3179999887943268,0.3160000145435333,0.3199999928474426,0.3240000009536743,0.3260000050067901,0.3179999887943268,0.3300000131130218,0.3179999887943268,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3199999928474426,0.3400000035762787,0.3339999914169311,0.328000009059906,0.328000009059906,0.3339999914169311,0.328000009059906,0.328000009059906,0.335999995470047,0.3580000102519989,0.3499999940395355,0.3260000050067901,0.3499999940395355,0.3420000076293945,0.3160000145435333,0.3339999914169311,0.335999995470047,0.3400000035762787,0.3240000009536743,0.3319999873638153,0.3379999995231628,0.3400000035762787,0.3379999995231628,0.3319999873638153,0.3319999873638153,0.3440000116825104,0.3300000131130218,0.3219999969005584,0.3260000050067901,0.3219999969005584,0.3339999914169311,0.328000009059906,0.3300000131130218,0.3219999969005584,0.3379999995231628,0.3400000035762787,0.3319999873638153,0.328000009059906,0.3440000116825104,0.3339999914169311,0.328000009059906,0.3379999995231628,0.3499999940395355,0.3339999914169311,0.3300000131130218,0.328000009059906,0.335999995470047,0.3240000009536743,0.335999995470047,0.3240000009536743,0.3400000035762787,0.3400000035762787,0.3420000076293945,0.3319999873638153,0.3339999914169311,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3400000035762787,0.3379999995231628,0.3459999859333038,0.3379999995231628,0.3300000131130218,0.3519999980926513,0.3379999995231628,0.356000006198883,0.335999995470047,0.3420000076293945,0.3400000035762787,0.328000009059906,0.3540000021457672,0.3499999940395355,0.3479999899864197,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3540000021457672,0.3440000116825104,0.3499999940395355,0.356000006198883,0.356000006198883,0.356000006198883,0.363999992609024,0.3600000143051147,0.356000006198883,0.3479999899864197,0.356000006198883,0.3459999859333038,0.3479999899864197,0.3619999885559082,0.363999992609024,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3540000021457672,0.3619999885559082,0.3580000102519989,0.3540000021457672,0.356000006198883,0.3479999899864197,0.3519999980926513,0.356000006198883,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3440000116825104,0.3580000102519989,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3540000021457672,0.3519999980926513,0.3540000021457672,0.356000006198883,0.363999992609024,0.356000006198883,0.356000006198883],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6200000047683716,0.6470000147819519,0.6700000166893005,0.6869999766349792,0.6990000009536743,0.7059999704360962,0.7120000123977661,0.7139999866485596,0.7129999995231628,0.7289999723434448,0.7200000286102295,0.7139999866485596,0.7260000109672546,0.7329999804496765,0.7289999723434448,0.7369999885559082,0.7319999933242798,0.7260000109672546,0.7360000014305115,0.7369999885559082,0.7369999885559082,0.7300000190734863,0.7300000190734863,0.7350000143051147,0.734000027179718,0.7409999966621399,0.7429999709129333,0.7400000095367432,0.7440000176429749,0.7440000176429749,0.7400000095367432,0.7400000095367432,0.7480000257492065,0.7440000176429749,0.7570000290870667,0.7360000014305115,0.734000027179718,0.7419999837875366,0.7429999709129333,0.7519999742507935,0.746999979019165,0.7459999918937683,0.75,0.75,0.7400000095367432,0.7440000176429749,0.7450000047683716,0.7549999952316284,0.7580000162124634,0.7540000081062317,0.7419999837875366,0.7580000162124634,0.746999979019165,0.7540000081062317,0.765999972820282,0.7549999952316284,0.7580000162124634,0.753000020980835,0.7549999952316284,0.753000020980835,0.7490000128746033,0.7519999742507935,0.7630000114440918,0.7509999871253967,0.7570000290870667,0.7609999775886536,0.7609999775886536,0.7559999823570251,0.75,0.7540000081062317,0.7480000257492065,0.7590000033378601,0.7509999871253967,0.75,0.7559999823570251,0.7509999871253967,0.7480000257492065,0.7519999742507935,0.765999972820282,0.7590000033378601,0.7549999952316284,0.7609999775886536,0.7559999823570251,0.7599999904632568,0.765999972820282,0.7549999952316284,0.7549999952316284,0.7580000162124634,0.7699999809265137,0.7590000033378601,0.7699999809265137,0.7609999775886536,0.7590000033378601,0.765999972820282,0.765999972820282,0.7639999985694885,0.7710000276565552,0.7649999856948853,0.7519999742507935,0.7609999775886536,0.7549999952316284,0.7580000162124634,0.7770000100135803,0.7699999809265137,0.7749999761581421,0.777999997138977,0.7710000276565552,0.7680000066757202,0.7749999761581421,0.7730000019073486,0.7699999809265137,0.7799999713897705,0.7649999856948853,0.7689999938011169,0.7739999890327454,0.7710000276565552,0.7760000228881836,0.7739999890327454,0.7699999809265137,0.7749999761581421,0.7730000019073486,0.7770000100135803,0.7720000147819519,0.7699999809265137,0.7739999890327454,0.7710000276565552,0.7710000276565552,0.777999997138977,0.7789999842643738,0.7770000100135803,0.7720000147819519,0.7730000019073486,0.7730000019073486,0.7760000228881836,0.7710000276565552,0.7789999842643738,0.781000018119812,0.7749999761581421,0.777999997138977,0.7789999842643738,0.7770000100135803,0.7760000228881836,0.7730000019073486,0.7789999842643738,0.7749999761581421,0.7770000100135803,0.7749999761581421,0.7689999938011169,0.7749999761581421,0.777999997138977,0.7760000228881836,0.7749999761581421,0.7789999842643738,0.7820000052452087,0.7739999890327454,0.7799999713897705,0.781000018119812,0.7870000004768372,0.781000018119812,0.7789999842643738,0.781000018119812,0.777999997138977,0.7760000228881836,0.777999997138977,0.7720000147819519,0.7749999761581421,0.7739999890327454],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6209999918937683,0.6549999713897705,0.6800000071525574,0.6830000281333923,0.703000009059906,0.7020000219345093,0.7110000252723694,0.7160000205039978,0.7129999995231628,0.7210000157356262,0.7250000238418579,0.7210000157356262,0.7310000061988831,0.7269999980926514,0.7329999804496765,0.7459999918937683,0.734000027179718,0.7409999966621399,0.7390000224113464,0.7350000143051147,0.7509999871253967,0.7440000176429749,0.7379999756813049,0.7599999904632568,0.7400000095367432,0.7409999966621399,0.7590000033378601,0.7409999966621399,0.7440000176429749,0.7400000095367432,0.7450000047683716,0.75,0.7440000176429749,0.7409999966621399,0.7429999709129333,0.7440000176429749,0.7440000176429749,0.7559999823570251,0.7459999918937683,0.7559999823570251,0.7540000081062317,0.7599999904632568,0.7559999823570251,0.7490000128746033,0.7490000128746033,0.7429999709129333,0.7609999775886536,0.7519999742507935,0.7480000257492065,0.7490000128746033,0.7620000243186951,0.7580000162124634,0.7580000162124634,0.7540000081062317,0.7509999871253967,0.7519999742507935,0.7440000176429749,0.7459999918937683,0.7559999823570251,0.7620000243186951,0.746999979019165,0.7570000290870667,0.7620000243186951,0.7570000290870667,0.7540000081062317,0.7540000081062317,0.7570000290870667,0.7590000033378601,0.7519999742507935,0.75,0.7559999823570251,0.7590000033378601,0.7559999823570251,0.7519999742507935,0.7639999985694885,0.7620000243186951,0.7549999952316284,0.7490000128746033,0.7559999823570251,0.7639999985694885,0.7609999775886536,0.7609999775886536,0.7519999742507935,0.7549999952316284,0.7570000290870667,0.7620000243186951,0.7599999904632568,0.7639999985694885,0.7559999823570251,0.753000020980835,0.7649999856948853,0.753000020980835,0.7549999952316284,0.7609999775886536,0.7599999904632568,0.7680000066757202,0.7540000081062317,0.7559999823570251,0.7590000033378601,0.7590000033378601,0.7649999856948853,0.7639999985694885,0.7710000276565552,0.7699999809265137,0.7609999775886536,0.765999972820282,0.7670000195503235,0.7720000147819519,0.7639999985694885,0.7609999775886536,0.7549999952316284,0.7630000114440918,0.7670000195503235,0.7599999904632568,0.765999972820282,0.7670000195503235,0.7670000195503235,0.7670000195503235,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7829999923706055,0.7630000114440918,0.7720000147819519,0.7649999856948853,0.7630000114440918,0.7699999809265137,0.7720000147819519,0.7720000147819519,0.7689999938011169,0.777999997138977,0.7689999938011169,0.7760000228881836,0.7730000019073486,0.7799999713897705,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7770000100135803,0.777999997138977,0.7670000195503235,0.7789999842643738,0.7799999713897705,0.7749999761581421,0.7730000019073486,0.777999997138977,0.777999997138977,0.7799999713897705,0.7770000100135803,0.7770000100135803,0.7789999842643738,0.7760000228881836,0.7770000100135803,0.7770000100135803,0.7770000100135803,0.7739999890327454,0.7689999938011169,0.7760000228881836,0.777999997138977,0.7699999809265137,0.7739999890327454,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7730000019073486,0.7739999890327454,0.7680000066757202],"label":"FineWeb: independent MinHash (id mh)"},"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6190000176429749,0.6549999713897705,0.6769999861717224,0.6899999976158142,0.6869999766349792,0.7149999737739563,0.7179999947547913,0.7179999947547913,0.7319999933242798,0.7390000224113464,0.7350000143051147,0.7480000257492065,0.7440000176429749,0.7409999966621399,0.7440000176429749,0.7580000162124634,0.7419999837875366,0.7440000176429749,0.75,0.734000027179718,0.746999979019165,0.7459999918937683,0.7390000224113464,0.7490000128746033,0.7379999756813049,0.7429999709129333,0.7390000224113464,0.7360000014305115,0.7419999837875366,0.7480000257492065,0.7480000257492065,0.7490000128746033,0.7440000176429749,0.75,0.7540000081062317,0.7490000128746033,0.7549999952316284,0.7429999709129333,0.7540000081062317,0.753000020980835,0.7540000081062317,0.7440000176429749,0.7570000290870667,0.7400000095367432,0.7490000128746033,0.7549999952316284,0.7559999823570251,0.7580000162124634,0.7609999775886536,0.7480000257492065,0.7490000128746033,0.7599999904632568,0.7609999775886536,0.7540000081062317,0.753000020980835,0.7490000128746033,0.7480000257492065,0.7440000176429749,0.7549999952316284,0.7540000081062317,0.7559999823570251,0.7490000128746033,0.7409999966621399,0.7580000162124634,0.75,0.746999979019165,0.7400000095367432,0.7559999823570251,0.7490000128746033,0.7429999709129333,0.7519999742507935,0.7549999952316284,0.7559999823570251,0.753000020980835,0.753000020980835,0.746999979019165,0.746999979019165,0.7559999823570251,0.7549999952316284,0.7549999952316284,0.7570000290870667,0.7599999904632568,0.7599999904632568,0.7549999952316284,0.765999972820282,0.7649999856948853,0.7630000114440918,0.7580000162124634,0.7599999904632568,0.7559999823570251,0.7490000128746033,0.7620000243186951,0.7519999742507935,0.7580000162124634,0.7559999823570251,0.7580000162124634,0.7670000195503235,0.7599999904632568,0.7559999823570251,0.7580000162124634,0.7570000290870667,0.7649999856948853,0.7590000033378601,0.7649999856948853,0.7649999856948853,0.7609999775886536,0.7519999742507935,0.7639999985694885,0.7699999809265137,0.7689999938011169,0.7609999775886536,0.765999972820282,0.7710000276565552,0.7590000033378601,0.7710000276565552,0.7639999985694885,0.7710000276565552,0.7730000019073486,0.7680000066757202,0.7590000033378601,0.7639999985694885,0.7609999775886536,0.7559999823570251,0.7749999761581421,0.7680000066757202,0.7599999904632568,0.7609999775886536,0.7599999904632568,0.7580000162124634,0.7599999904632568,0.7649999856948853,0.765999972820282,0.7580000162124634,0.7739999890327454,0.7739999890327454,0.7739999890327454,0.7620000243186951,0.7749999761581421,0.7699999809265137,0.7670000195503235,0.7720000147819519,0.7739999890327454,0.7739999890327454,0.7649999856948853,0.7710000276565552,0.7649999856948853,0.7699999809265137,0.7760000228881836,0.7730000019073486,0.7699999809265137,0.7739999890327454,0.7720000147819519,0.7670000195503235,0.7720000147819519,0.7749999761581421,0.7699999809265137,0.7689999938011169,0.7639999985694885,0.7760000228881836,0.7670000195503235,0.7670000195503235,0.7689999938011169,0.7760000228881836,0.7670000195503235,0.7649999856948853,0.7720000147819519,0.7609999775886536],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.621999979019165,0.6439999938011169,0.6700000166893005,0.6790000200271606,0.6869999766349792,0.6959999799728394,0.6790000200271606,0.6880000233650208,0.7049999833106995,0.699999988079071,0.6990000009536743,0.6940000057220459,0.7110000252723694,0.7120000123977661,0.7070000171661377,0.7070000171661377,0.6990000009536743,0.7009999752044678,0.7160000205039978,0.7200000286102295,0.7149999737739563,0.7250000238418579,0.7210000157356262,0.722000002861023,0.7310000061988831,0.7289999723434448,0.7319999933242798,0.7250000238418579,0.722000002861023,0.7210000157356262,0.7170000076293945,0.7260000109672546,0.7250000238418579,0.7210000157356262,0.7200000286102295,0.7379999756813049,0.7239999771118164,0.7239999771118164,0.7080000042915344,0.7289999723434448,0.7289999723434448,0.7300000190734863,0.7329999804496765,0.7319999933242798,0.7350000143051147,0.7390000224113464,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7329999804496765,0.7400000095367432,0.7409999966621399,0.7310000061988831,0.7350000143051147,0.7360000014305115,0.7360000014305115,0.7409999966621399,0.7319999933242798,0.7409999966621399,0.7400000095367432,0.7390000224113464,0.7329999804496765,0.7459999918937683,0.753000020980835,0.746999979019165,0.734000027179718,0.7369999885559082,0.7419999837875366,0.734000027179718,0.7419999837875366,0.7289999723434448,0.7350000143051147,0.7300000190734863,0.7519999742507935,0.7390000224113464,0.7400000095367432,0.7409999966621399,0.7429999709129333,0.7450000047683716,0.7329999804496765,0.7260000109672546,0.7570000290870667,0.7360000014305115,0.7519999742507935,0.7419999837875366,0.7379999756813049,0.7390000224113464,0.7490000128746033,0.734000027179718,0.7360000014305115,0.7390000224113464,0.7440000176429749,0.7450000047683716,0.7319999933242798,0.7429999709129333,0.7519999742507935,0.7540000081062317,0.7519999742507935,0.753000020980835,0.7480000257492065,0.7440000176429749,0.7459999918937683,0.7369999885559082,0.7419999837875366,0.7480000257492065,0.7419999837875366,0.765999972820282,0.746999979019165,0.7459999918937683,0.7570000290870667,0.7390000224113464,0.7409999966621399,0.7459999918937683,0.75,0.7570000290870667,0.753000020980835,0.7549999952316284,0.7519999742507935,0.7490000128746033,0.746999979019165,0.7459999918937683,0.7459999918937683,0.746999979019165,0.7409999966621399,0.7419999837875366,0.7459999918937683,0.7440000176429749,0.7459999918937683,0.7490000128746033,0.7450000047683716,0.7409999966621399,0.7419999837875366,0.7490000128746033,0.7590000033378601,0.7549999952316284,0.7549999952316284,0.746999979019165,0.753000020980835,0.7549999952316284,0.746999979019165,0.7580000162124634,0.7490000128746033,0.753000020980835,0.75,0.75,0.7540000081062317,0.7540000081062317,0.7490000128746033,0.7570000290870667,0.7570000290870667,0.7590000033378601,0.7559999823570251,0.7620000243186951,0.7590000033378601,0.7509999871253967,0.7639999985694885,0.7580000162124634,0.7599999904632568,0.7620000243186951,0.7590000033378601,0.7609999775886536,0.7559999823570251,0.75,0.7509999871253967,0.7549999952316284,0.7540000081062317,0.7540000081062317],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.395000010728836,0.3970000147819519,0.3930000066757202,0.4050000011920929,0.3899999856948852,0.4070000052452087,0.4040000140666961,0.4189999997615814,0.4000000059604645,0.4269999861717224,0.4009999930858612,0.3980000019073486,0.414000004529953,0.4120000004768371,0.4050000011920929,0.4070000052452087,0.4070000052452087,0.4129999876022339,0.4009999930858612,0.4070000052452087,0.4059999883174896,0.4050000011920929,0.4059999883174896,0.4090000092983246,0.4160000085830688,0.4059999883174896,0.3899999856948852,0.3899999856948852,0.4009999930858612,0.3970000147819519,0.3959999978542328,0.4110000133514404,0.4040000140666961,0.4110000133514404,0.4050000011920929,0.4090000092983246,0.402999997138977,0.4189999997615814,0.3980000019073486,0.4059999883174896,0.4120000004768371,0.4149999916553497,0.4059999883174896,0.4250000119209289,0.4110000133514404,0.4070000052452087,0.4120000004768371,0.4120000004768371,0.4020000100135803,0.4050000011920929,0.4009999930858612,0.4079999923706054,0.4110000133514404,0.4110000133514404,0.4059999883174896,0.4040000140666961,0.3980000019073486,0.395000010728836,0.4050000011920929,0.402999997138977,0.4020000100135803,0.4090000092983246,0.4079999923706054,0.4020000100135803,0.4000000059604645,0.4020000100135803,0.4090000092983246,0.4160000085830688,0.3959999978542328,0.3970000147819519,0.3970000147819519,0.3989999890327453,0.3970000147819519,0.4009999930858612,0.4020000100135803,0.4009999930858612,0.4050000011920929,0.4110000133514404,0.4050000011920929,0.414000004529953,0.4059999883174896,0.4020000100135803,0.395000010728836,0.4009999930858612,0.402999997138977,0.4040000140666961,0.4020000100135803,0.3989999890327453,0.3980000019073486,0.4040000140666961,0.4059999883174896,0.3899999856948852,0.4009999930858612,0.3980000019073486,0.4040000140666961,0.4110000133514404,0.3939999938011169,0.4040000140666961,0.4040000140666961,0.4059999883174896,0.402999997138977,0.4020000100135803,0.4040000140666961,0.4059999883174896,0.3970000147819519,0.3989999890327453,0.4040000140666961,0.4090000092983246,0.4059999883174896,0.4020000100135803,0.4099999964237213,0.4050000011920929,0.4099999964237213,0.402999997138977,0.4040000140666961,0.3989999890327453,0.4070000052452087,0.4129999876022339,0.4120000004768371,0.4149999916553497,0.4169999957084656,0.3989999890327453,0.414000004529953,0.4169999957084656,0.4079999923706054,0.4020000100135803,0.4009999930858612,0.4020000100135803,0.4090000092983246,0.4189999997615814,0.4050000011920929,0.4090000092983246,0.4040000140666961,0.4090000092983246,0.4160000085830688,0.402999997138977,0.4169999957084656,0.4160000085830688,0.4149999916553497,0.4059999883174896,0.4169999957084656,0.4099999964237213,0.4210000038146972,0.4239999949932098,0.4160000085830688,0.4090000092983246,0.4120000004768371,0.4110000133514404,0.4189999997615814,0.414000004529953,0.4059999883174896,0.4110000133514404,0.4149999916553497,0.4110000133514404,0.4129999876022339,0.4120000004768371,0.4149999916553497,0.414000004529953,0.4079999923706054,0.4070000052452087,0.4099999964237213,0.4059999883174896,0.4040000140666961,0.4149999916553497,0.4099999964237213,0.4149999916553497,0.414000004529953],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.4009999930858612,0.4110000133514404,0.3889999985694885,0.4020000100135803,0.4079999923706054,0.3880000114440918,0.4000000059604645,0.3910000026226043,0.3980000019073486,0.395000010728836,0.3939999938011169,0.4050000011920929,0.4099999964237213,0.4099999964237213,0.4099999964237213,0.4059999883174896,0.4059999883174896,0.402999997138977,0.4079999923706054,0.4169999957084656,0.4020000100135803,0.3970000147819519,0.3970000147819519,0.4210000038146972,0.3970000147819519,0.3980000019073486,0.395000010728836,0.4000000059604645,0.3989999890327453,0.4009999930858612,0.4129999876022339,0.4120000004768371,0.4120000004768371,0.3919999897480011,0.414000004529953,0.4009999930858612,0.4090000092983246,0.4099999964237213,0.4079999923706054,0.4079999923706054,0.4009999930858612,0.3959999978542328,0.3959999978542328,0.4099999964237213,0.3959999978542328,0.4050000011920929,0.4059999883174896,0.4110000133514404,0.402999997138977,0.4040000140666961,0.414000004529953,0.3989999890327453,0.4199999868869781,0.4050000011920929,0.4070000052452087,0.4079999923706054,0.3989999890327453,0.4099999964237213,0.4020000100135803,0.4090000092983246,0.414000004529953,0.4189999997615814,0.4079999923706054,0.4180000126361847,0.4059999883174896,0.4070000052452087,0.4120000004768371,0.402999997138977,0.4059999883174896,0.4090000092983246,0.4110000133514404,0.3989999890327453,0.4079999923706054,0.4040000140666961,0.414000004529953,0.4079999923706054,0.402999997138977,0.4099999964237213,0.402999997138977,0.4120000004768371,0.4000000059604645,0.4090000092983246,0.4050000011920929,0.4129999876022339,0.4040000140666961,0.4129999876022339,0.4079999923706054,0.4180000126361847,0.414000004529953,0.4160000085830688,0.4199999868869781,0.4020000100135803,0.4020000100135803,0.4079999923706054,0.4009999930858612,0.4040000140666961,0.4099999964237213,0.4050000011920929,0.4040000140666961,0.414000004529953,0.4180000126361847,0.4050000011920929,0.414000004529953,0.4079999923706054,0.4050000011920929,0.4050000011920929,0.4070000052452087,0.4050000011920929,0.4059999883174896,0.4059999883174896,0.4000000059604645,0.4120000004768371,0.4059999883174896,0.4070000052452087,0.4120000004768371,0.4050000011920929,0.4059999883174896,0.3989999890327453,0.3959999978542328,0.4020000100135803,0.3989999890327453,0.3959999978542328,0.3989999890327453,0.4059999883174896,0.4070000052452087,0.4120000004768371,0.4009999930858612,0.4120000004768371,0.4129999876022339,0.4090000092983246,0.414000004529953,0.4099999964237213,0.4160000085830688,0.4040000140666961,0.4099999964237213,0.414000004529953,0.4050000011920929,0.402999997138977,0.4040000140666961,0.4079999923706054,0.3989999890327453,0.4059999883174896,0.3980000019073486,0.4070000052452087,0.4020000100135803,0.4009999930858612,0.4000000059604645,0.4079999923706054,0.4070000052452087,0.402999997138977,0.4079999923706054,0.4050000011920929,0.4040000140666961,0.4070000052452087,0.4020000100135803,0.3959999978542328,0.402999997138977,0.402999997138977,0.4099999964237213,0.4090000092983246,0.4009999930858612,0.4059999883174896,0.4020000100135803,0.4040000140666961,0.4009999930858612,0.4070000052452087,0.4070000052452087],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.3959999978542328,0.4070000052452087,0.3910000026226043,0.3939999938011169,0.3980000019073486,0.3930000066757202,0.4059999883174896,0.4000000059604645,0.3889999985694885,0.3989999890327453,0.402999997138977,0.3980000019073486,0.4000000059604645,0.3989999890327453,0.4000000059604645,0.3930000066757202,0.3939999938011169,0.3930000066757202,0.3880000114440918,0.3980000019073486,0.4020000100135803,0.3980000019073486,0.3959999978542328,0.3989999890327453,0.4020000100135803,0.402999997138977,0.4009999930858612,0.4079999923706054,0.4009999930858612,0.4070000052452087,0.4070000052452087,0.4020000100135803,0.4059999883174896,0.4079999923706054,0.4110000133514404,0.3989999890327453,0.4000000059604645,0.402999997138977,0.3939999938011169,0.3939999938011169,0.3980000019073486,0.3980000019073486,0.4050000011920929,0.4009999930858612,0.4120000004768371,0.402999997138977,0.4090000092983246,0.402999997138977,0.3980000019073486,0.3959999978542328,0.3970000147819519,0.4009999930858612,0.4070000052452087,0.4070000052452087,0.4079999923706054,0.402999997138977,0.3989999890327453,0.3980000019073486,0.4009999930858612,0.4050000011920929,0.4000000059604645,0.3959999978542328,0.3980000019073486,0.3970000147819519,0.4129999876022339,0.402999997138977,0.4090000092983246,0.4050000011920929,0.402999997138977,0.4009999930858612,0.3980000019073486,0.402999997138977,0.4020000100135803,0.3980000019073486,0.3970000147819519,0.402999997138977,0.4000000059604645,0.4149999916553497,0.3959999978542328,0.4000000059604645,0.4020000100135803,0.3919999897480011,0.4110000133514404,0.4090000092983246,0.4070000052452087,0.4059999883174896,0.4020000100135803,0.3959999978542328,0.4050000011920929,0.395000010728836,0.4020000100135803,0.3959999978542328,0.4090000092983246,0.4070000052452087,0.4040000140666961,0.4000000059604645,0.4020000100135803,0.402999997138977,0.4050000011920929,0.414000004529953,0.4009999930858612,0.402999997138977,0.4020000100135803,0.3980000019073486,0.4020000100135803,0.4000000059604645,0.402999997138977,0.395000010728836,0.4009999930858612,0.3959999978542328,0.4120000004768371,0.3989999890327453,0.3980000019073486,0.4040000140666961,0.4070000052452087,0.3989999890327453,0.3989999890327453,0.3970000147819519,0.3970000147819519,0.3980000019073486,0.3970000147819519,0.3989999890327453,0.4110000133514404,0.4050000011920929,0.3939999938011169,0.3970000147819519,0.4009999930858612,0.3989999890327453,0.3980000019073486,0.402999997138977,0.3860000073909759,0.402999997138977,0.4020000100135803,0.3959999978542328,0.3989999890327453,0.3959999978542328,0.4000000059604645,0.395000010728836,0.395000010728836,0.3939999938011169,0.3980000019073486,0.4020000100135803,0.3989999890327453,0.3989999890327453,0.3880000114440918,0.3980000019073486,0.3959999978542328,0.4009999930858612,0.4079999923706054,0.395000010728836,0.3980000019073486,0.4009999930858612,0.3980000019073486,0.4009999930858612,0.3989999890327453,0.4000000059604645,0.402999997138977,0.4000000059604645,0.4020000100135803,0.3959999978542328,0.395000010728836,0.4020000100135803,0.3970000147819519,0.3980000019073486,0.3959999978542328,0.4000000059604645,0.4050000011920929],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3619999885559082,0.4000000059604645,0.395000010728836,0.3959999978542328,0.4020000100135803,0.4000000059604645,0.3959999978542328,0.3930000066757202,0.3899999856948852,0.402999997138977,0.4009999930858612,0.3930000066757202,0.4050000011920929,0.3939999938011169,0.4000000059604645,0.3989999890327453,0.3959999978542328,0.4020000100135803,0.4000000059604645,0.3939999938011169,0.395000010728836,0.3919999897480011,0.3980000019073486,0.3910000026226043,0.3880000114440918,0.3959999978542328,0.3980000019073486,0.3989999890327453,0.402999997138977,0.3959999978542328,0.3980000019073486,0.395000010728836,0.4090000092983246,0.4090000092983246,0.3889999985694885,0.3959999978542328,0.3880000114440918,0.3840000033378601,0.3959999978542328,0.3880000114440918,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.3939999938011169,0.4020000100135803,0.3980000019073486,0.3970000147819519,0.4009999930858612,0.3919999897480011,0.3899999856948852,0.3989999890327453,0.3860000073909759,0.3860000073909759,0.3970000147819519,0.3959999978542328,0.3939999938011169,0.3840000033378601,0.3869999945163727,0.402999997138977,0.4050000011920929,0.395000010728836,0.3880000114440918,0.3869999945163727,0.3939999938011169,0.402999997138977,0.3899999856948852,0.3910000026226043,0.3910000026226043,0.4009999930858612,0.3919999897480011,0.3970000147819519,0.3919999897480011,0.3930000066757202,0.3869999945163727,0.3880000114440918,0.3849999904632568,0.3930000066757202,0.395000010728836,0.3889999985694885,0.3959999978542328,0.3989999890327453,0.402999997138977,0.3939999938011169,0.4000000059604645,0.4000000059604645,0.4050000011920929,0.3989999890327453,0.3869999945163727,0.3910000026226043,0.3889999985694885,0.3889999985694885,0.4000000059604645,0.3910000026226043,0.3970000147819519,0.3989999890327453,0.3989999890327453,0.3959999978542328,0.3910000026226043,0.3880000114440918,0.3939999938011169,0.382999986410141,0.3849999904632568,0.3959999978542328,0.3989999890327453,0.3959999978542328,0.3880000114440918,0.3840000033378601,0.3980000019073486,0.4000000059604645,0.4000000059604645,0.4020000100135803,0.395000010728836,0.3910000026226043,0.3919999897480011,0.4040000140666961,0.3989999890327453,0.4020000100135803,0.3910000026226043,0.4009999930858612,0.3959999978542328,0.3939999938011169,0.3930000066757202,0.3910000026226043,0.3970000147819519,0.3880000114440918,0.3970000147819519,0.3959999978542328,0.3889999985694885,0.3970000147819519,0.4009999930858612,0.3970000147819519,0.3959999978542328,0.3959999978542328,0.3989999890327453,0.4040000140666961,0.3959999978542328,0.3980000019073486,0.3970000147819519,0.3970000147819519,0.3989999890327453,0.4020000100135803,0.3980000019073486,0.4000000059604645,0.4000000059604645,0.402999997138977,0.4090000092983246,0.3970000147819519,0.4020000100135803,0.3970000147819519,0.4009999930858612,0.3959999978542328,0.3970000147819519,0.3989999890327453,0.3939999938011169,0.3989999890327453,0.4000000059604645,0.4000000059604645,0.3989999890327453,0.4050000011920929,0.4059999883174896,0.4009999930858612,0.3989999890327453,0.3959999978542328,0.3939999938011169,0.3970000147819519,0.4009999930858612,0.3989999890327453,0.3939999938011169],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/all_filtering_steps/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4880000054836273,0.492000013589859,0.5059999823570251,0.5139999985694885,0.5070000290870667,0.5090000033378601,0.5230000019073486,0.5189999938011169,0.5189999938011169,0.5220000147819519,0.5149999856948853,0.5260000228881836,0.5329999923706055,0.5180000066757202,0.5289999842643738,0.5400000214576721,0.5410000085830688,0.5440000295639038,0.5329999923706055,0.550000011920929,0.5419999957084656,0.5360000133514404,0.5429999828338623,0.5429999828338623,0.5450000166893005,0.5490000247955322,0.5400000214576721,0.5509999990463257,0.5559999942779541,0.5479999780654907,0.5540000200271606,0.5490000247955322,0.5400000214576721,0.5429999828338623,0.5460000038146973,0.5370000004768372,0.5479999780654907,0.5550000071525574,0.5490000247955322,0.5400000214576721,0.5410000085830688,0.5460000038146973,0.546999990940094,0.5479999780654907,0.546999990940094,0.5509999990463257,0.5450000166893005,0.5590000152587891,0.5419999957084656,0.5540000200271606,0.5440000295639038,0.5450000166893005,0.5580000281333923,0.5540000200271606,0.5440000295639038,0.5619999766349792,0.5450000166893005,0.5600000023841858,0.5559999942779541,0.5600000023841858,0.5400000214576721,0.5569999814033508,0.5600000023841858,0.5619999766349792,0.5529999732971191,0.5649999976158142,0.5609999895095825,0.5550000071525574,0.5609999895095825,0.5580000281333923,0.5550000071525574,0.5619999766349792,0.5550000071525574,0.5519999861717224,0.5600000023841858,0.5550000071525574,0.5550000071525574,0.5590000152587891,0.5490000247955322,0.5580000281333923,0.5600000023841858,0.5419999957084656,0.5559999942779541,0.5559999942779541,0.5529999732971191,0.5609999895095825,0.5519999861717224,0.5569999814033508,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.546999990940094,0.5619999766349792,0.5460000038146973,0.5529999732971191,0.5619999766349792,0.5690000057220459,0.5680000185966492,0.5720000267028809,0.5640000104904175,0.5550000071525574,0.5509999990463257,0.550000011920929,0.5600000023841858,0.5609999895095825,0.5630000233650208,0.5649999976158142,0.5529999732971191,0.5540000200271606,0.5529999732971191,0.5659999847412109,0.5600000023841858,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5730000138282776,0.5569999814033508,0.5690000057220459,0.5619999766349792,0.5680000185966492,0.578000009059906,0.5730000138282776,0.5550000071525574,0.5529999732971191,0.5600000023841858,0.5630000233650208,0.5590000152587891,0.5659999847412109,0.5669999718666077,0.5609999895095825,0.5630000233650208,0.5569999814033508,0.5490000247955322,0.5619999766349792,0.5550000071525574,0.5630000233650208,0.5559999942779541,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5669999718666077,0.5609999895095825,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5720000267028809,0.5619999766349792,0.5649999976158142,0.5669999718666077,0.5680000185966492,0.5699999928474426,0.5640000104904175,0.5609999895095825,0.5740000009536743,0.5690000057220459,0.5669999718666077,0.5720000267028809,0.5699999928474426,0.5709999799728394,0.5740000009536743,0.5680000185966492,0.5619999766349792,0.5690000057220459,0.5659999847412109,0.574999988079071],"label":"FineWeb: independent MinHash (id mh)"},"big-run-fineweb-v1-all-dumps":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4760000109672546,0.4979999959468841,0.503000020980835,0.531000018119812,0.515999972820282,0.5220000147819519,0.5210000276565552,0.5260000228881836,0.5289999842643738,0.5249999761581421,0.5239999890327454,0.5189999938011169,0.5260000228881836,0.5139999985694885,0.5299999713897705,0.5370000004768372,0.5350000262260437,0.5329999923706055,0.531000018119812,0.5299999713897705,0.550000011920929,0.5329999923706055,0.5260000228881836,0.5320000052452087,0.5339999794960022,0.5429999828338623,0.5440000295639038,0.5379999876022339,0.5509999990463257,0.5529999732971191,0.5440000295639038,0.5479999780654907,0.5419999957084656,0.5339999794960022,0.5440000295639038,0.5419999957084656,0.5370000004768372,0.5289999842643738,0.5220000147819519,0.5429999828338623,0.5519999861717224,0.5419999957084656,0.5370000004768372,0.546999990940094,0.5509999990463257,0.5509999990463257,0.5460000038146973,0.5519999861717224,0.5429999828338623,0.5419999957084656,0.5379999876022339,0.5450000166893005,0.5440000295639038,0.5440000295639038,0.5239999890327454,0.5450000166893005,0.550000011920929,0.5550000071525574,0.5429999828338623,0.5540000200271606,0.5410000085830688,0.5429999828338623,0.5550000071525574,0.5509999990463257,0.5460000038146973,0.550000011920929,0.546999990940094,0.5429999828338623,0.5299999713897705,0.550000011920929,0.5550000071525574,0.5440000295639038,0.5410000085830688,0.5450000166893005,0.550000011920929,0.546999990940094,0.5519999861717224,0.5529999732971191,0.550000011920929,0.5519999861717224,0.5540000200271606,0.5379999876022339,0.5590000152587891,0.5440000295639038,0.5540000200271606,0.5540000200271606,0.5429999828338623,0.5450000166893005,0.5440000295639038,0.5519999861717224,0.546999990940094,0.5519999861717224,0.5559999942779541,0.5659999847412109,0.5649999976158142,0.5600000023841858,0.5569999814033508,0.5550000071525574,0.5630000233650208,0.5559999942779541,0.5669999718666077,0.5550000071525574,0.5609999895095825,0.5580000281333923,0.5699999928474426,0.5580000281333923,0.5490000247955322,0.5619999766349792,0.5609999895095825,0.5529999732971191,0.5490000247955322,0.5540000200271606,0.5590000152587891,0.5600000023841858,0.5509999990463257,0.5569999814033508,0.5509999990463257,0.5580000281333923,0.5580000281333923,0.5580000281333923,0.5619999766349792,0.5649999976158142,0.5540000200271606,0.5619999766349792,0.5659999847412109,0.5759999752044678,0.5709999799728394,0.5550000071525574,0.5659999847412109,0.5659999847412109,0.5680000185966492,0.5669999718666077,0.5600000023841858,0.5619999766349792,0.5640000104904175,0.5580000281333923,0.5580000281333923,0.5540000200271606,0.5789999961853027,0.5600000023841858,0.5509999990463257,0.5690000057220459,0.5709999799728394,0.5669999718666077,0.5600000023841858,0.5619999766349792,0.5600000023841858,0.5590000152587891,0.5600000023841858,0.5690000057220459,0.5690000057220459,0.5789999961853027,0.5669999718666077,0.5690000057220459,0.5649999976158142,0.5690000057220459,0.5699999928474426,0.5669999718666077,0.5649999976158142,0.5630000233650208,0.5559999942779541,0.5669999718666077,0.5669999718666077,0.5720000267028809,0.5690000057220459,0.5830000042915344,0.5640000104904175],"label":"FineWeb: id mh + C4 + custom filters"},"big-run-sampled-fineweb-c4-filters":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4790000021457672,0.4839999973773956,0.5059999823570251,0.5109999775886536,0.5070000290870667,0.5099999904632568,0.5239999890327454,0.5239999890327454,0.5120000243186951,0.5339999794960022,0.5220000147819519,0.5189999938011169,0.5210000276565552,0.5260000228881836,0.5389999747276306,0.5249999761581421,0.531000018119812,0.527999997138977,0.5299999713897705,0.5230000019073486,0.5289999842643738,0.5450000166893005,0.5419999957084656,0.5329999923706055,0.5400000214576721,0.5270000100135803,0.5299999713897705,0.5379999876022339,0.531000018119812,0.5170000195503235,0.5429999828338623,0.5339999794960022,0.5429999828338623,0.5389999747276306,0.5289999842643738,0.5360000133514404,0.5289999842643738,0.5350000262260437,0.5339999794960022,0.5220000147819519,0.5400000214576721,0.5540000200271606,0.5299999713897705,0.5379999876022339,0.531000018119812,0.5440000295639038,0.5329999923706055,0.5400000214576721,0.5299999713897705,0.5429999828338623,0.5299999713897705,0.5329999923706055,0.5419999957084656,0.5339999794960022,0.5450000166893005,0.5260000228881836,0.5379999876022339,0.5490000247955322,0.5529999732971191,0.527999997138977,0.5350000262260437,0.5429999828338623,0.5410000085830688,0.5400000214576721,0.5410000085830688,0.5320000052452087,0.5360000133514404,0.5329999923706055,0.5339999794960022,0.5410000085830688,0.5550000071525574,0.546999990940094,0.5360000133514404,0.546999990940094,0.5550000071525574,0.5440000295639038,0.5429999828338623,0.5479999780654907,0.5490000247955322,0.5440000295639038,0.5299999713897705,0.546999990940094,0.5529999732971191,0.5429999828338623,0.5419999957084656,0.5460000038146973,0.546999990940094,0.546999990940094,0.5440000295639038,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5479999780654907,0.5640000104904175,0.546999990940094,0.5419999957084656,0.550000011920929,0.5540000200271606,0.550000011920929,0.5490000247955322,0.5479999780654907,0.546999990940094,0.550000011920929,0.5479999780654907,0.550000011920929,0.5509999990463257,0.5440000295639038,0.5580000281333923,0.550000011920929,0.5590000152587891,0.5590000152587891,0.5600000023841858,0.550000011920929,0.5609999895095825,0.550000011920929,0.5630000233650208,0.5490000247955322,0.5490000247955322,0.5580000281333923,0.5519999861717224,0.5619999766349792,0.5540000200271606,0.5580000281333923,0.5559999942779541,0.5490000247955322,0.5490000247955322,0.5590000152587891,0.5440000295639038,0.550000011920929,0.5540000200271606,0.5590000152587891,0.5529999732971191,0.5540000200271606,0.5559999942779541,0.5540000200271606,0.5619999766349792,0.5509999990463257,0.5699999928474426,0.5529999732971191,0.5490000247955322,0.5529999732971191,0.5580000281333923,0.5540000200271606,0.5590000152587891,0.5590000152587891,0.5590000152587891,0.5509999990463257,0.546999990940094,0.5550000071525574,0.5600000023841858,0.5580000281333923,0.5680000185966492,0.5600000023841858,0.5590000152587891,0.5619999766349792,0.5640000104904175,0.5609999895095825,0.5569999814033508,0.5550000071525574,0.5509999990463257,0.5609999895095825,0.5529999732971191,0.5630000233650208,0.5690000057220459,0.5640000104904175,0.5619999766349792,0.5619999766349792],"label":"FineWeb: id mh + C4 filters"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5239999890327454,0.4900000095367431,0.5040000081062317,0.5099999904632568,0.4990000128746032,0.5170000195503235,0.5040000081062317,0.5009999871253967,0.5230000019073486,0.5109999775886536,0.5059999823570251,0.5130000114440918,0.5090000033378601,0.5220000147819519,0.5189999938011169,0.5180000066757202,0.5220000147819519,0.5120000243186951,0.5460000038146973,0.5239999890327454,0.5289999842643738,0.5440000295639038,0.5339999794960022,0.5299999713897705,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5360000133514404,0.5299999713897705,0.5180000066757202,0.5249999761581421,0.5440000295639038,0.5299999713897705,0.5339999794960022,0.5239999890327454,0.527999997138977,0.5139999985694885,0.5289999842643738,0.5360000133514404,0.5260000228881836,0.5389999747276306,0.5460000038146973,0.5270000100135803,0.5339999794960022,0.5320000052452087,0.5329999923706055,0.5260000228881836,0.5220000147819519,0.5260000228881836,0.5379999876022339,0.5410000085830688,0.5350000262260437,0.5389999747276306,0.5320000052452087,0.5389999747276306,0.5379999876022339,0.5329999923706055,0.5270000100135803,0.5170000195503235,0.5329999923706055,0.5370000004768372,0.5379999876022339,0.5249999761581421,0.5479999780654907,0.546999990940094,0.5400000214576721,0.5440000295639038,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5370000004768372,0.5370000004768372,0.5479999780654907,0.5379999876022339,0.5400000214576721,0.5479999780654907,0.5379999876022339,0.5509999990463257,0.5440000295639038,0.5379999876022339,0.550000011920929,0.5389999747276306,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5360000133514404,0.5509999990463257,0.5360000133514404,0.5419999957084656,0.5419999957084656,0.550000011920929,0.5360000133514404,0.5519999861717224,0.5540000200271606,0.546999990940094,0.5370000004768372,0.5379999876022339,0.5519999861717224,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.550000011920929,0.5490000247955322,0.5360000133514404,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5490000247955322,0.5479999780654907,0.5350000262260437,0.5490000247955322,0.5370000004768372,0.5440000295639038,0.5329999923706055,0.5440000295639038,0.5429999828338623,0.5389999747276306,0.5450000166893005,0.5320000052452087,0.5450000166893005,0.5400000214576721,0.5419999957084656,0.5460000038146973,0.5370000004768372,0.5400000214576721,0.5460000038146973,0.5370000004768372,0.5370000004768372,0.5460000038146973,0.5400000214576721,0.5490000247955322,0.5529999732971191,0.5379999876022339,0.5460000038146973,0.5450000166893005,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5479999780654907,0.5460000038146973,0.5540000200271606,0.5400000214576721,0.5350000262260437,0.5490000247955322,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5410000085830688,0.5429999828338623,0.5379999876022339,0.5450000166893005,0.5389999747276306,0.5400000214576721,0.5400000214576721,0.550000011920929,0.5440000295639038,0.5389999747276306,0.5450000166893005,0.5400000214576721,0.5389999747276306,0.5419999957084656,0.5410000085830688,0.5440000295639038,0.5519999861717224,0.5479999780654907,0.5450000166893005,0.5569999814033508],"label":"FineWeb: base filtering only"}},"layout":{"title":{"text":"The different FineWeb processing steps"}}}
dist/assets/data/plots/c4_filters_hellaswag/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3580685469011466,0.3740996705989043,0.39048008372386295,0.39857714250683784,0.40837346265713376,0.4111154315372308,0.41773712386687595,0.4196594481666882,0.42379963273803395,0.4276047808428605,0.42980752388636273,0.43098293244838715,0.43155378103256226,0.4327609067161878],"label":"C4"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36066408455371857,0.3812380563467741,0.394003426656127,0.40062618628144264,0.4117735456675291,0.4165923688560724,0.4175422675907612,0.42100309208035464,0.42246321588754654,0.42360376194119453,0.42823668196797365,0.4299001637846231,0.4302353039383888,0.4310380257666111],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.330924579873681,0.35825083684176207,0.37912008538842196,0.38942993618547916,0.3983491826802492,0.4053049590438604,0.4079726096242666,0.4135104585438967,0.41717425361275673,0.41904263757169247,0.4211529679596424,0.4212619122117758,0.42373160831630224,0.42435371689498425,0.4279126934707165],"label":"All filters except terminal_punct"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36182260885834694,0.3764855917543173,0.3928546328097582,0.3978128544986248,0.4073755294084549,0.4112890623509884,0.41486112400889397,0.4196756165474653,0.4235504809767008,0.42218128964304924,0.4228535555303097,0.4249562546610832,0.42740595713257784,0.42711055465042586],"label":"terminal_punct filter"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3583905678242445,0.38119001872837543,0.3873079549521208,0.39723034016788,0.4043100867420435,0.40908974781632423,0.4140731003135443,0.41894380562007427,0.41736695170402527,0.4232212919741869,0.4229240976274013,0.4236308634281158,0.42750727012753487,0.4268195778131485],"label":"curly_bracket filter"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36000680737197394,0.37551611103117466,0.38802069239318365,0.3933942876756191,0.4043118376284838,0.40780537389218807,0.4112964067608118,0.4137573726475239,0.41791345551609993,0.4173779133707285,0.42117033526301384,0.42073468305170536,0.42412591539323324,0.4260616712272167],"label":"word_lengths filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308296035975218,0.35613923892378807,0.3746252153068781,0.38806260935962195,0.39690930768847466,0.4043668694794178,0.40220927633345127,0.41070565767586226,0.41399387270212173,0.4170555509626865,0.42098715901374817,0.4210818205028772,0.42051274701952934,0.424176013097167,0.4225243702530861],"label":"baseline"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2928333381811778,0.3191666702429453,0.3451666633288066,0.342166672150294,0.35983332991600037,0.35483332475026447,0.3643333315849304,0.3631666700045268,0.3698333303133647,0.3696666657924652,0.37433333198229474,0.3805000086625417,0.3800000051657359,0.3798333406448364],"label":"C4"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29250000417232513,0.3184999972581863,0.3297500014305115,0.34450000524520874,0.3512499928474426,0.35724999010562897,0.36375001072883606,0.3665000051259994,0.3684999942779541,0.3712499886751175,0.37375000119209284,0.37800000607967377,0.3840000033378601,0.37950000166893005],"label":"curly_bracket filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.32549999654293055,0.3307500034570694,0.3467499911785126,0.3500000089406967,0.3452499955892563,0.3622500002384185,0.35999999940395355,0.37024998664855957,0.3684999942779541,0.3675000071525574,0.37249998748302454,0.37675000727176666,0.3760000020265579],"label":"baseline"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.28949999809265137,0.3187499940395355,0.33825001120567316,0.35074999928474426,0.3604999929666519,0.36274999380111694,0.3634999990463257,0.3645000010728836,0.3644999861717224,0.3669999986886978,0.3642500042915344,0.3722499907016754,0.37499999999999994,0.37549999356269836],"label":"word_lengths filter"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.294500008225441,0.32725000381469727,0.3352499902248382,0.3504999876022339,0.3487499952316284,0.3557500094175339,0.35324999690055847,0.36374999582767487,0.36474999785423273,0.372749999165535,0.36775000393390656,0.3707500100135803,0.3734999895095825,0.375],"label":"All filters except terminal_punct"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.30024999380111694,0.32724998891353607,0.33374999463558197,0.34574998915195465,0.351749986410141,0.36124999821186066,0.3527500033378601,0.3582500070333481,0.35850000381469727,0.36075000464916224,0.364750012755394,0.37049999833106995,0.3729999959468841,0.36974999308586115],"label":"All filters"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2947500050067901,0.31974999606609344,0.3344999998807907,0.3445000052452087,0.351500004529953,0.35199999809265137,0.35925000905990595,0.3634999990463257,0.36374999582767487,0.36550000309944153,0.36775000393390656,0.3677499890327453,0.36900000274181366,0.36650000512599945],"label":"terminal_punct filter"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.2824999988079071,0.2985000014305115,0.3050000071525574,0.3119999915361404,0.3110000044107437,0.3164999932050705,0.32199999690055847,0.3279999941587448,0.3365000039339065,0.3375000059604645,0.3384999930858612,0.340499997138977,0.341499999165535],"label":"word_lengths filter"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2619999945163727,0.288000002503395,0.29749999940395355,0.30399999022483826,0.3149999976158142,0.3245000094175339,0.3230000138282776,0.3240000009536743,0.3245000094175339,0.33550000190734863,0.335999995470047,0.32999999821186066,0.3375000059604645,0.34049999713897705],"label":"curly_bracket filter"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26299999654293055,0.2864999920129776,0.2944999933242798,0.2985000014305115,0.3165000081062317,0.3194999992847442,0.318000003695488,0.32500000298023224,0.32899999618530273,0.3254999965429306,0.33150000870227814,0.3330000042915344,0.33200000226497645,0.3330000042915344],"label":"All filters"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2650000005960464,0.28599999845027924,0.3110000044107437,0.2944999933242798,0.3085000067949295,0.32199999690055847,0.31949999928474426,0.3240000009536743,0.32500000298023224,0.3245000094175339,0.32199999690055847,0.3265000134706497,0.3295000046491623,0.32999999821186066],"label":"terminal_punct filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2584999948740005,0.2850000113248825,0.30850000679492945,0.30149999260902405,0.31049999594688416,0.3079999983310699,0.3150000125169754,0.32199999690055847,0.3244999945163727,0.3205000013113022,0.3244999945163727,0.3279999941587448,0.33149999380111694,0.32850000262260437],"label":"baseline"},"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25700000921885174,0.2786666651566823,0.2960000038146972,0.3049999972184499,0.3053333262602488,0.3120000064373016,0.31733333071072894,0.3163333336512248,0.3186666667461395,0.3226666748523712,0.3286666671435038,0.3240000009536743,0.32900000611941016,0.3283333381017049],"label":"C4"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25800000131130213,0.2849999964237213,0.29200001060962677,0.289000004529953,0.30349999666213984,0.30400000512599945,0.3139999955892563,0.3139999955892563,0.318000003695488,0.32299999892711634,0.3174999952316284,0.3215000033378601,0.32250000536441803,0.32549999654293055],"label":"All filters except terminal_punct"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29950000345706934,0.33799999952316284,0.3789999932050705,0.3970000147819519,0.42149999737739563,0.431999996304512,0.4440000057220459,0.4490000009536743,0.45949999988079065,0.4714999943971634,0.48000000417232513,0.47749999165534973,0.48100000619888306,0.48950000107288355],"label":"All filters"},"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29699999094009394,0.3369999925295512,0.3699999948342641,0.3930000066757202,0.41233333945274353,0.42733333508173627,0.43799999356269836,0.4506666660308838,0.454666664203008,0.47166667381922406,0.47766666611035663,0.476666659116745,0.48366666833559663,0.4853333334128062],"label":"C4"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2955000102519989,0.3385000079870224,0.36800000071525574,0.40099999308586115,0.4099999964237213,0.41700001060962677,0.42400000989437103,0.4389999955892563,0.4414999932050705,0.4484999924898147,0.455499991774559,0.45799998939037323,0.4660000056028366,0.471000000834465],"label":"All filters except terminal_punct"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2939999997615814,0.3295000046491623,0.3684999942779541,0.38449999690055847,0.398499995470047,0.3959999978542328,0.4204999953508377,0.4335000067949295,0.445499986410141,0.443000003695488,0.455499991774559,0.45250000059604645,0.4529999941587448,0.4545000046491623],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29100000858306885,0.32400000095367426,0.3439999967813492,0.3575000017881393,0.3800000101327896,0.40049999952316284,0.4134999960660934,0.42099998891353607,0.4204999953508377,0.4280000030994415,0.44099999964237213,0.43799999356269836,0.44200000166893005,0.44600000977516174],"label":"word_lengths filter"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29749999940395355,0.3240000009536743,0.34849999845027924,0.3725000023841858,0.3895000070333481,0.39800000190734863,0.41000001132488245,0.4214999973773956,0.42149999737739563,0.42499999701976776,0.42750000953674316,0.4364999979734421,0.4354999959468841,0.4385000020265579],"label":"curly_bracket filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28949999809265137,0.32599999010562897,0.34450000524520874,0.3725000023841858,0.38500000536441803,0.39499999582767487,0.408500000834465,0.41700001060962677,0.4174999892711639,0.4284999966621399,0.42849999666213984,0.43150000274181366,0.4399999976158142,0.4375],"label":"baseline"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"siqa/acc_norm":{"file":"siqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"defaultMetric":"hellaswag/acc_norm","slider":{"min":0,"max":10,"default":3}}}
dist/assets/data/plots/c4_filters_hellaswag/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25013685226440424,0.25661391019821167,0.2620016932487488,0.2657508552074432,0.2710244506597519,0.2744349539279938,0.27642421424388885,0.2818952649831772,0.2794509679079056,0.2831944525241852,0.28439727425575256,0.2866545617580414,0.2866020053625107,0.28615814447402954,0.2871949374675751],"label":"baseline"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25205445289611816,0.2613788843154907,0.26891554892063135,0.2724043130874634,0.27449470758438105,0.27719296514987946,0.27587129175662994,0.2815589904785156,0.2833077013492584,0.2830233126878738,0.28461267054080963,0.2871275246143341,0.28650729358196253,0.2869933694601059],"label":"word_lengths filter"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25500668585300446,0.26221066713333124,0.26368947327136993,0.2702934741973877,0.27218967676162714,0.27553085982799524,0.27833363413810724,0.2786440253257751,0.2810910940170288,0.2834737300872803,0.2833452969789505,0.2836028486490249,0.28682972490787506,0.2868015915155411],"label":"All filters except terminal_punct"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25762456655502314,0.2630201578140259,0.2672136425971985,0.27234274148941034,0.2702306807041168,0.27446796000003815,0.27583475410938263,0.2770504504442215,0.2794356495141983,0.28302033245563507,0.28214274346828455,0.2855468988418579,0.2840581685304642,0.28505663573741913],"label":"curly_bracket filter"},"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2557150324185689,0.25763070583343506,0.2643406589825948,0.26745049158732087,0.2721543808778127,0.2737567722797394,0.2732303539911906,0.27877557277679443,0.27923040588696796,0.2798382341861725,0.2831268608570099,0.28203009565671283,0.2810969154040019,0.28292057911554974],"label":"C4"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2583308666944504,0.2611347585916519,0.26333703100681305,0.2685028165578842,0.2725042402744293,0.27531248331069946,0.27463899552822113,0.2784048914909363,0.27915388345718384,0.27945026755332947,0.28207844495773315,0.281900018453598,0.2822476774454117,0.28188446164131165],"label":"terminal_punct filter"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25806266069412226,0.26165445148944855,0.26727744936943054,0.2677594721317291,0.2689383774995804,0.2724889665842056,0.27308812737464905,0.27327476441860193,0.27370570600032806,0.277080088853836,0.27814342081546783,0.2782013118267059,0.27888238430023193,0.2795541882514953],"label":"All filters"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.27699999511241913,0.288000002503395,0.2980000078678131,0.31199999153614044,0.29500000178813934,0.3139999955892563,0.31199999153614044,0.31200000643730164,0.3369999974966049,0.32899999618530273,0.3200000077486038,0.3310000002384186,0.3330000042915344],"label":"baseline"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2590000033378601,0.278999999165535,0.2979999929666519,0.29899999499320984,0.3270000070333481,0.32800000905990595,0.32899999618530273,0.3369999974966049,0.33200000226497645,0.3260000050067901,0.33599999547004694,0.335999995470047,0.33500000834465027,0.3330000042915344],"label":"All filters"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2690000087022781,0.27300000190734863,0.28599999845027924,0.28299999237060547,0.3050000071525574,0.30900000035762787,0.31199999153614044,0.3200000077486038,0.33200000226497645,0.31200000643730164,0.3230000138282776,0.32299999892711634,0.32899999618530273,0.3320000022649765],"label":"word_lengths filter"},"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2526666720708211,0.26533332467079157,0.26600000262260437,0.29333333174387616,0.3059999942779541,0.30933333436648053,0.31600000460942584,0.31466667850812274,0.32933333516120905,0.3346666693687439,0.3366666634877522,0.3386666675408681,0.33799999952316284,0.33066666126251215],"label":"C4"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.27400000393390656,0.2929999977350235,0.29600000381469727,0.306999996304512,0.3199999928474426,0.3190000057220459,0.31299999356269836,0.3229999989271164,0.3210000097751617,0.3270000070333481,0.3230000138282776,0.33399999141693115,0.3260000050067901],"label":"terminal_punct filter"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25800000131130213,0.29899999499320984,0.27900001406669617,0.296999990940094,0.2980000078678131,0.3149999976158142,0.3179999887943268,0.32500000298023224,0.3079999983310699,0.32900001108646393,0.32599999010562897,0.3190000057220459,0.3279999941587448,0.3229999989271164],"label":"curly_bracket filter"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2500000074505806,0.2759999930858612,0.2800000011920929,0.29099999368190765,0.3070000112056732,0.3070000112056732,0.3229999989271164,0.3240000009536743,0.31700000166893005,0.3100000023841858,0.31300000846385956,0.31700000166893005,0.3100000023841858,0.3189999908208847],"label":"All filters except terminal_punct"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6196666558583578,0.6583333412806193,0.6833333373069763,0.6829999883969625,0.6983333230018616,0.702999989191691,0.7056666612625122,0.7076666553815206,0.7139999866485596,0.7209999958674113,0.7179999947547913,0.7273333470026652,0.7209999958674113,0.7273333271344503],"label":"C4"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6520000100135803,0.6800000071525574,0.6895000040531158,0.6949999928474426,0.6990000009536743,0.7045000195503235,0.7114999890327454,0.710999995470047,0.7159999907016754,0.7199999988079071,0.7199999988079071,0.7204999923706055,0.7254999876022339],"label":"All filters except terminal_punct"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6149999797344208,0.6520000100135803,0.6789999902248383,0.69200000166893,0.6949999928474426,0.6955000162124634,0.7055000066757202,0.7150000035762787,0.7169999778270721,0.7184999883174896,0.7235000133514404,0.7240000069141388,0.723499983549118,0.7249999940395355],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6549999713897705,0.6695000231266022,0.6860000193119049,0.6994999945163727,0.6980000138282776,0.7084999978542328,0.7120000123977661,0.7124999761581421,0.7160000205039978,0.7179999947547913,0.7195000052452087,0.7229999899864197,0.723499983549118],"label":"terminal_punct filter"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6215000152587891,0.6580000221729279,0.6784999966621399,0.69200000166893,0.703499972820282,0.7029999792575836,0.710999995470047,0.7139999866485596,0.7179999947547913,0.7150000035762787,0.715499997138977,0.7184999883174896,0.7160000205039978,0.7224999964237213],"label":"All filters"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6229999959468842,0.6590000092983246,0.6714999973773956,0.6820000112056732,0.6949999928474426,0.6940000057220459,0.7064999938011169,0.7005000114440918,0.6989999711513519,0.7084999978542328,0.7060000002384186,0.7099999785423279,0.7160000205039978,0.7150000035762787],"label":"word_lengths filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6105000078678131,0.6350000202655792,0.6620000004768372,0.675000011920929,0.6940000057220459,0.6974999904632568,0.7054999768733978,0.7060000002384186,0.7059999704360962,0.7084999978542328,0.7060000002384186,0.7084999978542328,0.7144999802112579,0.7134999930858612],"label":"baseline"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.39633333683013916,0.3893333276112874,0.3933333357175191,0.39800000190734863,0.4013333320617676,0.4010000030199687,0.4059999982515971,0.41100000341733295,0.4063333372275035,0.40433333317438763,0.404666672150294,0.3993333379427592,0.4053333302338918,0.40800000230471295],"label":"C4"},"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.4010000079870224,0.3974999934434891,0.39499999582767487,0.403999999165535,0.40299999713897705,0.4095000028610229,0.4074999988079071,0.4065000116825104,0.4074999988079071,0.4050000011920929,0.3999999910593033,0.40700000524520874,0.4050000011920929,0.40799999237060547],"label":"terminal_punct filter"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3955000042915344,0.40049999952316284,0.3945000022649765,0.40700000524520874,0.4010000079870224,0.4025000035762787,0.39650000631809235,0.4004999995231628,0.4020000100135803,0.40150000154972076,0.40950000286102295,0.4080000072717666,0.40600000321865076,0.40750001370906824],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3994999974966049,0.392999991774559,0.40350000560283655,0.3939999938011169,0.39650000631809235,0.39450000226497645,0.4025000035762787,0.39900000393390656,0.3985000103712082,0.4030000120401382,0.3969999998807907,0.40150000154972076,0.40049999952316284,0.4025000035762787],"label":"All filters except terminal_punct"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.4009999930858612,0.4024999886751175,0.3935000002384186,0.3904999941587448,0.4040000140666961,0.4035000056028366,0.40449999272823334,0.4079999923706054,0.40049999952316284,0.3985000103712082,0.39750000834465027,0.39799998700618744,0.3995000123977661,0.39699999988079065],"label":"curly_bracket filter"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40150000154972076,0.39549998939037323,0.3969999998807907,0.3974999934434891,0.3959999978542328,0.39750000834465027,0.39549998939037323,0.3895000070333481,0.3994999974966049,0.3980000019073486,0.4000000059604645,0.39100000262260437,0.39250001311302185,0.39499999582767487],"label":"word_lengths filter"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.39800000190734863,0.3970000147819519,0.4000000059604645,0.39799998700618744,0.408500000834465,0.39400000870227814,0.392999991774559,0.40450000762939453,0.4070000052452087,0.39950001239776606,0.3994999974966049,0.3949999958276748,0.398499995470047,0.3920000046491623],"label":"baseline"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/c4_filters_hellaswag/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-c4-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.48900000751018524,0.5080000162124634,0.50450000166893,0.5185000002384186,0.5175000131130219,0.5099999904632568,0.526500016450882,0.5320000052452087,0.5230000019073486,0.5105000138282776,0.5214999914169312,0.523499995470047,0.5264999866485596],"label":"terminal_punct filter"},"filtering-c4-curly_bracket":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48350000381469727,0.5024999976158142,0.5039999932050705,0.5049999952316284,0.5115000009536743,0.50450000166893,0.5120000243186951,0.5144999921321869,0.5194999873638153,0.5250000059604645,0.5170000195503235,0.5180000066757202,0.527999997138977,0.5259999930858612],"label":"curly_bracket filter"},"sm-baseline-c4":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4933333396911621,0.48733333746592206,0.5056666731834412,0.5066666503747305,0.5116666754086813,0.5076666871706644,0.5213333169619242,0.5150000055631002,0.5183333357175192,0.5169999996821085,0.515333334604899,0.5193333427111307,0.5143333276112875,0.5196666717529297],"label":"C4"},"filtering-c4-all-except-terminal_punct":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49150000512599945,0.49900001287460327,0.49300000071525574,0.5015000104904175,0.5094999969005585,0.5109999775886536,0.5085000097751617,0.507500022649765,0.5205000042915344,0.5125000178813934,0.5160000026226044,0.5175000131130219,0.5150000154972076,0.5179999768733978],"label":"All filters except terminal_punct"},"filtering-c4-word_lengths":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.49000000953674316,0.4999999850988388,0.4989999830722809,0.5115000009536743,0.5105000138282776,0.5069999992847443,0.5109999775886536,0.5164999961853027,0.5059999823570251,0.5129999816417694,0.5059999823570251,0.5115000009536743,0.5164999961853027],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.4989999979734421,0.5064999908208847,0.49800001084804535,0.5040000081062317,0.5139999985694885,0.5160000026226044,0.5109999775886536,0.5070000141859055,0.5115000009536743,0.5105000138282776,0.5175000131130219,0.5200000107288361,0.5135000050067902],"label":"All filters"},"filtering-baseline-2019-18-60gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48950000107288355,0.48950000107288355,0.5049999952316284,0.5125000178813934,0.5004999935626984,0.5065000057220459,0.5055000185966492,0.511000007390976,0.5160000026226044,0.5209999978542328,0.5270000100135803,0.5219999849796295,0.5149999856948853,0.5125000178813934],"label":"baseline"}},"layout":{"title":{"text":"C4 filtering effect on HellaSwag"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3534814938902855,0.3764607086777687,0.38782499730587,0.3981050960719585,0.4028486795723438,0.4125883243978023,0.4117814563214779,0.414029736071825,0.4197172522544861,0.4211113378405571,0.4279881417751312,0.4280137903988361,0.4280424378812313,0.4291964024305343,0.4326301179826259,0.4371833503246307,0.4346669465303421,0.4336562640964985,0.4432648755609989,0.4401291646063328,0.4394684173166752,0.4476612061262131,0.4465444348752498,0.4472153298556804,0.4433343075215816,0.4510187618434429,0.4459567815065384,0.4460812956094742,0.4498684890568256,0.4529943652451038,0.4528274349868297,0.4551213420927524,0.4549156539142132,0.4564928151667118,0.4576693661510944,0.4557182416319847,0.4536240361630916,0.457439012825489,0.4570476822555065,0.4589823484420776,0.462024375796318,0.4540738053619861,0.4550252184271812,0.4576593860983848,0.4573238864541054,0.4575810581445694,0.4622134491801262,0.4592566937208175,0.4614734016358852,0.4637473002076149,0.4625372551381588,0.4613912180066108,0.4597448222339153,0.4594792164862156,0.4662549719214439,0.4634026065468788,0.4633508697152138,0.4635734222829342,0.4628961533308029,0.4670135043561458,0.4639505892992019,0.4631133340299129,0.4665167145431041,0.4672448337078094,0.4693268723785877,0.4630668573081493,0.4676454700529575,0.4646359197795391,0.4621579721570015,0.4692446552217006,0.4704835228621959,0.4663223996758461,0.4680556617677212,0.466339822858572,0.4682099223136902,0.4711195565760135,0.4722655527293682,0.4727961830794811,0.4676857478916645,0.4719390422105789,0.4713102728128433,0.4712141714990139,0.4721613004803657,0.4713456854224205,0.4682970903813839,0.4679934531450271,0.4685162976384163,0.4679946713149547,0.4681242071092129,0.4702276065945625,0.472664151340723,0.4730790853500366,0.4731674715876579,0.4718914777040481,0.4719801284372806,0.4761029370129108,0.4735167175531387,0.4730370938777923,0.4730173237621784,0.4735377207398414,0.4777223989367485,0.4796326830983162,0.4734170883893966,0.4739485755562782,0.4748299159109592,0.4765299335122108,0.4745025858283043,0.4754423759877682,0.4784592799842357,0.4761341325938701,0.4760282784700393,0.4769757278263569,0.47154351323843,0.4786738082766533,0.4804279990494251,0.4777076803147793,0.4798569902777672,0.4759011939167976,0.4784621745347976,0.479673832654953,0.4780617095530033,0.48076206818223,0.47995800152421,0.4790860973298549,0.4817167408764362,0.4811586998403072,0.482547752559185,0.4816697351634502,0.4809327870607376,0.4816545359790325,0.4804601892828941,0.4776877984404564,0.4813711903989315,0.4844604581594467,0.4819537848234176,0.4820829331874847,0.4778126627206802,0.482935007661581,0.48230691999197,0.4826001971960068,0.4823969900608063,0.4811219945549965,0.4789146520197391,0.484035175293684,0.4848698377609253,0.4855728335678577,0.4825376532971859,0.485215101391077,0.4824351668357849,0.4835342466831207,0.4822137206792831,0.4838785007596016,0.4837255179882049,0.4853012599050998,0.4857851006090641,0.4863366298377514,0.4856646582484245,0.4842503517866134,0.4838776960968971,0.4846346862614155,0.4837041422724724,0.4813097268342972,0.4873070046305656,0.4841253720223903,0.4837464913725853,0.483069509267807,0.4851242564618587,0.4861010462045669],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3551952373236418,0.3736435137689113,0.3814037963747978,0.3948809280991554,0.3996850810945034,0.4089604057371616,0.4100853353738785,0.4119834117591381,0.4168377220630646,0.4186493046581745,0.4169826358556747,0.4234288297593593,0.4229162000119686,0.4273439794778824,0.4290364980697632,0.4291782416403293,0.4296907968819141,0.4311576783657074,0.4326641112565994,0.430318683385849,0.430436260998249,0.4339037239551544,0.4363459683954716,0.4357402548193931,0.4342963136732578,0.4366712383925915,0.4363959729671478,0.436981026083231,0.4447868093848228,0.4411709941923618,0.4406092017889023,0.4424176625907421,0.4423875361680984,0.4422253370285034,0.4410557933151722,0.4447037056088447,0.4454837813973427,0.4435960277915001,0.4468514993786812,0.4479999616742134,0.4428562931716442,0.445764634758234,0.4456562362611294,0.4488007053732872,0.4475954286754131,0.4468922987580299,0.4548408314585686,0.4511027485132217,0.4530330970883369,0.4483681954443455,0.4531726539134979,0.45334542542696,0.4544384703040123,0.4530758671462536,0.4540613554418087,0.4510113634169101,0.4538320265710354,0.4518541917204857,0.4536847211420536,0.4532708041369915,0.4552236869931221,0.455034039914608,0.4562875479459762,0.4532428197562694,0.4574853852391243,0.4517738744616508,0.4579889141023159,0.4538268558681011,0.456730306148529,0.4526018649339676,0.4562746733427048,0.4560015797615051,0.4555426277220249,0.4561501257121563,0.4524396173655987,0.4557023830711841,0.4589769169688225,0.4581078588962555,0.4620813727378845,0.4586601965129375,0.4568093195557594,0.4569808952510357,0.4567535072565079,0.4575250148773193,0.4606908001005649,0.4603964723646641,0.4622848592698574,0.4594669193029403,0.4640629850327968,0.4604269936680794,0.4634841009974479,0.4644578285515308,0.4642514958977699,0.4666304066777229,0.4616626128554344,0.4588956907391548,0.4620226770639419,0.4628621749579906,0.4595407098531723,0.4635516740381717,0.46005355194211,0.4601523540914058,0.4644204638898372,0.4620639197528362,0.46614545956254,0.4636696502566337,0.4610077403485775,0.4640897810459137,0.4636163525283336,0.4630545899271965,0.466012816876173,0.4650349207222461,0.4613720141351223,0.4644323363900184,0.4647249802947044,0.4656480401754379,0.4651664271950722,0.4622530452907085,0.4655019529163837,0.4650313258171081,0.466718140989542,0.4661559611558914,0.4661237150430679,0.4664223715662956,0.4640601389110088,0.4642657749354839,0.4633881188929081,0.4629989042878151,0.4685831367969513,0.4675870984792709,0.467183344066143,0.4678030684590339,0.4660939238965511,0.4691914953291416,0.4670972637832165,0.468262892216444,0.4672016054391861,0.4676182121038437,0.4698677137494087,0.4658828042447567,0.4701816700398922,0.4684622809290886,0.466015312820673,0.4675401039421558,0.4693200923502445,0.4702670983970165,0.4679145030677318,0.4676233418285846,0.4674933589994907,0.4678357951343059,0.4669915996491909,0.4657857678830623,0.4666901864111423,0.4669371582567692,0.4672787226736545,0.4684535376727581,0.4685697965323925,0.4694835692644119,0.4683254994451999,0.4712230190634727,0.4683987610042095,0.4707653746008873,0.4663059376180172,0.4683133698999882,0.4686385430395603,0.4657671600580215,0.4692615270614624],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3605199865996837,0.3733148723840713,0.3882005847990513,0.3934122696518898,0.3947227671742439,0.4042885974049568,0.3974800482392311,0.4055779427289963,0.4133470430970192,0.4117913842201233,0.4113653488457203,0.4149517640471458,0.4187851920723915,0.4252083078026771,0.4206527359783649,0.4240428246557712,0.422003373503685,0.4280910938978195,0.4244147576391697,0.4316282644867897,0.4295645765960216,0.4310102686285972,0.4360743537545204,0.4313482865691185,0.4350991360843181,0.4378576353192329,0.4335876516997814,0.4347924515604973,0.4348904751241207,0.436600212007761,0.430036511272192,0.4350974671542644,0.4399556629359722,0.4371416717767715,0.4363861419260502,0.4376698136329651,0.4405004419386387,0.4373639523983001,0.4379038028419018,0.4371281825006008,0.4393439553678036,0.440426729619503,0.4401675276458263,0.4429537951946258,0.4449137263000011,0.4434786736965179,0.4450470842421055,0.4454202279448509,0.4394537284970283,0.442185215651989,0.4461225643754005,0.4427758157253265,0.4430646039545536,0.4476901069283485,0.4478763341903686,0.4493869319558143,0.4448477327823639,0.450044184923172,0.4498609118163585,0.4457665979862213,0.4506924152374267,0.449855338782072,0.448790930211544,0.4474099352955818,0.4546772800385952,0.4529431238770485,0.452015146613121,0.4502020999789238,0.4493804536759853,0.4523266032338142,0.4551868587732315,0.4501944817602634,0.4493303671479225,0.4526805207133293,0.4533850513398647,0.4518048763275146,0.4518973492085933,0.4531301632523536,0.4518006071448326,0.4553494565188885,0.4528752230107784,0.4536322727799415,0.4561733976006508,0.4549491256475448,0.4574789106845855,0.4577847123146057,0.4563642293214798,0.4578686729073524,0.4561499990522861,0.4537816494703293,0.4542164430022239,0.4559455662965774,0.4554723873734474,0.4575514122843742,0.4575202167034149,0.4592722058296203,0.4585275091230869,0.4580587856471538,0.456934317946434,0.4577495418488979,0.4540119916200638,0.4570806957781315,0.4608120545744896,0.4588425755500793,0.4578334167599678,0.4610816091299057,0.4598177038133144,0.461849745362997,0.4631866924464702,0.4601576402783394,0.4646804705262184,0.4632389545440674,0.4604574106633663,0.4602976888418197,0.4581312239170074,0.4654182009398937,0.4655338563024997,0.4616620391607284,0.461054053157568,0.4613021649420261,0.4658613465726375,0.4633531905710697,0.4613638147711754,0.4643996246159076,0.462500050663948,0.4650798961520195,0.4648764543235302,0.4639869071543216,0.4634246975183487,0.46585888043046,0.4639799632132053,0.4630857892334461,0.4644265696406364,0.4642998576164245,0.4686848931014538,0.4687492996454239,0.4650243632495403,0.4627032242715359,0.4665953740477562,0.4660026729106903,0.4664581045508384,0.4676475040614605,0.4657339677214622,0.4664678275585174,0.4673498086631298,0.4676674827933311,0.4680955372750759,0.4681585058569908,0.4659864418208599,0.4686457589268684,0.4661462865769863,0.4658931568264961,0.4674226939678192,0.46805215254426,0.4682257212698459,0.4689070098102093,0.4699570722877979,0.4655096270143986,0.4688013233244419,0.4707522802054882,0.4661469310522079,0.4688841328024864,0.4671329781413078,0.4662554152309894,0.4697433896362781,0.4698473587632179,0.4676505327224731,0.4696521013975143],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3608616776764393,0.3745453506708145,0.3862277194857597,0.3989979773759842,0.406296543776989,0.4094927236437797,0.4138859286904335,0.4177777022123337,0.4208802655339241,0.4254550077021122,0.4283009432256222,0.429458349943161,0.4330311268568039,0.4303463362157345,0.4349483698606491,0.4348161295056343,0.438955657184124,0.4389265701174736,0.4393925778567791,0.4383306242525577,0.4436748661100864,0.4423373565077781,0.4460027255117893,0.4440812170505523,0.4476902261376381,0.4465879611670971,0.4497823156416416,0.4513350501656532,0.4518667235970497,0.45149727165699,0.4513994492590427,0.4521937072277069,0.4520382955670357,0.4530793912708759,0.4516105614602566,0.4530563354492187,0.4495660625398159,0.4520940892398357,0.4561133235692978,0.4522969461977482,0.4575686641037464,0.4589144177734852,0.4582882039248943,0.457970168441534,0.4554797261953354,0.4622044861316681,0.4596928395330906,0.4624353349208832,0.4619148448109627,0.461100060492754,0.458431463688612,0.4620467089116573,0.4562215581536293,0.4620163068175316,0.4631462283432483,0.4600549824535846,0.4620365314185619,0.458735141903162,0.461642112582922,0.461245734244585,0.4645131677389145,0.4629777930676937,0.4651660025119781,0.4653937108814716,0.4676259346306324,0.4667201824486255,0.4650012850761413,0.4676916748285293,0.4708514772355556,0.4673572592437267,0.4689626581966877,0.4678038358688354,0.4667215310037136,0.4646228328347206,0.4662510119378567,0.4674677737057209,0.4690804108977318,0.4634581170976162,0.4701276533305645,0.4676450751721859,0.4672758504748344,0.4674397967755794,0.4656238108873367,0.4690065123140812,0.4677213467657566,0.4678985886275768,0.4735414572060108,0.4705612398684025,0.4703374318778515,0.4704933613538742,0.4688010476529598,0.4699571952223778,0.4674785658717155,0.4701188169419765,0.4682065695524215,0.4729971997439861,0.4748715870082378,0.4745333231985569,0.4737020246684551,0.4747246317565441,0.4771635122597217,0.4740425907075405,0.475264236330986,0.4744705818593502,0.474684040993452,0.4721556939184665,0.475641455501318,0.476833701133728,0.4746401384472847,0.4742486327886581,0.4730467088520527,0.4773029200732708,0.4760043211281299,0.4770320989191532,0.4742161482572555,0.4780259765684604,0.4806670732796192,0.4784667380154133,0.4788618609309196,0.4762138128280639,0.4777246937155723,0.4796081893146038,0.4798486456274986,0.475479181855917,0.4779988899827003,0.4765858314931392,0.4772914499044418,0.47843898832798,0.4799034222960472,0.4803600236773491,0.4751846008002758,0.4777872562408447,0.4779460839927196,0.4787487275898456,0.4808406494557857,0.4810357913374901,0.4797308407723903,0.4800078608095646,0.4806460626423359,0.4810502976179123,0.4797912389039993,0.477332629263401,0.4818884879350662,0.482621606439352,0.4833096489310264,0.4821632876992225,0.4831674285233021,0.4830279909074306,0.4849893450736999,0.4845218025147915,0.4825541749596596,0.4833571836352348,0.4853803217411041,0.483093187212944,0.4850797094404697,0.485261783003807,0.4837660938501358,0.4835929833352566,0.4855643883347511,0.4832059442996979,0.484714712947607,0.4839249886572361,0.4829078912734985,0.4818423055112362,0.482727088034153,0.4824129492044449,0.4820138849318027,0.4865870922803879],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2529999911785126,0.2800000011920929,0.2870000004768371,0.3179999887943268,0.3129999935626983,0.3210000097751617,0.3160000145435333,0.3210000097751617,0.31700000166893,0.3330000042915344,0.3389999866485595,0.3289999961853027,0.3429999947547912,0.3370000123977661,0.3379999995231628,0.3459999859333038,0.3490000069141388,0.3470000028610229,0.3600000143051147,0.3569999933242798,0.3449999988079071,0.3650000095367431,0.3499999940395355,0.3540000021457672,0.3569999933242798,0.3619999885559082,0.3619999885559082,0.3580000102519989,0.3740000128746032,0.3709999918937683,0.3720000088214874,0.3759999871253967,0.3720000088214874,0.3659999966621399,0.3790000081062317,0.3610000014305115,0.3650000095367431,0.3650000095367431,0.3720000088214874,0.3729999959468841,0.3790000081062317,0.3680000007152557,0.3659999966621399,0.3680000007152557,0.3619999885559082,0.3619999885559082,0.3729999959468841,0.3720000088214874,0.3650000095367431,0.3759999871253967,0.367000013589859,0.3650000095367431,0.3680000007152557,0.3580000102519989,0.3589999973773956,0.3700000047683716,0.3680000007152557,0.367000013589859,0.3709999918937683,0.3880000114440918,0.3810000121593475,0.375,0.4040000140666961,0.3860000073909759,0.3840000033378601,0.3779999911785126,0.3729999959468841,0.3720000088214874,0.3799999952316284,0.3799999952316284,0.3779999911785126,0.3689999878406524,0.3770000040531158,0.3740000128746032,0.3819999992847442,0.3899999856948852,0.3799999952316284,0.3919999897480011,0.3720000088214874,0.3770000040531158,0.3930000066757202,0.3849999904632568,0.3899999856948852,0.3740000128746032,0.3740000128746032,0.3799999952316284,0.3779999911785126,0.3880000114440918,0.3709999918937683,0.3810000121593475,0.3880000114440918,0.3980000019073486,0.3819999992847442,0.3849999904632568,0.3810000121593475,0.3819999992847442,0.3889999985694885,0.3840000033378601,0.3910000026226043,0.3899999856948852,0.3959999978542328,0.3880000114440918,0.3869999945163727,0.3779999911785126,0.3819999992847442,0.3919999897480011,0.3849999904632568,0.3860000073909759,0.3919999897480011,0.3819999992847442,0.3819999992847442,0.3889999985694885,0.3889999985694885,0.3860000073909759,0.3880000114440918,0.3889999985694885,0.3939999938011169,0.3899999856948852,0.3869999945163727,0.3910000026226043,0.3910000026226043,0.3910000026226043,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3939999938011169,0.4000000059604645,0.3970000147819519,0.402999997138977,0.3959999978542328,0.3959999978542328,0.4000000059604645,0.4040000140666961,0.4020000100135803,0.3989999890327453,0.3919999897480011,0.3930000066757202,0.3930000066757202,0.3980000019073486,0.4000000059604645,0.395000010728836,0.3899999856948852,0.4059999883174896,0.4020000100135803,0.4020000100135803,0.4059999883174896,0.3970000147819519,0.4110000133514404,0.4050000011920929,0.4000000059604645,0.4090000092983246,0.3989999890327453,0.402999997138977,0.4009999930858612,0.3980000019073486,0.4090000092983246,0.4079999923706054,0.4079999923706054,0.4020000100135803,0.402999997138977,0.402999997138977,0.4059999883174896,0.4040000140666961,0.4059999883174896,0.3989999890327453,0.4070000052452087,0.4059999883174896],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2540000081062317,0.2870000004768371,0.2829999923706054,0.3210000097751617,0.3079999983310699,0.3230000138282776,0.3179999887943268,0.3160000145435333,0.3289999961853027,0.3199999928474426,0.324999988079071,0.3310000002384186,0.3260000050067901,0.335999995470047,0.335999995470047,0.3310000002384186,0.335999995470047,0.3339999914169311,0.3459999859333038,0.3330000042915344,0.3449999988079071,0.3429999947547912,0.3479999899864197,0.3420000076293945,0.3479999899864197,0.3459999859333038,0.3339999914169311,0.3350000083446502,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3379999995231628,0.3420000076293945,0.3610000014305115,0.3409999907016754,0.356000006198883,0.3630000054836273,0.3519999980926513,0.3510000109672546,0.3619999885559082,0.3569999933242798,0.3479999899864197,0.3529999852180481,0.3569999933242798,0.3529999852180481,0.3519999980926513,0.3549999892711639,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3619999885559082,0.3459999859333038,0.3519999980926513,0.3529999852180481,0.3680000007152557,0.3519999980926513,0.3580000102519989,0.3549999892711639,0.3490000069141388,0.3499999940395355,0.3600000143051147,0.3709999918937683,0.3659999966621399,0.3569999933242798,0.3510000109672546,0.3600000143051147,0.367000013589859,0.3529999852180481,0.363999992609024,0.3630000054836273,0.3619999885559082,0.356000006198883,0.367000013589859,0.3600000143051147,0.3540000021457672,0.3589999973773956,0.3610000014305115,0.356000006198883,0.3680000007152557,0.3519999980926513,0.3549999892711639,0.3479999899864197,0.3549999892711639,0.3519999980926513,0.367000013589859,0.3600000143051147,0.3600000143051147,0.3680000007152557,0.356000006198883,0.3610000014305115,0.3689999878406524,0.367000013589859,0.3689999878406524,0.3720000088214874,0.3680000007152557,0.3569999933242798,0.3650000095367431,0.363999992609024,0.3610000014305115,0.3709999918937683,0.3569999933242798,0.3540000021457672,0.3619999885559082,0.3549999892711639,0.3650000095367431,0.3680000007152557,0.3589999973773956,0.356000006198883,0.3610000014305115,0.3619999885559082,0.3740000128746032,0.3700000047683716,0.3650000095367431,0.3819999992847442,0.3770000040531158,0.3810000121593475,0.3729999959468841,0.3680000007152557,0.3689999878406524,0.3740000128746032,0.3779999911785126,0.3720000088214874,0.3740000128746032,0.367000013589859,0.363999992609024,0.367000013589859,0.3689999878406524,0.3709999918937683,0.3709999918937683,0.375,0.3680000007152557,0.375,0.3630000054836273,0.3720000088214874,0.3819999992847442,0.3729999959468841,0.3689999878406524,0.363999992609024,0.3709999918937683,0.3659999966621399,0.3700000047683716,0.367000013589859,0.3709999918937683,0.3759999871253967,0.3759999871253967,0.3729999959468841,0.3729999959468841,0.3729999959468841,0.3779999911785126,0.375,0.3700000047683716,0.3659999966621399,0.3759999871253967,0.3779999911785126,0.3709999918937683,0.3840000033378601,0.3720000088214874,0.375,0.367000013589859,0.3770000040531158,0.3709999918937683,0.375,0.3709999918937683,0.3740000128746032,0.3740000128746032,0.375,0.3770000040531158],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2599999904632568,0.277999997138977,0.2910000085830688,0.3070000112056732,0.3140000104904175,0.3019999861717224,0.3059999942779541,0.3210000097751617,0.3230000138282776,0.324999988079071,0.3149999976158142,0.3109999895095825,0.3339999914169311,0.3289999961853027,0.3319999873638153,0.3319999873638153,0.3300000131130218,0.3370000123977661,0.3219999969005584,0.3370000123977661,0.328000009059906,0.3339999914169311,0.3420000076293945,0.3400000035762787,0.3440000116825104,0.3510000109672546,0.3409999907016754,0.3449999988079071,0.3339999914169311,0.3540000021457672,0.3339999914169311,0.3470000028610229,0.3470000028610229,0.3440000116825104,0.3589999973773956,0.3569999933242798,0.3630000054836273,0.3549999892711639,0.3589999973773956,0.3449999988079071,0.3549999892711639,0.3449999988079071,0.3389999866485595,0.3499999940395355,0.3610000014305115,0.3619999885559082,0.3600000143051147,0.3519999980926513,0.3479999899864197,0.356000006198883,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3519999980926513,0.3470000028610229,0.3589999973773956,0.3449999988079071,0.3490000069141388,0.356000006198883,0.3619999885559082,0.3569999933242798,0.3659999966621399,0.3610000014305115,0.3549999892711639,0.3700000047683716,0.363999992609024,0.3600000143051147,0.3580000102519989,0.3549999892711639,0.3619999885559082,0.3689999878406524,0.3630000054836273,0.363999992609024,0.3700000047683716,0.367000013589859,0.3630000054836273,0.3630000054836273,0.3700000047683716,0.3589999973773956,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3589999973773956,0.3650000095367431,0.3709999918937683,0.3680000007152557,0.3689999878406524,0.3650000095367431,0.3729999959468841,0.3619999885559082,0.3689999878406524,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.363999992609024,0.3700000047683716,0.3659999966621399,0.3659999966621399,0.363999992609024,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3610000014305115,0.3720000088214874,0.3729999959468841,0.3810000121593475,0.3630000054836273,0.3689999878406524,0.3709999918937683,0.3759999871253967,0.382999986410141,0.3729999959468841,0.3720000088214874,0.3680000007152557,0.3659999966621399,0.3650000095367431,0.363999992609024,0.3589999973773956,0.356000006198883,0.3650000095367431,0.3659999966621399,0.367000013589859,0.3729999959468841,0.3720000088214874,0.375,0.3740000128746032,0.3700000047683716,0.3569999933242798,0.3759999871253967,0.3740000128746032,0.367000013589859,0.3770000040531158,0.3759999871253967,0.3709999918937683,0.3779999911785126,0.3709999918937683,0.3689999878406524,0.3799999952316284,0.3630000054836273,0.375,0.3700000047683716,0.3700000047683716,0.3729999959468841,0.3720000088214874,0.3790000081062317,0.375,0.3729999959468841,0.3770000040531158,0.3799999952316284,0.3779999911785126,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3799999952316284,0.3790000081062317,0.375,0.3740000128746032,0.3729999959468841,0.3840000033378601,0.3659999966621399,0.3759999871253967,0.3720000088214874,0.3720000088214874,0.3759999871253967,0.375,0.3650000095367431,0.3729999959468841],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2639999985694885,0.2790000140666961,0.296999990940094,0.3109999895095825,0.3240000009536743,0.3070000112056732,0.3210000097751617,0.31700000166893,0.3339999914169311,0.324999988079071,0.3260000050067901,0.3330000042915344,0.3409999907016754,0.3350000083446502,0.3400000035762787,0.3529999852180481,0.3400000035762787,0.3490000069141388,0.3529999852180481,0.3499999940395355,0.3459999859333038,0.3370000123977661,0.356000006198883,0.3490000069141388,0.3429999947547912,0.3490000069141388,0.3610000014305115,0.3499999940395355,0.3569999933242798,0.3610000014305115,0.3619999885559082,0.3449999988079071,0.3409999907016754,0.3420000076293945,0.3449999988079071,0.3409999907016754,0.3379999995231628,0.3420000076293945,0.3569999933242798,0.3529999852180481,0.3610000014305115,0.363999992609024,0.3600000143051147,0.3540000021457672,0.3499999940395355,0.3689999878406524,0.367000013589859,0.3569999933242798,0.3610000014305115,0.3680000007152557,0.3630000054836273,0.3709999918937683,0.3540000021457672,0.3580000102519989,0.367000013589859,0.3529999852180481,0.356000006198883,0.3569999933242798,0.3610000014305115,0.3700000047683716,0.375,0.3709999918937683,0.3819999992847442,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3840000033378601,0.3740000128746032,0.375,0.356000006198883,0.3689999878406524,0.3700000047683716,0.3819999992847442,0.3799999952316284,0.3779999911785126,0.3729999959468841,0.3709999918937683,0.3759999871253967,0.3709999918937683,0.3759999871253967,0.3779999911785126,0.3779999911785126,0.3689999878406524,0.3840000033378601,0.3860000073909759,0.3849999904632568,0.3790000081062317,0.375,0.3849999904632568,0.3720000088214874,0.3770000040531158,0.3799999952316284,0.3810000121593475,0.382999986410141,0.3650000095367431,0.3740000128746032,0.382999986410141,0.3689999878406524,0.3759999871253967,0.3869999945163727,0.3889999985694885,0.3860000073909759,0.3819999992847442,0.3689999878406524,0.3860000073909759,0.3810000121593475,0.382999986410141,0.3819999992847442,0.3840000033378601,0.3889999985694885,0.3880000114440918,0.3849999904632568,0.3799999952316284,0.3910000026226043,0.3989999890327453,0.3880000114440918,0.3880000114440918,0.3840000033378601,0.3880000114440918,0.3860000073909759,0.3919999897480011,0.3880000114440918,0.3939999938011169,0.3869999945163727,0.3919999897480011,0.3910000026226043,0.382999986410141,0.3930000066757202,0.3840000033378601,0.3880000114440918,0.3840000033378601,0.3819999992847442,0.382999986410141,0.3880000114440918,0.3860000073909759,0.3860000073909759,0.3869999945163727,0.3860000073909759,0.3899999856948852,0.3819999992847442,0.3860000073909759,0.3889999985694885,0.3840000033378601,0.395000010728836,0.3899999856948852,0.3899999856948852,0.3910000026226043,0.3959999978542328,0.3959999978542328,0.3919999897480011,0.3980000019073486,0.3880000114440918,0.3930000066757202,0.4000000059604645,0.3919999897480011,0.3919999897480011,0.4040000140666961,0.3930000066757202,0.3970000147819519,0.3889999985694885,0.3959999978542328,0.3930000066757202,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.4020000100135803],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2759999930858612,0.328000009059906,0.3499999940395355,0.3889999985694885,0.3910000026226043,0.402999997138977,0.4210000038146972,0.4280000030994415,0.4359999895095825,0.4469999969005584,0.4440000057220459,0.4600000083446502,0.4690000116825104,0.4600000083446502,0.4679999947547912,0.4729999899864197,0.4760000109672546,0.4839999973773956,0.4939999878406524,0.488999992609024,0.4990000128746032,0.4979999959468841,0.4979999959468841,0.5009999871253967,0.5,0.5090000033378601,0.5070000290870667,0.5180000066757202,0.5199999809265137,0.5109999775886536,0.5130000114440918,0.5249999761581421,0.5149999856948853,0.5299999713897705,0.5339999794960022,0.5189999938011169,0.5289999842643738,0.5249999761581421,0.5320000052452087,0.5460000038146973,0.5419999957084656,0.5260000228881836,0.5289999842643738,0.546999990940094,0.5419999957084656,0.5419999957084656,0.5460000038146973,0.5419999957084656,0.5389999747276306,0.5440000295639038,0.5569999814033508,0.5450000166893005,0.5329999923706055,0.5580000281333923,0.5339999794960022,0.5540000200271606,0.5460000038146973,0.5479999780654907,0.5529999732971191,0.5540000200271606,0.5619999766349792,0.5490000247955322,0.5410000085830688,0.5490000247955322,0.5569999814033508,0.550000011920929,0.5479999780654907,0.5630000233650208,0.546999990940094,0.5559999942779541,0.5600000023841858,0.5509999990463257,0.5569999814033508,0.5569999814033508,0.5580000281333923,0.5619999766349792,0.5580000281333923,0.5669999718666077,0.5569999814033508,0.5709999799728394,0.5529999732971191,0.5649999976158142,0.5659999847412109,0.5659999847412109,0.5690000057220459,0.5600000023841858,0.5580000281333923,0.5540000200271606,0.5640000104904175,0.5680000185966492,0.5709999799728394,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5640000104904175,0.5799999833106995,0.5699999928474426,0.5669999718666077,0.5680000185966492,0.5770000219345093,0.5709999799728394,0.5759999752044678,0.5690000057220459,0.5789999961853027,0.5740000009536743,0.5709999799728394,0.5789999961853027,0.5709999799728394,0.5770000219345093,0.5770000219345093,0.5730000138282776,0.5809999704360962,0.5720000267028809,0.5849999785423279,0.5820000171661377,0.5799999833106995,0.5830000042915344,0.5759999752044678,0.5730000138282776,0.5799999833106995,0.5830000042915344,0.5860000252723694,0.5789999961853027,0.5789999961853027,0.5860000252723694,0.5979999899864197,0.5920000076293945,0.5820000171661377,0.5870000123977661,0.5889999866485596,0.5839999914169312,0.5849999785423279,0.5899999737739563,0.5920000076293945,0.593999981880188,0.597000002861023,0.5889999866485596,0.5889999866485596,0.5849999785423279,0.5899999737739563,0.5989999771118164,0.5899999737739563,0.5839999914169312,0.5910000205039978,0.5910000205039978,0.5929999947547913,0.5920000076293945,0.5929999947547913,0.5889999866485596,0.5899999737739563,0.593999981880188,0.5910000205039978,0.5960000157356262,0.5920000076293945,0.5889999866485596,0.593999981880188,0.5879999995231628,0.5960000157356262,0.5920000076293945,0.5960000157356262,0.5960000157356262,0.5920000076293945,0.6010000109672546,0.5920000076293945,0.5899999737739563,0.5889999866485596,0.5920000076293945,0.6019999980926514],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3009999990463257,0.3149999976158142,0.3400000035762787,0.3610000014305115,0.3680000007152557,0.3799999952316284,0.4020000100135803,0.4180000126361847,0.4129999876022339,0.4259999990463257,0.4239999949932098,0.4440000057220459,0.44200000166893,0.4440000057220459,0.4580000042915344,0.4510000050067901,0.4560000002384186,0.4650000035762787,0.4569999873638153,0.460999995470047,0.4659999907016754,0.4679999947547912,0.4779999852180481,0.4740000069141388,0.4600000083446502,0.4860000014305115,0.4790000021457672,0.4880000054836273,0.4930000007152557,0.4860000014305115,0.4850000143051147,0.4900000095367431,0.4850000143051147,0.4900000095367431,0.4959999918937683,0.492000013589859,0.4850000143051147,0.4970000088214874,0.4900000095367431,0.4979999959468841,0.503000020980835,0.5040000081062317,0.4990000128746032,0.4979999959468841,0.5080000162124634,0.5019999742507935,0.4970000088214874,0.4939999878406524,0.5120000243186951,0.5070000290870667,0.503000020980835,0.5070000290870667,0.503000020980835,0.5109999775886536,0.5080000162124634,0.5009999871253967,0.5090000033378601,0.5,0.5149999856948853,0.5109999775886536,0.5099999904632568,0.5130000114440918,0.5080000162124634,0.5080000162124634,0.5109999775886536,0.5099999904632568,0.5239999890327454,0.5180000066757202,0.5130000114440918,0.5120000243186951,0.5180000066757202,0.515999972820282,0.5260000228881836,0.5199999809265137,0.5239999890327454,0.5220000147819519,0.527999997138977,0.5249999761581421,0.5270000100135803,0.5249999761581421,0.5189999938011169,0.5230000019073486,0.5249999761581421,0.5199999809265137,0.5230000019073486,0.5299999713897705,0.5350000262260437,0.5339999794960022,0.5329999923706055,0.5249999761581421,0.5299999713897705,0.5360000133514404,0.5329999923706055,0.5410000085830688,0.5249999761581421,0.5289999842643738,0.5360000133514404,0.5360000133514404,0.5370000004768372,0.5389999747276306,0.5289999842643738,0.5299999713897705,0.5410000085830688,0.5329999923706055,0.5419999957084656,0.5410000085830688,0.527999997138977,0.5370000004768372,0.5429999828338623,0.5419999957084656,0.5389999747276306,0.5320000052452087,0.5350000262260437,0.5419999957084656,0.5410000085830688,0.5339999794960022,0.5440000295639038,0.5329999923706055,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5429999828338623,0.5479999780654907,0.550000011920929,0.5490000247955322,0.5410000085830688,0.5450000166893005,0.5429999828338623,0.550000011920929,0.5529999732971191,0.5490000247955322,0.5450000166893005,0.5450000166893005,0.5519999861717224,0.5569999814033508,0.5460000038146973,0.546999990940094,0.5509999990463257,0.5509999990463257,0.5450000166893005,0.5440000295639038,0.5440000295639038,0.546999990940094,0.5479999780654907,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5440000295639038,0.5410000085830688,0.5440000295639038,0.5389999747276306,0.5410000085830688,0.546999990940094,0.546999990940094,0.5479999780654907,0.546999990940094,0.550000011920929,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5479999780654907,0.5519999861717224,0.550000011920929],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2809999883174896,0.3230000138282776,0.3409999907016754,0.3600000143051147,0.3569999933242798,0.3889999985694885,0.395000010728836,0.4199999868869781,0.4180000126361847,0.421999990940094,0.4289999902248382,0.4350000023841858,0.4359999895095825,0.4469999969005584,0.4350000023841858,0.4480000138282776,0.4480000138282776,0.453000009059906,0.4550000131130218,0.4589999914169311,0.4639999866485595,0.4600000083446502,0.460999995470047,0.4589999914169311,0.481000006198883,0.4769999980926513,0.4709999859333038,0.4740000069141388,0.4679999947547912,0.4790000021457672,0.4729999899864197,0.4819999933242798,0.4850000143051147,0.4819999933242798,0.4819999933242798,0.4880000054836273,0.4869999885559082,0.4959999918937683,0.4850000143051147,0.4959999918937683,0.492000013589859,0.503000020980835,0.4930000007152557,0.5099999904632568,0.5040000081062317,0.5009999871253967,0.4970000088214874,0.4979999959468841,0.5059999823570251,0.5070000290870667,0.5040000081062317,0.5059999823570251,0.5049999952316284,0.5080000162124634,0.5049999952316284,0.5019999742507935,0.5120000243186951,0.5170000195503235,0.5170000195503235,0.5090000033378601,0.5239999890327454,0.527999997138977,0.5230000019073486,0.5210000276565552,0.5149999856948853,0.5189999938011169,0.5270000100135803,0.5149999856948853,0.5099999904632568,0.5299999713897705,0.5199999809265137,0.5230000019073486,0.5260000228881836,0.5249999761581421,0.5239999890327454,0.5329999923706055,0.5210000276565552,0.5260000228881836,0.5170000195503235,0.531000018119812,0.5289999842643738,0.531000018119812,0.5270000100135803,0.5299999713897705,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5329999923706055,0.5360000133514404,0.5299999713897705,0.5360000133514404,0.5270000100135803,0.5450000166893005,0.5410000085830688,0.546999990940094,0.5329999923706055,0.5329999923706055,0.5379999876022339,0.5299999713897705,0.5429999828338623,0.5360000133514404,0.5339999794960022,0.5419999957084656,0.5410000085830688,0.5370000004768372,0.5389999747276306,0.527999997138977,0.5400000214576721,0.5400000214576721,0.531000018119812,0.5440000295639038,0.5460000038146973,0.5479999780654907,0.5460000038146973,0.5410000085830688,0.5509999990463257,0.5479999780654907,0.5410000085830688,0.5389999747276306,0.550000011920929,0.5569999814033508,0.550000011920929,0.5490000247955322,0.5490000247955322,0.5569999814033508,0.5519999861717224,0.5479999780654907,0.5559999942779541,0.5550000071525574,0.5460000038146973,0.5540000200271606,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5550000071525574,0.5479999780654907,0.5479999780654907,0.5540000200271606,0.5550000071525574,0.5529999732971191,0.5529999732971191,0.5509999990463257,0.5509999990463257,0.5419999957084656,0.546999990940094,0.5509999990463257,0.5559999942779541,0.5490000247955322,0.5509999990463257,0.5529999732971191,0.550000011920929,0.5540000200271606,0.5550000071525574,0.5580000281333923,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5519999861717224,0.5519999861717224,0.5559999942779541,0.5569999814033508,0.5559999942779541,0.5550000071525574,0.5559999942779541,0.5490000247955322,0.5550000071525574,0.5600000023841858],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3019999861717224,0.3059999942779541,0.335999995470047,0.3610000014305115,0.3819999992847442,0.4009999930858612,0.4020000100135803,0.4250000119209289,0.4309999942779541,0.4469999969005584,0.4519999921321869,0.453000009059906,0.4580000042915344,0.4729999899864197,0.4749999940395355,0.4699999988079071,0.4799999892711639,0.4749999940395355,0.4769999980926513,0.481000006198883,0.4839999973773956,0.4959999918937683,0.5040000081062317,0.4970000088214874,0.4979999959468841,0.5070000290870667,0.5049999952316284,0.5109999775886536,0.515999972820282,0.5120000243186951,0.5120000243186951,0.515999972820282,0.5120000243186951,0.5249999761581421,0.5170000195503235,0.5199999809265137,0.5270000100135803,0.5170000195503235,0.5220000147819519,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5370000004768372,0.5339999794960022,0.5329999923706055,0.531000018119812,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.5389999747276306,0.5419999957084656,0.5429999828338623,0.5360000133514404,0.5299999713897705,0.546999990940094,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5350000262260437,0.5339999794960022,0.5419999957084656,0.5450000166893005,0.5460000038146973,0.5370000004768372,0.5490000247955322,0.5440000295639038,0.550000011920929,0.5490000247955322,0.5450000166893005,0.5490000247955322,0.5559999942779541,0.5559999942779541,0.5410000085830688,0.5419999957084656,0.5529999732971191,0.5460000038146973,0.5540000200271606,0.5379999876022339,0.5509999990463257,0.5540000200271606,0.5419999957084656,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5519999861717224,0.5600000023841858,0.5540000200271606,0.5509999990463257,0.5609999895095825,0.5619999766349792,0.5590000152587891,0.5559999942779541,0.5580000281333923,0.5640000104904175,0.5649999976158142,0.5590000152587891,0.5550000071525574,0.5630000233650208,0.5630000233650208,0.5609999895095825,0.5559999942779541,0.5609999895095825,0.5630000233650208,0.5680000185966492,0.5630000233650208,0.5690000057220459,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5690000057220459,0.5640000104904175,0.5630000233650208,0.574999988079071,0.5630000233650208,0.5619999766349792,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5590000152587891,0.5600000023841858,0.5619999766349792,0.5799999833106995,0.5619999766349792,0.5699999928474426,0.5709999799728394,0.5669999718666077,0.5680000185966492,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5720000267028809,0.5709999799728394,0.5770000219345093,0.574999988079071,0.5730000138282776,0.5690000057220459,0.5740000009536743,0.578000009059906,0.574999988079071,0.5820000171661377,0.5730000138282776,0.5740000009536743,0.574999988079071,0.5770000219345093,0.5789999961853027,0.5759999752044678,0.5720000267028809,0.5770000219345093,0.5759999752044678,0.5789999961853027,0.5789999961853027,0.5730000138282776,0.5789999961853027,0.5759999752044678,0.5690000057220459,0.5849999785423279,0.5759999752044678,0.5699999928474426,0.5789999961853027,0.5820000171661377,0.5730000138282776,0.5730000138282776,0.5789999961853027],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":30,"default":5}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2528519630432129,0.2616856694221496,0.2665999829769134,0.2683407664299011,0.2742894291877746,0.2762066125869751,0.2807516455650329,0.2767378389835357,0.2807380557060241,0.2788906991481781,0.2844051718711853,0.2856102883815765,0.2883394360542297,0.2875711619853973,0.2890409529209137,0.2894668281078338,0.2883355319499969,0.2872501015663147,0.291619062423706,0.2900333702564239,0.2962473034858703,0.2962896525859833,0.297355443239212,0.2932226359844208,0.2886744439601898,0.29665008187294,0.2976542115211487,0.2991503179073334,0.3004479110240936,0.3044549524784088,0.2976194322109222,0.3014707863330841,0.3048252463340759,0.3039425611495971,0.303354948759079,0.3027459383010864,0.2999922931194305,0.3050121665000915,0.2998814284801483,0.2978588044643402,0.3041949570178985,0.3010904192924499,0.3022017180919647,0.2997751235961914,0.3015910983085632,0.3096485137939453,0.3012076020240783,0.3065535724163055,0.3042872548103332,0.3104783594608307,0.2997980415821075,0.3051296770572662,0.303458571434021,0.3088337182998657,0.3145398199558258,0.3032208085060119,0.310806930065155,0.3075874149799347,0.3101692199707031,0.310107946395874,0.3066047430038452,0.3109066784381866,0.3081336915493011,0.3084586262702942,0.3086149394512176,0.3085348606109619,0.3136637806892395,0.3110873103141784,0.31076380610466,0.3084572553634643,0.3133681714534759,0.3125792145729065,0.3124453127384186,0.3097185790538788,0.3106793165206909,0.3089564740657806,0.3111244142055511,0.3123694658279419,0.3144859969615936,0.3135123550891876,0.311982125043869,0.3142133951187134,0.3122903704643249,0.3147654831409454,0.3078767359256744,0.314947634935379,0.3171303570270538,0.3129573762416839,0.3154936134815216,0.3158208429813385,0.3153132200241089,0.3141326904296875,0.3163397014141083,0.3166318237781524,0.3168410360813141,0.3198235332965851,0.3201336860656738,0.3212967813014984,0.3191385567188263,0.3178017139434814,0.3192791938781738,0.323061466217041,0.320336639881134,0.3165886104106903,0.3206393420696258,0.3167395293712616,0.3135207295417785,0.315539002418518,0.3191742599010467,0.321073055267334,0.3222262561321258,0.3193058371543884,0.3213480710983276,0.3198905289173126,0.3219239711761474,0.3211614489555359,0.318855881690979,0.3177095353603363,0.324197381734848,0.3208906352519989,0.3264936804771423,0.3245965242385864,0.3231639564037323,0.3221887946128845,0.3277338445186615,0.3227696120738983,0.3263820111751556,0.3258577883243561,0.3264622390270233,0.3222362995147705,0.3286814987659454,0.3235024213790893,0.32446950674057,0.3311836123466491,0.328130304813385,0.3271634578704834,0.3250012993812561,0.3309800624847412,0.3274554014205932,0.3273015916347503,0.3261759579181671,0.32697594165802,0.3303172886371612,0.3282814025878906,0.3289586305618286,0.3260826468467712,0.3258011937141418,0.3297208249568939,0.3254813551902771,0.3287739753723144,0.3287097811698913,0.3275279700756073,0.3293041586875915,0.3314100801944732,0.3287808299064636,0.3251930773258209,0.3288172781467438,0.3265027701854706,0.3275215625762939,0.3290774822235107,0.3261331617832184,0.3299777805805206,0.331955999135971,0.3305029273033142,0.3274719417095184,0.3235560953617096,0.3269940316677093,0.3323083519935608],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2510619163513183,0.2621481418609619,0.2632303833961487,0.2720474302768707,0.2719806432723999,0.2726832032203674,0.2786827087402344,0.2823672890663147,0.276201844215393,0.2816944718360901,0.280361145734787,0.2819306254386902,0.2823295891284942,0.2892518043518066,0.2872919738292694,0.2859259247779846,0.2885263860225677,0.2862614393234253,0.2933129370212555,0.2930494546890259,0.2884900867938995,0.2942298054695129,0.2927677929401397,0.2954220175743103,0.2918704748153686,0.2943699061870575,0.2891678512096405,0.291848212480545,0.2942944765090942,0.2973679602146148,0.2953736186027527,0.2963412702083587,0.297100305557251,0.2963026762008667,0.2944463491439819,0.2971296310424804,0.293870210647583,0.2982682287693023,0.2978119254112243,0.2989997565746307,0.2993503510951996,0.298117071390152,0.2977498769760132,0.3004056811332702,0.3012634217739105,0.3001384139060974,0.3052266240119934,0.3038219809532165,0.3037647306919098,0.3009455502033233,0.3038812279701233,0.303263396024704,0.3025077581405639,0.3056069612503052,0.3024908602237701,0.3050909340381622,0.3001562356948852,0.303833544254303,0.3019777834415436,0.3036664128303528,0.3022894859313965,0.3042722940444946,0.3023003339767456,0.3069425821304321,0.307883083820343,0.3026910126209259,0.3054113090038299,0.3046148121356964,0.305342435836792,0.3048149049282074,0.3066973984241485,0.3055126965045929,0.3063409924507141,0.307701051235199,0.3075169324874878,0.3091190159320831,0.3098153173923492,0.31436288356781,0.3096509575843811,0.3022815883159637,0.3119745552539825,0.3083471357822418,0.3085280954837799,0.3082001209259033,0.3080264329910278,0.3116717934608459,0.3097788393497467,0.3117353916168213,0.3170038759708404,0.3099159002304077,0.3133728504180908,0.3161626160144806,0.3095119595527649,0.3135432302951813,0.3103009164333343,0.3126655519008636,0.3121814131736755,0.3123973608016968,0.3148256838321686,0.3144133985042572,0.3124284744262695,0.3102188408374786,0.3123636841773987,0.3115113973617553,0.3151636719703674,0.3148572146892547,0.315061867237091,0.3127182424068451,0.3139308094978332,0.3134367167949676,0.3136025071144104,0.3172793388366699,0.3134761154651642,0.3109587132930755,0.3127998411655426,0.3161843717098236,0.3163313865661621,0.3145243525505066,0.3155156075954437,0.3127505779266357,0.3182451128959656,0.3162476718425751,0.3124897480010986,0.3128789663314819,0.3119811117649078,0.314126193523407,0.3136049509048462,0.3149912655353546,0.3146650791168213,0.3151968121528625,0.3179666996002197,0.3169245719909668,0.3202513754367828,0.3185319602489471,0.3202781081199646,0.3186031281948089,0.3166128396987915,0.3199457228183746,0.3194417059421539,0.3170624077320099,0.3184532523155212,0.3191981911659241,0.3191225528717041,0.3173209130764007,0.3195607960224151,0.3166368305683136,0.3188160359859466,0.3174867630004883,0.3184468746185303,0.3211863338947296,0.3184327483177185,0.3177861273288727,0.3180214762687683,0.3194973170757293,0.3212297558784485,0.3211282789707184,0.3200584352016449,0.3168685734272003,0.3211040198802948,0.3222841620445251,0.3196901082992553,0.3236229419708252,0.3204475045204162,0.3210069537162781,0.3191083669662475,0.31863734126091,0.3195922076702118],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2516599297523498,0.2610189318656921,0.2666046619415283,0.2667981088161468,0.2667821645736694,0.2708088159561157,0.2738403379917145,0.2726235687732696,0.2762763500213623,0.2768311202526092,0.2809228301048279,0.2836140990257263,0.2822815179824829,0.2831664383411407,0.2797218561172485,0.286342591047287,0.2855269610881805,0.2847287058830261,0.2888180613517761,0.286526083946228,0.2865165770053863,0.294582188129425,0.2925947606563568,0.2947863042354584,0.2892930805683136,0.2903610467910766,0.288201242685318,0.2873396277427673,0.2916238009929657,0.2908017039299011,0.2907920777797699,0.2952797412872314,0.2941452264785766,0.2921333611011505,0.2925891280174255,0.2968584895133972,0.2980035543441772,0.2964116632938385,0.2962304651737213,0.2950254380702972,0.2977516651153564,0.2944138348102569,0.3003402054309845,0.2976303696632385,0.3013098239898681,0.302829384803772,0.3018766045570373,0.305361807346344,0.2971298694610595,0.3014816343784332,0.3019805550575256,0.3037064969539642,0.2970167994499206,0.2995208501815796,0.2970106601715088,0.2990955114364624,0.3027818500995636,0.3048534691333771,0.2993872463703155,0.2986327707767486,0.3015393316745758,0.3003426790237427,0.3003274798393249,0.3017795085906982,0.3019182682037353,0.3015450537204742,0.3046211004257202,0.3031167984008789,0.3020436763763428,0.3011128306388855,0.3029948472976684,0.3045558631420135,0.301642894744873,0.3029441833496094,0.3035804331302643,0.3004390001296997,0.3021787703037262,0.306041270494461,0.3064048886299133,0.3087956011295318,0.3070018291473388,0.3065581619739532,0.3093871772289276,0.3060930073261261,0.3033313155174255,0.3072777390480041,0.306413859128952,0.3104493916034698,0.3056999444961548,0.3077532052993774,0.309231549501419,0.3070645034313202,0.3117790520191192,0.3114112913608551,0.312661737203598,0.3181777000427246,0.3117201030254364,0.3099702894687652,0.3074746131896972,0.3064963519573211,0.3105958700180053,0.3111456036567688,0.3084964454174042,0.3087405860424042,0.3121673166751861,0.3121528625488281,0.3100416660308838,0.3142979145050049,0.3129935264587402,0.3112611472606659,0.3119436800479889,0.3154115974903106,0.3091593086719513,0.3103814721107483,0.3130497634410858,0.3133455514907837,0.3152708411216736,0.3137963414192199,0.3099324703216553,0.3164172768592834,0.3133907914161682,0.3128255009651184,0.3134104907512665,0.3106969892978668,0.3130004107952118,0.3131391704082489,0.3130116462707519,0.3143952488899231,0.3143975436687469,0.3143710494041443,0.3163396418094635,0.3166862726211548,0.3184126019477844,0.3178988993167877,0.317479133605957,0.3184944093227386,0.316694974899292,0.3176258206367492,0.3182629346847534,0.3200214207172394,0.3181648552417755,0.320680022239685,0.3178716897964477,0.3182425796985626,0.3182984292507171,0.3158398568630218,0.3152642548084259,0.3132680356502533,0.3178914785385132,0.3156660795211792,0.3161703050136566,0.3176451921463012,0.3173815906047821,0.3194171786308288,0.3193057179450989,0.3172560334205627,0.317656546831131,0.3155770003795624,0.3199106156826019,0.3170182108879089,0.3156754970550537,0.3180731236934662,0.3205638229846954,0.3175432682037353,0.3184471428394317,0.3192788958549499,0.3197042346000671,0.3177168369293213],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2558934390544891,0.2618628144264221,0.2683217823505401,0.2699837982654571,0.2738722860813141,0.2744417488574981,0.2740873992443084,0.2807216048240661,0.2820421457290649,0.2891400754451751,0.2879075407981872,0.2881667613983154,0.2892490327358246,0.2882707118988037,0.2935869693756103,0.2870290875434875,0.2911452651023865,0.2949125170707702,0.2916406095027923,0.2981449663639068,0.2953989207744598,0.2946988642215729,0.297021746635437,0.3001497685909271,0.3010218441486358,0.2977036237716675,0.2992585003376007,0.2986803948879242,0.2994338274002075,0.2989781498908996,0.3041955828666687,0.3030496537685394,0.303806334733963,0.3036351203918457,0.3058845102787018,0.300450712442398,0.3025284707546234,0.3072526752948761,0.3039065897464752,0.3073755502700805,0.3070493042469024,0.3083153367042541,0.3123056292533874,0.307761400938034,0.3053378164768219,0.3116358816623688,0.3080427348613739,0.308482676744461,0.307318776845932,0.3083004653453827,0.3089516758918762,0.3088736236095428,0.3077724277973175,0.3126304149627685,0.3101697862148285,0.3159398734569549,0.314792275428772,0.3103811144828796,0.3111368715763092,0.3129658997058868,0.311605304479599,0.3118223249912262,0.3133279979228973,0.3146496713161468,0.3195074200630188,0.3142614662647247,0.3125102519989013,0.3115333616733551,0.3183117806911468,0.3168580532073974,0.3187012672424316,0.3179306983947754,0.3157722651958465,0.3214826583862304,0.3145081698894501,0.3172421753406524,0.3151432573795318,0.3181649446487427,0.3180212080478668,0.3171605765819549,0.3212067782878876,0.3180184066295624,0.3209905624389648,0.319052129983902,0.3212707936763763,0.3196887373924255,0.3188316226005554,0.3164899051189422,0.3241994678974151,0.3179469406604767,0.3214083909988403,0.3206575512886047,0.3263285160064697,0.3219505250453949,0.3181525468826294,0.3219776451587677,0.3259726762771606,0.3197665512561798,0.3236161768436432,0.3177970349788666,0.3258080780506134,0.3208407461643219,0.3251138925552368,0.3242645859718323,0.3229723274707794,0.3227455914020538,0.3206316232681274,0.3256695866584778,0.3241210877895355,0.3224890530109405,0.3263737261295318,0.3214233517646789,0.3240345120429992,0.3222567737102508,0.3242291808128357,0.3257078528404236,0.3278365731239319,0.3277338743209839,0.3253948092460632,0.3232105076313019,0.3267974853515625,0.3263654410839081,0.3262891769409179,0.3238334357738495,0.3294911682605743,0.3261866867542267,0.3243315815925598,0.3250119090080261,0.326727420091629,0.3268802464008331,0.3269768059253692,0.3257980346679687,0.3280686736106872,0.3274897634983063,0.3282252252101898,0.3272863030433655,0.328346699476242,0.325562834739685,0.3301684856414795,0.3284023404121399,0.3268299400806427,0.3286610245704651,0.3291078805923462,0.324972927570343,0.3314772248268127,0.3278062343597412,0.326839417219162,0.3277239501476288,0.330414742231369,0.3271744549274444,0.3279334008693695,0.3288575112819671,0.3285425007343292,0.3282454907894134,0.3296376466751098,0.3305942714214325,0.3276287615299225,0.3292438983917236,0.329515129327774,0.3281475007534027,0.3282177448272705,0.3333999514579773,0.3302631080150604,0.330238401889801,0.3323166668415069,0.3313035368919372,0.32961106300354,0.3321967124938965],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2840000092983246,0.3059999942779541,0.3059999942779541,0.2980000078678131,0.3240000009536743,0.3100000023841858,0.3000000119209289,0.3160000145435333,0.3140000104904175,0.3260000050067901,0.3199999928474426,0.2980000078678131,0.3179999887943268,0.3179999887943268,0.3319999873638153,0.3019999861717224,0.2939999997615814,0.3319999873638153,0.3319999873638153,0.3219999969005584,0.3379999995231628,0.3379999995231628,0.3339999914169311,0.3240000009536743,0.3479999899864197,0.3300000131130218,0.3240000009536743,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3319999873638153,0.3379999995231628,0.356000006198883,0.3339999914169311,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.3479999899864197,0.3339999914169311,0.3400000035762787,0.3479999899864197,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3499999940395355,0.3420000076293945,0.3659999966621399,0.3400000035762787,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3400000035762787,0.356000006198883,0.3339999914169311,0.3339999914169311,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3339999914169311,0.3440000116825104,0.3400000035762787,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3499999940395355,0.3420000076293945,0.3379999995231628,0.335999995470047,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3519999980926513,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3400000035762787,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3479999899864197,0.3379999995231628,0.3440000116825104,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3499999940395355,0.3600000143051147,0.3440000116825104,0.3499999940395355,0.356000006198883,0.3420000076293945,0.3479999899864197,0.3379999995231628,0.3379999995231628,0.3459999859333038,0.356000006198883,0.328000009059906,0.3459999859333038,0.3519999980926513,0.3499999940395355,0.3519999980926513,0.3420000076293945,0.3499999940395355,0.3420000076293945,0.3339999914169311,0.335999995470047,0.3379999995231628,0.3379999995231628,0.3540000021457672,0.356000006198883,0.356000006198883,0.335999995470047,0.363999992609024,0.363999992609024,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3540000021457672,0.3459999859333038,0.3479999899864197,0.3519999980926513,0.3519999980926513,0.3420000076293945,0.3440000116825104,0.3379999995231628,0.3519999980926513,0.356000006198883,0.3420000076293945,0.3580000102519989,0.3499999940395355,0.3619999885559082,0.3519999980926513,0.3600000143051147,0.3459999859333038,0.3519999980926513,0.3519999980926513,0.3499999940395355,0.3580000102519989,0.356000006198883,0.3580000102519989,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3440000116825104,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3600000143051147,0.3580000102519989,0.3540000021457672,0.3519999980926513,0.3459999859333038,0.3459999859333038,0.3540000021457672,0.335999995470047,0.3540000021457672,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3499999940395355,0.356000006198883],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2460000067949295,0.2720000147819519,0.270000010728836,0.2939999997615814,0.2960000038146972,0.3240000009536743,0.3019999861717224,0.2879999876022339,0.3179999887943268,0.3059999942779541,0.2899999916553497,0.3100000023841858,0.3179999887943268,0.3219999969005584,0.3219999969005584,0.3300000131130218,0.3140000104904175,0.3240000009536743,0.3079999983310699,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3179999887943268,0.3260000050067901,0.3260000050067901,0.3240000009536743,0.3379999995231628,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3339999914169311,0.328000009059906,0.3319999873638153,0.3199999928474426,0.3000000119209289,0.3260000050067901,0.3240000009536743,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3440000116825104,0.3199999928474426,0.3319999873638153,0.3219999969005584,0.335999995470047,0.3519999980926513,0.3379999995231628,0.328000009059906,0.3300000131130218,0.335999995470047,0.3479999899864197,0.3459999859333038,0.3479999899864197,0.3540000021457672,0.3479999899864197,0.3300000131130218,0.356000006198883,0.3479999899864197,0.356000006198883,0.335999995470047,0.335999995470047,0.3479999899864197,0.3339999914169311,0.3540000021457672,0.3300000131130218,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3479999899864197,0.335999995470047,0.3400000035762787,0.3179999887943268,0.335999995470047,0.328000009059906,0.328000009059906,0.3540000021457672,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3440000116825104,0.3499999940395355,0.335999995470047,0.3540000021457672,0.356000006198883,0.3400000035762787,0.3600000143051147,0.3580000102519989,0.3519999980926513,0.3499999940395355,0.3540000021457672,0.3519999980926513,0.3499999940395355,0.3440000116825104,0.356000006198883,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3499999940395355,0.3440000116825104,0.3519999980926513,0.3440000116825104,0.356000006198883,0.3459999859333038,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3420000076293945,0.3379999995231628,0.3479999899864197,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.3420000076293945,0.3420000076293945,0.3499999940395355,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3459999859333038,0.3479999899864197,0.3440000116825104,0.3720000088214874,0.3619999885559082,0.356000006198883,0.3519999980926513,0.3459999859333038,0.3440000116825104,0.3420000076293945,0.3580000102519989,0.3600000143051147,0.3519999980926513,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3619999885559082,0.3499999940395355,0.3499999940395355,0.363999992609024,0.3580000102519989,0.3499999940395355,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3540000021457672,0.3600000143051147,0.3420000076293945,0.3519999980926513,0.3440000116825104,0.3519999980926513,0.3540000021457672,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3519999980926513,0.3580000102519989,0.3440000116825104,0.3499999940395355,0.3580000102519989,0.3479999899864197,0.3479999899864197],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2720000147819519,0.2980000078678131,0.2840000092983246,0.2879999876022339,0.3039999902248382,0.2860000133514404,0.2899999916553497,0.3019999861717224,0.2960000038146972,0.3039999902248382,0.3100000023841858,0.3160000145435333,0.3260000050067901,0.3160000145435333,0.3260000050067901,0.3179999887943268,0.3420000076293945,0.3219999969005584,0.328000009059906,0.3240000009536743,0.3300000131130218,0.328000009059906,0.3199999928474426,0.3379999995231628,0.3400000035762787,0.3240000009536743,0.3120000064373016,0.3319999873638153,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3140000104904175,0.3179999887943268,0.3160000145435333,0.3199999928474426,0.3240000009536743,0.3260000050067901,0.3179999887943268,0.3300000131130218,0.3179999887943268,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3199999928474426,0.3400000035762787,0.3339999914169311,0.328000009059906,0.328000009059906,0.3339999914169311,0.328000009059906,0.328000009059906,0.335999995470047,0.3580000102519989,0.3499999940395355,0.3260000050067901,0.3499999940395355,0.3420000076293945,0.3160000145435333,0.3339999914169311,0.335999995470047,0.3400000035762787,0.3240000009536743,0.3319999873638153,0.3379999995231628,0.3400000035762787,0.3379999995231628,0.3319999873638153,0.3319999873638153,0.3440000116825104,0.3300000131130218,0.3219999969005584,0.3260000050067901,0.3219999969005584,0.3339999914169311,0.328000009059906,0.3300000131130218,0.3219999969005584,0.3379999995231628,0.3400000035762787,0.3319999873638153,0.328000009059906,0.3440000116825104,0.3339999914169311,0.328000009059906,0.3379999995231628,0.3499999940395355,0.3339999914169311,0.3300000131130218,0.328000009059906,0.335999995470047,0.3240000009536743,0.335999995470047,0.3240000009536743,0.3400000035762787,0.3400000035762787,0.3420000076293945,0.3319999873638153,0.3339999914169311,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3400000035762787,0.3379999995231628,0.3459999859333038,0.3379999995231628,0.3300000131130218,0.3519999980926513,0.3379999995231628,0.356000006198883,0.335999995470047,0.3420000076293945,0.3400000035762787,0.328000009059906,0.3540000021457672,0.3499999940395355,0.3479999899864197,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3540000021457672,0.3440000116825104,0.3499999940395355,0.356000006198883,0.356000006198883,0.356000006198883,0.363999992609024,0.3600000143051147,0.356000006198883,0.3479999899864197,0.356000006198883,0.3459999859333038,0.3479999899864197,0.3619999885559082,0.363999992609024,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3540000021457672,0.3619999885559082,0.3580000102519989,0.3540000021457672,0.356000006198883,0.3479999899864197,0.3519999980926513,0.356000006198883,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3440000116825104,0.3580000102519989,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3540000021457672,0.3519999980926513,0.3540000021457672,0.356000006198883,0.363999992609024,0.356000006198883,0.356000006198883],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2660000026226043,0.277999997138977,0.2820000052452087,0.3079999983310699,0.3140000104904175,0.3260000050067901,0.3039999902248382,0.3319999873638153,0.3240000009536743,0.3199999928474426,0.3379999995231628,0.3339999914169311,0.3319999873638153,0.3179999887943268,0.3319999873638153,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3199999928474426,0.3179999887943268,0.3400000035762787,0.3219999969005584,0.335999995470047,0.3339999914169311,0.3420000076293945,0.3240000009536743,0.3440000116825104,0.3420000076293945,0.3379999995231628,0.3459999859333038,0.328000009059906,0.3420000076293945,0.3459999859333038,0.3479999899864197,0.3379999995231628,0.356000006198883,0.3379999995231628,0.3440000116825104,0.3400000035762787,0.3379999995231628,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3379999995231628,0.356000006198883,0.3400000035762787,0.3519999980926513,0.3479999899864197,0.3479999899864197,0.3400000035762787,0.3459999859333038,0.3519999980926513,0.3440000116825104,0.3400000035762787,0.356000006198883,0.3420000076293945,0.356000006198883,0.3540000021457672,0.3600000143051147,0.3339999914169311,0.3499999940395355,0.3580000102519989,0.3440000116825104,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3339999914169311,0.3540000021457672,0.3459999859333038,0.3459999859333038,0.3400000035762787,0.356000006198883,0.356000006198883,0.3420000076293945,0.3420000076293945,0.3400000035762787,0.3479999899864197,0.3519999980926513,0.3319999873638153,0.3580000102519989,0.356000006198883,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3400000035762787,0.3440000116825104,0.3339999914169311,0.3379999995231628,0.3479999899864197,0.3680000007152557,0.3619999885559082,0.3440000116825104,0.3619999885559082,0.3580000102519989,0.356000006198883,0.3600000143051147,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3540000021457672,0.3600000143051147,0.356000006198883,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3540000021457672,0.363999992609024,0.3580000102519989,0.3680000007152557,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3420000076293945,0.363999992609024,0.3580000102519989,0.3619999885559082,0.3759999871253967,0.3740000128746032,0.363999992609024,0.3580000102519989,0.3700000047683716,0.3700000047683716,0.363999992609024,0.3440000116825104,0.3580000102519989,0.3680000007152557,0.3700000047683716,0.3740000128746032,0.3619999885559082,0.3619999885559082,0.3700000047683716,0.363999992609024,0.363999992609024,0.363999992609024,0.3700000047683716,0.3600000143051147,0.3680000007152557,0.363999992609024,0.3659999966621399,0.363999992609024,0.3680000007152557,0.3580000102519989,0.363999992609024,0.3659999966621399,0.363999992609024,0.3580000102519989,0.3600000143051147,0.3600000143051147,0.3580000102519989,0.3600000143051147],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6019999980926514,0.652999997138977,0.6710000038146973,0.6740000247955322,0.6899999976158142,0.6919999718666077,0.6909999847412109,0.7070000171661377,0.7089999914169312,0.7129999995231628,0.7229999899864197,0.7120000123977661,0.7200000286102295,0.7300000190734863,0.7279999852180481,0.7369999885559082,0.7390000224113464,0.7350000143051147,0.7319999933242798,0.7279999852180481,0.7269999980926514,0.7459999918937683,0.7400000095367432,0.7390000224113464,0.7319999933242798,0.7390000224113464,0.7379999756813049,0.7390000224113464,0.7360000014305115,0.7440000176429749,0.7400000095367432,0.7360000014305115,0.7480000257492065,0.7360000014305115,0.7440000176429749,0.7459999918937683,0.7409999966621399,0.746999979019165,0.7440000176429749,0.7450000047683716,0.753000020980835,0.7390000224113464,0.7490000128746033,0.7419999837875366,0.7390000224113464,0.7559999823570251,0.7519999742507935,0.7549999952316284,0.7419999837875366,0.7490000128746033,0.7540000081062317,0.7480000257492065,0.7450000047683716,0.7429999709129333,0.7509999871253967,0.7549999952316284,0.7490000128746033,0.7490000128746033,0.7400000095367432,0.753000020980835,0.75,0.7509999871253967,0.7570000290870667,0.7590000033378601,0.7570000290870667,0.7329999804496765,0.7540000081062317,0.746999979019165,0.7409999966621399,0.7590000033378601,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7580000162124634,0.7639999985694885,0.7630000114440918,0.7590000033378601,0.7549999952316284,0.7480000257492065,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7549999952316284,0.7559999823570251,0.7580000162124634,0.7580000162124634,0.753000020980835,0.7490000128746033,0.7540000081062317,0.7639999985694885,0.7580000162124634,0.7519999742507935,0.7590000033378601,0.75,0.7570000290870667,0.7620000243186951,0.7710000276565552,0.7739999890327454,0.7620000243186951,0.7549999952316284,0.7599999904632568,0.765999972820282,0.7680000066757202,0.7639999985694885,0.7540000081062317,0.7649999856948853,0.7649999856948853,0.7609999775886536,0.7549999952316284,0.765999972820282,0.7639999985694885,0.7580000162124634,0.7710000276565552,0.7570000290870667,0.7630000114440918,0.7580000162124634,0.7599999904632568,0.7649999856948853,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7559999823570251,0.7609999775886536,0.7620000243186951,0.7620000243186951,0.7609999775886536,0.753000020980835,0.7570000290870667,0.7620000243186951,0.7609999775886536,0.7609999775886536,0.7559999823570251,0.7540000081062317,0.7570000290870667,0.7639999985694885,0.7590000033378601,0.7680000066757202,0.7680000066757202,0.765999972820282,0.765999972820282,0.7670000195503235,0.7739999890327454,0.7649999856948853,0.7749999761581421,0.7699999809265137,0.7639999985694885,0.7680000066757202,0.7630000114440918,0.7680000066757202,0.7699999809265137,0.7739999890327454,0.7749999761581421,0.765999972820282,0.7680000066757202,0.7710000276565552,0.7680000066757202,0.765999972820282,0.7689999938011169,0.7760000228881836,0.7710000276565552,0.7680000066757202,0.7649999856948853,0.7720000147819519,0.7730000019073486],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6169999837875366,0.6359999775886536,0.6769999861717224,0.6769999861717224,0.6970000267028809,0.6990000009536743,0.6970000267028809,0.6959999799728394,0.7049999833106995,0.7089999914169312,0.7179999947547913,0.7099999785423279,0.7160000205039978,0.7260000109672546,0.7229999899864197,0.7179999947547913,0.7210000157356262,0.7200000286102295,0.734000027179718,0.7089999914169312,0.7229999899864197,0.7239999771118164,0.7310000061988831,0.7300000190734863,0.7260000109672546,0.7250000238418579,0.7239999771118164,0.7289999723434448,0.7390000224113464,0.7229999899864197,0.7310000061988831,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7289999723434448,0.7329999804496765,0.7300000190734863,0.7319999933242798,0.7440000176429749,0.746999979019165,0.7310000061988831,0.7329999804496765,0.7480000257492065,0.7429999709129333,0.7369999885559082,0.7269999980926514,0.7269999980926514,0.7379999756813049,0.75,0.7360000014305115,0.746999979019165,0.7409999966621399,0.7369999885559082,0.7459999918937683,0.7400000095367432,0.7409999966621399,0.746999979019165,0.7360000014305115,0.7459999918937683,0.7400000095367432,0.7429999709129333,0.7350000143051147,0.7390000224113464,0.7379999756813049,0.7480000257492065,0.7329999804496765,0.734000027179718,0.7390000224113464,0.7459999918937683,0.7360000014305115,0.7419999837875366,0.7429999709129333,0.7400000095367432,0.7379999756813049,0.7310000061988831,0.7360000014305115,0.7390000224113464,0.75,0.7369999885559082,0.7570000290870667,0.7409999966621399,0.7459999918937683,0.7350000143051147,0.7459999918937683,0.7509999871253967,0.7429999709129333,0.7419999837875366,0.7419999837875366,0.75,0.7440000176429749,0.7450000047683716,0.75,0.7409999966621399,0.7490000128746033,0.7409999966621399,0.7419999837875366,0.7429999709129333,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.75,0.753000020980835,0.75,0.746999979019165,0.7519999742507935,0.746999979019165,0.7570000290870667,0.7549999952316284,0.75,0.7540000081062317,0.7480000257492065,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.746999979019165,0.746999979019165,0.75,0.7519999742507935,0.7580000162124634,0.7549999952316284,0.7490000128746033,0.7480000257492065,0.7519999742507935,0.7590000033378601,0.7450000047683716,0.75,0.7440000176429749,0.7419999837875366,0.7519999742507935,0.7450000047683716,0.753000020980835,0.7450000047683716,0.7440000176429749,0.7559999823570251,0.7509999871253967,0.7540000081062317,0.7440000176429749,0.7509999871253967,0.753000020980835,0.7490000128746033,0.7570000290870667,0.7490000128746033,0.746999979019165,0.746999979019165,0.7509999871253967,0.7509999871253967,0.7519999742507935,0.7570000290870667,0.7540000081062317,0.7440000176429749,0.7480000257492065,0.7509999871253967,0.7509999871253967,0.7509999871253967,0.7549999952316284,0.75,0.7559999823570251,0.746999979019165,0.7609999775886536,0.7549999952316284,0.746999979019165,0.7490000128746033,0.753000020980835,0.753000020980835,0.7609999775886536,0.746999979019165,0.7580000162124634],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.621999979019165,0.6439999938011169,0.6700000166893005,0.6790000200271606,0.6869999766349792,0.6959999799728394,0.6790000200271606,0.6880000233650208,0.7049999833106995,0.699999988079071,0.6990000009536743,0.6940000057220459,0.7110000252723694,0.7089999914169312,0.7120000123977661,0.7070000171661377,0.7070000171661377,0.6990000009536743,0.7009999752044678,0.7160000205039978,0.7200000286102295,0.7149999737739563,0.7250000238418579,0.7210000157356262,0.722000002861023,0.7310000061988831,0.7289999723434448,0.7319999933242798,0.7250000238418579,0.722000002861023,0.7210000157356262,0.7170000076293945,0.7260000109672546,0.7250000238418579,0.7210000157356262,0.7200000286102295,0.7379999756813049,0.7239999771118164,0.7239999771118164,0.7080000042915344,0.7289999723434448,0.7289999723434448,0.7300000190734863,0.7329999804496765,0.7319999933242798,0.7350000143051147,0.7390000224113464,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7329999804496765,0.7400000095367432,0.7409999966621399,0.7310000061988831,0.7350000143051147,0.7360000014305115,0.7360000014305115,0.7409999966621399,0.7319999933242798,0.7409999966621399,0.7400000095367432,0.7390000224113464,0.7329999804496765,0.7459999918937683,0.753000020980835,0.746999979019165,0.734000027179718,0.7369999885559082,0.7419999837875366,0.734000027179718,0.7419999837875366,0.7289999723434448,0.7350000143051147,0.7300000190734863,0.7519999742507935,0.7390000224113464,0.7400000095367432,0.7409999966621399,0.7429999709129333,0.7450000047683716,0.7329999804496765,0.7260000109672546,0.7570000290870667,0.7360000014305115,0.7519999742507935,0.7419999837875366,0.7379999756813049,0.7390000224113464,0.7490000128746033,0.734000027179718,0.7360000014305115,0.7390000224113464,0.7440000176429749,0.7450000047683716,0.7319999933242798,0.7429999709129333,0.7519999742507935,0.7540000081062317,0.7519999742507935,0.753000020980835,0.7480000257492065,0.7440000176429749,0.7459999918937683,0.7369999885559082,0.7419999837875366,0.7480000257492065,0.7419999837875366,0.765999972820282,0.746999979019165,0.7459999918937683,0.7570000290870667,0.7390000224113464,0.7409999966621399,0.7459999918937683,0.75,0.7570000290870667,0.753000020980835,0.7549999952316284,0.7519999742507935,0.7490000128746033,0.746999979019165,0.7459999918937683,0.7459999918937683,0.746999979019165,0.7409999966621399,0.7419999837875366,0.7459999918937683,0.7440000176429749,0.7459999918937683,0.7490000128746033,0.7450000047683716,0.7409999966621399,0.7419999837875366,0.7490000128746033,0.7590000033378601,0.7549999952316284,0.7549999952316284,0.746999979019165,0.753000020980835,0.7549999952316284,0.746999979019165,0.7580000162124634,0.7490000128746033,0.753000020980835,0.75,0.75,0.7540000081062317,0.7540000081062317,0.7490000128746033,0.7570000290870667,0.7570000290870667,0.7590000033378601,0.7559999823570251,0.7620000243186951,0.7590000033378601,0.7509999871253967,0.7639999985694885,0.7580000162124634,0.7599999904632568,0.7620000243186951,0.7590000033378601,0.7609999775886536,0.7559999823570251,0.75,0.7509999871253967,0.7549999952316284,0.7540000081062317,0.7540000081062317],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6209999918937683,0.6549999713897705,0.6800000071525574,0.6830000281333923,0.703000009059906,0.7020000219345093,0.7110000252723694,0.7160000205039978,0.7129999995231628,0.7210000157356262,0.7250000238418579,0.7210000157356262,0.7310000061988831,0.7269999980926514,0.7269999980926514,0.7329999804496765,0.7459999918937683,0.734000027179718,0.7409999966621399,0.7390000224113464,0.7350000143051147,0.7509999871253967,0.7440000176429749,0.7379999756813049,0.7599999904632568,0.7400000095367432,0.7409999966621399,0.7590000033378601,0.7409999966621399,0.7440000176429749,0.7400000095367432,0.7450000047683716,0.75,0.7440000176429749,0.7409999966621399,0.7429999709129333,0.7440000176429749,0.7440000176429749,0.7559999823570251,0.7459999918937683,0.7559999823570251,0.7540000081062317,0.7599999904632568,0.7559999823570251,0.7490000128746033,0.7490000128746033,0.7429999709129333,0.7609999775886536,0.7519999742507935,0.7480000257492065,0.7490000128746033,0.7620000243186951,0.7580000162124634,0.7580000162124634,0.7540000081062317,0.7509999871253967,0.7519999742507935,0.7440000176429749,0.7459999918937683,0.7559999823570251,0.7620000243186951,0.746999979019165,0.7570000290870667,0.7620000243186951,0.7570000290870667,0.7540000081062317,0.7540000081062317,0.7570000290870667,0.7590000033378601,0.7519999742507935,0.75,0.7559999823570251,0.7590000033378601,0.7559999823570251,0.7519999742507935,0.7639999985694885,0.7620000243186951,0.7549999952316284,0.7490000128746033,0.7559999823570251,0.7639999985694885,0.7609999775886536,0.7609999775886536,0.7519999742507935,0.7549999952316284,0.7570000290870667,0.7620000243186951,0.7599999904632568,0.7639999985694885,0.7559999823570251,0.753000020980835,0.7649999856948853,0.753000020980835,0.7549999952316284,0.7609999775886536,0.7599999904632568,0.7680000066757202,0.7540000081062317,0.7559999823570251,0.7590000033378601,0.7590000033378601,0.7649999856948853,0.7639999985694885,0.7710000276565552,0.7699999809265137,0.7609999775886536,0.765999972820282,0.7670000195503235,0.7720000147819519,0.7639999985694885,0.7609999775886536,0.7549999952316284,0.7630000114440918,0.7670000195503235,0.7599999904632568,0.765999972820282,0.7670000195503235,0.7670000195503235,0.7670000195503235,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7829999923706055,0.7630000114440918,0.7720000147819519,0.7649999856948853,0.7630000114440918,0.7699999809265137,0.7720000147819519,0.7720000147819519,0.7689999938011169,0.777999997138977,0.7689999938011169,0.7760000228881836,0.7730000019073486,0.7799999713897705,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7770000100135803,0.777999997138977,0.7670000195503235,0.7789999842643738,0.7799999713897705,0.7749999761581421,0.7730000019073486,0.777999997138977,0.777999997138977,0.7799999713897705,0.7770000100135803,0.7770000100135803,0.7789999842643738,0.7760000228881836,0.7770000100135803,0.7770000100135803,0.7770000100135803,0.7739999890327454,0.7689999938011169,0.7760000228881836,0.777999997138977,0.7699999809265137,0.7739999890327454,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7730000019073486,0.7739999890327454,0.7680000066757202],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5,0.4979999959468841,0.4950000047683716,0.4950000047683716,0.5049999952316284,0.5329999923706055,0.5220000147819519,0.5139999985694885,0.5339999794960022,0.5130000114440918,0.5389999747276306,0.5400000214576721,0.5270000100135803,0.5320000052452087,0.5260000228881836,0.5370000004768372,0.527999997138977,0.5289999842643738,0.5339999794960022,0.5270000100135803,0.531000018119812,0.527999997138977,0.5400000214576721,0.5479999780654907,0.550000011920929,0.5400000214576721,0.5350000262260437,0.5410000085830688,0.5379999876022339,0.5299999713897705,0.5490000247955322,0.5509999990463257,0.5519999861717224,0.5429999828338623,0.5429999828338623,0.5440000295639038,0.5379999876022339,0.5379999876022339,0.5419999957084656,0.5609999895095825,0.5540000200271606,0.5370000004768372,0.5440000295639038,0.5410000085830688,0.5379999876022339,0.5329999923706055,0.5419999957084656,0.5419999957084656,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5400000214576721,0.5450000166893005,0.5509999990463257,0.5569999814033508,0.5550000071525574,0.5590000152587891,0.5479999780654907,0.5550000071525574,0.5440000295639038,0.5460000038146973,0.546999990940094,0.5559999942779541,0.5550000071525574,0.5490000247955322,0.5440000295639038,0.546999990940094,0.5450000166893005,0.546999990940094,0.5649999976158142,0.5490000247955322,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5519999861717224,0.5519999861717224,0.5529999732971191,0.5490000247955322,0.546999990940094,0.550000011920929,0.5720000267028809,0.5619999766349792,0.5490000247955322,0.5680000185966492,0.5519999861717224,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.5630000233650208,0.5529999732971191,0.5619999766349792,0.5609999895095825,0.550000011920929,0.5479999780654907,0.5529999732971191,0.5519999861717224,0.5580000281333923,0.5590000152587891,0.5529999732971191,0.550000011920929,0.5680000185966492,0.5580000281333923,0.5630000233650208,0.5630000233650208,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5649999976158142,0.5659999847412109,0.5559999942779541,0.5659999847412109,0.5630000233650208,0.5509999990463257,0.5669999718666077,0.5669999718666077,0.5479999780654907,0.5540000200271606,0.5580000281333923,0.5519999861717224,0.5590000152587891,0.5590000152587891,0.5619999766349792,0.5509999990463257,0.546999990940094,0.5609999895095825,0.5540000200271606,0.5630000233650208,0.5580000281333923,0.5559999942779541,0.5680000185966492,0.5649999976158142,0.5619999766349792,0.5580000281333923,0.5630000233650208,0.5559999942779541,0.5540000200271606,0.5540000200271606,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5600000023841858,0.5460000038146973,0.5429999828338623,0.5580000281333923,0.5550000071525574,0.5580000281333923,0.5540000200271606,0.5609999895095825,0.5519999861717224,0.550000011920929,0.5519999861717224,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5590000152587891,0.5690000057220459,0.5640000104904175,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5640000104904175,0.5600000023841858,0.5550000071525574,0.5640000104904175,0.5600000023841858,0.5540000200271606],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4869999885559082,0.4959999918937683,0.4979999959468841,0.5099999904632568,0.515999972820282,0.5080000162124634,0.5249999761581421,0.5239999890327454,0.5299999713897705,0.5239999890327454,0.5149999856948853,0.5270000100135803,0.5249999761581421,0.5180000066757202,0.5220000147819519,0.5329999923706055,0.5289999842643738,0.5239999890327454,0.5299999713897705,0.5230000019073486,0.5130000114440918,0.5180000066757202,0.5299999713897705,0.5199999809265137,0.5270000100135803,0.5230000019073486,0.5299999713897705,0.5320000052452087,0.5429999828338623,0.527999997138977,0.5379999876022339,0.527999997138977,0.5419999957084656,0.5329999923706055,0.5450000166893005,0.5320000052452087,0.5410000085830688,0.5249999761581421,0.5400000214576721,0.5249999761581421,0.5289999842643738,0.5320000052452087,0.5339999794960022,0.5320000052452087,0.5350000262260437,0.5400000214576721,0.5450000166893005,0.5440000295639038,0.5400000214576721,0.5379999876022339,0.5350000262260437,0.5410000085830688,0.5490000247955322,0.531000018119812,0.5389999747276306,0.546999990940094,0.5529999732971191,0.5370000004768372,0.5440000295639038,0.5400000214576721,0.5490000247955322,0.550000011920929,0.5580000281333923,0.5609999895095825,0.5429999828338623,0.5529999732971191,0.5519999861717224,0.5450000166893005,0.550000011920929,0.5379999876022339,0.5490000247955322,0.5460000038146973,0.5419999957084656,0.5569999814033508,0.5509999990463257,0.5490000247955322,0.5529999732971191,0.5479999780654907,0.5590000152587891,0.5479999780654907,0.5509999990463257,0.5440000295639038,0.5509999990463257,0.5540000200271606,0.5559999942779541,0.5630000233650208,0.5649999976158142,0.5640000104904175,0.5649999976158142,0.5490000247955322,0.5709999799728394,0.5659999847412109,0.5630000233650208,0.5640000104904175,0.5580000281333923,0.546999990940094,0.5550000071525574,0.5580000281333923,0.5429999828338623,0.5440000295639038,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5550000071525574,0.5649999976158142,0.5540000200271606,0.5630000233650208,0.5609999895095825,0.5580000281333923,0.5509999990463257,0.5550000071525574,0.5550000071525574,0.5519999861717224,0.5609999895095825,0.5630000233650208,0.5509999990463257,0.550000011920929,0.5490000247955322,0.5540000200271606,0.550000011920929,0.5529999732971191,0.5460000038146973,0.550000011920929,0.5529999732971191,0.5519999861717224,0.5529999732971191,0.5609999895095825,0.5590000152587891,0.5550000071525574,0.550000011920929,0.5609999895095825,0.5619999766349792,0.5609999895095825,0.5540000200271606,0.550000011920929,0.5600000023841858,0.5559999942779541,0.5609999895095825,0.5569999814033508,0.5600000023841858,0.5680000185966492,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5669999718666077,0.5709999799728394,0.5640000104904175,0.5569999814033508,0.5600000023841858,0.5569999814033508,0.5649999976158142,0.5600000023841858,0.5580000281333923,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5529999732971191,0.5640000104904175,0.5649999976158142,0.5659999847412109,0.5630000233650208,0.5630000233650208,0.5619999766349792,0.5609999895095825,0.5559999942779541,0.5529999732971191,0.5600000023841858],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5239999890327454,0.4900000095367431,0.5040000081062317,0.5099999904632568,0.4990000128746032,0.5170000195503235,0.5040000081062317,0.5009999871253967,0.5230000019073486,0.5109999775886536,0.5059999823570251,0.5130000114440918,0.5090000033378601,0.5180000066757202,0.5220000147819519,0.5189999938011169,0.5180000066757202,0.5220000147819519,0.5120000243186951,0.5460000038146973,0.5239999890327454,0.5289999842643738,0.5440000295639038,0.5339999794960022,0.5299999713897705,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5360000133514404,0.5299999713897705,0.5180000066757202,0.5249999761581421,0.5440000295639038,0.5299999713897705,0.5339999794960022,0.5239999890327454,0.527999997138977,0.5139999985694885,0.5289999842643738,0.5360000133514404,0.5260000228881836,0.5389999747276306,0.5460000038146973,0.5270000100135803,0.5339999794960022,0.5320000052452087,0.5329999923706055,0.5260000228881836,0.5220000147819519,0.5260000228881836,0.5379999876022339,0.5410000085830688,0.5350000262260437,0.5389999747276306,0.5320000052452087,0.5389999747276306,0.5379999876022339,0.5329999923706055,0.5270000100135803,0.5170000195503235,0.5329999923706055,0.5370000004768372,0.5379999876022339,0.5249999761581421,0.5479999780654907,0.546999990940094,0.5400000214576721,0.5440000295639038,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5370000004768372,0.5370000004768372,0.5479999780654907,0.5379999876022339,0.5400000214576721,0.5479999780654907,0.5379999876022339,0.5509999990463257,0.5440000295639038,0.5379999876022339,0.550000011920929,0.5389999747276306,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5360000133514404,0.5509999990463257,0.5360000133514404,0.5419999957084656,0.5419999957084656,0.550000011920929,0.5360000133514404,0.5519999861717224,0.5540000200271606,0.546999990940094,0.5370000004768372,0.5379999876022339,0.5519999861717224,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.550000011920929,0.5490000247955322,0.5360000133514404,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5490000247955322,0.5479999780654907,0.5350000262260437,0.5490000247955322,0.5370000004768372,0.5440000295639038,0.5329999923706055,0.5440000295639038,0.5429999828338623,0.5389999747276306,0.5450000166893005,0.5320000052452087,0.5450000166893005,0.5400000214576721,0.5419999957084656,0.5460000038146973,0.5370000004768372,0.5400000214576721,0.5460000038146973,0.5370000004768372,0.5370000004768372,0.5460000038146973,0.5400000214576721,0.5490000247955322,0.5529999732971191,0.5379999876022339,0.5460000038146973,0.5450000166893005,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5479999780654907,0.5460000038146973,0.5540000200271606,0.5400000214576721,0.5350000262260437,0.5490000247955322,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5410000085830688,0.5429999828338623,0.5379999876022339,0.5450000166893005,0.5389999747276306,0.5400000214576721,0.5400000214576721,0.550000011920929,0.5440000295639038,0.5389999747276306,0.5450000166893005,0.5400000214576721,0.5389999747276306,0.5419999957084656,0.5410000085830688,0.5440000295639038,0.5519999861717224,0.5479999780654907,0.5450000166893005,0.5569999814033508],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4880000054836273,0.492000013589859,0.5059999823570251,0.5139999985694885,0.5070000290870667,0.5090000033378601,0.5230000019073486,0.5189999938011169,0.5189999938011169,0.5220000147819519,0.5149999856948853,0.5260000228881836,0.5329999923706055,0.5230000019073486,0.5180000066757202,0.5289999842643738,0.5400000214576721,0.5410000085830688,0.5440000295639038,0.5329999923706055,0.550000011920929,0.5419999957084656,0.5360000133514404,0.5429999828338623,0.5429999828338623,0.5450000166893005,0.5490000247955322,0.5400000214576721,0.5509999990463257,0.5559999942779541,0.5479999780654907,0.5540000200271606,0.5490000247955322,0.5400000214576721,0.5429999828338623,0.5460000038146973,0.5370000004768372,0.5479999780654907,0.5550000071525574,0.5490000247955322,0.5400000214576721,0.5410000085830688,0.5460000038146973,0.546999990940094,0.5479999780654907,0.546999990940094,0.5509999990463257,0.5450000166893005,0.5590000152587891,0.5419999957084656,0.5540000200271606,0.5440000295639038,0.5450000166893005,0.5580000281333923,0.5540000200271606,0.5440000295639038,0.5619999766349792,0.5450000166893005,0.5600000023841858,0.5559999942779541,0.5600000023841858,0.5400000214576721,0.5569999814033508,0.5600000023841858,0.5619999766349792,0.5529999732971191,0.5649999976158142,0.5609999895095825,0.5550000071525574,0.5609999895095825,0.5580000281333923,0.5550000071525574,0.5619999766349792,0.5550000071525574,0.5519999861717224,0.5600000023841858,0.5550000071525574,0.5550000071525574,0.5590000152587891,0.5490000247955322,0.5580000281333923,0.5600000023841858,0.5419999957084656,0.5559999942779541,0.5559999942779541,0.5529999732971191,0.5609999895095825,0.5519999861717224,0.5569999814033508,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.546999990940094,0.5619999766349792,0.5460000038146973,0.5529999732971191,0.5619999766349792,0.5690000057220459,0.5680000185966492,0.5720000267028809,0.5640000104904175,0.5550000071525574,0.5509999990463257,0.550000011920929,0.5600000023841858,0.5609999895095825,0.5630000233650208,0.5649999976158142,0.5529999732971191,0.5540000200271606,0.5529999732971191,0.5659999847412109,0.5600000023841858,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5730000138282776,0.5569999814033508,0.5690000057220459,0.5619999766349792,0.5680000185966492,0.578000009059906,0.5730000138282776,0.5550000071525574,0.5529999732971191,0.5600000023841858,0.5630000233650208,0.5590000152587891,0.5659999847412109,0.5669999718666077,0.5609999895095825,0.5630000233650208,0.5569999814033508,0.5490000247955322,0.5619999766349792,0.5550000071525574,0.5630000233650208,0.5559999942779541,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5669999718666077,0.5609999895095825,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5720000267028809,0.5619999766349792,0.5649999976158142,0.5669999718666077,0.5680000185966492,0.5699999928474426,0.5640000104904175,0.5609999895095825,0.5740000009536743,0.5690000057220459,0.5669999718666077,0.5720000267028809,0.5699999928474426,0.5709999799728394,0.5740000009536743,0.5680000185966492,0.5619999766349792,0.5690000057220459,0.5659999847412109,0.574999988079071],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
dist/assets/data/plots/custom_filters/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3309533800929785,0.3574739173054695,0.3774360120296478,0.3879939243197441,0.3961103111505508,0.4038164801895618,0.4059260934591293,0.4138728193938732,0.414092980325222,0.4190553873777389,0.4232541136443615,0.4207314290106296,0.4239514805376529,0.425716370344162,0.4249534271657467],"label":"Baseline"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2515000104904175,0.2854999899864197,0.3215000033378601,0.3384999930858612,0.3445000052452087,0.3540000021457672,0.3544999957084656,0.3650000095367431,0.3714999854564667,0.3695000112056732,0.3700000047683716,0.3720000088214874,0.3770000040531158,0.3770000040531158,0.3774999976158142],"label":"Baseline"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2709999978542328,0.2840000092983246,0.2910000085830688,0.3149999976158142,0.3079999983310699,0.3269999921321869,0.3269999921321869,0.3179999887943268,0.3260000050067901,0.328000009059906,0.3350000083446502,0.3330000042915344,0.3409999907016754,0.335999995470047],"label":"Baseline"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2870000004768371,0.3319999873638153,0.3589999973773956,0.3659999966621399,0.3889999985694885,0.402999997138977,0.4180000126361847,0.421999990940094,0.421999990940094,0.4289999902248382,0.4309999942779541,0.4320000112056732,0.4370000064373016,0.4350000023841858],"label":"Baseline"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"siqa/acc_norm":{"file":"siqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"defaultMetric":"agg_score","slider":{"min":0,"max":10,"default":3}}}
dist/assets/data/plots/custom_filters/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.253291368484497,0.2609881162643432,0.2644513845443725,0.2703824639320373,0.2735317945480346,0.2759087681770324,0.2779825627803802,0.2812439203262329,0.2799430787563324,0.286032885313034,0.2868514060974121,0.2856118083000183,0.2887309193611145,0.2871274054050445],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2599999904632568,0.2680000066757202,0.2800000011920929,0.2860000133514404,0.2960000038146972,0.2980000078678131,0.3039999902248382,0.3059999942779541,0.3179999887943268,0.3319999873638153,0.3140000104904175,0.3199999928474426,0.3140000104904175,0.3160000145435333],"label":"Baseline"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6209999918937683,0.6520000100135803,0.6639999747276306,0.6880000233650208,0.6890000104904175,0.699999988079071,0.6980000138282776,0.7049999833106995,0.7080000042915344,0.7110000252723694,0.7070000171661377,0.7179999947547913,0.7120000123977661,0.7160000205039978],"label":"Baseline"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/siqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3959999978542328,0.3989999890327453,0.4040000140666961,0.3989999890327453,0.4000000059604645,0.3880000114440918,0.4050000011920929,0.4079999923706054,0.4169999957084656,0.4110000133514404,0.4059999883174896,0.414000004529953,0.4099999964237213,0.4020000100135803],"label":"Baseline"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3889999985694885,0.4040000140666961,0.4035000056028366,0.4050000011920929,0.3995000123977661,0.4064999967813492,0.4050000011920929,0.4025000035762787,0.4055000096559524,0.40799999237060547,0.4000000059604645,0.4025000035762787,0.403999999165535,0.40150000154972076],"label":"Filters combined"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"}},"layout":{"title":{"text":"Custom filters Performance"}}}
dist/assets/data/plots/custom_filters/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4860000014305115,0.5019999742507935,0.503000020980835,0.5,0.5210000276565552,0.5009999871253967,0.515999972820282,0.5009999871253967,0.5120000243186951,0.5189999938011169,0.5139999985694885,0.5120000243186951,0.5260000228881836,0.5299999713897705],"label":"Baseline"},"filtering-custom-lines-punc-0.12":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"}},"layout":{"title":{"text":"Custom filters Performance"}}}