DerivedFunction1 commited on
Commit
5d38774
·
1 Parent(s): 930b96a
Files changed (2) hide show
  1. language.py +8 -571
  2. source_config.py +105 -110
language.py CHANGED
@@ -10,6 +10,9 @@ LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items(
10
  LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
11
  LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
12
  LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
 
 
 
13
 
14
  LANGS_JSON = Path(__file__).with_name("all_langs.json")
15
 
@@ -23,6 +26,10 @@ ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
23
  LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
24
 
25
 
 
 
 
 
26
  def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
27
  """Write the canonical ALL_LANGS list to JSON if it is missing."""
28
  path = Path(path)
@@ -41,574 +48,4 @@ def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
41
  if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
42
  return langs
43
  write_all_langs_json(path)
44
- return ALL_LANGS[:]
45
-
46
- ENGLISH_STOP_WORDS = [
47
- "able",
48
- "about",
49
- "above",
50
- "abroad",
51
- "according",
52
- "accordingly",
53
- "across",
54
- "actually",
55
- "after",
56
- "afterwards",
57
- "again",
58
- "against",
59
- "ago",
60
- "ahead",
61
- "aint",
62
- "all",
63
- "allow",
64
- "almost",
65
- "alone",
66
- "along",
67
- "alongside",
68
- "already",
69
- "also",
70
- "although",
71
- "always",
72
- "am",
73
- "amid",
74
- "amidst",
75
- "among",
76
- "amongst",
77
- "an",
78
- "and",
79
- "another",
80
- "any",
81
- "anybody",
82
- "anyhow",
83
- "anyone",
84
- "anything",
85
- "anyway",
86
- "anyways",
87
- "anywhere",
88
- "apart",
89
- "appear",
90
- "appreciate",
91
- "appropriate",
92
- "app",
93
- "are",
94
- "arent",
95
- "aren",
96
- "around",
97
- "as",
98
- "aside",
99
- "ask",
100
- "asking",
101
- "associated",
102
- "at",
103
- "available",
104
- "away",
105
- "awfully",
106
- "back",
107
- "backward",
108
- "be",
109
- "became",
110
- "because",
111
- "become",
112
- "becoming",
113
- "been",
114
- "before",
115
- "beforehand",
116
- "begin",
117
- "behind",
118
- "being",
119
- "believe",
120
- "below",
121
- "beside",
122
- "best",
123
- "better",
124
- "between",
125
- "beyond",
126
- "both",
127
- "brief",
128
- "but",
129
- "by",
130
- "came",
131
- "can",
132
- "cannot",
133
- "cant",
134
- "caption",
135
- "cause",
136
- "certain",
137
- "certainly",
138
- "changes",
139
- "clearly",
140
- "cmon",
141
- "com",
142
- "come",
143
- "concerning",
144
- "consequently",
145
- "consider",
146
- "considering",
147
- "contain",
148
- "containing",
149
- "corresponding",
150
- "could",
151
- "couldnt",
152
- "course",
153
- "currently",
154
- "definitely",
155
- "described",
156
- "despite",
157
- "did",
158
- "didnt",
159
- "different",
160
- "directly",
161
- "do",
162
- "does",
163
- "doesnt",
164
- "doing",
165
- "done",
166
- "dont",
167
- "down",
168
- "downward",
169
- "download",
170
- "during",
171
- "each",
172
- "eight",
173
- "eighty",
174
- "either",
175
- "else",
176
- "elsewhere",
177
- "end",
178
- "ending",
179
- "enough",
180
- "entirely",
181
- "especially",
182
- "etc",
183
- "even",
184
- "ever",
185
- "evermore",
186
- "every",
187
- "everybody",
188
- "everyone",
189
- "everything",
190
- "everywhere",
191
- "exactly",
192
- "example",
193
- "except",
194
- "fairly",
195
- "far",
196
- "farther",
197
- "few",
198
- "fewer",
199
- "fifth",
200
- "first",
201
- "five",
202
- "followed",
203
- "following",
204
- "follows",
205
- "for",
206
- "forever",
207
- "former",
208
- "formerly",
209
- "forth",
210
- "forward",
211
- "found",
212
- "four",
213
- "from",
214
- "free",
215
- "further",
216
- "furthermore",
217
- "get",
218
- "gets",
219
- "getting",
220
- "given",
221
- "gives",
222
- "go",
223
- "goes",
224
- "going",
225
- "gone",
226
- "got",
227
- "gotten",
228
- "greetings",
229
- "had",
230
- "hadnt",
231
- "half",
232
- "happens",
233
- "hardly",
234
- "has",
235
- "hasnt",
236
- "have",
237
- "havent",
238
- "having",
239
- "he",
240
- "hed",
241
- "hell",
242
- "hello",
243
- "help",
244
- "hence",
245
- "her",
246
- "here",
247
- "hereafter",
248
- "hereby",
249
- "herein",
250
- "hereupon",
251
- "herself",
252
- "hi",
253
- "him",
254
- "himself",
255
- "his",
256
- "hither",
257
- "hopefully",
258
- "how",
259
- "howbeit",
260
- "however",
261
- "hundred",
262
- "id",
263
- "ie",
264
- "if",
265
- "ignored",
266
- "ill",
267
- "im",
268
- "immediate",
269
- "in",
270
- "inasmuch",
271
- "inc",
272
- "indeed",
273
- "indicate",
274
- "indicated",
275
- "inner",
276
- "inside",
277
- "insofar",
278
- "instead",
279
- "into",
280
- "inward",
281
- "is",
282
- "isnt",
283
- "it",
284
- "itd",
285
- "itll",
286
- "itself",
287
- "ive",
288
- "just",
289
- "keep",
290
- "keeps",
291
- "kept",
292
- "know",
293
- "known",
294
- "last",
295
- "lately",
296
- "later",
297
- "latter",
298
- "least",
299
- "less",
300
- "lest",
301
- "let",
302
- "like",
303
- "liked",
304
- "likely",
305
- "likewise",
306
- "little",
307
- "look",
308
- "looking",
309
- "low",
310
- "lower",
311
- "ltd",
312
- "made",
313
- "mainly",
314
- "make",
315
- "many",
316
- "may",
317
- "maybe",
318
- "maynt",
319
- "me",
320
- "mean",
321
- "meantime",
322
- "meanwhile",
323
- "merely",
324
- "might",
325
- "mightnt",
326
- "mine",
327
- "minus",
328
- "miss",
329
- "more",
330
- "moreover",
331
- "most",
332
- "mostly",
333
- "much",
334
- "must",
335
- "mustnt",
336
- "my",
337
- "myself",
338
- "name",
339
- "namely",
340
- "near",
341
- "nearly",
342
- "necessary",
343
- "need",
344
- "neednt",
345
- "neither",
346
- "never",
347
- "neverless",
348
- "nevertheless",
349
- "new",
350
- "next",
351
- "nine",
352
- "ninety",
353
- "no",
354
- "nobody",
355
- "non",
356
- "none",
357
- "nonetheless",
358
- "noone",
359
- "no-one",
360
- "nor",
361
- "normally",
362
- "not",
363
- "nothing",
364
- "notwithstanding",
365
- "novel",
366
- "now",
367
- "nowhere",
368
- "obviously",
369
- "of",
370
- "off",
371
- "often",
372
- "oh",
373
- "ok",
374
- "okay",
375
- "old",
376
- "on",
377
- "once",
378
- "one",
379
- "only",
380
- "onto",
381
- "opposite",
382
- "or",
383
- "other",
384
- "otherwise",
385
- "ought",
386
- "oughtnt",
387
- "our",
388
- "ourselves",
389
- "out",
390
- "outside",
391
- "over",
392
- "overall",
393
- "own",
394
- "particular",
395
- "particularly",
396
- "past",
397
- "per",
398
- "perhaps",
399
- "placed",
400
- "please",
401
- "plus",
402
- "possible",
403
- "presumably",
404
- "probably",
405
- "provided",
406
- "provide",
407
- "quite",
408
- "rather",
409
- "really",
410
- "reasonably",
411
- "recent",
412
- "recently",
413
- "regarding",
414
- "regardless",
415
- "regards",
416
- "relatively",
417
- "respectively",
418
- "right",
419
- "round",
420
- "said",
421
- "same",
422
- "saw",
423
- "say",
424
- "saying",
425
- "second",
426
- "secondly",
427
- "see",
428
- "seeing",
429
- "seem",
430
- "seemed",
431
- "seeming",
432
- "seems",
433
- "seen",
434
- "self",
435
- "sensible",
436
- "sent",
437
- "serious",
438
- "seriously",
439
- "seven",
440
- "several",
441
- "shall",
442
- "shant",
443
- "she",
444
- "shed",
445
- "shell",
446
- "should",
447
- "shouldnt",
448
- "since",
449
- "six",
450
- "so",
451
- "some",
452
- "somebody",
453
- "someday",
454
- "somehow",
455
- "someone",
456
- "something",
457
- "sometime",
458
- "somewhat",
459
- "somewhere",
460
- "soon",
461
- "sorry",
462
- "specified",
463
- "specify",
464
- "specifying",
465
- "still",
466
- "such",
467
- "sure",
468
- "take",
469
- "taken",
470
- "taking",
471
- "tell",
472
- "tends",
473
- "ten",
474
- "than",
475
- "thank",
476
- "that",
477
- "thatll",
478
- "thatve",
479
- "the",
480
- "their",
481
- "them",
482
- "themselves",
483
- "then",
484
- "thence",
485
- "there",
486
- "thereafter",
487
- "thereby",
488
- "thered",
489
- "therefore",
490
- "therein",
491
- "therell",
492
- "therere",
493
- "thereupon",
494
- "thereve",
495
- "these",
496
- "they",
497
- "theyd",
498
- "theyll",
499
- "theyre",
500
- "theyve",
501
- "thing",
502
- "think",
503
- "third",
504
- "thirty",
505
- "this",
506
- "thorough",
507
- "thoroughly",
508
- "those",
509
- "though",
510
- "three",
511
- "through",
512
- "throughout",
513
- "thru",
514
- "thus",
515
- "till",
516
- "to",
517
- "together",
518
- "too",
519
- "took",
520
- "toward",
521
- "tried",
522
- "tries",
523
- "truly",
524
- "try",
525
- "trying",
526
- "twice",
527
- "two",
528
- "under",
529
- "underneath",
530
- "undoing",
531
- "unfortunately",
532
- "unless",
533
- "unlike",
534
- "unlikely",
535
- "until",
536
- "unto",
537
- "up",
538
- "upon",
539
- "upwards",
540
- "use",
541
- "used",
542
- "useful",
543
- "using",
544
- "usually",
545
- "value",
546
- "various",
547
- "versus",
548
- "very",
549
- "via",
550
- "viz",
551
- "want",
552
- "was",
553
- "wasnt",
554
- "way",
555
- "we",
556
- "wed",
557
- "welcome",
558
- "well",
559
- "went",
560
- "were",
561
- "werent",
562
- "weve",
563
- "what",
564
- "whatever",
565
- "whatll",
566
- "whatve",
567
- "when",
568
- "whence",
569
- "whenever",
570
- "where",
571
- "whereafter",
572
- "whereas",
573
- "whereby",
574
- "wherein",
575
- "whereupon",
576
- "wherever",
577
- "whether",
578
- "which",
579
- "whichever",
580
- "while",
581
- "whilst",
582
- "whither",
583
- "who",
584
- "whod",
585
- "whoever",
586
- "whole",
587
- "wholl",
588
- "whom",
589
- "whomever",
590
- "whose",
591
- "why",
592
- "will",
593
- "willing",
594
- "wish",
595
- "with",
596
- "within",
597
- "without",
598
- "wonder",
599
- "wont",
600
- "would",
601
- "wouldnt",
602
- "website",
603
- "yes",
604
- "yet",
605
- "you",
606
- "youd",
607
- "youll",
608
- "your",
609
- "youre",
610
- "yourself",
611
- "yourselves",
612
- "youve",
613
- "zero",
614
- ]
 
10
  LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
11
  LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
12
  LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
13
+ LANG_ALIASES = {
14
+ "nn": "no",
15
+ }
16
 
17
  LANGS_JSON = Path(__file__).with_name("all_langs.json")
18
 
 
26
  LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
27
 
28
 
29
+ def canonical_lang(lang: str) -> str:
30
+ return LANG_ALIASES.get(lang, lang)
31
+
32
+
33
  def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
34
  """Write the canonical ALL_LANGS list to JSON if it is missing."""
35
  path = Path(path)
 
48
  if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
49
  return langs
50
  write_all_langs_json(path)
51
+ return ALL_LANGS[:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
source_config.py CHANGED
@@ -1,145 +1,174 @@
1
  from __future__ import annotations
2
-
3
  LANGUAGE_BUCKETS = {
4
  # ~41% of CC — intentionally capped to avoid crowding out other languages
5
  "English": {
6
  "langs": ["en"],
7
- "weight": 2.5,
8
  "min_chars": 2_000,
9
  "latin": True,
10
  },
11
  # ~6.3% of CC — was badly underweighted relative to German/French
12
  "Russian": {
13
  "langs": ["ru"],
14
- "weight": 1.8,
15
  "min_chars": 2_000,
16
  "latin": False,
17
  },
18
  # ~5.9% of CC
19
  "German": {
20
  "langs": ["de"],
21
- "weight": 1.8,
22
  "min_chars": 2_000,
23
  "latin": True,
24
  },
25
  # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
26
  "Japanese": {
27
  "langs": ["ja"],
28
- "weight": 1.8,
29
  "min_chars": 1_200,
30
  "latin": False,
31
  },
32
  # ~5.0% of CC — CC likely undercounts due to Great Firewall
33
  "Chinese": {
34
  "langs": ["zh"],
35
- "weight": 1.8,
36
  "min_chars": 1_200,
37
  "latin": False,
38
  },
39
  # ~4.6% of CC
40
  "French": {
41
  "langs": ["fr"],
42
- "weight": 1.8,
43
  "min_chars": 2_000,
44
  "latin": True,
45
  },
46
  # ~4.6% of CC
47
  "Spanish": {
48
  "langs": ["es"],
49
- "weight": 1.8,
50
  "min_chars": 2_000,
51
  "latin": True,
52
  },
53
  # ~2.5% of CC
54
  "Portuguese": {
55
  "langs": ["pt"],
56
- "weight": 1.6,
57
  "min_chars": 2_000,
58
  "latin": True,
59
  },
60
  # ~2.4% of CC
61
  "Italian": {
62
  "langs": ["it"],
63
- "weight": 1.5,
64
  "min_chars": 2_000,
65
  "latin": True,
66
  },
67
  # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
68
  "Polish": {
69
  "langs": ["pl"],
70
- "weight": 1.5,
71
  "min_chars": 2_000,
72
  "latin": True,
73
  },
74
  # ~1.8% of CC — was significantly underweighted at 1.15
75
  "Dutch": {
76
  "langs": ["nl"],
77
- "weight": 1.5,
78
  "min_chars": 2_000,
79
  "latin": True,
80
  },
81
  # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
82
  "Turkish": {
83
  "langs": ["tr"],
84
- "weight": 1.4,
85
  "min_chars": 2_000,
86
  "latin": True,
87
  },
88
  # ind ~1.1%, vie ~1.05% of CC
89
  "SoutheastAsianLatin": {
90
  "langs": ["vi", "id", "ms", "sq", "la"],
91
- "weight": 1.4,
92
  "min_chars": 2_000,
93
  "latin": True,
94
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
96
  "CentralEuropeanLatin": {
97
  "langs": ["cs", "ro", "hu"],
98
- "weight": 1.2,
99
  "min_chars": 2_000,
100
  "latin": True,
101
  },
102
  # ~0.81% of CC — was overweighted at 1.7
103
  "Korean": {
104
  "langs": ["ko"],
105
- "weight": 1.3,
106
  "min_chars": 1_200,
107
  "latin": False,
108
  },
109
  # ukr ~0.70%, bel ~0.017% of CC
110
  "EastSlavicCyrillic": {
111
  "langs": ["uk", "be"],
112
- "weight": 1.15,
113
  "min_chars": 2_000,
114
  "latin": False,
115
  },
116
  # ~0.65% of CC — upweighted relative to CC share given speaker population
117
  "Arabic": {
118
  "langs": ["ar"],
119
- "weight": 1.35,
120
  "min_chars": 2_000,
121
  "latin": False,
122
  },
123
- # sv ~0.7%, dan ~0.51%, nor+nno ~0.33%, fin ~0.37%, isl ~0.04%, afr ~0.01%
 
 
 
 
 
 
124
  # combined ~2.0% of CC — was drastically overweighted at 6.0
125
  # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
126
  "NordicCore": {
127
- "langs": ["sv", "da", "no", "is", "af", "fi"],
128
- "weight": 1.8,
129
  "min_chars": 2_000,
130
  "latin": True,
131
  },
132
  # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
133
  "BalkanCyrillic": {
134
  "langs": ["bg", "sr", "mk"],
135
- "weight": 1.0,
136
  "min_chars": 2_000,
137
  "latin": False,
138
  },
139
  # fas ~0.20% of CC (ignore the one anomalous crawl spike)
140
  "ArabicOther": {
141
  "langs": ["fa", "ps", "sd", "ug"],
142
- "weight": 0.9,
143
  "min_chars": 2_000,
144
  "latin": False,
145
  },
@@ -153,104 +182,70 @@ LANGUAGE_BUCKETS = {
153
  },
154
  # combined ~0.27% of CC — upweighted for script diversity
155
  "IndicOther": {
156
- "langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"],
157
- "weight": 0.9,
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  "min_chars": 2_000,
159
  "latin": False,
160
  },
161
  # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
162
- "CentralAsianCyrillic": {
163
- "langs": ["kk", "mn"],
164
- "weight": 0.9,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  "min_chars": 2_000,
166
  "latin": False,
167
  },
168
  "AfricanLatin": {
169
- "langs": ["sw", "tl", "eu"],
170
- "weight": 0.8,
 
 
 
 
 
 
171
  "min_chars": 1_500,
172
  "latin": True,
173
  },
174
- # el ~0.55%, he ~0.24%, th ~0.38%, hy ~0.033%, ka ~0.044% etc. — combined ~1%+
175
- # nudged up slightly from 0.8 given Greek and Thai have meaningful CC presence
176
- "OtherScripts": {
177
- "langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"],
178
- "weight": 0.9,
 
 
 
 
 
 
179
  "min_chars": 2_000,
180
  "latin": False,
181
  },
182
- }
183
-
184
- POOL = {
185
- "wiki": {
186
- "reserve": 0.60,
187
- "min": 4,
188
- "max": 120_000,
189
- },
190
- "smol": {
191
- "reserve": 0.95,
192
- "min": 1,
193
- "max": 1_000,
194
- },
195
- "ft": {
196
- "reserve": 0.60,
197
- "min": 1,
198
- "max": 30_000,
199
- },
200
- }
201
-
202
- DOC_MIX = {
203
- "pure": {
204
- "fraction": 0.60,
205
- "pool": "reserve",
206
- "min_sentences": 1,
207
- "max_sentences": 4,
208
- "strip_punct_prob": 0.10,
209
- },
210
- "homogeneous": {
211
- "fraction": 0.30,
212
- "pool": "main",
213
- "min_sentences": 2,
214
- "max_sentences": 6,
215
- "strip_punct_prob": 0.15,
216
- },
217
- "mixed": {
218
- "fraction": 0.10,
219
- "pool": "main",
220
- "min_segments": 2,
221
- "max_segments": 4,
222
- "strip_punct_prob": 0.25,
223
- "swap_prob": 0.06,
224
- "o_inject_prob": 0.06,
225
- "allow_repeated_langs": True,
226
- },
227
- }
228
-
229
- SMOL = {
230
- "use": True,
231
- "rebuild": False,
232
- }
233
-
234
- FT = {
235
- "use": True,
236
- "rebuild": False,
237
- "max_lang": 50_000,
238
- "overflow_lang": 75_000,
239
- "max_row": 50_000,
240
- "miss": 1_000,
241
- "include_en": True,
242
- "langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
243
- }
244
- FT["every"] = len(FT["langs"])
245
-
246
- RUN = {
247
- "len": 512,
248
- "target": 2_500_000, # synthetic mixed-language training examples to generate
249
- "syn_cache": True,
250
- "syn_rebuild": False,
251
- "tok_cache": True,
252
- "tok_rebuild": False,
253
- "tok_skip_check": False,
254
- "retry": 8,
255
- "preview": 2_000,
256
- }
 
1
  from __future__ import annotations
 
2
  LANGUAGE_BUCKETS = {
3
  # ~41% of CC — intentionally capped to avoid crowding out other languages
4
  "English": {
5
  "langs": ["en"],
6
+ "weight": 2.9,
7
  "min_chars": 2_000,
8
  "latin": True,
9
  },
10
  # ~6.3% of CC — was badly underweighted relative to German/French
11
  "Russian": {
12
  "langs": ["ru"],
13
+ "weight": 1.95,
14
  "min_chars": 2_000,
15
  "latin": False,
16
  },
17
  # ~5.9% of CC
18
  "German": {
19
  "langs": ["de"],
20
+ "weight": 1.9,
21
  "min_chars": 2_000,
22
  "latin": True,
23
  },
24
  # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
25
  "Japanese": {
26
  "langs": ["ja"],
27
+ "weight": 1.9,
28
  "min_chars": 1_200,
29
  "latin": False,
30
  },
31
  # ~5.0% of CC — CC likely undercounts due to Great Firewall
32
  "Chinese": {
33
  "langs": ["zh"],
34
+ "weight": 1.9,
35
  "min_chars": 1_200,
36
  "latin": False,
37
  },
38
  # ~4.6% of CC
39
  "French": {
40
  "langs": ["fr"],
41
+ "weight": 1.9,
42
  "min_chars": 2_000,
43
  "latin": True,
44
  },
45
  # ~4.6% of CC
46
  "Spanish": {
47
  "langs": ["es"],
48
+ "weight": 1.9,
49
  "min_chars": 2_000,
50
  "latin": True,
51
  },
52
  # ~2.5% of CC
53
  "Portuguese": {
54
  "langs": ["pt"],
55
+ "weight": 1.7,
56
  "min_chars": 2_000,
57
  "latin": True,
58
  },
59
  # ~2.4% of CC
60
  "Italian": {
61
  "langs": ["it"],
62
+ "weight": 1.6,
63
  "min_chars": 2_000,
64
  "latin": True,
65
  },
66
  # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
67
  "Polish": {
68
  "langs": ["pl"],
69
+ "weight": 1.55,
70
  "min_chars": 2_000,
71
  "latin": True,
72
  },
73
  # ~1.8% of CC — was significantly underweighted at 1.15
74
  "Dutch": {
75
  "langs": ["nl"],
76
+ "weight": 1.55,
77
  "min_chars": 2_000,
78
  "latin": True,
79
  },
80
  # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
81
  "Turkish": {
82
  "langs": ["tr"],
83
+ "weight": 1.45,
84
  "min_chars": 2_000,
85
  "latin": True,
86
  },
87
  # ind ~1.1%, vie ~1.05% of CC
88
  "SoutheastAsianLatin": {
89
  "langs": ["vi", "id", "ms", "sq", "la"],
90
+ "weight": 1.55,
91
  "min_chars": 2_000,
92
  "latin": True,
93
  },
94
+ "WesternLatin": {
95
+ "langs": ["ca", "gl", "oc"],
96
+ "weight": 1.2,
97
+ "min_chars": 1_500,
98
+ "latin": True,
99
+ },
100
+ "CelticLatin": {
101
+ "langs": ["br", "ga", "gd", "cy"],
102
+ "weight": 1.3,
103
+ "min_chars": 1_500,
104
+ "latin": True,
105
+ },
106
+ "AdriaticLatin": {
107
+ "langs": ["bs", "hr", "sl", "sk"],
108
+ "weight": 1.4,
109
+ "min_chars": 1_500,
110
+ "latin": True,
111
+ },
112
+ "BalticLatin": {
113
+ "langs": ["et", "lv", "lt"],
114
+ "weight": 1.2,
115
+ "min_chars": 1_500,
116
+ "latin": True,
117
+ },
118
  # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
119
  "CentralEuropeanLatin": {
120
  "langs": ["cs", "ro", "hu"],
121
+ "weight": 1.3,
122
  "min_chars": 2_000,
123
  "latin": True,
124
  },
125
  # ~0.81% of CC — was overweighted at 1.7
126
  "Korean": {
127
  "langs": ["ko"],
128
+ "weight": 1.35,
129
  "min_chars": 1_200,
130
  "latin": False,
131
  },
132
  # ukr ~0.70%, bel ~0.017% of CC
133
  "EastSlavicCyrillic": {
134
  "langs": ["uk", "be"],
135
+ "weight": 1.7,
136
  "min_chars": 2_000,
137
  "latin": False,
138
  },
139
  # ~0.65% of CC — upweighted relative to CC share given speaker population
140
  "Arabic": {
141
  "langs": ["ar"],
142
+ "weight": 1.4,
143
  "min_chars": 2_000,
144
  "latin": False,
145
  },
146
+ "Norwegian": {
147
+ "langs": ["no"],
148
+ "weight": 1.0,
149
+ "min_chars": 2_000,
150
+ "latin": True,
151
+ },
152
+ # sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
153
  # combined ~2.0% of CC — was drastically overweighted at 6.0
154
  # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
155
  "NordicCore": {
156
+ "langs": ["sv", "da", "is", "af", "fi"],
157
+ "weight": 2.1,
158
  "min_chars": 2_000,
159
  "latin": True,
160
  },
161
  # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
162
  "BalkanCyrillic": {
163
  "langs": ["bg", "sr", "mk"],
164
+ "weight": 1.05,
165
  "min_chars": 2_000,
166
  "latin": False,
167
  },
168
  # fas ~0.20% of CC (ignore the one anomalous crawl spike)
169
  "ArabicOther": {
170
  "langs": ["fa", "ps", "sd", "ug"],
171
+ "weight": 0.95,
172
  "min_chars": 2_000,
173
  "latin": False,
174
  },
 
182
  },
183
  # combined ~0.27% of CC — upweighted for script diversity
184
  "IndicOther": {
185
+ "langs": [
186
+ "ur",
187
+ "bn",
188
+ "ta",
189
+ "te",
190
+ "mr",
191
+ "gu",
192
+ "kn",
193
+ "ml",
194
+ "pa",
195
+ "as",
196
+ "or",
197
+ "ne",
198
+ ],
199
+ "weight": 0.95,
200
  "min_chars": 2_000,
201
  "latin": False,
202
  },
203
  # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
204
+ "CentralAsianCaucusCyrillic": {
205
+ "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
206
+ "weight": 1.1,
207
+ "min_chars": 2_000,
208
+ "latin": False,
209
+ },
210
+ # Kurdish is split by script/source:
211
+ # - ku: Wikipedia / Latin-script Kurdish
212
+ # - ckb: FineTranslations / Arabic-script Kurdish
213
+ "KurdishLatin": {
214
+ "langs": ["ku"],
215
+ "weight": 0.45,
216
+ "min_chars": 1_500,
217
+ "latin": True,
218
+ },
219
+ "KurdishArabic": {
220
+ "langs": ["ckb"],
221
+ "weight": 0.45,
222
  "min_chars": 2_000,
223
  "latin": False,
224
  },
225
  "AfricanLatin": {
226
+ "langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
227
+ "weight": 1.0,
228
+ "min_chars": 1_500,
229
+ "latin": True,
230
+ },
231
+ "PeripheralLatin": {
232
+ "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
233
+ "weight": 1.0,
234
  "min_chars": 1_500,
235
  "latin": True,
236
  },
237
+ # Split the remaining non-Latin scripts into two buckets to keep
238
+ # Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
239
+ "OtherScriptsWest": {
240
+ "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
241
+ "weight": 1.0,
242
+ "min_chars": 2_000,
243
+ "latin": False,
244
+ },
245
+ "OtherScriptsEast": {
246
+ "langs": ["km", "lo", "my", "th", "si", "bo"],
247
+ "weight": 1.0,
248
  "min_chars": 2_000,
249
  "latin": False,
250
  },
251
+ }