Commit ·
5d38774
1
Parent(s): 930b96a
update
Browse files- language.py +8 -571
- source_config.py +105 -110
language.py
CHANGED
|
@@ -10,6 +10,9 @@ LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items(
|
|
| 10 |
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 11 |
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 12 |
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
LANGS_JSON = Path(__file__).with_name("all_langs.json")
|
| 15 |
|
|
@@ -23,6 +26,10 @@ ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
|
|
| 23 |
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
|
| 27 |
"""Write the canonical ALL_LANGS list to JSON if it is missing."""
|
| 28 |
path = Path(path)
|
|
@@ -41,574 +48,4 @@ def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
|
|
| 41 |
if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
|
| 42 |
return langs
|
| 43 |
write_all_langs_json(path)
|
| 44 |
-
return ALL_LANGS[:]
|
| 45 |
-
|
| 46 |
-
ENGLISH_STOP_WORDS = [
|
| 47 |
-
"able",
|
| 48 |
-
"about",
|
| 49 |
-
"above",
|
| 50 |
-
"abroad",
|
| 51 |
-
"according",
|
| 52 |
-
"accordingly",
|
| 53 |
-
"across",
|
| 54 |
-
"actually",
|
| 55 |
-
"after",
|
| 56 |
-
"afterwards",
|
| 57 |
-
"again",
|
| 58 |
-
"against",
|
| 59 |
-
"ago",
|
| 60 |
-
"ahead",
|
| 61 |
-
"aint",
|
| 62 |
-
"all",
|
| 63 |
-
"allow",
|
| 64 |
-
"almost",
|
| 65 |
-
"alone",
|
| 66 |
-
"along",
|
| 67 |
-
"alongside",
|
| 68 |
-
"already",
|
| 69 |
-
"also",
|
| 70 |
-
"although",
|
| 71 |
-
"always",
|
| 72 |
-
"am",
|
| 73 |
-
"amid",
|
| 74 |
-
"amidst",
|
| 75 |
-
"among",
|
| 76 |
-
"amongst",
|
| 77 |
-
"an",
|
| 78 |
-
"and",
|
| 79 |
-
"another",
|
| 80 |
-
"any",
|
| 81 |
-
"anybody",
|
| 82 |
-
"anyhow",
|
| 83 |
-
"anyone",
|
| 84 |
-
"anything",
|
| 85 |
-
"anyway",
|
| 86 |
-
"anyways",
|
| 87 |
-
"anywhere",
|
| 88 |
-
"apart",
|
| 89 |
-
"appear",
|
| 90 |
-
"appreciate",
|
| 91 |
-
"appropriate",
|
| 92 |
-
"app",
|
| 93 |
-
"are",
|
| 94 |
-
"arent",
|
| 95 |
-
"aren",
|
| 96 |
-
"around",
|
| 97 |
-
"as",
|
| 98 |
-
"aside",
|
| 99 |
-
"ask",
|
| 100 |
-
"asking",
|
| 101 |
-
"associated",
|
| 102 |
-
"at",
|
| 103 |
-
"available",
|
| 104 |
-
"away",
|
| 105 |
-
"awfully",
|
| 106 |
-
"back",
|
| 107 |
-
"backward",
|
| 108 |
-
"be",
|
| 109 |
-
"became",
|
| 110 |
-
"because",
|
| 111 |
-
"become",
|
| 112 |
-
"becoming",
|
| 113 |
-
"been",
|
| 114 |
-
"before",
|
| 115 |
-
"beforehand",
|
| 116 |
-
"begin",
|
| 117 |
-
"behind",
|
| 118 |
-
"being",
|
| 119 |
-
"believe",
|
| 120 |
-
"below",
|
| 121 |
-
"beside",
|
| 122 |
-
"best",
|
| 123 |
-
"better",
|
| 124 |
-
"between",
|
| 125 |
-
"beyond",
|
| 126 |
-
"both",
|
| 127 |
-
"brief",
|
| 128 |
-
"but",
|
| 129 |
-
"by",
|
| 130 |
-
"came",
|
| 131 |
-
"can",
|
| 132 |
-
"cannot",
|
| 133 |
-
"cant",
|
| 134 |
-
"caption",
|
| 135 |
-
"cause",
|
| 136 |
-
"certain",
|
| 137 |
-
"certainly",
|
| 138 |
-
"changes",
|
| 139 |
-
"clearly",
|
| 140 |
-
"cmon",
|
| 141 |
-
"com",
|
| 142 |
-
"come",
|
| 143 |
-
"concerning",
|
| 144 |
-
"consequently",
|
| 145 |
-
"consider",
|
| 146 |
-
"considering",
|
| 147 |
-
"contain",
|
| 148 |
-
"containing",
|
| 149 |
-
"corresponding",
|
| 150 |
-
"could",
|
| 151 |
-
"couldnt",
|
| 152 |
-
"course",
|
| 153 |
-
"currently",
|
| 154 |
-
"definitely",
|
| 155 |
-
"described",
|
| 156 |
-
"despite",
|
| 157 |
-
"did",
|
| 158 |
-
"didnt",
|
| 159 |
-
"different",
|
| 160 |
-
"directly",
|
| 161 |
-
"do",
|
| 162 |
-
"does",
|
| 163 |
-
"doesnt",
|
| 164 |
-
"doing",
|
| 165 |
-
"done",
|
| 166 |
-
"dont",
|
| 167 |
-
"down",
|
| 168 |
-
"downward",
|
| 169 |
-
"download",
|
| 170 |
-
"during",
|
| 171 |
-
"each",
|
| 172 |
-
"eight",
|
| 173 |
-
"eighty",
|
| 174 |
-
"either",
|
| 175 |
-
"else",
|
| 176 |
-
"elsewhere",
|
| 177 |
-
"end",
|
| 178 |
-
"ending",
|
| 179 |
-
"enough",
|
| 180 |
-
"entirely",
|
| 181 |
-
"especially",
|
| 182 |
-
"etc",
|
| 183 |
-
"even",
|
| 184 |
-
"ever",
|
| 185 |
-
"evermore",
|
| 186 |
-
"every",
|
| 187 |
-
"everybody",
|
| 188 |
-
"everyone",
|
| 189 |
-
"everything",
|
| 190 |
-
"everywhere",
|
| 191 |
-
"exactly",
|
| 192 |
-
"example",
|
| 193 |
-
"except",
|
| 194 |
-
"fairly",
|
| 195 |
-
"far",
|
| 196 |
-
"farther",
|
| 197 |
-
"few",
|
| 198 |
-
"fewer",
|
| 199 |
-
"fifth",
|
| 200 |
-
"first",
|
| 201 |
-
"five",
|
| 202 |
-
"followed",
|
| 203 |
-
"following",
|
| 204 |
-
"follows",
|
| 205 |
-
"for",
|
| 206 |
-
"forever",
|
| 207 |
-
"former",
|
| 208 |
-
"formerly",
|
| 209 |
-
"forth",
|
| 210 |
-
"forward",
|
| 211 |
-
"found",
|
| 212 |
-
"four",
|
| 213 |
-
"from",
|
| 214 |
-
"free",
|
| 215 |
-
"further",
|
| 216 |
-
"furthermore",
|
| 217 |
-
"get",
|
| 218 |
-
"gets",
|
| 219 |
-
"getting",
|
| 220 |
-
"given",
|
| 221 |
-
"gives",
|
| 222 |
-
"go",
|
| 223 |
-
"goes",
|
| 224 |
-
"going",
|
| 225 |
-
"gone",
|
| 226 |
-
"got",
|
| 227 |
-
"gotten",
|
| 228 |
-
"greetings",
|
| 229 |
-
"had",
|
| 230 |
-
"hadnt",
|
| 231 |
-
"half",
|
| 232 |
-
"happens",
|
| 233 |
-
"hardly",
|
| 234 |
-
"has",
|
| 235 |
-
"hasnt",
|
| 236 |
-
"have",
|
| 237 |
-
"havent",
|
| 238 |
-
"having",
|
| 239 |
-
"he",
|
| 240 |
-
"hed",
|
| 241 |
-
"hell",
|
| 242 |
-
"hello",
|
| 243 |
-
"help",
|
| 244 |
-
"hence",
|
| 245 |
-
"her",
|
| 246 |
-
"here",
|
| 247 |
-
"hereafter",
|
| 248 |
-
"hereby",
|
| 249 |
-
"herein",
|
| 250 |
-
"hereupon",
|
| 251 |
-
"herself",
|
| 252 |
-
"hi",
|
| 253 |
-
"him",
|
| 254 |
-
"himself",
|
| 255 |
-
"his",
|
| 256 |
-
"hither",
|
| 257 |
-
"hopefully",
|
| 258 |
-
"how",
|
| 259 |
-
"howbeit",
|
| 260 |
-
"however",
|
| 261 |
-
"hundred",
|
| 262 |
-
"id",
|
| 263 |
-
"ie",
|
| 264 |
-
"if",
|
| 265 |
-
"ignored",
|
| 266 |
-
"ill",
|
| 267 |
-
"im",
|
| 268 |
-
"immediate",
|
| 269 |
-
"in",
|
| 270 |
-
"inasmuch",
|
| 271 |
-
"inc",
|
| 272 |
-
"indeed",
|
| 273 |
-
"indicate",
|
| 274 |
-
"indicated",
|
| 275 |
-
"inner",
|
| 276 |
-
"inside",
|
| 277 |
-
"insofar",
|
| 278 |
-
"instead",
|
| 279 |
-
"into",
|
| 280 |
-
"inward",
|
| 281 |
-
"is",
|
| 282 |
-
"isnt",
|
| 283 |
-
"it",
|
| 284 |
-
"itd",
|
| 285 |
-
"itll",
|
| 286 |
-
"itself",
|
| 287 |
-
"ive",
|
| 288 |
-
"just",
|
| 289 |
-
"keep",
|
| 290 |
-
"keeps",
|
| 291 |
-
"kept",
|
| 292 |
-
"know",
|
| 293 |
-
"known",
|
| 294 |
-
"last",
|
| 295 |
-
"lately",
|
| 296 |
-
"later",
|
| 297 |
-
"latter",
|
| 298 |
-
"least",
|
| 299 |
-
"less",
|
| 300 |
-
"lest",
|
| 301 |
-
"let",
|
| 302 |
-
"like",
|
| 303 |
-
"liked",
|
| 304 |
-
"likely",
|
| 305 |
-
"likewise",
|
| 306 |
-
"little",
|
| 307 |
-
"look",
|
| 308 |
-
"looking",
|
| 309 |
-
"low",
|
| 310 |
-
"lower",
|
| 311 |
-
"ltd",
|
| 312 |
-
"made",
|
| 313 |
-
"mainly",
|
| 314 |
-
"make",
|
| 315 |
-
"many",
|
| 316 |
-
"may",
|
| 317 |
-
"maybe",
|
| 318 |
-
"maynt",
|
| 319 |
-
"me",
|
| 320 |
-
"mean",
|
| 321 |
-
"meantime",
|
| 322 |
-
"meanwhile",
|
| 323 |
-
"merely",
|
| 324 |
-
"might",
|
| 325 |
-
"mightnt",
|
| 326 |
-
"mine",
|
| 327 |
-
"minus",
|
| 328 |
-
"miss",
|
| 329 |
-
"more",
|
| 330 |
-
"moreover",
|
| 331 |
-
"most",
|
| 332 |
-
"mostly",
|
| 333 |
-
"much",
|
| 334 |
-
"must",
|
| 335 |
-
"mustnt",
|
| 336 |
-
"my",
|
| 337 |
-
"myself",
|
| 338 |
-
"name",
|
| 339 |
-
"namely",
|
| 340 |
-
"near",
|
| 341 |
-
"nearly",
|
| 342 |
-
"necessary",
|
| 343 |
-
"need",
|
| 344 |
-
"neednt",
|
| 345 |
-
"neither",
|
| 346 |
-
"never",
|
| 347 |
-
"neverless",
|
| 348 |
-
"nevertheless",
|
| 349 |
-
"new",
|
| 350 |
-
"next",
|
| 351 |
-
"nine",
|
| 352 |
-
"ninety",
|
| 353 |
-
"no",
|
| 354 |
-
"nobody",
|
| 355 |
-
"non",
|
| 356 |
-
"none",
|
| 357 |
-
"nonetheless",
|
| 358 |
-
"noone",
|
| 359 |
-
"no-one",
|
| 360 |
-
"nor",
|
| 361 |
-
"normally",
|
| 362 |
-
"not",
|
| 363 |
-
"nothing",
|
| 364 |
-
"notwithstanding",
|
| 365 |
-
"novel",
|
| 366 |
-
"now",
|
| 367 |
-
"nowhere",
|
| 368 |
-
"obviously",
|
| 369 |
-
"of",
|
| 370 |
-
"off",
|
| 371 |
-
"often",
|
| 372 |
-
"oh",
|
| 373 |
-
"ok",
|
| 374 |
-
"okay",
|
| 375 |
-
"old",
|
| 376 |
-
"on",
|
| 377 |
-
"once",
|
| 378 |
-
"one",
|
| 379 |
-
"only",
|
| 380 |
-
"onto",
|
| 381 |
-
"opposite",
|
| 382 |
-
"or",
|
| 383 |
-
"other",
|
| 384 |
-
"otherwise",
|
| 385 |
-
"ought",
|
| 386 |
-
"oughtnt",
|
| 387 |
-
"our",
|
| 388 |
-
"ourselves",
|
| 389 |
-
"out",
|
| 390 |
-
"outside",
|
| 391 |
-
"over",
|
| 392 |
-
"overall",
|
| 393 |
-
"own",
|
| 394 |
-
"particular",
|
| 395 |
-
"particularly",
|
| 396 |
-
"past",
|
| 397 |
-
"per",
|
| 398 |
-
"perhaps",
|
| 399 |
-
"placed",
|
| 400 |
-
"please",
|
| 401 |
-
"plus",
|
| 402 |
-
"possible",
|
| 403 |
-
"presumably",
|
| 404 |
-
"probably",
|
| 405 |
-
"provided",
|
| 406 |
-
"provide",
|
| 407 |
-
"quite",
|
| 408 |
-
"rather",
|
| 409 |
-
"really",
|
| 410 |
-
"reasonably",
|
| 411 |
-
"recent",
|
| 412 |
-
"recently",
|
| 413 |
-
"regarding",
|
| 414 |
-
"regardless",
|
| 415 |
-
"regards",
|
| 416 |
-
"relatively",
|
| 417 |
-
"respectively",
|
| 418 |
-
"right",
|
| 419 |
-
"round",
|
| 420 |
-
"said",
|
| 421 |
-
"same",
|
| 422 |
-
"saw",
|
| 423 |
-
"say",
|
| 424 |
-
"saying",
|
| 425 |
-
"second",
|
| 426 |
-
"secondly",
|
| 427 |
-
"see",
|
| 428 |
-
"seeing",
|
| 429 |
-
"seem",
|
| 430 |
-
"seemed",
|
| 431 |
-
"seeming",
|
| 432 |
-
"seems",
|
| 433 |
-
"seen",
|
| 434 |
-
"self",
|
| 435 |
-
"sensible",
|
| 436 |
-
"sent",
|
| 437 |
-
"serious",
|
| 438 |
-
"seriously",
|
| 439 |
-
"seven",
|
| 440 |
-
"several",
|
| 441 |
-
"shall",
|
| 442 |
-
"shant",
|
| 443 |
-
"she",
|
| 444 |
-
"shed",
|
| 445 |
-
"shell",
|
| 446 |
-
"should",
|
| 447 |
-
"shouldnt",
|
| 448 |
-
"since",
|
| 449 |
-
"six",
|
| 450 |
-
"so",
|
| 451 |
-
"some",
|
| 452 |
-
"somebody",
|
| 453 |
-
"someday",
|
| 454 |
-
"somehow",
|
| 455 |
-
"someone",
|
| 456 |
-
"something",
|
| 457 |
-
"sometime",
|
| 458 |
-
"somewhat",
|
| 459 |
-
"somewhere",
|
| 460 |
-
"soon",
|
| 461 |
-
"sorry",
|
| 462 |
-
"specified",
|
| 463 |
-
"specify",
|
| 464 |
-
"specifying",
|
| 465 |
-
"still",
|
| 466 |
-
"such",
|
| 467 |
-
"sure",
|
| 468 |
-
"take",
|
| 469 |
-
"taken",
|
| 470 |
-
"taking",
|
| 471 |
-
"tell",
|
| 472 |
-
"tends",
|
| 473 |
-
"ten",
|
| 474 |
-
"than",
|
| 475 |
-
"thank",
|
| 476 |
-
"that",
|
| 477 |
-
"thatll",
|
| 478 |
-
"thatve",
|
| 479 |
-
"the",
|
| 480 |
-
"their",
|
| 481 |
-
"them",
|
| 482 |
-
"themselves",
|
| 483 |
-
"then",
|
| 484 |
-
"thence",
|
| 485 |
-
"there",
|
| 486 |
-
"thereafter",
|
| 487 |
-
"thereby",
|
| 488 |
-
"thered",
|
| 489 |
-
"therefore",
|
| 490 |
-
"therein",
|
| 491 |
-
"therell",
|
| 492 |
-
"therere",
|
| 493 |
-
"thereupon",
|
| 494 |
-
"thereve",
|
| 495 |
-
"these",
|
| 496 |
-
"they",
|
| 497 |
-
"theyd",
|
| 498 |
-
"theyll",
|
| 499 |
-
"theyre",
|
| 500 |
-
"theyve",
|
| 501 |
-
"thing",
|
| 502 |
-
"think",
|
| 503 |
-
"third",
|
| 504 |
-
"thirty",
|
| 505 |
-
"this",
|
| 506 |
-
"thorough",
|
| 507 |
-
"thoroughly",
|
| 508 |
-
"those",
|
| 509 |
-
"though",
|
| 510 |
-
"three",
|
| 511 |
-
"through",
|
| 512 |
-
"throughout",
|
| 513 |
-
"thru",
|
| 514 |
-
"thus",
|
| 515 |
-
"till",
|
| 516 |
-
"to",
|
| 517 |
-
"together",
|
| 518 |
-
"too",
|
| 519 |
-
"took",
|
| 520 |
-
"toward",
|
| 521 |
-
"tried",
|
| 522 |
-
"tries",
|
| 523 |
-
"truly",
|
| 524 |
-
"try",
|
| 525 |
-
"trying",
|
| 526 |
-
"twice",
|
| 527 |
-
"two",
|
| 528 |
-
"under",
|
| 529 |
-
"underneath",
|
| 530 |
-
"undoing",
|
| 531 |
-
"unfortunately",
|
| 532 |
-
"unless",
|
| 533 |
-
"unlike",
|
| 534 |
-
"unlikely",
|
| 535 |
-
"until",
|
| 536 |
-
"unto",
|
| 537 |
-
"up",
|
| 538 |
-
"upon",
|
| 539 |
-
"upwards",
|
| 540 |
-
"use",
|
| 541 |
-
"used",
|
| 542 |
-
"useful",
|
| 543 |
-
"using",
|
| 544 |
-
"usually",
|
| 545 |
-
"value",
|
| 546 |
-
"various",
|
| 547 |
-
"versus",
|
| 548 |
-
"very",
|
| 549 |
-
"via",
|
| 550 |
-
"viz",
|
| 551 |
-
"want",
|
| 552 |
-
"was",
|
| 553 |
-
"wasnt",
|
| 554 |
-
"way",
|
| 555 |
-
"we",
|
| 556 |
-
"wed",
|
| 557 |
-
"welcome",
|
| 558 |
-
"well",
|
| 559 |
-
"went",
|
| 560 |
-
"were",
|
| 561 |
-
"werent",
|
| 562 |
-
"weve",
|
| 563 |
-
"what",
|
| 564 |
-
"whatever",
|
| 565 |
-
"whatll",
|
| 566 |
-
"whatve",
|
| 567 |
-
"when",
|
| 568 |
-
"whence",
|
| 569 |
-
"whenever",
|
| 570 |
-
"where",
|
| 571 |
-
"whereafter",
|
| 572 |
-
"whereas",
|
| 573 |
-
"whereby",
|
| 574 |
-
"wherein",
|
| 575 |
-
"whereupon",
|
| 576 |
-
"wherever",
|
| 577 |
-
"whether",
|
| 578 |
-
"which",
|
| 579 |
-
"whichever",
|
| 580 |
-
"while",
|
| 581 |
-
"whilst",
|
| 582 |
-
"whither",
|
| 583 |
-
"who",
|
| 584 |
-
"whod",
|
| 585 |
-
"whoever",
|
| 586 |
-
"whole",
|
| 587 |
-
"wholl",
|
| 588 |
-
"whom",
|
| 589 |
-
"whomever",
|
| 590 |
-
"whose",
|
| 591 |
-
"why",
|
| 592 |
-
"will",
|
| 593 |
-
"willing",
|
| 594 |
-
"wish",
|
| 595 |
-
"with",
|
| 596 |
-
"within",
|
| 597 |
-
"without",
|
| 598 |
-
"wonder",
|
| 599 |
-
"wont",
|
| 600 |
-
"would",
|
| 601 |
-
"wouldnt",
|
| 602 |
-
"website",
|
| 603 |
-
"yes",
|
| 604 |
-
"yet",
|
| 605 |
-
"you",
|
| 606 |
-
"youd",
|
| 607 |
-
"youll",
|
| 608 |
-
"your",
|
| 609 |
-
"youre",
|
| 610 |
-
"yourself",
|
| 611 |
-
"yourselves",
|
| 612 |
-
"youve",
|
| 613 |
-
"zero",
|
| 614 |
-
]
|
|
|
|
| 10 |
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 11 |
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
|
| 12 |
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
|
| 13 |
+
LANG_ALIASES = {
|
| 14 |
+
"nn": "no",
|
| 15 |
+
}
|
| 16 |
|
| 17 |
LANGS_JSON = Path(__file__).with_name("all_langs.json")
|
| 18 |
|
|
|
|
| 26 |
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
|
| 27 |
|
| 28 |
|
| 29 |
+
def canonical_lang(lang: str) -> str:
|
| 30 |
+
return LANG_ALIASES.get(lang, lang)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
|
| 34 |
"""Write the canonical ALL_LANGS list to JSON if it is missing."""
|
| 35 |
path = Path(path)
|
|
|
|
| 48 |
if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
|
| 49 |
return langs
|
| 50 |
write_all_langs_json(path)
|
| 51 |
+
return ALL_LANGS[:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
source_config.py
CHANGED
|
@@ -1,145 +1,174 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
-
|
| 3 |
LANGUAGE_BUCKETS = {
|
| 4 |
# ~41% of CC — intentionally capped to avoid crowding out other languages
|
| 5 |
"English": {
|
| 6 |
"langs": ["en"],
|
| 7 |
-
"weight": 2.
|
| 8 |
"min_chars": 2_000,
|
| 9 |
"latin": True,
|
| 10 |
},
|
| 11 |
# ~6.3% of CC — was badly underweighted relative to German/French
|
| 12 |
"Russian": {
|
| 13 |
"langs": ["ru"],
|
| 14 |
-
"weight": 1.
|
| 15 |
"min_chars": 2_000,
|
| 16 |
"latin": False,
|
| 17 |
},
|
| 18 |
# ~5.9% of CC
|
| 19 |
"German": {
|
| 20 |
"langs": ["de"],
|
| 21 |
-
"weight": 1.
|
| 22 |
"min_chars": 2_000,
|
| 23 |
"latin": True,
|
| 24 |
},
|
| 25 |
# ~5.7% of CC — bumped up from 1.7 to match its actual footprint
|
| 26 |
"Japanese": {
|
| 27 |
"langs": ["ja"],
|
| 28 |
-
"weight": 1.
|
| 29 |
"min_chars": 1_200,
|
| 30 |
"latin": False,
|
| 31 |
},
|
| 32 |
# ~5.0% of CC — CC likely undercounts due to Great Firewall
|
| 33 |
"Chinese": {
|
| 34 |
"langs": ["zh"],
|
| 35 |
-
"weight": 1.
|
| 36 |
"min_chars": 1_200,
|
| 37 |
"latin": False,
|
| 38 |
},
|
| 39 |
# ~4.6% of CC
|
| 40 |
"French": {
|
| 41 |
"langs": ["fr"],
|
| 42 |
-
"weight": 1.
|
| 43 |
"min_chars": 2_000,
|
| 44 |
"latin": True,
|
| 45 |
},
|
| 46 |
# ~4.6% of CC
|
| 47 |
"Spanish": {
|
| 48 |
"langs": ["es"],
|
| 49 |
-
"weight": 1.
|
| 50 |
"min_chars": 2_000,
|
| 51 |
"latin": True,
|
| 52 |
},
|
| 53 |
# ~2.5% of CC
|
| 54 |
"Portuguese": {
|
| 55 |
"langs": ["pt"],
|
| 56 |
-
"weight": 1.
|
| 57 |
"min_chars": 2_000,
|
| 58 |
"latin": True,
|
| 59 |
},
|
| 60 |
# ~2.4% of CC
|
| 61 |
"Italian": {
|
| 62 |
"langs": ["it"],
|
| 63 |
-
"weight": 1.
|
| 64 |
"min_chars": 2_000,
|
| 65 |
"latin": True,
|
| 66 |
},
|
| 67 |
# ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
|
| 68 |
"Polish": {
|
| 69 |
"langs": ["pl"],
|
| 70 |
-
"weight": 1.
|
| 71 |
"min_chars": 2_000,
|
| 72 |
"latin": True,
|
| 73 |
},
|
| 74 |
# ~1.8% of CC — was significantly underweighted at 1.15
|
| 75 |
"Dutch": {
|
| 76 |
"langs": ["nl"],
|
| 77 |
-
"weight": 1.
|
| 78 |
"min_chars": 2_000,
|
| 79 |
"latin": True,
|
| 80 |
},
|
| 81 |
# ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
|
| 82 |
"Turkish": {
|
| 83 |
"langs": ["tr"],
|
| 84 |
-
"weight": 1.
|
| 85 |
"min_chars": 2_000,
|
| 86 |
"latin": True,
|
| 87 |
},
|
| 88 |
# ind ~1.1%, vie ~1.05% of CC
|
| 89 |
"SoutheastAsianLatin": {
|
| 90 |
"langs": ["vi", "id", "ms", "sq", "la"],
|
| 91 |
-
"weight": 1.
|
| 92 |
"min_chars": 2_000,
|
| 93 |
"latin": True,
|
| 94 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
|
| 96 |
"CentralEuropeanLatin": {
|
| 97 |
"langs": ["cs", "ro", "hu"],
|
| 98 |
-
"weight": 1.
|
| 99 |
"min_chars": 2_000,
|
| 100 |
"latin": True,
|
| 101 |
},
|
| 102 |
# ~0.81% of CC — was overweighted at 1.7
|
| 103 |
"Korean": {
|
| 104 |
"langs": ["ko"],
|
| 105 |
-
"weight": 1.
|
| 106 |
"min_chars": 1_200,
|
| 107 |
"latin": False,
|
| 108 |
},
|
| 109 |
# ukr ~0.70%, bel ~0.017% of CC
|
| 110 |
"EastSlavicCyrillic": {
|
| 111 |
"langs": ["uk", "be"],
|
| 112 |
-
"weight": 1.
|
| 113 |
"min_chars": 2_000,
|
| 114 |
"latin": False,
|
| 115 |
},
|
| 116 |
# ~0.65% of CC — upweighted relative to CC share given speaker population
|
| 117 |
"Arabic": {
|
| 118 |
"langs": ["ar"],
|
| 119 |
-
"weight": 1.
|
| 120 |
"min_chars": 2_000,
|
| 121 |
"latin": False,
|
| 122 |
},
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# combined ~2.0% of CC — was drastically overweighted at 6.0
|
| 125 |
# note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
|
| 126 |
"NordicCore": {
|
| 127 |
-
"langs": ["sv", "da", "
|
| 128 |
-
"weight":
|
| 129 |
"min_chars": 2_000,
|
| 130 |
"latin": True,
|
| 131 |
},
|
| 132 |
# bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
|
| 133 |
"BalkanCyrillic": {
|
| 134 |
"langs": ["bg", "sr", "mk"],
|
| 135 |
-
"weight": 1.
|
| 136 |
"min_chars": 2_000,
|
| 137 |
"latin": False,
|
| 138 |
},
|
| 139 |
# fas ~0.20% of CC (ignore the one anomalous crawl spike)
|
| 140 |
"ArabicOther": {
|
| 141 |
"langs": ["fa", "ps", "sd", "ug"],
|
| 142 |
-
"weight": 0.
|
| 143 |
"min_chars": 2_000,
|
| 144 |
"latin": False,
|
| 145 |
},
|
|
@@ -153,104 +182,70 @@ LANGUAGE_BUCKETS = {
|
|
| 153 |
},
|
| 154 |
# combined ~0.27% of CC — upweighted for script diversity
|
| 155 |
"IndicOther": {
|
| 156 |
-
"langs": [
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
"min_chars": 2_000,
|
| 159 |
"latin": False,
|
| 160 |
},
|
| 161 |
# kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
|
| 162 |
-
"
|
| 163 |
-
"langs": ["kk", "mn"],
|
| 164 |
-
"weight":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
"min_chars": 2_000,
|
| 166 |
"latin": False,
|
| 167 |
},
|
| 168 |
"AfricanLatin": {
|
| 169 |
-
"langs": ["sw", "tl", "eu"],
|
| 170 |
-
"weight":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
"min_chars": 1_500,
|
| 172 |
"latin": True,
|
| 173 |
},
|
| 174 |
-
#
|
| 175 |
-
#
|
| 176 |
-
"
|
| 177 |
-
"langs": ["el", "he", "hy", "ka", "am", "
|
| 178 |
-
"weight":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
"min_chars": 2_000,
|
| 180 |
"latin": False,
|
| 181 |
},
|
| 182 |
-
}
|
| 183 |
-
|
| 184 |
-
POOL = {
|
| 185 |
-
"wiki": {
|
| 186 |
-
"reserve": 0.60,
|
| 187 |
-
"min": 4,
|
| 188 |
-
"max": 120_000,
|
| 189 |
-
},
|
| 190 |
-
"smol": {
|
| 191 |
-
"reserve": 0.95,
|
| 192 |
-
"min": 1,
|
| 193 |
-
"max": 1_000,
|
| 194 |
-
},
|
| 195 |
-
"ft": {
|
| 196 |
-
"reserve": 0.60,
|
| 197 |
-
"min": 1,
|
| 198 |
-
"max": 30_000,
|
| 199 |
-
},
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
DOC_MIX = {
|
| 203 |
-
"pure": {
|
| 204 |
-
"fraction": 0.60,
|
| 205 |
-
"pool": "reserve",
|
| 206 |
-
"min_sentences": 1,
|
| 207 |
-
"max_sentences": 4,
|
| 208 |
-
"strip_punct_prob": 0.10,
|
| 209 |
-
},
|
| 210 |
-
"homogeneous": {
|
| 211 |
-
"fraction": 0.30,
|
| 212 |
-
"pool": "main",
|
| 213 |
-
"min_sentences": 2,
|
| 214 |
-
"max_sentences": 6,
|
| 215 |
-
"strip_punct_prob": 0.15,
|
| 216 |
-
},
|
| 217 |
-
"mixed": {
|
| 218 |
-
"fraction": 0.10,
|
| 219 |
-
"pool": "main",
|
| 220 |
-
"min_segments": 2,
|
| 221 |
-
"max_segments": 4,
|
| 222 |
-
"strip_punct_prob": 0.25,
|
| 223 |
-
"swap_prob": 0.06,
|
| 224 |
-
"o_inject_prob": 0.06,
|
| 225 |
-
"allow_repeated_langs": True,
|
| 226 |
-
},
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
SMOL = {
|
| 230 |
-
"use": True,
|
| 231 |
-
"rebuild": False,
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
FT = {
|
| 235 |
-
"use": True,
|
| 236 |
-
"rebuild": False,
|
| 237 |
-
"max_lang": 50_000,
|
| 238 |
-
"overflow_lang": 75_000,
|
| 239 |
-
"max_row": 50_000,
|
| 240 |
-
"miss": 1_000,
|
| 241 |
-
"include_en": True,
|
| 242 |
-
"langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
|
| 243 |
-
}
|
| 244 |
-
FT["every"] = len(FT["langs"])
|
| 245 |
-
|
| 246 |
-
RUN = {
|
| 247 |
-
"len": 512,
|
| 248 |
-
"target": 2_500_000, # synthetic mixed-language training examples to generate
|
| 249 |
-
"syn_cache": True,
|
| 250 |
-
"syn_rebuild": False,
|
| 251 |
-
"tok_cache": True,
|
| 252 |
-
"tok_rebuild": False,
|
| 253 |
-
"tok_skip_check": False,
|
| 254 |
-
"retry": 8,
|
| 255 |
-
"preview": 2_000,
|
| 256 |
-
}
|
|
|
|
| 1 |
from __future__ import annotations
|
|
|
|
| 2 |
LANGUAGE_BUCKETS = {
|
| 3 |
# ~41% of CC — intentionally capped to avoid crowding out other languages
|
| 4 |
"English": {
|
| 5 |
"langs": ["en"],
|
| 6 |
+
"weight": 2.9,
|
| 7 |
"min_chars": 2_000,
|
| 8 |
"latin": True,
|
| 9 |
},
|
| 10 |
# ~6.3% of CC — was badly underweighted relative to German/French
|
| 11 |
"Russian": {
|
| 12 |
"langs": ["ru"],
|
| 13 |
+
"weight": 1.95,
|
| 14 |
"min_chars": 2_000,
|
| 15 |
"latin": False,
|
| 16 |
},
|
| 17 |
# ~5.9% of CC
|
| 18 |
"German": {
|
| 19 |
"langs": ["de"],
|
| 20 |
+
"weight": 1.9,
|
| 21 |
"min_chars": 2_000,
|
| 22 |
"latin": True,
|
| 23 |
},
|
| 24 |
# ~5.7% of CC — bumped up from 1.7 to match its actual footprint
|
| 25 |
"Japanese": {
|
| 26 |
"langs": ["ja"],
|
| 27 |
+
"weight": 1.9,
|
| 28 |
"min_chars": 1_200,
|
| 29 |
"latin": False,
|
| 30 |
},
|
| 31 |
# ~5.0% of CC — CC likely undercounts due to Great Firewall
|
| 32 |
"Chinese": {
|
| 33 |
"langs": ["zh"],
|
| 34 |
+
"weight": 1.9,
|
| 35 |
"min_chars": 1_200,
|
| 36 |
"latin": False,
|
| 37 |
},
|
| 38 |
# ~4.6% of CC
|
| 39 |
"French": {
|
| 40 |
"langs": ["fr"],
|
| 41 |
+
"weight": 1.9,
|
| 42 |
"min_chars": 2_000,
|
| 43 |
"latin": True,
|
| 44 |
},
|
| 45 |
# ~4.6% of CC
|
| 46 |
"Spanish": {
|
| 47 |
"langs": ["es"],
|
| 48 |
+
"weight": 1.9,
|
| 49 |
"min_chars": 2_000,
|
| 50 |
"latin": True,
|
| 51 |
},
|
| 52 |
# ~2.5% of CC
|
| 53 |
"Portuguese": {
|
| 54 |
"langs": ["pt"],
|
| 55 |
+
"weight": 1.7,
|
| 56 |
"min_chars": 2_000,
|
| 57 |
"latin": True,
|
| 58 |
},
|
| 59 |
# ~2.4% of CC
|
| 60 |
"Italian": {
|
| 61 |
"langs": ["it"],
|
| 62 |
+
"weight": 1.6,
|
| 63 |
"min_chars": 2_000,
|
| 64 |
"latin": True,
|
| 65 |
},
|
| 66 |
# ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
|
| 67 |
"Polish": {
|
| 68 |
"langs": ["pl"],
|
| 69 |
+
"weight": 1.55,
|
| 70 |
"min_chars": 2_000,
|
| 71 |
"latin": True,
|
| 72 |
},
|
| 73 |
# ~1.8% of CC — was significantly underweighted at 1.15
|
| 74 |
"Dutch": {
|
| 75 |
"langs": ["nl"],
|
| 76 |
+
"weight": 1.55,
|
| 77 |
"min_chars": 2_000,
|
| 78 |
"latin": True,
|
| 79 |
},
|
| 80 |
# ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
|
| 81 |
"Turkish": {
|
| 82 |
"langs": ["tr"],
|
| 83 |
+
"weight": 1.45,
|
| 84 |
"min_chars": 2_000,
|
| 85 |
"latin": True,
|
| 86 |
},
|
| 87 |
# ind ~1.1%, vie ~1.05% of CC
|
| 88 |
"SoutheastAsianLatin": {
|
| 89 |
"langs": ["vi", "id", "ms", "sq", "la"],
|
| 90 |
+
"weight": 1.55,
|
| 91 |
"min_chars": 2_000,
|
| 92 |
"latin": True,
|
| 93 |
},
|
| 94 |
+
"WesternLatin": {
|
| 95 |
+
"langs": ["ca", "gl", "oc"],
|
| 96 |
+
"weight": 1.2,
|
| 97 |
+
"min_chars": 1_500,
|
| 98 |
+
"latin": True,
|
| 99 |
+
},
|
| 100 |
+
"CelticLatin": {
|
| 101 |
+
"langs": ["br", "ga", "gd", "cy"],
|
| 102 |
+
"weight": 1.3,
|
| 103 |
+
"min_chars": 1_500,
|
| 104 |
+
"latin": True,
|
| 105 |
+
},
|
| 106 |
+
"AdriaticLatin": {
|
| 107 |
+
"langs": ["bs", "hr", "sl", "sk"],
|
| 108 |
+
"weight": 1.4,
|
| 109 |
+
"min_chars": 1_500,
|
| 110 |
+
"latin": True,
|
| 111 |
+
},
|
| 112 |
+
"BalticLatin": {
|
| 113 |
+
"langs": ["et", "lv", "lt"],
|
| 114 |
+
"weight": 1.2,
|
| 115 |
+
"min_chars": 1_500,
|
| 116 |
+
"latin": True,
|
| 117 |
+
},
|
| 118 |
# ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
|
| 119 |
"CentralEuropeanLatin": {
|
| 120 |
"langs": ["cs", "ro", "hu"],
|
| 121 |
+
"weight": 1.3,
|
| 122 |
"min_chars": 2_000,
|
| 123 |
"latin": True,
|
| 124 |
},
|
| 125 |
# ~0.81% of CC — was overweighted at 1.7
|
| 126 |
"Korean": {
|
| 127 |
"langs": ["ko"],
|
| 128 |
+
"weight": 1.35,
|
| 129 |
"min_chars": 1_200,
|
| 130 |
"latin": False,
|
| 131 |
},
|
| 132 |
# ukr ~0.70%, bel ~0.017% of CC
|
| 133 |
"EastSlavicCyrillic": {
|
| 134 |
"langs": ["uk", "be"],
|
| 135 |
+
"weight": 1.7,
|
| 136 |
"min_chars": 2_000,
|
| 137 |
"latin": False,
|
| 138 |
},
|
| 139 |
# ~0.65% of CC — upweighted relative to CC share given speaker population
|
| 140 |
"Arabic": {
|
| 141 |
"langs": ["ar"],
|
| 142 |
+
"weight": 1.4,
|
| 143 |
"min_chars": 2_000,
|
| 144 |
"latin": False,
|
| 145 |
},
|
| 146 |
+
"Norwegian": {
|
| 147 |
+
"langs": ["no"],
|
| 148 |
+
"weight": 1.0,
|
| 149 |
+
"min_chars": 2_000,
|
| 150 |
+
"latin": True,
|
| 151 |
+
},
|
| 152 |
+
# sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
|
| 153 |
# combined ~2.0% of CC — was drastically overweighted at 6.0
|
| 154 |
# note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
|
| 155 |
"NordicCore": {
|
| 156 |
+
"langs": ["sv", "da", "is", "af", "fi"],
|
| 157 |
+
"weight": 2.1,
|
| 158 |
"min_chars": 2_000,
|
| 159 |
"latin": True,
|
| 160 |
},
|
| 161 |
# bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
|
| 162 |
"BalkanCyrillic": {
|
| 163 |
"langs": ["bg", "sr", "mk"],
|
| 164 |
+
"weight": 1.05,
|
| 165 |
"min_chars": 2_000,
|
| 166 |
"latin": False,
|
| 167 |
},
|
| 168 |
# fas ~0.20% of CC (ignore the one anomalous crawl spike)
|
| 169 |
"ArabicOther": {
|
| 170 |
"langs": ["fa", "ps", "sd", "ug"],
|
| 171 |
+
"weight": 0.95,
|
| 172 |
"min_chars": 2_000,
|
| 173 |
"latin": False,
|
| 174 |
},
|
|
|
|
| 182 |
},
|
| 183 |
# combined ~0.27% of CC — upweighted for script diversity
|
| 184 |
"IndicOther": {
|
| 185 |
+
"langs": [
|
| 186 |
+
"ur",
|
| 187 |
+
"bn",
|
| 188 |
+
"ta",
|
| 189 |
+
"te",
|
| 190 |
+
"mr",
|
| 191 |
+
"gu",
|
| 192 |
+
"kn",
|
| 193 |
+
"ml",
|
| 194 |
+
"pa",
|
| 195 |
+
"as",
|
| 196 |
+
"or",
|
| 197 |
+
"ne",
|
| 198 |
+
],
|
| 199 |
+
"weight": 0.95,
|
| 200 |
"min_chars": 2_000,
|
| 201 |
"latin": False,
|
| 202 |
},
|
| 203 |
# kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
|
| 204 |
+
"CentralAsianCaucusCyrillic": {
|
| 205 |
+
"langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
|
| 206 |
+
"weight": 1.1,
|
| 207 |
+
"min_chars": 2_000,
|
| 208 |
+
"latin": False,
|
| 209 |
+
},
|
| 210 |
+
# Kurdish is split by script/source:
|
| 211 |
+
# - ku: Wikipedia / Latin-script Kurdish
|
| 212 |
+
# - ckb: FineTranslations / Arabic-script Kurdish
|
| 213 |
+
"KurdishLatin": {
|
| 214 |
+
"langs": ["ku"],
|
| 215 |
+
"weight": 0.45,
|
| 216 |
+
"min_chars": 1_500,
|
| 217 |
+
"latin": True,
|
| 218 |
+
},
|
| 219 |
+
"KurdishArabic": {
|
| 220 |
+
"langs": ["ckb"],
|
| 221 |
+
"weight": 0.45,
|
| 222 |
"min_chars": 2_000,
|
| 223 |
"latin": False,
|
| 224 |
},
|
| 225 |
"AfricanLatin": {
|
| 226 |
+
"langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
|
| 227 |
+
"weight": 1.0,
|
| 228 |
+
"min_chars": 1_500,
|
| 229 |
+
"latin": True,
|
| 230 |
+
},
|
| 231 |
+
"PeripheralLatin": {
|
| 232 |
+
"langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
|
| 233 |
+
"weight": 1.0,
|
| 234 |
"min_chars": 1_500,
|
| 235 |
"latin": True,
|
| 236 |
},
|
| 237 |
+
# Split the remaining non-Latin scripts into two buckets to keep
|
| 238 |
+
# Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
|
| 239 |
+
"OtherScriptsWest": {
|
| 240 |
+
"langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
|
| 241 |
+
"weight": 1.0,
|
| 242 |
+
"min_chars": 2_000,
|
| 243 |
+
"latin": False,
|
| 244 |
+
},
|
| 245 |
+
"OtherScriptsEast": {
|
| 246 |
+
"langs": ["km", "lo", "my", "th", "si", "bo"],
|
| 247 |
+
"weight": 1.0,
|
| 248 |
"min_chars": 2_000,
|
| 249 |
"latin": False,
|
| 250 |
},
|
| 251 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|