Spaces:

polyglot-tagger
/

language-extractor-demo

Running

App Files Files Community

DerivedFunction1 commited on 21 days ago

Commit

5d38774

1 Parent(s): 930b96a

update

Browse files

Files changed (2) hide show

language.py +8 -571
source_config.py +105 -110

language.py CHANGED Viewed

@@ -10,6 +10,9 @@ LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items(
 LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
 LANGS_JSON = Path(__file__).with_name("all_langs.json")
@@ -23,6 +26,10 @@ ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
 LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
 def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
     """Write the canonical ALL_LANGS list to JSON if it is missing."""
     path = Path(path)
@@ -41,574 +48,4 @@ def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
         if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
             return langs
     write_all_langs_json(path)
-    return ALL_LANGS[:]
-ENGLISH_STOP_WORDS = [
-    "able",
-    "about",
-    "above",
-    "abroad",
-    "according",
-    "accordingly",
-    "across",
-    "actually",
-    "after",
-    "afterwards",
-    "again",
-    "against",
-    "ago",
-    "ahead",
-    "aint",
-    "all",
-    "allow",
-    "almost",
-    "alone",
-    "along",
-    "alongside",
-    "already",
-    "also",
-    "although",
-    "always",
-    "am",
-    "amid",
-    "amidst",
-    "among",
-    "amongst",
-    "an",
-    "and",
-    "another",
-    "any",
-    "anybody",
-    "anyhow",
-    "anyone",
-    "anything",
-    "anyway",
-    "anyways",
-    "anywhere",
-    "apart",
-    "appear",
-    "appreciate",
-    "appropriate",
-    "app",
-    "are",
-    "arent",
-    "aren",
-    "around",
-    "as",
-    "aside",
-    "ask",
-    "asking",
-    "associated",
-    "at",
-    "available",
-    "away",
-    "awfully",
-    "back",
-    "backward",
-    "be",
-    "became",
-    "because",
-    "become",
-    "becoming",
-    "been",
-    "before",
-    "beforehand",
-    "begin",
-    "behind",
-    "being",
-    "believe",
-    "below",
-    "beside",
-    "best",
-    "better",
-    "between",
-    "beyond",
-    "both",
-    "brief",
-    "but",
-    "by",
-    "came",
-    "can",
-    "cannot",
-    "cant",
-    "caption",
-    "cause",
-    "certain",
-    "certainly",
-    "changes",
-    "clearly",
-    "cmon",
-    "com",
-    "come",
-    "concerning",
-    "consequently",
-    "consider",
-    "considering",
-    "contain",
-    "containing",
-    "corresponding",
-    "could",
-    "couldnt",
-    "course",
-    "currently",
-    "definitely",
-    "described",
-    "despite",
-    "did",
-    "didnt",
-    "different",
-    "directly",
-    "do",
-    "does",
-    "doesnt",
-    "doing",
-    "done",
-    "dont",
-    "down",
-    "downward",
-    "download",
-    "during",
-    "each",
-    "eight",
-    "eighty",
-    "either",
-    "else",
-    "elsewhere",
-    "end",
-    "ending",
-    "enough",
-    "entirely",
-    "especially",
-    "etc",
-    "even",
-    "ever",
-    "evermore",
-    "every",
-    "everybody",
-    "everyone",
-    "everything",
-    "everywhere",
-    "exactly",
-    "example",
-    "except",
-    "fairly",
-    "far",
-    "farther",
-    "few",
-    "fewer",
-    "fifth",
-    "first",
-    "five",
-    "followed",
-    "following",
-    "follows",
-    "for",
-    "forever",
-    "former",
-    "formerly",
-    "forth",
-    "forward",
-    "found",
-    "four",
-    "from",
-    "free",
-    "further",
-    "furthermore",
-    "get",
-    "gets",
-    "getting",
-    "given",
-    "gives",
-    "go",
-    "goes",
-    "going",
-    "gone",
-    "got",
-    "gotten",
-    "greetings",
-    "had",
-    "hadnt",
-    "half",
-    "happens",
-    "hardly",
-    "has",
-    "hasnt",
-    "have",
-    "havent",
-    "having",
-    "he",
-    "hed",
-    "hell",
-    "hello",
-    "help",
-    "hence",
-    "her",
-    "here",
-    "hereafter",
-    "hereby",
-    "herein",
-    "hereupon",
-    "herself",
-    "hi",
-    "him",
-    "himself",
-    "his",
-    "hither",
-    "hopefully",
-    "how",
-    "howbeit",
-    "however",
-    "hundred",
-    "id",
-    "ie",
-    "if",
-    "ignored",
-    "ill",
-    "im",
-    "immediate",
-    "in",
-    "inasmuch",
-    "inc",
-    "indeed",
-    "indicate",
-    "indicated",
-    "inner",
-    "inside",
-    "insofar",
-    "instead",
-    "into",
-    "inward",
-    "is",
-    "isnt",
-    "it",
-    "itd",
-    "itll",
-    "itself",
-    "ive",
-    "just",
-    "keep",
-    "keeps",
-    "kept",
-    "know",
-    "known",
-    "last",
-    "lately",
-    "later",
-    "latter",
-    "least",
-    "less",
-    "lest",
-    "let",
-    "like",
-    "liked",
-    "likely",
-    "likewise",
-    "little",
-    "look",
-    "looking",
-    "low",
-    "lower",
-    "ltd",
-    "made",
-    "mainly",
-    "make",
-    "many",
-    "may",
-    "maybe",
-    "maynt",
-    "me",
-    "mean",
-    "meantime",
-    "meanwhile",
-    "merely",
-    "might",
-    "mightnt",
-    "mine",
-    "minus",
-    "miss",
-    "more",
-    "moreover",
-    "most",
-    "mostly",
-    "much",
-    "must",
-    "mustnt",
-    "my",
-    "myself",
-    "name",
-    "namely",
-    "near",
-    "nearly",
-    "necessary",
-    "need",
-    "neednt",
-    "neither",
-    "never",
-    "neverless",
-    "nevertheless",
-    "new",
-    "next",
-    "nine",
-    "ninety",
-    "no",
-    "nobody",
-    "non",
-    "none",
-    "nonetheless",
-    "noone",
-    "no-one",
-    "nor",
-    "normally",
-    "not",
-    "nothing",
-    "notwithstanding",
-    "novel",
-    "now",
-    "nowhere",
-    "obviously",
-    "of",
-    "off",
-    "often",
-    "oh",
-    "ok",
-    "okay",
-    "old",
-    "on",
-    "once",
-    "one",
-    "only",
-    "onto",
-    "opposite",
-    "or",
-    "other",
-    "otherwise",
-    "ought",
-    "oughtnt",
-    "our",
-    "ourselves",
-    "out",
-    "outside",
-    "over",
-    "overall",
-    "own",
-    "particular",
-    "particularly",
-    "past",
-    "per",
-    "perhaps",
-    "placed",
-    "please",
-    "plus",
-    "possible",
-    "presumably",
-    "probably",
-    "provided",
-    "provide",
-    "quite",
-    "rather",
-    "really",
-    "reasonably",
-    "recent",
-    "recently",
-    "regarding",
-    "regardless",
-    "regards",
-    "relatively",
-    "respectively",
-    "right",
-    "round",
-    "said",
-    "same",
-    "saw",
-    "say",
-    "saying",
-    "second",
-    "secondly",
-    "see",
-    "seeing",
-    "seem",
-    "seemed",
-    "seeming",
-    "seems",
-    "seen",
-    "self",
-    "sensible",
-    "sent",
-    "serious",
-    "seriously",
-    "seven",
-    "several",
-    "shall",
-    "shant",
-    "she",
-    "shed",
-    "shell",
-    "should",
-    "shouldnt",
-    "since",
-    "six",
-    "so",
-    "some",
-    "somebody",
-    "someday",
-    "somehow",
-    "someone",
-    "something",
-    "sometime",
-    "somewhat",
-    "somewhere",
-    "soon",
-    "sorry",
-    "specified",
-    "specify",
-    "specifying",
-    "still",
-    "such",
-    "sure",
-    "take",
-    "taken",
-    "taking",
-    "tell",
-    "tends",
-    "ten",
-    "than",
-    "thank",
-    "that",
-    "thatll",
-    "thatve",
-    "the",
-    "their",
-    "them",
-    "themselves",
-    "then",
-    "thence",
-    "there",
-    "thereafter",
-    "thereby",
-    "thered",
-    "therefore",
-    "therein",
-    "therell",
-    "therere",
-    "thereupon",
-    "thereve",
-    "these",
-    "they",
-    "theyd",
-    "theyll",
-    "theyre",
-    "theyve",
-    "thing",
-    "think",
-    "third",
-    "thirty",
-    "this",
-    "thorough",
-    "thoroughly",
-    "those",
-    "though",
-    "three",
-    "through",
-    "throughout",
-    "thru",
-    "thus",
-    "till",
-    "to",
-    "together",
-    "too",
-    "took",
-    "toward",
-    "tried",
-    "tries",
-    "truly",
-    "try",
-    "trying",
-    "twice",
-    "two",
-    "under",
-    "underneath",
-    "undoing",
-    "unfortunately",
-    "unless",
-    "unlike",
-    "unlikely",
-    "until",
-    "unto",
-    "up",
-    "upon",
-    "upwards",
-    "use",
-    "used",
-    "useful",
-    "using",
-    "usually",
-    "value",
-    "various",
-    "versus",
-    "very",
-    "via",
-    "viz",
-    "want",
-    "was",
-    "wasnt",
-    "way",
-    "we",
-    "wed",
-    "welcome",
-    "well",
-    "went",
-    "were",
-    "werent",
-    "weve",
-    "what",
-    "whatever",
-    "whatll",
-    "whatve",
-    "when",
-    "whence",
-    "whenever",
-    "where",
-    "whereafter",
-    "whereas",
-    "whereby",
-    "wherein",
-    "whereupon",
-    "wherever",
-    "whether",
-    "which",
-    "whichever",
-    "while",
-    "whilst",
-    "whither",
-    "who",
-    "whod",
-    "whoever",
-    "whole",
-    "wholl",
-    "whom",
-    "whomever",
-    "whose",
-    "why",
-    "will",
-    "willing",
-    "wish",
-    "with",
-    "within",
-    "without",
-    "wonder",
-    "wont",
-    "would",
-    "wouldnt",
-    "website",
-    "yes",
-    "yet",
-    "you",
-    "youd",
-    "youll",
-    "your",
-    "youre",
-    "yourself",
-    "yourselves",
-    "youve",
-    "zero",
-]

 LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
 LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
+LANG_ALIASES = {
+    "nn": "no",
+}
 LANGS_JSON = Path(__file__).with_name("all_langs.json")
 LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
+def canonical_lang(lang: str) -> str:
+    return LANG_ALIASES.get(lang, lang)
 def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
     """Write the canonical ALL_LANGS list to JSON if it is missing."""
     path = Path(path)
         if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
             return langs
     write_all_langs_json(path)
+    return ALL_LANGS[:]

source_config.py CHANGED Viewed

@@ -1,145 +1,174 @@
 from __future__ import annotations
 LANGUAGE_BUCKETS = {
     # ~41% of CC — intentionally capped to avoid crowding out other languages
     "English": {
         "langs": ["en"],
-        "weight": 2.5,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~6.3% of CC — was badly underweighted relative to German/French
     "Russian": {
         "langs": ["ru"],
-        "weight": 1.8,
         "min_chars": 2_000,
         "latin": False,
     },
     # ~5.9% of CC
     "German": {
         "langs": ["de"],
-        "weight": 1.8,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
     "Japanese": {
         "langs": ["ja"],
-        "weight": 1.8,
         "min_chars": 1_200,
         "latin": False,
     },
     # ~5.0% of CC — CC likely undercounts due to Great Firewall
     "Chinese": {
         "langs": ["zh"],
-        "weight": 1.8,
         "min_chars": 1_200,
         "latin": False,
     },
     # ~4.6% of CC
     "French": {
         "langs": ["fr"],
-        "weight": 1.8,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~4.6% of CC
     "Spanish": {
         "langs": ["es"],
-        "weight": 1.8,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.5% of CC
     "Portuguese": {
         "langs": ["pt"],
-        "weight": 1.6,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.4% of CC
     "Italian": {
         "langs": ["it"],
-        "weight": 1.5,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
     "Polish": {
         "langs": ["pl"],
-        "weight": 1.5,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~1.8% of CC — was significantly underweighted at 1.15
     "Dutch": {
         "langs": ["nl"],
-        "weight": 1.5,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
     "Turkish": {
         "langs": ["tr"],
-        "weight": 1.4,
         "min_chars": 2_000,
         "latin": True,
     },
     # ind ~1.1%, vie ~1.05% of CC
     "SoutheastAsianLatin": {
         "langs": ["vi", "id", "ms", "sq", "la"],
-        "weight": 1.4,
         "min_chars": 2_000,
         "latin": True,
     },
     # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
     "CentralEuropeanLatin": {
         "langs": ["cs", "ro", "hu"],
-        "weight": 1.2,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~0.81% of CC — was overweighted at 1.7
     "Korean": {
         "langs": ["ko"],
-        "weight": 1.3,
         "min_chars": 1_200,
         "latin": False,
     },
     # ukr ~0.70%, bel ~0.017% of CC
     "EastSlavicCyrillic": {
         "langs": ["uk", "be"],
-        "weight": 1.15,
         "min_chars": 2_000,
         "latin": False,
     },
     # ~0.65% of CC — upweighted relative to CC share given speaker population
     "Arabic": {
         "langs": ["ar"],
-        "weight": 1.35,
         "min_chars": 2_000,
         "latin": False,
     },
-    # sv ~0.7%, dan ~0.51%, nor+nno ~0.33%, fin ~0.37%, isl ~0.04%, afr ~0.01%
     # combined ~2.0% of CC — was drastically overweighted at 6.0
     # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
     "NordicCore": {
-        "langs": ["sv", "da", "no", "is", "af", "fi"],
-        "weight": 1.8,
         "min_chars": 2_000,
         "latin": True,
     },
     # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
     "BalkanCyrillic": {
         "langs": ["bg", "sr", "mk"],
-        "weight": 1.0,
         "min_chars": 2_000,
         "latin": False,
     },
     # fas ~0.20% of CC (ignore the one anomalous crawl spike)
     "ArabicOther": {
         "langs": ["fa", "ps", "sd", "ug"],
-        "weight": 0.9,
         "min_chars": 2_000,
         "latin": False,
     },
@@ -153,104 +182,70 @@ LANGUAGE_BUCKETS = {
     },
     # combined ~0.27% of CC — upweighted for script diversity
     "IndicOther": {
-        "langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"],
-        "weight": 0.9,
         "min_chars": 2_000,
         "latin": False,
     },
     # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
-    "CentralAsianCyrillic": {
-        "langs": ["kk", "mn"],
-        "weight": 0.9,
         "min_chars": 2_000,
         "latin": False,
     },
     "AfricanLatin": {
-        "langs": ["sw", "tl", "eu"],
-        "weight": 0.8,
         "min_chars": 1_500,
         "latin": True,
     },
-    # el ~0.55%, he ~0.24%, th ~0.38%, hy ~0.033%, ka ~0.044% etc. — combined ~1%+
-    # nudged up slightly from 0.8 given Greek and Thai have meaningful CC presence
-    "OtherScripts": {
-        "langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"],
-        "weight": 0.9,
         "min_chars": 2_000,
         "latin": False,
     },
-}
-POOL = {
-    "wiki": {
-        "reserve": 0.60,
-        "min": 4,
-        "max": 120_000,
-    },
-    "smol": {
-        "reserve": 0.95,
-        "min": 1,
-        "max": 1_000,
-    },
-    "ft": {
-        "reserve": 0.60,
-        "min": 1,
-        "max": 30_000,
-    },
-}
-DOC_MIX = {
-    "pure": {
-        "fraction": 0.60,
-        "pool": "reserve",
-        "min_sentences": 1,
-        "max_sentences": 4,
-        "strip_punct_prob": 0.10,
-    },
-    "homogeneous": {
-        "fraction": 0.30,
-        "pool": "main",
-        "min_sentences": 2,
-        "max_sentences": 6,
-        "strip_punct_prob": 0.15,
-    },
-    "mixed": {
-        "fraction": 0.10,
-        "pool": "main",
-        "min_segments": 2,
-        "max_segments": 4,
-        "strip_punct_prob": 0.25,
-        "swap_prob": 0.06,
-        "o_inject_prob": 0.06,
-        "allow_repeated_langs": True,
-    },
-}
-SMOL = {
-    "use": True,
-    "rebuild": False,
-}
-FT = {
-    "use": True,
-    "rebuild": False,
-    "max_lang": 50_000,
-    "overflow_lang": 75_000,
-    "max_row": 50_000,
-    "miss": 1_000,
-    "include_en": True,
-    "langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
-}
-FT["every"] = len(FT["langs"])
-RUN = {
-    "len": 512,
-    "target": 2_500_000,  # synthetic mixed-language training examples to generate
-    "syn_cache": True,
-    "syn_rebuild": False,
-    "tok_cache": True,
-    "tok_rebuild": False,
-    "tok_skip_check": False,
-    "retry": 8,
-    "preview": 2_000,
-}

 from __future__ import annotations
 LANGUAGE_BUCKETS = {
     # ~41% of CC — intentionally capped to avoid crowding out other languages
     "English": {
         "langs": ["en"],
+        "weight": 2.9,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~6.3% of CC — was badly underweighted relative to German/French
     "Russian": {
         "langs": ["ru"],
+        "weight": 1.95,
         "min_chars": 2_000,
         "latin": False,
     },
     # ~5.9% of CC
     "German": {
         "langs": ["de"],
+        "weight": 1.9,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
     "Japanese": {
         "langs": ["ja"],
+        "weight": 1.9,
         "min_chars": 1_200,
         "latin": False,
     },
     # ~5.0% of CC — CC likely undercounts due to Great Firewall
     "Chinese": {
         "langs": ["zh"],
+        "weight": 1.9,
         "min_chars": 1_200,
         "latin": False,
     },
     # ~4.6% of CC
     "French": {
         "langs": ["fr"],
+        "weight": 1.9,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~4.6% of CC
     "Spanish": {
         "langs": ["es"],
+        "weight": 1.9,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.5% of CC
     "Portuguese": {
         "langs": ["pt"],
+        "weight": 1.7,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.4% of CC
     "Italian": {
         "langs": ["it"],
+        "weight": 1.6,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
     "Polish": {
         "langs": ["pl"],
+        "weight": 1.55,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~1.8% of CC — was significantly underweighted at 1.15
     "Dutch": {
         "langs": ["nl"],
+        "weight": 1.55,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
     "Turkish": {
         "langs": ["tr"],
+        "weight": 1.45,
         "min_chars": 2_000,
         "latin": True,
     },
     # ind ~1.1%, vie ~1.05% of CC
     "SoutheastAsianLatin": {
         "langs": ["vi", "id", "ms", "sq", "la"],
+        "weight": 1.55,
         "min_chars": 2_000,
         "latin": True,
     },
+    "WesternLatin": {
+        "langs": ["ca", "gl", "oc"],
+        "weight": 1.2,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    "CelticLatin": {
+        "langs": ["br", "ga", "gd", "cy"],
+        "weight": 1.3,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    "AdriaticLatin": {
+        "langs": ["bs", "hr", "sl", "sk"],
+        "weight": 1.4,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    "BalticLatin": {
+        "langs": ["et", "lv", "lt"],
+        "weight": 1.2,
+        "min_chars": 1_500,
+        "latin": True,
+    },
     # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
     "CentralEuropeanLatin": {
         "langs": ["cs", "ro", "hu"],
+        "weight": 1.3,
         "min_chars": 2_000,
         "latin": True,
     },
     # ~0.81% of CC — was overweighted at 1.7
     "Korean": {
         "langs": ["ko"],
+        "weight": 1.35,
         "min_chars": 1_200,
         "latin": False,
     },
     # ukr ~0.70%, bel ~0.017% of CC
     "EastSlavicCyrillic": {
         "langs": ["uk", "be"],
+        "weight": 1.7,
         "min_chars": 2_000,
         "latin": False,
     },
     # ~0.65% of CC — upweighted relative to CC share given speaker population
     "Arabic": {
         "langs": ["ar"],
+        "weight": 1.4,
         "min_chars": 2_000,
         "latin": False,
     },
+    "Norwegian": {
+        "langs": ["no"],
+        "weight": 1.0,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
     # combined ~2.0% of CC — was drastically overweighted at 6.0
     # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
     "NordicCore": {
+        "langs": ["sv", "da", "is", "af", "fi"],
+        "weight": 2.1,
         "min_chars": 2_000,
         "latin": True,
     },
     # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
     "BalkanCyrillic": {
         "langs": ["bg", "sr", "mk"],
+        "weight": 1.05,
         "min_chars": 2_000,
         "latin": False,
     },
     # fas ~0.20% of CC (ignore the one anomalous crawl spike)
     "ArabicOther": {
         "langs": ["fa", "ps", "sd", "ug"],
+        "weight": 0.95,
         "min_chars": 2_000,
         "latin": False,
     },
     },
     # combined ~0.27% of CC — upweighted for script diversity
     "IndicOther": {
+        "langs": [
+            "ur",
+            "bn",
+            "ta",
+            "te",
+            "mr",
+            "gu",
+            "kn",
+            "ml",
+            "pa",
+            "as",
+            "or",
+            "ne",
+        ],
+        "weight": 0.95,
         "min_chars": 2_000,
         "latin": False,
     },
     # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
+    "CentralAsianCaucusCyrillic": {
+        "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
+        "weight": 1.1,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # Kurdish is split by script/source:
+    # - ku: Wikipedia / Latin-script Kurdish
+    # - ckb: FineTranslations / Arabic-script Kurdish
+    "KurdishLatin": {
+        "langs": ["ku"],
+        "weight": 0.45,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    "KurdishArabic": {
+        "langs": ["ckb"],
+        "weight": 0.45,
         "min_chars": 2_000,
         "latin": False,
     },
     "AfricanLatin": {
+        "langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
+        "weight": 1.0,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    "PeripheralLatin": {
+        "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
+        "weight": 1.0,
         "min_chars": 1_500,
         "latin": True,
     },
+    # Split the remaining non-Latin scripts into two buckets to keep
+    # Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
+    "OtherScriptsWest": {
+        "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
+        "weight": 1.0,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    "OtherScriptsEast": {
+        "langs": ["km", "lo", "my", "th", "si", "bo"],
+        "weight": 1.0,
         "min_chars": 2_000,
         "latin": False,
     },
+}