symphonym-v7 / training_stats /coverage_stats.json
docuracy's picture
Upload Symphonym v7 model, vocabularies, and evaluation results
4558539 verified
{
"total_toponyms": 66924548,
"in_training_namespaces": 57593810,
"with_ipa": 31113585,
"with_panphon_embedding": 31113585,
"panphon_coverage_pct": 54.02244616218305,
"from_db_cache": 31113562,
"from_precomputed": 2,
"from_epitran": 21,
"by_script": {
"CYRILLIC": 3614762,
"LATIN": 55617677,
"CJK": 2973525,
"ARABIC": 2098089,
"HEBREW": 151960,
"KATAKANA": 340555,
"MALAYALAM": 68176,
"HIRAGANA": 151980,
"OTHER": 342642,
"GEORGIAN": 105902,
"GREEK": 217997,
"DEVANAGARI": 166957,
"ARMENIAN": 153467,
"THAI": 251458,
"KANNADA": 43155,
"HANGUL": 393996,
"GUJARATI": 21428,
"BENGALI": 106896,
"TAMIL": 52486,
"TELUGU": 51440
},
"by_script_lang_ipa": {
"LATIN:en": 8039689,
"LATIN:fr": 2311837,
"LATIN:nl": 2292068,
"LATIN:de": 2063027,
"LATIN:sv": 1715947,
"LATIN:es": 1518527,
"CJK:zh": 1306961,
"LATIN:id": 931192,
"LATIN:tr": 843744,
"LATIN:it": 815130,
"CYRILLIC:ru": 803734,
"LATIN:pt": 711346,
"LATIN:pl": 655880,
"LATIN:cs": 593385,
"ARABIC:fa": 576743,
"LATIN:fi": 532010,
"CYRILLIC:uk": 435644,
"LATIN:no": 428240,
"ARABIC:ar": 412306,
"LATIN:ro": 375292,
"KATAKANA:ja": 310410,
"LATIN:da": 297142,
"LATIN:ms": 285578,
"LATIN:vi": 267432,
"LATIN:hu": 247134,
"CYRILLIC:bg": 235749,
"CYRILLIC:sr": 235582,
"HANGUL:ko": 228523,
"THAI:th": 210310,
"GREEK:el": 168827,
"ARMENIAN:hy": 143819,
"HEBREW:he": 127337,
"LATIN:sw": 113131,
"ARABIC:ur": 109688,
"GEORGIAN:ka": 86021,
"BENGALI:bn": 77935,
"LATIN:la": 77703,
"CYRILLIC:mk": 61607,
"DEVANAGARI:hi": 60800,
"MALAYALAM:ml": 53546,
"CJK:wuu": 48883,
"TAMIL:ta": 47700,
"TELUGU:te": 47617,
"HIRAGANA:ja": 47533,
"CJK:gan": 37097,
"CJK:yue": 31345,
"DEVANAGARI:mr": 24452,
"KANNADA:kn": 20962,
"GUJARATI:gu": 20329,
"LATIN:yue": 13719,
"DEVANAGARI:ne": 10249,
"CJK:ko": 2060,
"LATIN:wuu": 254,
"KATAKANA:zh": 69,
"LATIN:gan": 60,
"CYRILLIC:zh": 53,
"OTHER:zh": 51,
"OTHER:ko": 21,
"HIRAGANA:yue": 15,
"ARABIC:yue": 14,
"HANGUL:zh": 14,
"HIRAGANA:zh": 14,
"ARABIC:zh": 13,
"OTHER:he": 10,
"GREEK:zh": 9,
"THAI:yue": 7,
"THAI:zh": 7,
"CYRILLIC:yue": 7,
"OTHER:yue": 5,
"KATAKANA:ko": 4,
"HEBREW:yue": 4,
"THAI:wuu": 3,
"CYRILLIC:ko": 3,
"BENGALI:yue": 3,
"DEVANAGARI:zh": 3,
"KATAKANA:yue": 3,
"TAMIL:yue": 2,
"ARMENIAN:yue": 2,
"TELUGU:zh": 2,
"HANGUL:yue": 2,
"CYRILLIC:wuu": 2,
"GREEK:ko": 1,
"OTHER:wuu": 1,
"ARABIC:ko": 1,
"GEORGIAN:yue": 1,
"HEBREW:zh": 1,
"OTHER:gan": 1,
"TAMIL:zh": 1
},
"training_namespaces": [
"gn",
"wd",
"tgn"
],
"num_workers": 62,
"db_engine": "DuckDB",
"ipa_backends": [
"epitran",
"phonikud",
"charsiu_g2p"
]
}