File size: 13,808 Bytes
a7d24e3
 
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
1299535
a7d24e3
1299535
 
 
 
 
 
 
 
 
 
 
 
 
 
a7d24e3
1299535
 
 
 
 
a7d24e3
1299535
a7d24e3
 
1299535
 
 
 
 
a7d24e3
1299535
 
a7d24e3
 
 
 
 
1299535
 
a7d24e3
 
 
1299535
 
 
a7d24e3
 
 
 
 
1299535
a7d24e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
 
 
 
 
1299535
a7d24e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
 
1299535
 
 
 
 
a7d24e3
 
 
 
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
1299535
a7d24e3
 
 
1299535
a7d24e3
 
 
 
 
 
 
 
 
 
1299535
a7d24e3
 
 
 
 
1299535
a7d24e3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
Language support service - provides information about supported languages
"""

from typing import Dict, List, Optional
from ..core.logging import get_logger

logger = get_logger()

# FLORES-200 language codes with human-readable names and regions
SUPPORTED_LANGUAGES = {
    # African Languages (55+ languages) - Complete FLORES-200 African language support
    "afr_Latn": {"name": "Afrikaans", "native_name": "Afrikaans", "region": "Africa", "script": "Latin"},
    "aka_Latn": {"name": "Akan", "native_name": "Akan", "region": "Africa", "script": "Latin"},
    "amh_Ethi": {"name": "Amharic", "native_name": "አማርኛ", "region": "Africa", "script": "Ethiopic"},
    "bam_Latn": {"name": "Bambara", "native_name": "Bamanankan", "region": "Africa", "script": "Latin"},
    "bem_Latn": {"name": "Bemba", "native_name": "Ichibemba", "region": "Africa", "script": "Latin"},
    "dik_Latn": {"name": "Dinka", "native_name": "Thuɔŋjäŋ", "region": "Africa", "script": "Latin"},
    "dyu_Latn": {"name": "Dyula", "native_name": "Jula", "region": "Africa", "script": "Latin"},
    "ewe_Latn": {"name": "Ewe", "native_name": "Eʋegbe", "region": "Africa", "script": "Latin"},
    "fon_Latn": {"name": "Fon", "native_name": "Fɔngbe", "region": "Africa", "script": "Latin"},
    "fuv_Latn": {"name": "Nigerian Fulfulde", "native_name": "Fulfulde", "region": "Africa", "script": "Latin"},
    "gaz_Latn": {"name": "West Central Oromo", "native_name": "Oromoo", "region": "Africa", "script": "Latin"},
    "hau_Latn": {"name": "Hausa", "native_name": "Harshen Hausa", "region": "Africa", "script": "Latin"},
    "ibo_Latn": {"name": "Igbo", "native_name": "Asụsụ Igbo", "region": "Africa", "script": "Latin"},
    "kab_Latn": {"name": "Kabyle", "native_name": "Taqbaylit", "region": "Africa", "script": "Latin"},
    "kam_Latn": {"name": "Kamba", "native_name": "Kikamba", "region": "Africa", "script": "Latin"},
    "kbp_Latn": {"name": "Kabiyè", "native_name": "Kabɩyɛ", "region": "Africa", "script": "Latin"},
    "kea_Latn": {"name": "Kabuverdianu", "native_name": "Kabuverdianu", "region": "Africa", "script": "Latin"},
    "kik_Latn": {"name": "Kikuyu", "native_name": "Gĩkũyũ", "region": "Africa", "script": "Latin"},
    "kin_Latn": {"name": "Kinyarwanda", "native_name": "Ikinyarwanda", "region": "Africa", "script": "Latin"},
    "kmb_Latn": {"name": "Kimbundu", "native_name": "Kimbundu", "region": "Africa", "script": "Latin"},
    "knc_Arab": {"name": "Central Kanuri", "native_name": "Kanuri", "region": "Africa", "script": "Arabic"},
    "knc_Latn": {"name": "Central Kanuri", "native_name": "Kanuri", "region": "Africa", "script": "Latin"},
    "kon_Latn": {"name": "Kikongo", "native_name": "Kikongo", "region": "Africa", "script": "Latin"},
    "lin_Latn": {"name": "Lingala", "native_name": "Lingála", "region": "Africa", "script": "Latin"},
    "lua_Latn": {"name": "Luba-Lulua", "native_name": "Tshiluba", "region": "Africa", "script": "Latin"},
    "lug_Latn": {"name": "Luganda", "native_name": "Luganda", "region": "Africa", "script": "Latin"},
    "luo_Latn": {"name": "Luo", "native_name": "Dholuo", "region": "Africa", "script": "Latin"},
    "lus_Latn": {"name": "Mizo", "native_name": "Mizo ṭawng", "region": "Africa", "script": "Latin"},
    "mos_Latn": {"name": "Mossi", "native_name": "Mooré", "region": "Africa", "script": "Latin"},
    "nso_Latn": {"name": "Northern Sotho", "native_name": "Sesotho sa Leboa", "region": "Africa", "script": "Latin"},
    "nus_Latn": {"name": "Nuer", "native_name": "Thok Nath", "region": "Africa", "script": "Latin"},
    "nya_Latn": {"name": "Nyanja", "native_name": "Chinyanja", "region": "Africa", "script": "Latin"},
    "orm_Latn": {"name": "Oromo", "native_name": "Afaan Oromoo", "region": "Africa", "script": "Latin"},
    "run_Latn": {"name": "Rundi", "native_name": "Ikirundi", "region": "Africa", "script": "Latin"},
    "sag_Latn": {"name": "Sango", "native_name": "Sängö", "region": "Africa", "script": "Latin"},
    "sna_Latn": {"name": "Shona", "native_name": "ChiShona", "region": "Africa", "script": "Latin"},
    "som_Latn": {"name": "Somali", "native_name": "Soomaali", "region": "Africa", "script": "Latin"},
    "sot_Latn": {"name": "Southern Sotho", "native_name": "Sesotho", "region": "Africa", "script": "Latin"},
    "ssw_Latn": {"name": "Swati", "native_name": "SiSwati", "region": "Africa", "script": "Latin"},
    "swh_Latn": {"name": "Swahili", "native_name": "Kiswahili", "region": "Africa", "script": "Latin"},
    "taq_Latn": {"name": "Tamasheq", "native_name": "Tamasheq", "region": "Africa", "script": "Latin"},
    "taq_Tfng": {"name": "Tamasheq", "native_name": "ⵜⴰⵎⴰⵌⴰⵖ", "region": "Africa", "script": "Tifinagh"},
    "tir_Ethi": {"name": "Tigrinya", "native_name": "ትግርኛ", "region": "Africa", "script": "Ethiopic"},
    "tsn_Latn": {"name": "Tswana", "native_name": "Setswana", "region": "Africa", "script": "Latin"},
    "tso_Latn": {"name": "Tsonga", "native_name": "Xitsonga", "region": "Africa", "script": "Latin"},
    "tum_Latn": {"name": "Tumbuka", "native_name": "Chitumbuka", "region": "Africa", "script": "Latin"},
    "twi_Latn": {"name": "Twi", "native_name": "Twi", "region": "Africa", "script": "Latin"},
    "tzm_Tfng": {"name": "Central Atlas Tamazight", "native_name": "ⵜⴰⵎⴰⵣⵉⵖⵜ", "region": "Africa", "script": "Tifinagh"},
    "umb_Latn": {"name": "Umbundu", "native_name": "Umbundu", "region": "Africa", "script": "Latin"},
    "wol_Latn": {"name": "Wolof", "native_name": "Wolof", "region": "Africa", "script": "Latin"},
    "xho_Latn": {"name": "Xhosa", "native_name": "isiXhosa", "region": "Africa", "script": "Latin"},
    "yor_Latn": {"name": "Yoruba", "native_name": "Yorùbá", "region": "Africa", "script": "Latin"},
    "zul_Latn": {"name": "Zulu", "native_name": "isiZulu", "region": "Africa", "script": "Latin"},

    # European Languages
    "eng_Latn": {"name": "English", "native_name": "English", "region": "Europe", "script": "Latin"},
    "fra_Latn": {"name": "French", "native_name": "Français", "region": "Europe", "script": "Latin"},
    "deu_Latn": {"name": "German", "native_name": "Deutsch", "region": "Europe", "script": "Latin"},
    "spa_Latn": {"name": "Spanish", "native_name": "Español", "region": "Europe", "script": "Latin"},
    "ita_Latn": {"name": "Italian", "native_name": "Italiano", "region": "Europe", "script": "Latin"},
    "por_Latn": {"name": "Portuguese", "native_name": "Português", "region": "Europe", "script": "Latin"},
    "rus_Cyrl": {"name": "Russian", "native_name": "Русский", "region": "Europe", "script": "Cyrillic"},
    "nld_Latn": {"name": "Dutch", "native_name": "Nederlands", "region": "Europe", "script": "Latin"},
    "pol_Latn": {"name": "Polish", "native_name": "Polski", "region": "Europe", "script": "Latin"},
    "ces_Latn": {"name": "Czech", "native_name": "Čeština", "region": "Europe", "script": "Latin"},
    "hun_Latn": {"name": "Hungarian", "native_name": "Magyar", "region": "Europe", "script": "Latin"},
    "ron_Latn": {"name": "Romanian", "native_name": "Română", "region": "Europe", "script": "Latin"},
    "bul_Cyrl": {"name": "Bulgarian", "native_name": "Български", "region": "Europe", "script": "Cyrillic"},
    "hrv_Latn": {"name": "Croatian", "native_name": "Hrvatski", "region": "Europe", "script": "Latin"},
    "srp_Cyrl": {"name": "Serbian", "native_name": "Српски", "region": "Europe", "script": "Cyrillic"},
    "slk_Latn": {"name": "Slovak", "native_name": "Slovenčina", "region": "Europe", "script": "Latin"},
    "slv_Latn": {"name": "Slovenian", "native_name": "Slovenščina", "region": "Europe", "script": "Latin"},
    "est_Latn": {"name": "Estonian", "native_name": "Eesti", "region": "Europe", "script": "Latin"},
    "lav_Latn": {"name": "Latvian", "native_name": "Latviešu", "region": "Europe", "script": "Latin"},
    "lit_Latn": {"name": "Lithuanian", "native_name": "Lietuvių", "region": "Europe", "script": "Latin"},

    # Asian Languages
    "cmn_Hans": {"name": "Chinese (Simplified)", "native_name": "中文 (简体)", "region": "Asia", "script": "Han"},
    "cmn_Hant": {"name": "Chinese (Traditional)", "native_name": "中文 (繁體)", "region": "Asia", "script": "Han"},
    "jpn_Jpan": {"name": "Japanese", "native_name": "日本語", "region": "Asia", "script": "Japanese"},
    "kor_Hang": {"name": "Korean", "native_name": "한국어", "region": "Asia", "script": "Hangul"},
    "hin_Deva": {"name": "Hindi", "native_name": "हिन्दी", "region": "Asia", "script": "Devanagari"},
    "ben_Beng": {"name": "Bengali", "native_name": "বাংলা", "region": "Asia", "script": "Bengali"},
    "urd_Arab": {"name": "Urdu", "native_name": "اردو", "region": "Asia", "script": "Arabic"},
    "tam_Taml": {"name": "Tamil", "native_name": "தமிழ்", "region": "Asia", "script": "Tamil"},
    "tel_Telu": {"name": "Telugu", "native_name": "తెలుగు", "region": "Asia", "script": "Telugu"},
    "mar_Deva": {"name": "Marathi", "native_name": "मराठी", "region": "Asia", "script": "Devanagari"},
    "guj_Gujr": {"name": "Gujarati", "native_name": "ગુજરાતી", "region": "Asia", "script": "Gujarati"},
    "kan_Knda": {"name": "Kannada", "native_name": "ಕನ್ನಡ", "region": "Asia", "script": "Kannada"},
    "mal_Mlym": {"name": "Malayalam", "native_name": "മലയാളം", "region": "Asia", "script": "Malayalam"},
    "ori_Orya": {"name": "Odia", "native_name": "ଓଡ଼ିଆ", "region": "Asia", "script": "Odia"},
    "pan_Guru": {"name": "Punjabi", "native_name": "ਪੰਜਾਬੀ", "region": "Asia", "script": "Gurmukhi"},
    "tha_Thai": {"name": "Thai", "native_name": "ไทย", "region": "Asia", "script": "Thai"},
    "vie_Latn": {"name": "Vietnamese", "native_name": "Tiếng Việt", "region": "Asia", "script": "Latin"},
    "ind_Latn": {"name": "Indonesian", "native_name": "Bahasa Indonesia", "region": "Asia", "script": "Latin"},
    "msa_Latn": {"name": "Malay", "native_name": "Bahasa Melayu", "region": "Asia", "script": "Latin"},
    "tgl_Latn": {"name": "Tagalog", "native_name": "Tagalog", "region": "Asia", "script": "Latin"},

    # Middle Eastern Languages
    "ara_Arab": {"name": "Arabic", "native_name": "العربية", "region": "Middle East", "script": "Arabic"},
    "heb_Hebr": {"name": "Hebrew", "native_name": "עברית", "region": "Middle East", "script": "Hebrew"},
    "fas_Arab": {"name": "Persian", "native_name": "فارسی", "region": "Middle East", "script": "Arabic"},
    "tur_Latn": {"name": "Turkish", "native_name": "Türkçe", "region": "Middle East", "script": "Latin"},

    # Americas Languages
    "spa_Latn": {"name": "Spanish", "native_name": "Español", "region": "Americas", "script": "Latin"},
    "por_Latn": {"name": "Portuguese", "native_name": "Português", "region": "Americas", "script": "Latin"},
    "eng_Latn": {"name": "English", "native_name": "English", "region": "Americas", "script": "Latin"},
    "fra_Latn": {"name": "French", "native_name": "Français", "region": "Americas", "script": "Latin"},
}


def get_all_languages() -> Dict[str, Dict[str, str]]:
    """Get all supported languages with their metadata"""
    return SUPPORTED_LANGUAGES


def get_languages_by_region(region: str) -> Dict[str, Dict[str, str]]:
    """Get languages filtered by region"""
    return {
        code: info for code, info in SUPPORTED_LANGUAGES.items()
        if info["region"].lower() == region.lower()
    }


def get_language_info(language_code: str) -> Optional[Dict[str, str]]:
    """Get information about a specific language"""
    return SUPPORTED_LANGUAGES.get(language_code)


def is_language_supported(language_code: str) -> bool:
    """Check if a language code is supported"""
    return language_code in SUPPORTED_LANGUAGES


def get_popular_languages() -> Dict[str, Dict[str, str]]:
    """Get most commonly used languages"""
    popular_codes = [
        # Global languages
        "eng_Latn", "spa_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "por_Latn",
        "rus_Cyrl", "cmn_Hans", "jpn_Jpan", "kor_Hang", "ara_Arab", "hin_Deva",
        # Popular African languages
        "swh_Latn", "hau_Latn", "yor_Latn", "amh_Ethi", "som_Latn", "kik_Latn",
        "afr_Latn", "ibo_Latn", "orm_Latn", "aka_Latn", "bam_Latn", "fon_Latn",
        "lin_Latn", "lug_Latn", "nya_Latn", "sna_Latn", "tir_Ethi", "wol_Latn",
        "xho_Latn", "zul_Latn", "tsn_Latn", "sot_Latn"
    ]
    return {code: SUPPORTED_LANGUAGES[code] for code in popular_codes if code in SUPPORTED_LANGUAGES}


def get_african_languages() -> Dict[str, Dict[str, str]]:
    """Get African languages specifically"""
    return get_languages_by_region("Africa")


def search_languages(query: str) -> Dict[str, Dict[str, str]]:
    """Search languages by name or native name"""
    query_lower = query.lower()
    results = {}

    for code, info in SUPPORTED_LANGUAGES.items():
        if (query_lower in info["name"].lower() or
            query_lower in info["native_name"].lower() or
            query_lower in code.lower()):
            results[code] = info

    return results


def get_language_statistics() -> Dict[str, int]:
    """Get statistics about supported languages"""
    stats = {
        "total_languages": len(SUPPORTED_LANGUAGES),
        "regions": len(set(info["region"] for info in SUPPORTED_LANGUAGES.values())),
        "scripts": len(set(info["script"] for info in SUPPORTED_LANGUAGES.values()))
    }

    # Count by region
    region_counts = {}
    for info in SUPPORTED_LANGUAGES.values():
        region = info["region"]
        region_counts[region] = region_counts.get(region, 0) + 1

    stats["by_region"] = region_counts
    return stats