datasets-tagging / language_set.json
yacine
expanded tag sets
cca065e
raw
history blame
12.6 kB
{
"aa": "Afar",
"ab": "Abkhazian",
"ace": "Achinese",
"ach": "Acoli",
"ada": "Adangme",
"ady": "Adyghe, Adygei",
"ae": "Avestan",
"af": "Afrikaans",
"afa": "Afro-Asiatic languages",
"afh": "Afrihili",
"ain": "Ainu (Japan)",
"ak": "Akan",
"akk": "Akkadian",
"ale": "Aleut",
"alg": "Algonquian languages",
"alt": "Southern Altai",
"am": "Amharic",
"an": "Aragonese",
"ang": "Old English (ca. 450-1100)",
"apa": "Apache languages",
"ar": "Arabic",
"arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
"arn": "Mapudungun, Mapuche",
"arp": "Arapaho",
"art": "Artificial languages",
"arw": "Arawak",
"as": "Assamese",
"ast": "Asturian, Asturleonese, Bable, Leonese",
"ath": "Athapascan languages",
"aus": "Australian languages",
"av": "Avaric",
"awa": "Awadhi",
"ay": "Aymara",
"az": "Azerbaijani",
"ba": "Bashkir",
"bad": "Banda languages",
"bai": "Bamileke languages",
"bal": "Baluchi",
"ban": "Balinese",
"bas": "Basa (Cameroon)",
"bat": "Baltic languages",
"be": "Belarusian",
"bej": "Beja, Bedawiyet",
"bem": "Bemba (Zambia)",
"ber": "Berber languages",
"bg": "Bulgarian",
"bh": "Bihari languages",
"bho": "Bhojpuri",
"bi": "Bislama",
"bik": "Bikol",
"bin": "Bini, Edo",
"bla": "Siksika",
"bm": "Bambara",
"bn": "Bengali, Bangla",
"bnt": "Bantu languages",
"bo": "Tibetan",
"br": "Breton",
"bra": "Braj",
"bs": "Bosnian",
"btk": "Batak languages",
"bua": "Buriat",
"bug": "Buginese",
"byn": "Bilin, Blin",
"ca": "Catalan, Valencian",
"cad": "Caddo",
"cai": "Central American Indian languages",
"car": "Galibi Carib",
"cau": "Caucasian languages",
"ce": "Chechen",
"ceb": "Cebuano",
"cel": "Celtic languages",
"ch": "Chamorro",
"chb": "Chibcha",
"chg": "Chagatai",
"chk": "Chuukese",
"chm": "Mari (Russia)",
"chn": "Chinook jargon",
"cho": "Choctaw",
"chp": "Chipewyan, Dene Suline",
"chr": "Cherokee",
"chy": "Cheyenne",
"cmc": "Chamic languages",
"co": "Corsican",
"cop": "Coptic",
"cpe": "English-based creoles and pidgins",
"cpf": "French-based creoles and pidgins",
"cpp": "Portuguese-based creoles and pidgins",
"cr": "Cree",
"crh": "Crimean Tatar, Crimean Turkish",
"crp": "Creoles and pidgins",
"cs": "Czech",
"csb": "Kashubian",
"cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
"cus": "Cushitic languages",
"cv": "Chuvash",
"cy": "Welsh",
"da": "Danish",
"dak": "Dakota",
"dar": "Dargwa",
"day": "Land Dayak languages",
"de": "German",
"del": "Delaware",
"den": "Slave (Athapascan)",
"dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
"din": "Dinka",
"doi": "Dogri (macrolanguage)",
"dra": "Dravidian languages",
"dsb": "Lower Sorbian",
"dua": "Duala",
"dum": "Middle Dutch (ca. 1050-1350)",
"dv": "Dhivehi, Divehi, Maldivian",
"dyu": "Dyula",
"dz": "Dzongkha",
"ee": "Ewe",
"efi": "Efik",
"egy": "Egyptian (Ancient)",
"eka": "Ekajuk",
"el": "Modern Greek (1453-)",
"elx": "Elamite",
"en": "English",
"enm": "Middle English (1100-1500)",
"eo": "Esperanto",
"es": "Spanish, Castilian",
"et": "Estonian",
"eu": "Basque",
"ewo": "Ewondo",
"fa": "Persian",
"fan": "Fang (Equatorial Guinea)",
"fat": "Fanti",
"ff": "Fulah",
"fi": "Finnish",
"fil": "Filipino, Pilipino",
"fiu": "Finno-Ugrian languages",
"fj": "Fijian",
"fo": "Faroese",
"fon": "Fon",
"fr": "French",
"frm": "Middle French (ca. 1400-1600)",
"fro": "Old French (842-ca. 1400)",
"fur": "Friulian",
"fy": "Western Frisian",
"ga": "Irish",
"gaa": "Ga",
"gay": "Gayo",
"gba": "Gbaya (Central African Republic)",
"gd": "Scottish Gaelic, Gaelic",
"gem": "Germanic languages",
"gez": "Geez",
"gil": "Gilbertese",
"gl": "Galician",
"gmh": "Middle High German (ca. 1050-1500)",
"gn": "Guarani",
"goh": "Old High German (ca. 750-1050)",
"gon": "Gondi",
"gor": "Gorontalo",
"got": "Gothic",
"grb": "Grebo",
"grc": "Ancient Greek (to 1453)",
"gu": "Gujarati",
"gv": "Manx",
"gwi": "Gwich\u02bcin",
"ha": "Hausa",
"hai": "Haida",
"haw": "Hawaiian",
"he": "Hebrew",
"hi": "Hindi",
"hil": "Hiligaynon",
"him": "Himachali languages, Western Pahari languages",
"hit": "Hittite",
"hmn": "Hmong, Mong",
"ho": "Hiri Motu",
"hr": "Croatian",
"hsb": "Upper Sorbian",
"ht": "Haitian, Haitian Creole",
"hu": "Hungarian",
"hup": "Hupa",
"hy": "Armenian",
"hz": "Herero",
"ia": "Interlingua (International Auxiliary Language Association)",
"iba": "Iban",
"id": "Indonesian",
"ie": "Interlingue, Occidental",
"ig": "Igbo",
"ii": "Sichuan Yi, Nuosu",
"ijo": "Ijo languages",
"ik": "Inupiaq",
"ilo": "Iloko",
"inc": "Indic languages",
"ine": "Indo-European languages",
"inh": "Ingush",
"io": "Ido",
"ira": "Iranian languages",
"iro": "Iroquoian languages",
"is": "Icelandic",
"it": "Italian",
"iu": "Inuktitut",
"ja": "Japanese",
"jbo": "Lojban",
"jpr": "Judeo-Persian",
"jrb": "Judeo-Arabic",
"jv": "Javanese",
"ka": "Georgian",
"kaa": "Kara-Kalpak, Karakalpak",
"kab": "Kabyle",
"kac": "Kachin, Jingpho",
"kam": "Kamba (Kenya)",
"kar": "Karen languages",
"kaw": "Kawi",
"kbd": "Kabardian",
"kg": "Kongo",
"kha": "Khasi",
"khi": "Khoisan languages",
"kho": "Khotanese, Sakan",
"ki": "Kikuyu, Gikuyu",
"kj": "Kuanyama, Kwanyama",
"kk": "Kazakh",
"kl": "Kalaallisut, Greenlandic",
"km": "Khmer, Central Khmer",
"kmb": "Kimbundu",
"kn": "Kannada",
"ko": "Korean",
"kok": "Konkani (macrolanguage)",
"kos": "Kosraean",
"kpe": "Kpelle",
"kr": "Kanuri",
"krc": "Karachay-Balkar",
"kro": "Kru languages",
"kru": "Kurukh",
"ks": "Kashmiri",
"ku": "Kurdish",
"kum": "Kumyk",
"kut": "Kutenai",
"kv": "Komi",
"kw": "Cornish",
"ky": "Kirghiz, Kyrgyz",
"la": "Latin",
"lad": "Ladino",
"lah": "Lahnda",
"lam": "Lamba",
"lb": "Luxembourgish, Letzeburgesch",
"lez": "Lezghian",
"lg": "Ganda, Luganda",
"li": "Limburgan, Limburger, Limburgish",
"ln": "Lingala",
"lo": "Lao",
"lol": "Mongo",
"loz": "Lozi",
"lt": "Lithuanian",
"lu": "Luba-Katanga",
"lua": "Luba-Lulua",
"lui": "Luiseno",
"lun": "Lunda",
"luo": "Luo (Kenya and Tanzania), Dholuo",
"lus": "Lushai",
"lv": "Latvian",
"mad": "Madurese",
"mag": "Magahi",
"mai": "Maithili",
"mak": "Makasar",
"man": "Mandingo, Manding",
"map": "Austronesian languages",
"mas": "Masai",
"mdf": "Moksha",
"mdr": "Mandar",
"men": "Mende (Sierra Leone)",
"mg": "Malagasy",
"mga": "Middle Irish (900-1200)",
"mh": "Marshallese",
"mi": "Maori",
"mic": "Mi'kmaq, Micmac",
"min": "Minangkabau",
"mis": "Uncoded languages",
"mk": "Macedonian",
"mkh": "Mon-Khmer languages",
"ml": "Malayalam",
"mn": "Mongolian",
"mnc": "Manchu",
"mni": "Manipuri",
"mno": "Manobo languages",
"moh": "Mohawk",
"mos": "Mossi",
"mr": "Marathi",
"ms": "Malay (macrolanguage)",
"mt": "Maltese",
"mul": "Multiple languages",
"mun": "Munda languages",
"mus": "Creek",
"mwl": "Mirandese",
"mwr": "Marwari",
"my": "Burmese",
"myn": "Mayan languages",
"myv": "Erzya",
"na": "Nauru",
"nah": "Nahuatl languages",
"nai": "North American Indian languages",
"nap": "Neapolitan",
"nb": "Norwegian Bokm\u00e5l",
"nd": "North Ndebele",
"nds": "Low German, Low Saxon",
"ne": "Nepali (macrolanguage)",
"new": "Newari, Nepal Bhasa",
"ng": "Ndonga",
"nia": "Nias",
"nic": "Niger-Kordofanian languages",
"niu": "Niuean",
"nl": "Dutch, Flemish",
"nn": "Norwegian Nynorsk",
"no": "Norwegian",
"nog": "Nogai",
"non": "Old Norse",
"nr": "South Ndebele",
"nso": "Pedi, Northern Sotho, Sepedi",
"nub": "Nubian languages",
"nv": "Navajo, Navaho",
"nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
"ny": "Nyanja, Chewa, Chichewa",
"nym": "Nyamwezi",
"nyn": "Nyankole",
"nyo": "Nyoro",
"nzi": "Nzima",
"oc": "Occitan (post 1500)",
"oj": "Ojibwa",
"om": "Oromo",
"or": "Oriya (macrolanguage), Odia (macrolanguage)",
"os": "Ossetian, Ossetic",
"osa": "Osage",
"ota": "Ottoman Turkish (1500-1928)",
"oto": "Otomian languages",
"pa": "Panjabi, Punjabi",
"paa": "Papuan languages",
"pag": "Pangasinan",
"pal": "Pahlavi",
"pam": "Pampanga, Kapampangan",
"pap": "Papiamento",
"pau": "Palauan",
"peo": "Old Persian (ca. 600-400 B.C.)",
"phi": "Philippine languages",
"phn": "Phoenician",
"pi": "Pali",
"pl": "Polish",
"pon": "Pohnpeian",
"pra": "Prakrit languages",
"pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
"ps": "Pushto, Pashto",
"pt": "Portuguese",
"qaa..qtz": "Private use",
"qu": "Quechua",
"raj": "Rajasthani",
"rap": "Rapanui",
"rar": "Rarotongan, Cook Islands Maori",
"rm": "Romansh",
"rn": "Rundi",
"ro": "Romanian, Moldavian, Moldovan",
"roa": "Romance languages",
"rom": "Romany",
"ru": "Russian",
"rup": "Macedo-Romanian, Aromanian, Arumanian",
"rw": "Kinyarwanda",
"sa": "Sanskrit",
"sad": "Sandawe",
"sah": "Yakut",
"sai": "South American Indian languages",
"sal": "Salishan languages",
"sam": "Samaritan Aramaic",
"sas": "Sasak",
"sat": "Santali",
"sc": "Sardinian",
"scn": "Sicilian",
"sco": "Scots",
"sd": "Sindhi",
"se": "Northern Sami",
"sel": "Selkup",
"sem": "Semitic languages",
"sg": "Sango",
"sga": "Old Irish (to 900)",
"sgn": "Sign languages",
"sh": "Serbo-Croatian",
"shn": "Shan",
"si": "Sinhala, Sinhalese",
"sid": "Sidamo",
"sio": "Siouan languages",
"sit": "Sino-Tibetan languages",
"sk": "Slovak",
"sl": "Slovenian",
"sla": "Slavic languages",
"sm": "Samoan",
"sma": "Southern Sami",
"smi": "Sami languages",
"smj": "Lule Sami",
"smn": "Inari Sami",
"sms": "Skolt Sami",
"sn": "Shona",
"snk": "Soninke",
"so": "Somali",
"sog": "Sogdian",
"son": "Songhai languages",
"sq": "Albanian",
"sr": "Serbian",
"srn": "Sranan Tongo",
"srr": "Serer",
"ss": "Swati",
"ssa": "Nilo-Saharan languages",
"st": "Southern Sotho",
"su": "Sundanese",
"suk": "Sukuma",
"sus": "Susu",
"sux": "Sumerian",
"sv": "Swedish",
"sw": "Swahili (macrolanguage)",
"syr": "Syriac",
"ta": "Tamil",
"tai": "Tai languages",
"te": "Telugu",
"tem": "Timne",
"ter": "Tereno",
"tet": "Tetum",
"tg": "Tajik",
"th": "Thai",
"ti": "Tigrinya",
"tig": "Tigre",
"tiv": "Tiv",
"tk": "Turkmen",
"tkl": "Tokelau",
"tl": "Tagalog",
"tlh": "Klingon, tlhIngan Hol",
"tli": "Tlingit",
"tmh": "Tamashek",
"tn": "Tswana",
"to": "Tonga (Tonga Islands)",
"tog": "Tonga (Nyasa)",
"tpi": "Tok Pisin",
"tr": "Turkish",
"ts": "Tsonga",
"tsi": "Tsimshian",
"tt": "Tatar",
"tum": "Tumbuka",
"tup": "Tupi languages",
"tut": "Altaic languages",
"tvl": "Tuvalu",
"tw": "Twi",
"ty": "Tahitian",
"tyv": "Tuvinian",
"udm": "Udmurt",
"ug": "Uighur, Uyghur",
"uga": "Ugaritic",
"uk": "Ukrainian",
"umb": "Umbundu",
"und": "Undetermined",
"ur": "Urdu",
"uz": "Uzbek",
"vai": "Vai",
"ve": "Venda",
"vi": "Vietnamese",
"vo": "Volap\u00fck",
"vot": "Votic",
"wa": "Walloon",
"wak": "Wakashan languages",
"wal": "Wolaytta, Wolaitta",
"war": "Waray (Philippines)",
"was": "Washo",
"wen": "Sorbian languages",
"wo": "Wolof",
"xal": "Kalmyk, Oirat",
"xh": "Xhosa",
"yao": "Yao",
"yap": "Yapese",
"yi": "Yiddish",
"yo": "Yoruba",
"ypk": "Yupik languages",
"za": "Zhuang, Chuang",
"zap": "Zapotec",
"zen": "Zenaga",
"zh": "Chinese",
"znd": "Zande languages",
"zu": "Zulu",
"zun": "Zuni"
}