Spaces:
Sleeping
Sleeping
whisper-large-v3
/
venv
/lib
/python3.10
/site-packages
/pip
/_vendor
/chardet
/metadata
/languages.py
""" | |
Metadata about languages used by our model training code for our | |
SingleByteCharSetProbers. Could be used for other things in the future. | |
This code is based on the language metadata from the uchardet project. | |
""" | |
from string import ascii_letters | |
# TODO: Add Ukrainian (KOI8-U) | |
class Language: | |
"""Metadata about a language useful for training models | |
:ivar name: The human name for the language, in English. | |
:type name: str | |
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, | |
or use another catalog as a last resort. | |
:type iso_code: str | |
:ivar use_ascii: Whether or not ASCII letters should be included in trained | |
models. | |
:type use_ascii: bool | |
:ivar charsets: The charsets we want to support and create data for. | |
:type charsets: list of str | |
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is | |
`True`, you only need to add those not in the ASCII set. | |
:type alphabet: str | |
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling | |
Wikipedia for training data. | |
:type wiki_start_pages: list of str | |
""" | |
def __init__( | |
self, | |
name=None, | |
iso_code=None, | |
use_ascii=True, | |
charsets=None, | |
alphabet=None, | |
wiki_start_pages=None, | |
): | |
super().__init__() | |
self.name = name | |
self.iso_code = iso_code | |
self.use_ascii = use_ascii | |
self.charsets = charsets | |
if self.use_ascii: | |
if alphabet: | |
alphabet += ascii_letters | |
else: | |
alphabet = ascii_letters | |
elif not alphabet: | |
raise ValueError("Must supply alphabet if use_ascii is False") | |
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None | |
self.wiki_start_pages = wiki_start_pages | |
def __repr__(self): | |
param_str = ", ".join( | |
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") | |
) | |
return f"{self.__class__.__name__}({param_str})" | |
LANGUAGES = { | |
"Arabic": Language( | |
name="Arabic", | |
iso_code="ar", | |
use_ascii=False, | |
# We only support encodings that use isolated | |
# forms, because the current recommendation is | |
# that the rendering system handles presentation | |
# forms. This means we purposefully skip IBM864. | |
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"], | |
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", | |
wiki_start_pages=["الصفحة_الرئيسية"], | |
), | |
"Belarusian": Language( | |
name="Belarusian", | |
iso_code="be", | |
use_ascii=False, | |
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"], | |
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", | |
wiki_start_pages=["Галоўная_старонка"], | |
), | |
"Bulgarian": Language( | |
name="Bulgarian", | |
iso_code="bg", | |
use_ascii=False, | |
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"], | |
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", | |
wiki_start_pages=["Начална_страница"], | |
), | |
"Czech": Language( | |
name="Czech", | |
iso_code="cz", | |
use_ascii=True, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ", | |
wiki_start_pages=["Hlavní_strana"], | |
), | |
"Danish": Language( | |
name="Danish", | |
iso_code="da", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="æøåÆØÅ", | |
wiki_start_pages=["Forside"], | |
), | |
"German": Language( | |
name="German", | |
iso_code="de", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "WINDOWS-1252"], | |
alphabet="äöüßÄÖÜ", | |
wiki_start_pages=["Wikipedia:Hauptseite"], | |
), | |
"Greek": Language( | |
name="Greek", | |
iso_code="el", | |
use_ascii=False, | |
charsets=["ISO-8859-7", "WINDOWS-1253"], | |
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", | |
wiki_start_pages=["Πύλη:Κύρια"], | |
), | |
"English": Language( | |
name="English", | |
iso_code="en", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "WINDOWS-1252"], | |
wiki_start_pages=["Main_Page"], | |
), | |
"Esperanto": Language( | |
name="Esperanto", | |
iso_code="eo", | |
# Q, W, X, and Y not used at all | |
use_ascii=False, | |
charsets=["ISO-8859-3"], | |
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ", | |
wiki_start_pages=["Vikipedio:Ĉefpaĝo"], | |
), | |
"Spanish": Language( | |
name="Spanish", | |
iso_code="es", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="ñáéíóúüÑÁÉÍÓÚÜ", | |
wiki_start_pages=["Wikipedia:Portada"], | |
), | |
"Estonian": Language( | |
name="Estonian", | |
iso_code="et", | |
use_ascii=False, | |
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"], | |
# C, F, Š, Q, W, X, Y, Z, Ž are only for | |
# loanwords | |
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü", | |
wiki_start_pages=["Esileht"], | |
), | |
"Finnish": Language( | |
name="Finnish", | |
iso_code="fi", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="ÅÄÖŠŽåäöšž", | |
wiki_start_pages=["Wikipedia:Etusivu"], | |
), | |
"French": Language( | |
name="French", | |
iso_code="fr", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", | |
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], | |
), | |
"Hebrew": Language( | |
name="Hebrew", | |
iso_code="he", | |
use_ascii=False, | |
charsets=["ISO-8859-8", "WINDOWS-1255"], | |
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", | |
wiki_start_pages=["עמוד_ראשי"], | |
), | |
"Croatian": Language( | |
name="Croatian", | |
iso_code="hr", | |
# Q, W, X, Y are only used for foreign words. | |
use_ascii=False, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ", | |
wiki_start_pages=["Glavna_stranica"], | |
), | |
"Hungarian": Language( | |
name="Hungarian", | |
iso_code="hu", | |
# Q, W, X, Y are only used for foreign words. | |
use_ascii=False, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ", | |
wiki_start_pages=["Kezdőlap"], | |
), | |
"Italian": Language( | |
name="Italian", | |
iso_code="it", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="ÀÈÉÌÒÓÙàèéìòóù", | |
wiki_start_pages=["Pagina_principale"], | |
), | |
"Lithuanian": Language( | |
name="Lithuanian", | |
iso_code="lt", | |
use_ascii=False, | |
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], | |
# Q, W, and X not used at all | |
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž", | |
wiki_start_pages=["Pagrindinis_puslapis"], | |
), | |
"Latvian": Language( | |
name="Latvian", | |
iso_code="lv", | |
use_ascii=False, | |
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], | |
# Q, W, X, Y are only for loanwords | |
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž", | |
wiki_start_pages=["Sākumlapa"], | |
), | |
"Macedonian": Language( | |
name="Macedonian", | |
iso_code="mk", | |
use_ascii=False, | |
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], | |
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", | |
wiki_start_pages=["Главна_страница"], | |
), | |
"Dutch": Language( | |
name="Dutch", | |
iso_code="nl", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "WINDOWS-1252"], | |
wiki_start_pages=["Hoofdpagina"], | |
), | |
"Polish": Language( | |
name="Polish", | |
iso_code="pl", | |
# Q and X are only used for foreign words. | |
use_ascii=False, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż", | |
wiki_start_pages=["Wikipedia:Strona_główna"], | |
), | |
"Portuguese": Language( | |
name="Portuguese", | |
iso_code="pt", | |
use_ascii=True, | |
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | |
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", | |
wiki_start_pages=["Wikipédia:Página_principal"], | |
), | |
"Romanian": Language( | |
name="Romanian", | |
iso_code="ro", | |
use_ascii=True, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="ăâîșțĂÂÎȘȚ", | |
wiki_start_pages=["Pagina_principală"], | |
), | |
"Russian": Language( | |
name="Russian", | |
iso_code="ru", | |
use_ascii=False, | |
charsets=[ | |
"ISO-8859-5", | |
"WINDOWS-1251", | |
"KOI8-R", | |
"MacCyrillic", | |
"IBM866", | |
"IBM855", | |
], | |
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", | |
wiki_start_pages=["Заглавная_страница"], | |
), | |
"Slovak": Language( | |
name="Slovak", | |
iso_code="sk", | |
use_ascii=True, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ", | |
wiki_start_pages=["Hlavná_stránka"], | |
), | |
"Slovene": Language( | |
name="Slovene", | |
iso_code="sl", | |
# Q, W, X, Y are only used for foreign words. | |
use_ascii=False, | |
charsets=["ISO-8859-2", "WINDOWS-1250"], | |
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ", | |
wiki_start_pages=["Glavna_stran"], | |
), | |
# Serbian can be written in both Latin and Cyrillic, but there's no | |
# simple way to get the Latin alphabet pages from Wikipedia through | |
# the API, so for now we just support Cyrillic. | |
"Serbian": Language( | |
name="Serbian", | |
iso_code="sr", | |
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", | |
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], | |
wiki_start_pages=["Главна_страна"], | |
), | |
"Thai": Language( | |
name="Thai", | |
iso_code="th", | |
use_ascii=False, | |
charsets=["ISO-8859-11", "TIS-620", "CP874"], | |
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", | |
wiki_start_pages=["หน้าหลัก"], | |
), | |
"Turkish": Language( | |
name="Turkish", | |
iso_code="tr", | |
# Q, W, and X are not used by Turkish | |
use_ascii=False, | |
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"], | |
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ", | |
wiki_start_pages=["Ana_Sayfa"], | |
), | |
"Vietnamese": Language( | |
name="Vietnamese", | |
iso_code="vi", | |
use_ascii=False, | |
# Windows-1258 is the only common 8-bit | |
# Vietnamese encoding supported by Python. | |
# From Wikipedia: | |
# For systems that lack support for Unicode, | |
# dozens of 8-bit Vietnamese code pages are | |
# available.[1] The most common are VISCII | |
# (TCVN 5712:1993), VPS, and Windows-1258.[3] | |
# Where ASCII is required, such as when | |
# ensuring readability in plain text e-mail, | |
# Vietnamese letters are often encoded | |
# according to Vietnamese Quoted-Readable | |
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] | |
# though usage of either variable-width | |
# scheme has declined dramatically following | |
# the adoption of Unicode on the World Wide | |
# Web. | |
charsets=["WINDOWS-1258"], | |
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY", | |
wiki_start_pages=["Chữ_Quốc_ngữ"], | |
), | |
} | |