MFA / scripts /prep_staged_models.py
niobures's picture
MFA
2f6b10b verified
import collections
import json
import os
import pathlib
import random
import re
import shutil
import typing
from datetime import datetime
import numpy as np
import sqlalchemy
from montreal_forced_aligner import config
config.TEMPORARY_DIRECTORY = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
config.USE_POSTGRES = False
import montreal_forced_aligner.utils
from montreal_forced_aligner.data import PhoneSetType, voiced_variants, voiceless_variants
from montreal_forced_aligner.db import Phone, PhoneType, Pronunciation, Word
from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionary
from montreal_forced_aligner.models import MODEL_TYPES
rng = np.random.default_rng(1234)
random.seed(1234)
root_dir = pathlib.Path(__file__).resolve().parent
template_dir = root_dir.joinpath("templates")
CURRENT_MODEL_VERSION = "3.3.0"
# Get corpus information
current_corpora = {
"english": [
"Common Voice English v8_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
"ICE-Nigeria",
"A Scripted Pakistani English Daily-use Speech Corpus",
"L2-ARCTIC",
],
"czech": [
"Common Voice Czech v9_0",
"GlobalPhone Czech v3_1",
"Large Corpus of Czech Parliament Plenary Hearings",
"Czech Parliament Meetings",
],
"hausa": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"],
"swahili": ["Common Voice Swahili v9_0", "ALFFA Swahili", "GlobalPhone Swahili v3_1"],
"korean": [
"GlobalPhone Korean v3_1",
"Deeply Korean read speech corpus public sample",
"Pansori TEDxKR",
"Zeroth Korean",
"Seoul Corpus",
"ASR-KCSC: A Korean Conversational Speech Corpus",
"ASR-SKDuSC: A Scripted Korean Daily-use Speech Corpus",
"Korean Single Speaker Speech Dataset",
"Common Voice Korean v16_1",
],
"mandarin": [
"Common Voice Chinese (China) v9_0",
"Common Voice Chinese (Taiwan) v9_0",
"AI-DataTang Corpus",
"AISHELL-3",
"THCHS-30",
"GlobalPhone Chinese-Mandarin v3_1",
],
"japanese": [
"Common Voice Japanese v9_0",
"GlobalPhone Japanese v3_1",
"Microsoft Speech Language Translation Japanese",
"Japanese Versatile Speech",
"TEDxJP-10K v1_1",
],
"thai": ["Common Voice Thai v9_0", "GlobalPhone Thai v3_1"],
"vietnamese": ["Common Voice Vietnamese v9_0", "VIVOS", "GlobalPhone Vietnamese v3_1"],
}
model_corpus_mapping = {
"Abkhaz CV acoustic model v2_0_0": ["Common Voice Abkhaz v7_0"],
"Armenian CV acoustic model v2_0_0": ["Common Voice Armenian v7_0"],
"Bashkir CV acoustic model v2_0_0": ["Common Voice Bashkir v7_0"],
"Basque CV acoustic model v2_0_0": ["Common Voice Basque v7_0"],
"Belarusian CV acoustic model v2_0_0": ["Common Voice Belarusian v7_0"],
"Bulgarian CV acoustic model v2_0_0": ["Common Voice Bulgarian v7_0"],
"Chuvash CV acoustic model v2_0_0": ["Common Voice Chuvash v7_0"],
"Czech CV acoustic model v2_0_0": ["Common Voice Czech v7_0"],
"Dutch CV acoustic model v2_0_0": ["Common Voice Dutch v7_0"],
"Georgian CV acoustic model v2_0_0": ["Common Voice Georgian v7_0"],
"Greek CV acoustic model v2_0_0": ["Common Voice Greek v7_0"],
"Guarani CV acoustic model v2_0_0": ["Common Voice Guarani v7_0"],
"Hausa CV acoustic model v2_0_0": ["Common Voice Hausa v7_0"],
"Hungarian CV acoustic model v2_0_0": ["Common Voice Hungarian v7_0"],
"Italian CV acoustic model v2_0_0": ["Common Voice Italian v7_0"],
"Kazakh CV acoustic model v2_0_0": ["Common Voice Kazakh v7_0"],
"Kurmanji CV acoustic model v2_0_0": ["Common Voice Kurmanji v7_0"],
"Kyrgyz CV acoustic model v2_0_0": ["Common Voice Kyrgyz v7_0"],
"Polish CV acoustic model v2_0_0": ["Common Voice Polish v7_0"],
"Portuguese CV acoustic model v2_0_0": ["Common Voice Portuguese v7_0"],
"Romanian CV acoustic model v2_0_0": ["Common Voice Romanian v7_0"],
"Russian CV acoustic model v2_0_0": ["Common Voice Russian v7_0"],
"Sorbian (Upper) CV acoustic model v2_0_0": ["Common Voice Sorbian Upper v7_0"],
"Swedish CV acoustic model v2_0_0": ["Common Voice Swedish v7_0"],
"Tamil CV acoustic model v2_0_0": ["Common Voice Tamil v7_0"],
"Tatar CV acoustic model v2_0_0": ["Common Voice Tatar v7_0"],
"Thai CV acoustic model v2_0_0": ["Common Voice Thai v7_0"],
"Turkish CV acoustic model v2_0_0": ["Common Voice Turkish v7_0"],
"Ukrainian CV acoustic model v2_0_0": ["Common Voice Ukrainian v7_0"],
"Uyghur CV acoustic model v2_0_0": ["Common Voice Uyghur v7_0"],
"Uzbek CV acoustic model v2_0_0": ["Common Voice Uzbek v7_0"],
"Vietnamese CV acoustic model v2_0_0": ["Common Voice Vietnamese v7_0"],
"English (US) ARPA acoustic model v2_0_0": ["LibriSpeech English"],
"English (US) ARPA acoustic model v2_0_0a": ["LibriSpeech English"],
"English (US) ARPA acoustic model v3_0_0": ["LibriSpeech English"],
"English MFA acoustic model v2_0_0": [
"Common Voice English v8_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
],
"English MFA acoustic model v2_0_0a": [
"Common Voice English v8_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
],
"English MFA acoustic model v2_2_1": [
"Common Voice English v8_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
"ICE-Nigeria",
"A Scripted Pakistani English Daily-use Speech Corpus",
"L2-ARCTIC",
],
"English MFA acoustic model v3_0_0": [
"Common Voice English v8_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
"ICE-Nigeria",
"A Scripted Pakistani English Daily-use Speech Corpus",
"L2-ARCTIC",
],
"English MFA acoustic model v3_1_0": [
"Common Voice English v17_0",
"LibriSpeech English",
"Corpus of Regional African American Language v2021_07",
"Google Nigerian English",
"Google UK and Ireland English",
"NCHLT English",
"ARU English corpus",
"ICE-Nigeria",
"A Scripted Pakistani English Daily-use Speech Corpus",
"L2-ARCTIC",
],
"English MFA ivector extractor v2_1_0": current_corpora["english"],
"Multilingual MFA ivector extractor v2_1_0": [
x
for k in [
"english",
"czech",
"hausa",
"swahili",
"thai",
"vietnamese",
"japanese",
"mandarin",
]
for x in current_corpora[k]
],
"French MFA acoustic model v2_0_0": [
"Common Voice French v8_0",
"Multilingual LibriSpeech French",
"GlobalPhone French v3_1",
"African-accented French",
],
"French MFA acoustic model v2_0_0a": [
"Common Voice French v8_0",
"Multilingual LibriSpeech French",
"GlobalPhone French v3_1",
"African-accented French",
],
"French MFA acoustic model v3_0_0": [
"Common Voice French v16_1",
"GlobalPhone French v3_1",
"African-accented French",
],
"German MFA acoustic model v2_0_0": [
"Common Voice German v8_0",
"Multilingual LibriSpeech German",
"GlobalPhone German v3_1",
],
"German MFA acoustic model v3_0_0": ["Common Voice German v16_1", "GlobalPhone German v3_1"],
"German MFA acoustic model v2_0_0a": [
"Common Voice German v8_0",
"Multilingual LibriSpeech German",
"GlobalPhone German v3_1",
],
"Japanese MFA acoustic model v2_0_1a": [
"Common Voice Japanese v12_0",
"GlobalPhone Japanese v3_1",
"Microsoft Speech Language Translation Japanese",
"Japanese Versatile Speech",
"TEDxJP-10K v1_1",
],
"Japanese MFA acoustic model v3_0_0": [
"Common Voice Japanese v12_0",
"GlobalPhone Japanese v3_1",
"Microsoft Speech Language Translation Japanese",
"Japanese Versatile Speech",
"TEDxJP-10K v1_1",
],
"Hausa MFA acoustic model v2_0_0": ["Common Voice Hausa v8_0", "GlobalPhone Hausa v3_1"],
"Hausa MFA acoustic model v2_0_0a": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"],
"Hausa MFA acoustic model v3_0_0": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"],
"Mandarin MFA acoustic model v2_0_0": [
"Common Voice Chinese (China) v8_0",
"Common Voice Chinese (Taiwan) v8_0",
"AI-DataTang Corpus",
"AISHELL-3",
"THCHS-30",
],
"Mandarin MFA acoustic model v2_0_0a": [
"Common Voice Chinese (China) v9_0",
"Common Voice Chinese (Taiwan) v9_0",
"AI-DataTang Corpus",
"AISHELL-3",
"THCHS-30",
"GlobalPhone Chinese-Mandarin v3_1",
],
"Mandarin MFA acoustic model v3_0_0": [
"Common Voice Chinese (China) v16_1",
"Common Voice Chinese (Taiwan) v16_1",
"AI-DataTang Corpus",
"AISHELL-3",
"THCHS-30",
"GlobalPhone Chinese-Mandarin v3_1",
],
"Korean MFA acoustic model v2_0_0": [
"GlobalPhone Korean v3_1",
"Deeply Korean read speech corpus public sample",
"Pansori TEDxKR",
"Zeroth Korean",
"Seoul Corpus",
],
"Korean MFA acoustic model v2_0_0a": [
"GlobalPhone Korean v3_1",
"Deeply Korean read speech corpus public sample",
"Pansori TEDxKR",
"Zeroth Korean",
"Seoul Corpus",
],
"Korean MFA acoustic model v3_0_0": [
"GlobalPhone Korean v3_1",
"Deeply Korean read speech corpus public sample",
"Pansori TEDxKR",
"Zeroth Korean",
"ASR-KCSC A Korean Conversational Speech Corpus",
"ASR-SKDuSC A Scripted Korean Daily-use Speech Corpus",
"Korean Single Speaker Speech Dataset",
"Common Voice Korean v16_1",
],
"Polish MFA acoustic model v2_0_0": [
"Common Voice Polish v8_0",
"Multilingual LibriSpeech Polish",
"M-AILABS Polish",
"GlobalPhone Polish v3_1",
],
"Polish MFA acoustic model v2_0_0a": [
"Common Voice Polish v8_0",
"Multilingual LibriSpeech Polish",
"M-AILABS Polish",
"GlobalPhone Polish v3_1",
],
"Portuguese MFA acoustic model v2_0_0": [
"Common Voice Portuguese v8_0",
"Multilingual LibriSpeech Portuguese",
"GlobalPhone Portuguese (Brazilian) v3_1",
],
"Portuguese MFA acoustic model v2_0_0a": [
"Common Voice Portuguese v8_0",
"Multilingual LibriSpeech Portuguese",
"GlobalPhone Portuguese (Brazilian) v3_1",
],
"Russian MFA acoustic model v2_0_0": [
"Common Voice Russian v8_0",
"Russian LibriSpeech",
"M-AILABS Russian",
"GlobalPhone Russian v3_1",
],
"Russian MFA acoustic model v2_0_0a": [
"Common Voice Russian v9_0",
"Russian LibriSpeech",
"M-AILABS Russian",
"GlobalPhone Russian v3_1",
],
"Russian MFA acoustic model v3_1_0": [
"Common Voice Russian v17_0",
"Russian LibriSpeech",
"M-AILABS Russian",
"Multilingual TEDx Russian",
"GlobalPhone Russian v3_1",
],
"Spanish MFA acoustic model v2_0_0": [
"Common Voice Spanish v8_0",
"Multilingual LibriSpeech Spanish",
"Google i18n Chile",
"Google i18n Columbia",
"Google i18n Peru",
"Google i18n Puerto Rico",
"Google i18n Venezuela",
"M-AILABS Spanish",
"GlobalPhone Spanish (Latin American) v3_1",
],
"Spanish MFA acoustic model v2_0_0a": [
"Common Voice Spanish v8_0",
"Multilingual LibriSpeech Spanish",
"Google i18n Chile",
"Google i18n Columbia",
"Google i18n Peru",
"Google i18n Puerto Rico",
"Google i18n Venezuela",
"M-AILABS Spanish",
"GlobalPhone Spanish (Latin American) v3_1",
],
"Spanish MFA acoustic model v3_3_0": [
"Common Voice Spanish v8_0",
"Multilingual LibriSpeech Spanish",
"Google i18n Chile",
"Google i18n Columbia",
"Google i18n Peru",
"Google i18n Puerto Rico",
"Google i18n Venezuela",
"M-AILABS Spanish",
"GlobalPhone Spanish (Latin American) v3_1",
"Multilingual TEDx Spanish",
],
"Swahili MFA acoustic model v2_0_0": [
"Common Voice Swahili v8_0",
"ALFFA Swahili",
"GlobalPhone Swahili v3_1",
],
"Swahili MFA acoustic model v2_0_0a": [
"Common Voice Swahili v9_0",
"ALFFA Swahili",
"GlobalPhone Swahili v3_1",
],
"Swedish MFA acoustic model v2_0_0": [
"Common Voice Swedish v8_0",
"NST Swedish",
"GlobalPhone Swedish v3_1",
],
"Swedish MFA acoustic model v2_0_0a": [
"Common Voice Swedish v8_0",
"NST Swedish",
"GlobalPhone Swedish v3_1",
],
"Swedish MFA acoustic model v3_0_0": [
"Common Voice Swedish v8_0",
"NST Swedish",
"GlobalPhone Swedish v3_1",
],
"Thai MFA acoustic model v2_0_0": ["Common Voice Thai v8_0", "GlobalPhone Thai v3_1"],
"Thai MFA acoustic model v2_0_0a": ["Common Voice Thai v9_0", "GlobalPhone Thai v3_1"],
"Thai MFA acoustic model v3_0_0": [
"Common Voice Thai v16_1",
"GlobalPhone Thai v3_1",
"Lotus Corpus v1_0",
"Gowajee Corpus v0_9_3",
"Thai Elderly Speech dataset by Data Wow and VISAI v1_0_0",
],
"Bulgarian MFA acoustic model v2_0_0": [
"Common Voice Bulgarian v8_0",
"GlobalPhone Bulgarian v3_1",
],
"Bulgarian MFA acoustic model v2_0_0a": [
"Common Voice Bulgarian v9_0",
"GlobalPhone Bulgarian v3_1",
],
"Bulgarian MFA acoustic model v3_0_0": [
"Common Voice Bulgarian v16_1",
"GlobalPhone Bulgarian v3_1",
],
"Croatian MFA acoustic model v2_0_0": [
"Common Voice Serbian v8_0",
"GlobalPhone Croatian v3_1",
],
"Croatian MFA acoustic model v2_0_0a": [
"Common Voice Serbian v9_0",
"GlobalPhone Croatian v3_1",
],
"Croatian MFA acoustic model v3_3_0": [
"Common Voice Serbian v9_0",
"GlobalPhone Croatian v3_1",
],
"Czech MFA acoustic model v2_0_0": [
"Common Voice Czech v8_0",
"GlobalPhone Czech v3_1",
"Large Corpus of Czech Parliament Plenary Hearings",
"Czech Parliament Meetings",
],
"Czech MFA acoustic model v2_0_0a": [
"Common Voice Czech v9_0",
"GlobalPhone Czech v3_1",
"Large Corpus of Czech Parliament Plenary Hearings",
"Czech Parliament Meetings",
],
"Czech MFA acoustic model v3_3_0": [
"Common Voice Czech v9_0",
"GlobalPhone Czech v3_1",
"Large Corpus of Czech Parliament Plenary Hearings",
"Czech Parliament Meetings",
],
"Turkish MFA acoustic model v3_0_0": [
"Common Voice Turkish v16_1",
"GlobalPhone Turkish v3_1",
],
"Turkish MFA acoustic model v2_0_0": [
"Common Voice Turkish v8_0",
"MediaSpeech Turkish v1_1",
"GlobalPhone Turkish v3_1",
],
"Turkish MFA acoustic model v2_0_0a": [
"Common Voice Turkish v8_0",
"MediaSpeech Turkish v1_1",
"GlobalPhone Turkish v3_1",
],
"Ukrainian MFA acoustic model v2_0_0": [
"Common Voice Ukrainian v8_0",
"M-AILABS Ukrainian",
"GlobalPhone Ukrainian v3_1",
],
"Ukrainian MFA acoustic model v2_0_0a": [
"Common Voice Ukrainian v9_0",
"M-AILABS Ukrainian",
"GlobalPhone Ukrainian v3_1",
],
"Ukrainian MFA acoustic model v3_0_0": [
"Common Voice Ukrainian v16_1",
"M-AILABS Ukrainian",
"GlobalPhone Ukrainian v3_1",
],
"Vietnamese MFA acoustic model v2_0_0": [
"Common Voice Vietnamese v8_0",
"VIVOS",
"GlobalPhone Vietnamese v3_1",
],
"Vietnamese MFA acoustic model v2_0_0a": [
"Common Voice Vietnamese v9_0",
"VIVOS",
"GlobalPhone Vietnamese v3_1",
],
"Vietnamese MFA acoustic model v3_0_0": [
"Common Voice Vietnamese v17_0",
"VIVOS",
"GlobalPhone Vietnamese v3_1",
],
}
model_dictionary_mapping = {
"English MFA acoustic model v2_0_0": [
"English (US) MFA dictionary v2_0_0",
"English (UK) MFA dictionary v2_0_0",
"English (Nigeria) MFA dictionary v2_0_0",
],
"English MFA acoustic model v3_0_0": [
"English (US) MFA dictionary v3_0_0",
"English (UK) MFA dictionary v3_0_0",
"English (Nigeria) MFA dictionary v3_0_0",
"English (India) MFA dictionary v3_0_0",
],
"English MFA acoustic model v3_1_0": [
"English (US) MFA dictionary v3_1_0",
"English (UK) MFA dictionary v3_1_0",
"English (Nigeria) MFA dictionary v3_1_0",
"English (India) MFA dictionary v3_1_0",
],
"Vietnamese MFA acoustic model v2_0_0": [
"Vietnamese (Hanoi) MFA dictionary v2_0_0",
"Vietnamese (Ho Chi Minh City) MFA dictionary v2_0_0",
"Vietnamese (Hue) MFA dictionary v2_0_0",
"Vietnamese MFA dictionary v2_0_0",
],
"Spanish MFA acoustic model v2_0_0": [
"Spanish (Latin America) MFA dictionary v2_0_0",
"Spanish (Spain) MFA dictionary v2_0_0",
"Spanish MFA dictionary v2_0_0",
],
"Spanish MFA acoustic model v3_3_0": [
"Spanish (Latin America) MFA dictionary v3_3_0",
"Spanish (Spain) MFA dictionary v3_3_0",
],
"Portuguese MFA acoustic model v2_0_0": [
"Portuguese (Brazil) MFA dictionary v2_0_0",
"Portuguese (Portugal) MFA dictionary v2_0_0",
"Portuguese MFA dictionary v2_0_0",
],
"Mandarin MFA acoustic model v2_0_0": [
"Mandarin (China) MFA dictionary v2_0_0",
"Mandarin (Erhua) MFA dictionary v2_0_0",
"Mandarin (Taiwan) MFA dictionary v2_0_0",
],
}
def make_path_safe(string):
s = re.sub(r"[- .:()]+", "_", string.lower())
if s.endswith("_"):
s = s[:-1]
return s
def get_model_card_directory(model_type, meta_data):
model_directory = os.path.join(mfa_model_root, model_type)
if model_type == "language_model":
language, version = meta_data["language"], meta_data["version"]
directory = os.path.join(model_directory, language.lower(), "mfa", f"v{version}")
elif model_type in {"ivector", "tokenizer"}:
language, version = meta_data["language"], meta_data["version"]
directory = os.path.join(model_directory, language.lower(), f"v{version}")
elif model_type == "corpus":
language, name = meta_data["language"], meta_data["name"]
name = make_path_safe(name)
if "version" in meta_data:
version = meta_data["version"]
directory = os.path.join(model_directory, language.lower(), name, f"{version}")
else:
directory = os.path.join(model_directory, language.lower(), name)
else:
language, phone_set, dialect, version = (
meta_data["language"],
meta_data["phone_set"],
meta_data["dialect"],
meta_data["version"],
)
if dialect:
phoneset_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower()
else:
phoneset_folder = phone_set.lower()
directory = os.path.join(model_directory, language.lower(), phoneset_folder, f"v{version}")
return directory
mfa_model_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OVERWRITE_METADATA = False
OVERWRITE_MD = False
mfa_citation_template = (
"@techreport{{{id},\n\tauthor={{{extra_authors}McAuliffe, Michael and Sonderegger, Morgan}},"
"\n\ttitle={{{title}}},"
"\n\taddress={{\\url{{https://mfa-models.readthedocs.io/{model_type}/{language}/{link_safe_title}.html}}}},"
"\n\tyear={{{year}}},\n\tmonth={{{month}}},"
"\n}}"
)
cv_citation = (
"@misc{Ahn_Chodroff_2022,\n\tauthor={Ahn, Emily and Chodroff, Eleanor},"
"\n\ttitle={VoxCommunis Corpus},"
"\n\taddress={\\url{https://osf.io/t957v}},"
"\n\tpublisher={OSF},"
"\n\tyear={2022}, \n\tmonth={Jan}\n}"
)
prosodylab_citation = (
"@article{gorman2011prosodylab,\n\tauthor={Gorman, Kyle and Howell, Jonathan and Wagner, Michael},"
"\n\ttitle={Prosodylab-aligner: A tool for forced alignment of laboratory speech},"
"\n\tjournal={Canadian Acoustics},"
"\n\tvolume={39},\n\tnumber={3},\n\tpages={192--193},\n\tyear={2011}\n}"
)
language_link_template = "[{}]({})"
license_links = {
"CC-0": "https://creativecommons.org/publicdomain/zero/1.0/",
"CC BY 4.0": "https://creativecommons.org/licenses/by/4.0/",
"CC BY 3.0": "https://creativecommons.org/licenses/by/3.0/",
"CC BY-SA-NC 3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/",
"CC BY-NC-SA 4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
"CC BY-NC-SA 3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/",
"CC BY-NC 4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
"CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
"CC BY-NC-ND 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
"CC BY-NC 2.0": "https://creativecommons.org/licenses/by-nc/2.0/",
"CC BY-NC-ND 3.0": "https://creativecommons.org/licenses/by-nc-nd/3.0/",
"Microsoft Research Data License": "https://msropendata-web-api.azurewebsites.net/licenses/2f933be3-284d-500b-7ea3-2aa2fd0f1bb2/view",
"Apache 2.0": "https://www.apache.org/licenses/LICENSE-2.0",
"O-UDA v1.0": "https://msropendata-web-api.azurewebsites.net/licenses/f1f352a6-243f-4905-8e00-389edbca9e83/view",
"MIT": "https://opensource.org/licenses/MIT",
"Public domain in the USA": "https://creativecommons.org/share-your-work/public-domain/cc0/",
"M-AILABS License": "https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/",
"ELRA": "https://www.elra.info/en/services-around-lrs/distribution/licensing/",
"Buckeye License": "https://buckeyecorpus.osu.edu/php/registration.php",
"LDC License": "https://www.ldc.upenn.edu/data-management/using/licensing",
"LaboroTV Non-commercial": "https://laboro.ai/activity/column/engineer/eg-laboro-tv-corpus-jp/",
}
mfa_maintainer = "[Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/)"
cv_maintainer = "[Vox Communis](https://osf.io/t957v/)"
corpus_detail_template = """
* {link}:
* **Hours:** `{num_hours:.2f}`
* **Speakers:** `{num_speakers:,}`
* **Utterances:** `{num_utterances:,}`"""
g2p_training_detail_template = """
* **Words:** `{num_words:,}`
* **Phones:** `{num_phones:,}`
* **Graphemes:** `{num_graphemes:,}`"""
g2p_evaluation_detail_template = """
* **Words:** `{num_words:,}`
* **WER:** `{word_error_rate:.2f}%`
* **PER:** `{phone_error_rate:.2f}%`"""
tokenizer_training_detail_template = """
* **Utterances:** `{num_utterances:,}`
* **Graphemes:** `{num_graphemes:,}`"""
tokenizer_evaluation_detail_template = """
* **Utterances:** `{num_utterances:,}`
* **UER:** `{utterance_error_rate:.2f}%`
* **CER:** `{character_error_rate:.2f}%`"""
lm_training_detail_template = """
* **Words:** `{num_words:,}`
* **OOVs:** `{num_oovs:,}`"""
lm_evaluation_detail_template = """
* **Large model:** `{large_perplexity:.2f}`
* **Medium model:** `{medium_perplexity:.2f}`
* **Small model:** `{small_perplexity:.2f}`"""
link_template = "* {{ref}}`{}`"
see_also_template = """```{{admonition}} {model_type_name}
{links}
```"""
mfa_acoustic_model_card_template = template_dir.joinpath(
"mfa_acoustic_model_card_template.md"
).read_text("utf8")
ivector_card_template = template_dir.joinpath("ivector_card_template.md").read_text("utf8")
other_acoustic_model_card_template = template_dir.joinpath(
"other_acoustic_model_card_template.md"
).read_text("utf8")
g2p_model_card_template = template_dir.joinpath("g2p_model_card_template.md").read_text("utf8")
language_model_card_template = template_dir.joinpath("language_model_card_template.md").read_text(
"utf8"
)
mfa_dictionary_card_template = template_dir.joinpath("mfa_dictionary_card_template.md").read_text(
"utf8"
)
other_dictionary_card_template = template_dir.joinpath(
"other_dictionary_card_template.md"
).read_text("utf8")
corpus_card_template = template_dir.joinpath("corpus_card_template.md").read_text("utf8")
tokenizer_model_card_template = template_dir.joinpath(
"tokenizer_model_card_template.md"
).read_text("utf8")
corpus_docs_md_template = template_dir.joinpath("corpus_docs_md_template.md").read_text("utf8")
acoustic_docs_md_template = template_dir.joinpath("acoustic_docs_md_template.md").read_text("utf8")
ivector_docs_md_template = template_dir.joinpath("ivector_docs_md_template.md").read_text("utf8")
g2p_docs_md_template = template_dir.joinpath("g2p_docs_md_template.md").read_text("utf8")
lm_docs_md_template = template_dir.joinpath("lm_docs_md_template.md").read_text("utf8")
tokenizer_docs_md_template = template_dir.joinpath("tokenizer_docs_md_template.md").read_text(
"utf8"
)
mfa_dictionary_docs_md_template = template_dir.joinpath(
"mfa_dictionary_docs_md_template.md"
).read_text("utf8")
other_dictionary_docs_md_template = template_dir.joinpath(
"other_dictionary_docs_md_template.md"
).read_text("utf8")
language_links = {
"Abkhaz": ("Abkhaz", "https://en.wikipedia.org/wiki/Abkhaz_language"),
"Arabic": ("Arabic", "https://en.wikipedia.org/wiki/Arabic"),
"Armenian": ("Armenian", "https://en.wikipedia.org/wiki/Armenian_language"),
"Bashkir": ("Bashkir", "https://en.wikipedia.org/wiki/Bashkir_language"),
"Basque": ("Basque", "https://en.wikipedia.org/wiki/Basque_language"),
"Belarusian": ("Belarusian", "https://en.wikipedia.org/wiki/Belarusian_language"),
"Bulgarian": ("Bulgarian", "https://en.wikipedia.org/wiki/Bulgarian_language"),
"Chuvash": ("Chuvash", "https://en.wikipedia.org/wiki/Chuvash_language"),
"Croatian": ("Serbo-Croatian", "https://en.wikipedia.org/wiki/Serbo-Croatian"),
"Serbocroatian": ("Serbo-Croatian", "https://en.wikipedia.org/wiki/Serbo-Croatian"),
"Czech": ("Czech", "https://en.wikipedia.org/wiki/Czech_language"),
"Dutch": ("Dutch", "https://en.wikipedia.org/wiki/Dutch_language"),
"English": ("English", "https://en.wikipedia.org/wiki/English_language"),
("English", "US"): (
"General American English",
"https://en.wikipedia.org/wiki/General_American_English",
),
("English", "UK"): ("British English", "https://en.wikipedia.org/wiki/British_English"),
("English", "Nigeria"): ("Nigerian English", "https://en.wikipedia.org/wiki/Nigerian_English"),
("English", "India"): ("Indian English", "Japanese tokenizer v2_1_0.md"),
"French": ("French", "https://en.wikipedia.org/wiki/French_language"),
"Georgian": ("Georgian", "https://en.wikipedia.org/wiki/Georgian_language"),
"German": ("German", "https://en.wikipedia.org/wiki/German_language"),
"Greek": ("Greek", "https://en.wikipedia.org/wiki/Greek_language"),
"Guarani": ("Guarani", "https://en.wikipedia.org/wiki/Guarani_language"),
"Hungarian": ("Hungarian", "https://en.wikipedia.org/wiki/Hungarian_language"),
"Italian": ("Italian", "https://en.wikipedia.org/wiki/Italian_language"),
"Indonesian": ("Indonesian", "https://en.wikipedia.org/wiki/Indonesian_language"),
"Hausa": ("Hausa", "https://en.wikipedia.org/wiki/Hausa_language"),
"Kazakh": ("Kazakh", "https://en.wikipedia.org/wiki/Kazakh_language"),
"Kyrgyz": ("Kyrgyz", "https://en.wikipedia.org/wiki/Kyrgyz_language"),
"Kurmanji": ("Kurmanji", "https://en.wikipedia.org/wiki/Kurmanji"),
"Maltese": ("Maltese", "https://en.wikipedia.org/wiki/Maltese_language"),
"Uzbek": ("Uzbek", "https://en.wikipedia.org/wiki/Uzbek_language"),
"Uyghur": ("Uyghur", "https://en.wikipedia.org/wiki/Uyghur_language"),
"Punjabi": ("Punjabi", "https://en.wikipedia.org/wiki/Punjabi_language"),
"Hindi": ("Hindi", "https://en.wikipedia.org/wiki/Hindi_language"),
"Hindi-Urdu": ("Hindi-Urdu", "https://en.wikipedia.org/wiki/Hindustani_language"),
"Japanese": ("Japanese", "https://en.wikipedia.org/wiki/Japanese_language"),
"Korean": ("Korean", "https://en.wikipedia.org/wiki/Korean_language"),
"Polish": ("Polish", "https://en.wikipedia.org/wiki/Polish_language"),
"Portuguese": ("Portuguese", "https://en.wikipedia.org/wiki/Portuguese_language"),
("Portuguese", "Brazil"): (
"Brazilian Portuguese",
"https://en.wikipedia.org/wiki/Brazilian_Portuguese",
),
("Portuguese", "Portugal"): (
"European Portuguese",
"https://en.wikipedia.org/wiki/European_Portuguese",
),
"Romanian": ("Romanian", "https://en.wikipedia.org/wiki/Romanian_language"),
"Russian": ("Russian", "https://en.wikipedia.org/wiki/Russian_language"),
"Spanish": ("Spanish", "https://en.wikipedia.org/wiki/Spanish_language"),
("Spanish", "Latin America"): (
"Spanish in the Americas",
"https://en.wikipedia.org/wiki/Spanish_language_in_the_Americas",
),
("Spanish", "Spain"): (
"Peninsular Spanish",
"https://en.wikipedia.org/wiki/Peninsular_Spanish",
),
"Swahili": ("Swahili", "https://en.wikipedia.org/wiki/Swahili_language"),
"Swedish": ("Swedish", "https://en.wikipedia.org/wiki/Swedish_language"),
"Tamil": ("Tamil", "https://en.wikipedia.org/wiki/Tamil_language"),
"Tatar": ("Tatar", "https://en.wikipedia.org/wiki/Tatar_language"),
"Thai": ("Thai", "https://en.wikipedia.org/wiki/Thai_language"),
"Turkish": ("Turkish", "https://en.wikipedia.org/wiki/Turkish_language"),
"Ukrainian": ("Ukrainian", "https://en.wikipedia.org/wiki/Ukrainian_language"),
"Vietnamese": ("Vietnamese", "https://en.wikipedia.org/wiki/Vietnamese_language"),
("Vietnamese", "Ho Chi Minh City"): (
"Southern Vietnamese",
"https://en.wikipedia.org/wiki/Vietnamese_language#Language_variation",
),
("Vietnamese", "Hanoi"): (
"Northern Vietnamese",
"https://en.wikipedia.org/wiki/Vietnamese_language#Language_variation",
),
"Sorbian": ("Sorbian", "https://en.wikipedia.org/wiki/Sorbian_languages"),
("Sorbian", "Upper"): (
"Upper Sorbian",
"https://en.wikipedia.org/wiki/Upper_Sorbian_language",
),
"Mandarin": ("Mandarin Chinese", "https://en.wikipedia.org/wiki/Mandarin_Chinese"),
("Mandarin", "Taiwan"): (
"Taiwanese Mandarin",
"https://en.wikipedia.org/wiki/Taiwanese_Mandarin",
),
("Mandarin", "Erhua"): ("Beijing Mandarin", "https://en.wikipedia.org/wiki/Beijing_dialect"),
("Mandarin", "China"): (
"Standard Mandarin Chinese",
"https://en.wikipedia.org/wiki/Standard_Chinese",
),
"Urdu": ("Urdu", "https://en.wikipedia.org/wiki/Urdu"),
}
cv_phone_set_mapping = {
"abkhaz": "XPF",
"armenian": "XPF",
"bashkir": "XPF",
"basque": "XPF",
"belarusian": "XPF",
"bulgarian": "XPF",
"chuvash": "XPF",
"czech": "XPF",
"dutch": "Epitran",
"georgian": "XPF",
"greek": "XPF",
"guarani": "XPF",
"hausa": "Epitran",
"hindi": "Epitran",
"hungarian": "XPF",
"indonesian": "Epitran",
"italian": "Epitran",
"kazakh": "Epitran",
"kurmanji": "Epitran",
"kyrgyz": "Epitran",
"maltese": "Epitran",
"polish": "Epitran",
"punjabi": "Epitran",
"portuguese": "Epitran",
"romanian": "XPF",
"russian": "Epitran",
"sorbian_upper": "XPF",
"sorbian": "XPF",
"swedish": "XPF",
"tamil": "XPF",
"tatar": "Epitran",
"thai": "XPF",
"turkish": "XPF",
"ukrainian": "XPF",
"uyghur": "Epitran",
"uzbek": "Epitran",
"urdu": "Epitran",
"vietnamese": "XPF",
}
phone_set_templates = {
"Epitran": "[Epitran](https://github.com/dmort27/epitran)",
"XPF": "[XPF](https://github.com/CohenPr-XPF/XPF)",
"ARPA": "[ARPA](https://en.wikipedia.org/wiki/ARPABET)",
"PINYIN": "[PINYIN](https://en.wikipedia.org/wiki/Pinyin)",
"PROSODYLAB": "[PROSODYLAB](https://github.com/prosodylab/prosodylab.dictionaries)",
"MFA": "[MFA](https://mfa-models.readthedocs.io/en/refactor/mfa_phone_set.html#{language})",
}
model_id_templates = {
"acoustic": "{language}{dialect_title_string} {phone_set} acoustic model{version_string}",
"dictionary": "{language}{dialect_title_string} {phone_set} dictionary{version_string}",
"g2p": "{language}{dialect_title_string} {phone_set} G2P model{version_string}",
"language_model": "{language}{dialect_title_string} language model{version_string}",
"corpus": "{corpus_name}{version_string}",
"ivector": "{language} {phone_set} ivector extractor{version_string}",
"tokenizer": "{language} tokenizer{version_string}",
}
pronunciation_dictionaries = {}
def load_dict(dictionary_path, dict_name, phone_set_type) -> MultispeakerDictionary:
if dict_name not in pronunciation_dictionaries:
pronunciation_dictionaries[dict_name] = MultispeakerDictionary(
dictionary_path, phone_set_type=phone_set_type, position_dependent_phones=False
)
if os.path.exists(pronunciation_dictionaries[dict_name].output_directory):
shutil.rmtree(pronunciation_dictionaries[dict_name].output_directory)
pronunciation_dictionaries[dict_name].dictionary_setup()
return pronunciation_dictionaries[dict_name]
def generate_id(meta_data, model_type):
if "dialect" in meta_data and meta_data["dialect"]:
dialect_title_string = f' ({meta_data["dialect"]})'
else:
dialect_title_string = ""
if "version" in meta_data and meta_data["version"]:
version_string = f' v{meta_data["version"]}'
else:
version_string = ""
template = model_id_templates[model_type]
if model_type == "corpus":
fields = {"corpus_name": meta_data["name"], "version_string": version_string}
else:
fields = {
"language": meta_data["language"].title(),
"dialect_title_string": dialect_title_string,
"version_string": version_string,
}
if model_type not in {"language_model"}:
fields["phone_set"] = meta_data["phone_set"]
if model_type == "ivector":
fields["phone_set"] = "MFA"
return template.format(**fields).replace(".", "_")
def generate_meta_data(model, model_type, language, dialect, version, phone_set):
citation_details = {
"model_name": model.name,
"version": version,
"extra_authors": "",
"model_type": model_type,
"language": language.title(),
"phone_set": phone_set.upper(),
}
citation_template = mfa_citation_template
if language in {"Arabic"}:
citation_details["extra_authors"] = "Shmueli, Natalia and "
maintainer = mfa_maintainer
if dialect:
phone_set_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower()
citation_details["dialect"] = dialect
else:
phone_set_folder = phone_set.lower()
license = "CC BY 4.0"
license_link = f"[CC BY 4.0](https://github.com/MontrealCorpusTools/mfa-models/tree/main/{model_type}/{language.lower()}/{phone_set_folder}/v{version}/LICENSE)"
if model_type == "acoustic":
if model.source.name.endswith("_cv.zip"):
citation = cv_citation
maintainer = cv_maintainer
license = "CC-0"
license_link = "[CC-0](https://creativecommons.org/publicdomain/zero/1.0/)"
train_date = "02-11-2022"
else:
train_date = datetime.fromisoformat(model.meta["train_date"]).date()
citation_details["year"] = train_date.year
citation_details["month"] = train_date.strftime("%b")
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["id"] = f'mfa_{model.name}_acoustic_{citation_details["year"]}'
citation = mfa_citation_template.format(**citation_details)
features = "MFCC"
if model.meta["features"].get("use_pitch", False):
features += " + pitch"
return {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"phone_set": phone_set,
"version": version,
"maintainer": maintainer,
"citation": citation,
"license": license,
"license_link": license_link,
"architecture": model.meta["architecture"],
"features": features,
"evaluation": {},
"decode": {},
"train_date": str(train_date),
}
if model_type == "dictionary":
train_date = datetime.today().date()
citation_details["model_type"] = "pronunciation dictionary"
citation_details["year"] = train_date.year
citation_details["month"] = train_date.strftime("%b")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["id"] = f"mfa_{model.name}_dictionary_{train_date.year}"
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation = citation_template.format(**citation_details)
phone_set = phone_set.upper()
if model.path.name.endswith("_cv.dict"):
citation = cv_citation
maintainer = cv_maintainer
license_link = "[CC-0](https://creativecommons.org/publicdomain/zero/1.0/)"
dictionary_phone_set = "IPA"
elif model.path.name.endswith("_mfa.dict"):
dictionary_phone_set = "IPA"
else:
if model.path.name.endswith("_prosodylab.dict") or model.path.name.endswith(
"us_arpa.dict"
):
citation = prosodylab_citation
try:
dictionary_phone_set = montreal_forced_aligner.data.PhoneSetType[phone_set].name
except KeyError:
dictionary_phone_set = "UNKNOWN"
dictionary = load_dict(model.path, model.name, phone_set_type=dictionary_phone_set)
word_count = len(dictionary.actual_words)
data = {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"maintainer": maintainer,
"license_link": license_link,
"license": license,
"phone_set": phone_set,
"phones": sorted(dictionary.non_silence_phones),
"word_count": word_count,
"train_date": str(train_date),
"version": version,
"citation": citation,
}
output_path = os.path.join(
os.path.dirname(get_model_card_directory("dictionary", data)),
dictionary.name + ".dict",
)
dictionary.export_lexicon(1, output_path)
return data
if model_type == "g2p":
train_date = datetime.fromisoformat(model.meta["train_date"]).date()
citation_details["model_type"] = "G2P model"
citation_details["year"] = train_date.year
citation_details["month"] = train_date.strftime("%b")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation_details["id"] = f"mfa_{model.name}_g2p_{train_date.year}"
return {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"maintainer": maintainer,
"license_link": license_link,
"license": license,
"architecture": model.meta["architecture"],
"training": model.meta["training"],
"evaluation": {
k: v if v is not None else 100 for k, v in model.meta["evaluation"].items()
},
"phone_set": phone_set,
"phones": sorted(model.meta["phones"]),
"version": version,
"train_date": str(train_date),
"citation": citation_template.format(**citation_details),
}
if model_type == "language_model":
train_date = datetime.fromisoformat(model.meta["train_date"]).date()
citation_details["model_type"] = "language model"
citation_details["year"] = train_date.year
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["month"] = train_date.strftime("%b")
citation_details["id"] = f"mfa_{model.name}_lm_{train_date.year}"
return {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"phone_set": "MFA",
"maintainer": maintainer,
"license_link": license_link,
"license": license,
"architecture": model.meta["architecture"],
"version": version,
"train_date": str(train_date),
"training": {
"num_words": model.meta["training"]["num_words"],
"num_oovs": model.meta["training"]["num_oovs"],
},
"evaluation": {
"large_perplexity": model.meta["evaluation_training"]["large_perplexity"],
"medium_perplexity": model.meta["evaluation_training"]["medium_perplexity"],
"small_perplexity": model.meta["evaluation_training"]["small_perplexity"],
},
"citation": citation_template.format(**citation_details),
}
if model_type == "tokenizer":
train_date = datetime.fromisoformat(model.meta["train_date"]).date()
citation_details["model_type"] = "tokenizer"
citation_details["year"] = train_date.year
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["month"] = train_date.strftime("%b")
citation_details["id"] = f"mfa_{model.name}_tokenizer_{train_date.year}"
return {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"phone_set": "MFA",
"maintainer": maintainer,
"license_link": license_link,
"license": license,
"architecture": model.meta["architecture"],
"version": version,
"train_date": str(train_date),
"training": {
"num_utterances": model.meta["training"]["num_utterances"],
"num_graphemes": model.meta["training"]["num_graphemes"],
},
"evaluation": {
k: v if v is not None else 100 for k, v in model.meta["evaluation"].items()
},
"citation": citation_template.format(**citation_details),
}
if model_type == "ivector":
print(model.meta)
if "train_date" in model.meta:
train_date = datetime.fromisoformat(model.meta["train_date"]).date()
else:
train_date = datetime.now().date()
citation_details["model_type"] = "ivector"
citation_details["year"] = train_date.year
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".")
citation_details["link_safe_title"] = generate_id(citation_details, model_type)
citation_details["month"] = train_date.strftime("%b")
citation_details["id"] = f"mfa_{model.name}_ivector_{train_date.year}"
return {
"name": model.name,
"language": language.title(),
"dialect": dialect,
"phone_set": "MFA",
"maintainer": maintainer,
"license_link": license_link,
"license": license,
"version": version,
"train_date": str(train_date),
"citation": citation_template.format(**citation_details),
}
return {}
def extract_model_card_fields(meta_data, model_type):
dialect_link = "N/A"
if "dialect" in meta_data and meta_data["dialect"]:
key = (meta_data["language"], meta_data["dialect"])
if key in language_links:
dialect_link = language_link_template.format(*language_links[key])
if meta_data["language"] != "Multilingual":
language_link = language_link_template.format(*language_links[meta_data["language"]])
else:
language_link = meta_data["language"]
if "dialects" in meta_data and meta_data["dialects"]:
dialect_links = []
for d in meta_data["dialects"]:
key = (meta_data["language"], d)
if key in language_links:
dialect_links.append(language_link_template.format(*language_links[key]))
dialect_link = ", ".join(dialect_links)
if "phone_set" in meta_data:
phone_set = meta_data["phone_set"]
if phone_set == "CV":
phone_set = cv_phone_set_mapping[language.lower()]
phone_set_link = phone_set_templates[phone_set]
if phone_set == "MFA":
phone_set_link = phone_set_link.format(language=meta_data["language"].lower())
name = generate_id(meta_data, model_type)
discussion_title = name.replace(" ", "+").replace(")", "").replace("(", "").replace("_", ".")
if model_type == "acoustic":
corpora_details = ""
if "corpus" in meta_data:
for corpus in meta_data["corpus"]:
if "version" in corpus and corpus["version"]:
corpus_link_template = "[{name}](../../../../corpus/{language}/{corpus_safe_name}/{version}/README.md)"
link = corpus_link_template.format(
name=corpus["name"],
language=make_path_safe(corpus["language"]),
corpus_safe_name=make_path_safe(corpus["name"]),
version=corpus["version"],
)
else:
corpus_link_template = (
"[{name}](../../../../corpus/{language}/{corpus_safe_name}/README.md)"
)
link = corpus_link_template.format(
name=corpus["name"],
language=make_path_safe(corpus["language"]),
corpus_safe_name=make_path_safe(corpus["name"]),
)
data = {
"name": corpus["name"],
"link": link,
"num_hours": corpus["num_hours"],
"num_speakers": corpus["num_speakers"],
"num_utterances": corpus["num_utterances"],
}
corpora_details += "\n" + corpus_detail_template.format(**data)
return {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"discussion_title": discussion_title,
"language": meta_data["language"],
"language_link": language_link,
"dialect": meta_data["dialect"],
"dialect_link": dialect_link,
"version": meta_data["version"],
"maintainer": meta_data["maintainer"],
"features": meta_data["features"],
"architecture": meta_data["architecture"],
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"license_link": meta_data["license_link"],
"corpora_details": corpora_details,
"phone_set_link": phone_set_link,
}
if model_type == "ivector":
corpora_details = ""
if "corpus" in meta_data:
for corpus in meta_data["corpus"]:
if "version" in corpus and corpus["version"]:
corpus_link_template = "[{name}](../../../corpus/{language}/{corpus_safe_name}/{version}/README.md)"
link = corpus_link_template.format(
name=corpus["name"],
language=make_path_safe(corpus["language"]),
corpus_safe_name=make_path_safe(corpus["name"]),
version=corpus["version"],
)
else:
corpus_link_template = (
"[{name}](../../../corpus/{language}/{corpus_safe_name}/README.md)"
)
link = corpus_link_template.format(
name=corpus["name"],
language=make_path_safe(corpus["language"]),
corpus_safe_name=make_path_safe(corpus["name"]),
)
data = {
"name": corpus["name"],
"link": link,
"num_hours": corpus["num_hours"],
"num_speakers": corpus["num_speakers"],
"num_utterances": corpus["num_utterances"],
}
corpora_details += "\n" + corpus_detail_template.format(**data)
return {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"discussion_title": discussion_title,
"language": meta_data["language"],
"language_link": language_link,
"version": meta_data["version"],
"maintainer": meta_data["maintainer"],
"features": meta_data.get("features", "MFCC"),
"architecture": meta_data.get("architecture", "ivector"),
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"license_link": meta_data["license_link"],
"corpora_details": corpora_details,
}
if model_type == "corpus":
citation = meta_data.get("citation", "")
version = meta_data.get("version", "")
dialects = meta_data.get("dialects", [])
if dialects:
dialects = ", ".join(dialects)
else:
dialects = "N/A"
if version:
version = f"- **Version:** `{version}`"
return {
"corpus_name": meta_data["name"],
"title": meta_data["id"].replace("_", "."),
"corpus_id": meta_data["id"],
"language": meta_data["language"],
"language_link": language_link,
"discussion_title": discussion_title,
"corpus_link": f"[{meta_data['name']}]({meta_data['link']})",
"dialects": dialects,
"dialect_link": dialect_link,
"num_hours": meta_data["num_hours"],
"num_utterances": meta_data["num_utterances"],
"num_speakers": meta_data["num_speakers"],
"num_female": meta_data.get("num_female", 0),
"num_male": meta_data.get("num_male", 0),
"num_other": meta_data.get("num_other", meta_data["num_speakers"]),
"license_link": meta_data["license_link"],
"version": version,
"citation": citation,
}
if model_type == "dictionary":
data = {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"language": meta_data["language"],
"language_link": language_link,
"dialect": meta_data["dialect"],
"dialect_link": dialect_link,
"discussion_title": discussion_title,
"version": meta_data["version"],
"maintainer": meta_data["maintainer"],
"license_link": meta_data["license_link"],
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"phone_set": meta_data["phone_set"],
"phones": " ".join(sorted(meta_data["phones"])),
"word_count": meta_data["word_count"],
"phone_set_link": phone_set_link,
}
if meta_data["phone_set"] in {"MFA", "ARPA"}:
data[
"plain_link"
] = f"https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/{language.lower()}/mfa/{model_name}.dict"
return data
if model_type == "g2p":
training_details = g2p_training_detail_template.format(**meta_data["training"])
evaluation_details = g2p_evaluation_detail_template.format(**meta_data["evaluation"])
return {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"language": meta_data["language"],
"language_link": language_link,
"dialect": meta_data["dialect"],
"dialect_link": dialect_link,
"discussion_title": discussion_title,
"architecture": meta_data["architecture"],
"maintainer": meta_data["maintainer"],
"version": meta_data["version"],
"license_link": meta_data["license_link"],
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"phone_set": meta_data["phone_set"],
"phones": ", ".join(f"{{ipa_inline}}`{x}`" for x in meta_data["phones"]),
"training_details": training_details,
"evaluation_details": evaluation_details,
"phone_set_link": phone_set_link,
}
if model_type == "tokenizer":
training_details = tokenizer_training_detail_template.format(**meta_data["training"])
evaluation_details = tokenizer_evaluation_detail_template.format(**meta_data["evaluation"])
return {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"language": meta_data["language"],
"language_link": language_link,
"discussion_title": discussion_title,
"architecture": meta_data["architecture"],
"maintainer": meta_data["maintainer"],
"version": meta_data["version"],
"license_link": meta_data["license_link"],
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"training_details": training_details,
"evaluation_details": evaluation_details,
}
if model_type == "language_model":
training_details = lm_training_detail_template.format(**meta_data["training"])
evaluation_details = lm_evaluation_detail_template.format(**meta_data["evaluation"])
return {
"model_name": meta_data["name"],
"title": name.replace("_", "."),
"language": meta_data["language"],
"language_link": language_link,
"dialect": meta_data["dialect"],
"dialect_link": dialect_link,
"discussion_title": discussion_title,
"architecture": meta_data["architecture"],
"maintainer": meta_data["maintainer"],
"version": meta_data["version"],
"license_link": meta_data["license_link"],
"mfa_version": CURRENT_MODEL_VERSION,
"date": meta_data["train_date"],
"citation": meta_data["citation"],
"training_details": training_details,
"evaluation_details": evaluation_details,
}
def extract_doc_card_fields(meta_data, model_type):
tags = [meta_data["language"]]
if model_type not in {"language_model", "corpus"}:
tags.append(meta_data["phone_set"].upper())
see_also = ""
links = []
for k in ["corpus", "dictionary", "g2p", "acoustic", "language_model", "tokenizer"]:
if k == "corpus" and model_type in {"acoustic", "ivector"}:
continue
if k in meta_data:
if k == "corpus":
links.append(
see_also_template.format(
links="\n".join(
link_template.format(x["id"].lower().replace(" ", "_"))
for x in meta_data[k]
),
model_type_name=model_type_names[k],
)
)
else:
print(meta_data[k])
links.append(
see_also_template.format(
links="\n".join(
link_template.format(x.lower().replace(" ", "_")) for x in meta_data[k]
),
model_type_name=model_type_names[k],
)
)
if links:
see_also = "\n\n".join(links)
try:
license_link = f"[{meta_data['license']}]({license_links[meta_data['license']]})"
except KeyError:
license_link = meta_data["license"]
layout_type = "not_mfa"
if "phone_set" in meta_data:
phone_set = meta_data["phone_set"]
if phone_set == "CV":
phone_set = cv_phone_set_mapping[meta_data["language"].lower()]
elif phone_set in {"MFA", "ARPA", "PROSODYLAB"}:
layout_type = "mfa"
try:
phone_set_link = phone_set_templates[phone_set]
if phone_set == "MFA":
phone_set_link = phone_set_link.format(language=meta_data["language"].lower())
except KeyError:
phone_set_link = phone_set
if "dialect" in meta_data and meta_data["dialect"]:
language_sub_folder = f"{meta_data['dialect']}_{meta_data['phone_set']}".replace(
" ", "_"
).lower()
dialect_title_string = f" ({meta_data['dialect']})"
else:
language_sub_folder = meta_data["phone_set"].lower()
dialect_title_string = ""
name = generate_id(meta_data, model_type)
if model_type == "acoustic":
corpora_details = ""
corpus_link_template = "{{ref}}`{corpus_id}`"
dialects = []
if "corpus" in meta_data:
for corpus in meta_data["corpus"]:
if "dialects" in corpus:
dialects.extend(corpus["dialects"])
data = {
"name": corpus["name"],
"link": corpus_link_template.format(corpus_id=corpus["id"].replace(" ", "_")),
"num_hours": corpus["num_hours"],
"num_speakers": corpus["num_speakers"],
"num_utterances": corpus["num_utterances"],
}
corpora_details += "\n" + corpus_detail_template.format(**data)
if not dialects and "dialect" in meta_data and meta_data["dialect"]:
dialects = [meta_data["dialect"]]
if meta_data["phone_set"] in {"CV", "MFA"}:
tags.append("IPA")
return {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"architecture": meta_data["architecture"],
"version": meta_data["version"],
"corpora_details": corpora_details,
"see_also": see_also,
"tags": ";".join(tags),
"dialects": ";".join(sorted(set(dialects))) if dialects else "N/A",
"language": meta_data["language"].lower(),
"language_name": meta_data["language"].title(),
"license": meta_data["license"],
"phone_set": phone_set,
"layout_type": layout_type,
"license_link": license_link,
"phone_set_link": phone_set_link,
"dialect_title_string": dialect_title_string,
"language_sub_folder": language_sub_folder,
"phone_set_name": meta_data["phone_set"].upper(),
}
if model_type == "ivector":
corpora_details = ""
corpus_link_template = "{{ref}}`{corpus_id}`"
if "corpus" in meta_data:
for corpus in meta_data["corpus"]:
data = {
"name": corpus["name"],
"link": corpus_link_template.format(corpus_id=corpus["id"].replace(" ", "_")),
"num_hours": corpus["num_hours"],
"num_speakers": corpus["num_speakers"],
"num_utterances": corpus["num_utterances"],
}
corpora_details += "\n" + corpus_detail_template.format(**data)
return {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"architecture": meta_data.get("architecture", "ivector"),
"version": meta_data["version"],
"corpora_details": corpora_details,
"see_also": see_also,
"tags": ";".join(tags),
"language": meta_data["language"].lower(),
"language_name": meta_data["language"].title(),
"license": meta_data["license"],
"layout_type": layout_type,
"license_link": license_link,
}
if model_type == "corpus":
if "tags" in meta_data:
tags.extend(meta_data["tags"])
dialects = []
if "dialects" in meta_data:
dialects = meta_data["dialects"]
version_subdirectory = meta_data.get("version", "")
if version_subdirectory:
version_subdirectory = f"/{version_subdirectory}"
return {
"corpus_id": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"corpus_name": meta_data["name"],
"corpus_name_safe": make_path_safe(meta_data["name"]),
"license": meta_data["license"],
"see_also": see_also,
"layout_type": layout_type,
"tags": ";".join(tags),
"version_subdirectory": version_subdirectory,
"language": meta_data["language"].lower(),
"dialects": ";".join(sorted(set(dialects))) if dialects else "N/A",
"language_name": meta_data["language"].title(),
}
if model_type == "dictionary":
if meta_data["name"].endswith("_cv") or meta_data["name"].endswith("_mfa"):
tags.append("IPA")
elif meta_data["name"].endswith("_prosodylab") or meta_data["name"].endswith("us_arpa"):
tags.append("PROSODYLAB")
tags.append("MFA")
data = {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"version": meta_data["version"],
"see_also": see_also,
"tags": ";".join(tags),
"layout_type": layout_type,
"language": meta_data["language"].lower(),
"license": meta_data["license"],
"language_name": meta_data["language"].title(),
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A",
"phone_set": phone_set,
"language_sub_folder": language_sub_folder,
"dialect_title_string": dialect_title_string,
"phone_set_name": meta_data["phone_set"].upper(),
}
if meta_data["name"] in phone_charts:
charts = phone_charts[meta_data["name"]]
data["consonant_chart"] = charts["consonant_chart"]
data["vowel_section"] = charts["oral_vowel_chart"]
if charts["nasal_vowel_chart"]:
data["vowel_section"] = "#### Oral Vowels\n\n" + data["vowel_section"]
data["vowel_section"] += "\n\n#### Nasal Vowels\n\n" + charts["nasal_vowel_chart"]
if charts["diphthongs"]:
data["vowel_section"] += "\n\n#### Diphthongs\n\n* " + "\n* ".join(
f"{{ipa_inline}}`{x}`" for x in sorted(charts["diphthongs"])
)
if charts["triphthongs"]:
data["vowel_section"] += "\n\n#### Triphthongs\n\n* " + "\n* ".join(
f"{{ipa_inline}}`{x}`" for x in sorted(charts["triphthongs"])
)
if "tones" in charts and charts["tones"]:
data["vowel_section"] += "\n\n#### Tones\n\n* " + "\n* ".join(
f"{{ipa_inline}}`{x}`" for x in sorted(charts["tones"])
)
if "stress" in charts and charts["stress"]:
data["vowel_section"] += "\n\n#### Stress\n\n* " + "\n* ".join(
f"{{ipa_inline}}`{x}`" for x in sorted(charts["stress"])
)
if "other" in charts and charts["other"]:
data["vowel_section"] += "\n\n### Other phones\n\n* " + "\n* ".join(
f"{{ipa_inline}}`{x}`" for x in sorted(charts["other"])
)
return data
if model_type == "g2p":
if meta_data["phone_set"] in {"CV", "MFA"}:
tags.append("IPA")
return {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"architecture": meta_data["architecture"],
"version": meta_data["version"],
"see_also": see_also,
"layout_type": layout_type,
"language_sub_folder": language_sub_folder,
"dialect_title_string": dialect_title_string,
"tags": ";".join(tags),
"license": meta_data["license"],
"language": meta_data["language"].lower(),
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A",
"language_name": meta_data["language"].title(),
"phone_set": phone_set,
"phone_set_name": meta_data["phone_set"].upper(),
}
if model_type == "language_model":
tags = ["MFA"]
return {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"layout_type": layout_type,
"language": meta_data["language"].lower(),
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A",
"architecture": meta_data["architecture"],
"language_name": meta_data["language"].title(),
"dialect_title_string": dialect_title_string,
"license": meta_data["license"],
"version": meta_data["version"],
"source": "mfa",
"see_also": see_also,
"tags": ";".join(tags),
}
if model_type == "tokenizer":
tags = ["MFA"]
return {
"model_name": name,
"ref": name.replace(" ", "_"),
"title": name.replace("_", "."),
"model_type": model_type,
"layout_type": layout_type,
"language": meta_data["language"].lower(),
"architecture": meta_data["architecture"],
"language_name": meta_data["language"].title(),
"license": meta_data["license"],
"version": meta_data["version"],
"source": "mfa",
"see_also": see_also,
"tags": ";".join(tags),
}
model_card_templates = {
"acoustic": {
"mfa": mfa_acoustic_model_card_template,
"other": other_acoustic_model_card_template,
},
"dictionary": {"mfa": mfa_dictionary_card_template, "other": other_dictionary_card_template},
"g2p": {"mfa": g2p_model_card_template, "other": g2p_model_card_template},
"language_model": {"mfa": language_model_card_template, "other": language_model_card_template},
"corpus": {"mfa": corpus_card_template, "other": corpus_card_template},
"ivector": {"mfa": ivector_card_template, "other": ivector_card_template},
"tokenizer": {"mfa": tokenizer_model_card_template, "other": tokenizer_model_card_template},
}
docs_card_templates = {
"acoustic": {"mfa": acoustic_docs_md_template, "other": acoustic_docs_md_template},
"dictionary": {
"mfa": mfa_dictionary_docs_md_template,
"other": other_dictionary_docs_md_template,
},
"g2p": {"mfa": g2p_docs_md_template, "other": g2p_docs_md_template},
"language_model": {"mfa": lm_docs_md_template, "other": lm_docs_md_template},
"corpus": {"mfa": corpus_docs_md_template, "other": corpus_docs_md_template},
"ivector": {"mfa": ivector_docs_md_template, "other": ivector_docs_md_template},
"tokenizer": {"mfa": tokenizer_docs_md_template, "other": tokenizer_docs_md_template},
}
model_type_names = {
"acoustic": "Acoustic models",
"dictionary": "Pronunciation dictionaries",
"g2p": "G2P models",
"language_model": "Language models",
"corpus": "Corpora",
"ivector": "Ivector extractors",
"tokenizer": "Tokenizer models",
}
model_type_columns = {
"acoustic": "ID;language;dialect;phoneset;license",
"ivector": "ID;language;license",
"dictionary": "ID;language;dialect;phoneset;license",
"g2p": "ID;language;dialect;phoneset;license",
"language_model": "ID;language;dialect;license",
"corpus": "ID;language;dialect;license",
"tokenizer": "ID;language;license",
}
model_type_column_widths = {
"acoustic": "40;20;20;10;10",
"dictionary": "40;20;20;10;10",
"g2p": "40;20;20;10;10",
"language_model": "50;20;20;10",
"ivector": "50;25;25",
"tokenizer": "50;25;25",
"corpus": "40;20;25;15",
}
meta_datas = {}
chart_template = """``````{{list-table}}
:header-rows: 1
:stub-columns: {stub_column_count}
:class: {type}_chart table-striped table-bordered
* - {header_data}
* - {row_data}
``````
"""
def generate_extra_data(dictionary, base_indent):
lines = []
for key, value in dictionary.items():
if isinstance(value, dict):
lines.append(f"{base_indent}* {key}")
if len(value) > 4:
value = {k: value[k] for k in rng.choice(list(value.keys()), 4, replace=False)}
lines.extend(generate_extra_data(value, base_indent=" " + base_indent))
else:
lines.append(f"{base_indent}* {key}: {value}")
return lines
def format_ipa_cell(
phone_data: dict[str, list[str]],
extra_data: dict[str, dict[str, typing.Any]] = None,
base_indent: typing.Optional[str] = "",
) -> str:
cell_lines = [f"```{{ipa_cell}}"]
for phone_class, v in phone_data.items():
if not v:
continue
cell_lines.append(f"{base_indent}* {phone_class}")
for phone in v:
cell_lines.append(f"{base_indent} * {phone}")
if phone in extra_data:
cell_lines.extend(
generate_extra_data(extra_data[phone], base_indent=base_indent + " ")
)
cell_lines.append(f"{base_indent}```")
cell_content = "\n".join(cell_lines)
return cell_content
def check_phone(phone, feature_set, phone_set_type):
if phone_set_type is PhoneSetType.ARPA:
return phone in feature_set
else:
return any(x in phone for x in feature_set)
def analyze_dictionary(dictionary_path, name, phone_set_type):
d = load_dict(dictionary_path, name, phone_set_type=phone_set_type)
dictionary_mapping = collections.defaultdict(set)
if d.phone_set_type is PhoneSetType.ARPA:
super_segmentals = {"stress": re.compile(r"[0-2]+")}
ipa_mapping = {
"stops": d.phone_set_type.stops,
"voiced": d.phone_set_type.voiced_obstruents,
"voiceless": d.phone_set_type.voiceless_obstruents,
"fricative": d.phone_set_type.fricatives,
"affricates": d.phone_set_type.affricates,
"sibilant": d.phone_set_type.sibilants,
"lateral": d.phone_set_type.laterals,
"nasal": d.phone_set_type.nasals,
"approximant": d.phone_set_type.approximants,
"labial": d.phone_set_type.labials,
"labiodental": d.phone_set_type.labiodental,
"dental": d.phone_set_type.dental,
"alveolar": d.phone_set_type.alveolar,
"alveopalatal": d.phone_set_type.alveopalatal,
"velar": d.phone_set_type.velar,
"glottal": d.phone_set_type.glottal,
"implosive": set(),
"lateral_tap": set(),
"tap": set(),
"palatal": d.phone_set_type.palatal,
"trill": set(),
"pharyngeal": set(),
"epiglottal": set(),
"uvular": set(),
"retroflex": set(),
"lateral_fricative": set(),
"close": d.phone_set_type.close_vowels,
"close-mid": d.phone_set_type.close_mid_vowels,
"open-mid": d.phone_set_type.open_mid_vowels,
"open": d.phone_set_type.open_vowels,
"front": d.phone_set_type.front_vowels - {"IH"},
"near-front": {"IH"},
"central": d.phone_set_type.central_vowels,
"back": d.phone_set_type.back_vowels - {"UH"},
"near-back": {"UH"},
"rounded": d.phone_set_type.rounded_vowels,
"unrounded": d.phone_set_type.unrounded_vowels,
"lax": {"IH", "UH", "AH", "AE", "ER"},
"other": set(),
}
else:
ipa_mapping = {
"stops": d.phone_set_type.stops,
"voiced": d.phone_set_type.voiced_obstruents,
"voiceless": d.phone_set_type.voiceless_obstruents,
"implosive": d.phone_set_type.implosive_obstruents,
"fricative": d.phone_set_type.fricatives,
"sibilant": d.phone_set_type.sibilants,
"lateral": d.phone_set_type.laterals,
"lateral_fricative": d.phone_set_type.lateral_fricatives,
"nasal": d.phone_set_type.nasals,
"nasal_approximants": d.phone_set_type.nasal_approximants,
"trill": d.phone_set_type.trills,
"tap": d.phone_set_type.taps,
"lateral_tap": d.phone_set_type.lateral_taps,
"approximant": d.phone_set_type.approximants - d.phone_set_type.nasal_approximants,
"labial": d.phone_set_type.labials,
"labiodental": d.phone_set_type.labiodental,
"dental": d.phone_set_type.dental,
"alveolar": d.phone_set_type.alveolar,
"retroflex": d.phone_set_type.retroflex,
"alveopalatal": d.phone_set_type.alveopalatal,
"palatal": d.phone_set_type.palatal,
"velar": d.phone_set_type.velar,
"uvular": d.phone_set_type.uvular,
"pharyngeal": d.phone_set_type.pharyngeal,
"epiglottal": d.phone_set_type.epiglottal,
"glottal": d.phone_set_type.glottal,
"close": d.phone_set_type.close_vowels,
"close-mid": d.phone_set_type.close_mid_vowels,
"open-mid": d.phone_set_type.open_mid_vowels,
"open": d.phone_set_type.open_vowels,
"front": d.phone_set_type.front_vowels - {"ɪ", "ʏ", "ɛ̈", "ʏ̈"},
"near-front": {"ɪ", "ʏ", "ɛ̈", "ʏ̈"},
"central": d.phone_set_type.central_vowels,
"back": d.phone_set_type.back_vowels - {"ʊ", "ɔ̈"},
"near-back": {"ʊ", "ɔ̈"},
"rounded": d.phone_set_type.rounded_vowels,
"unrounded": d.phone_set_type.unrounded_vowels,
"lax": {"ɪ", "ʏ", "ʊ", "ə", "ɐ", "æ", "ɚ"},
"nasalized": {"ã", "õ", "ĩ", "ũ", "ẽ"},
"other": {"kp", "ɧ", "ŋm"},
}
super_segmentals = {"tones": re.compile(r"[˩˨˧˦˥ˀ]+")}
for k, v in ipa_mapping.items():
voiceless = [x for x in v if x in ipa_mapping["voiceless"]]
voiced = [x for x in v if x not in ipa_mapping["voiceless"]]
mod_phones = set()
for p in voiceless:
mod_phones |= voiceless_variants(p)
for p in voiced:
mod_phones |= voiced_variants(p)
ipa_mapping[k] = mod_phones | v
extra_data = {}
with d.session() as session:
phones = session.query(Phone).filter(Phone.phone_type == PhoneType.non_silence)
phone_counts = collections.Counter()
pronunciations = session.query(Pronunciation.pronunciation)
for (p,) in pronunciations:
p = p.split()
phone_counts.update(p)
total_phones = set()
triphthongs = d.phone_set_type.triphthong_phones
diphthongs = d.phone_set_type.diphthong_phones
for phone in phones:
words = (
session.query(Word.word, Pronunciation.pronunciation)
.join(Word.pronunciations)
.filter(
sqlalchemy.func.length(Word.word) > 2,
sqlalchemy.func.length(Word.word) < 6,
Pronunciation.probability != None, # noqa
Pronunciation.pronunciation.regexp_match(rf"\b{phone.phone}(?=\s|$)"),
)
.distinct()
.order_by(sqlalchemy.func.random())
.limit(4)
)
for super_seg, pattern in super_segmentals.items():
phone_m = pattern.search(phone.phone)
if phone_m:
dictionary_mapping[super_seg].add(phone_m.group(0))
counts = phone_counts[phone.phone]
examples = {}
for w, pron in words:
examples[w] = f"[{pron}]"
phone = phone.phone.replace(phone_m.group(0), "")
if phone not in extra_data:
extra_data[phone] = {"Occurrences": 0, "Examples": {}}
if isinstance(extra_data[phone]["Occurrences"], str):
try:
extra_data[phone]["Occurrences"] = int(extra_data[phone]["Occurrences"])
except ValueError:
extra_data[phone]["Occurrences"] = 0
extra_data[phone]["Occurrences"] += counts
extra_data[phone]["Examples"].update(examples)
break
else:
extra_data[phone.phone] = {"Occurrences": phone_counts[phone.phone], "Examples": {}}
phone = phone.phone
for w, pron in words:
extra_data[phone]["Examples"][w] = f"[{pron}]"
base_phone = d.get_base_phone(phone)
query_set = {phone, base_phone}
if base_phone in ipa_mapping["other"]:
dictionary_mapping["other"].add(phone)
continue
if "ʲ" in phone:
dictionary_mapping["palatalized"].add(phone)
if "ʷ" in phone:
dictionary_mapping["labialized"].add(phone)
if "̃" in phone:
dictionary_mapping["nasalized"].add(phone)
base_phone = base_phone.replace("̃", "")
if "͈" in phone:
dictionary_mapping["tense"].add(phone)
dictionary_mapping["voiceless"].add(phone)
if "̪" in phone:
dictionary_mapping["dental"].add(phone)
if any(x in phone for x in ["ⁿ", "ᵑ", "ᵐ"]):
dictionary_mapping["prenasalized"].add(phone)
dictionary_mapping["voiced"].add(phone)
elif "ʱ" in phone or "̤" in phone:
dictionary_mapping["aspirated"].add(phone)
dictionary_mapping["voiced"].add(phone)
elif check_phone(phone, ipa_mapping["voiced"], d.phone_set_type):
dictionary_mapping["voiced"].add(phone)
elif check_phone(phone, ipa_mapping["implosive"], d.phone_set_type):
dictionary_mapping["implosive"].add(phone)
dictionary_mapping["voiced"].add(phone)
elif "ʰ" in phone:
dictionary_mapping["aspirated"].add(phone)
dictionary_mapping["voiceless"].add(phone)
elif "ʼ" in phone:
dictionary_mapping["ejective"].add(phone)
dictionary_mapping["voiceless"].add(phone)
elif check_phone(phone, ipa_mapping["voiceless"], d.phone_set_type):
dictionary_mapping["voiceless"].add(phone)
if "̚" in phone:
dictionary_mapping["unreleased"].add(phone)
if any(x in diphthongs for x in query_set):
dictionary_mapping["diphthong"].add(phone)
elif any(x in triphthongs for x in query_set):
dictionary_mapping["triphthong"].add(phone)
elif any(x in d.phone_set_type.affricates for x in query_set):
dictionary_mapping["affricate"].add(phone)
elif any(x in d.phone_set_type.stops for x in query_set):
dictionary_mapping["stop"].add(phone)
for k, v in ipa_mapping.items():
if base_phone in v:
dictionary_mapping[k].add(phone)
total_phones.add(phone)
for v in dictionary_mapping.values():
if phone in v:
break
else:
dictionary_mapping["other"].add(phone)
places = [
"labial",
"labiodental",
"dental",
"alveolar",
"alveopalatal",
"retroflex",
"palatal",
"velar",
"uvular",
"pharyngeal",
"epiglottal",
"glottal",
]
columns = []
for p in places:
if p in dictionary_mapping:
columns.append(p)
sub_manners = ["tense", "aspirated", "implosive", "ejective", "unreleased", "prenasalized"]
rows = []
plotted = set()
for manner in [
"nasal",
"stop",
"affricate",
"sibilant",
"fricative",
"approximant",
"tap",
"trill",
"lateral_fricative",
"lateral",
"lateral_tap",
]:
if manner not in dictionary_mapping:
continue
realized_submanner_rows = {}
for x in sub_manners:
if dictionary_mapping[manner] & dictionary_mapping[x]:
realized_submanner_rows[x] = [f"{{submanner}}`{x.title()}`"]
row_title = f"{{manner}}`{manner.replace('_',' ').title()}`"
if realized_submanner_rows:
row_title += " {submanner}`Plain`"
row = [row_title]
for place in columns:
cell_set = dictionary_mapping[manner] & dictionary_mapping[place]
base_set = dictionary_mapping[manner] & dictionary_mapping[place]
for x in sub_manners:
cell_set -= dictionary_mapping[x]
base_set -= dictionary_mapping[x]
voiced_set = base_set & dictionary_mapping["voiced"]
voiceless_set = base_set & dictionary_mapping["voiceless"]
other_set = base_set - dictionary_mapping["voiceless"] - dictionary_mapping["voiced"]
plotted.update(voiceless_set)
plotted.update(voiced_set)
plotted.update(other_set)
cell_data = {
"voiceless": sorted(voiceless_set),
"voiced": sorted(voiced_set),
"other": sorted(other_set),
}
cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ")
row.append(cell_contents)
rows.append(row)
if realized_submanner_rows:
for place in columns:
for sub_manner in realized_submanner_rows.keys():
cell_set = (
dictionary_mapping[manner]
& dictionary_mapping[place]
& dictionary_mapping[sub_manner]
)
for s in realized_submanner_rows.keys():
if s == sub_manner:
continue
cell_set -= dictionary_mapping[s]
voiced_set = cell_set & dictionary_mapping["voiced"]
voiceless_set = cell_set & dictionary_mapping["voiceless"]
other_set = (
cell_set - dictionary_mapping["voiceless"] - dictionary_mapping["voiced"]
)
plotted.update(voiceless_set)
plotted.update(voiced_set)
plotted.update(other_set)
cell_data = {
"voiceless": sorted(voiceless_set),
"voiced": sorted(voiced_set),
"other": sorted(other_set),
}
cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ")
realized_submanner_rows[sub_manner].append(cell_contents)
rows.extend(realized_submanner_rows.values())
row_headers = ["Manner"]
columns = row_headers + columns
consonants = {"header": columns, "rows": rows}
oral_rows = []
nasal_rows = []
headers = ["front", "near-front", "central", "near-back", "back"]
has_nasal = False
for height in ["close", "close-mid", "open-mid", "open"]:
for on in ["nasalized", "oral"]:
main_row = [height.title()]
lax_row = [""]
for column in headers:
cell_set = dictionary_mapping[height] & dictionary_mapping[column]
if on in dictionary_mapping: # nasalized
cell_set &= dictionary_mapping["nasalized"]
if cell_set and not has_nasal:
has_nasal = True
else:
cell_set -= dictionary_mapping["nasalized"]
if height == "close" and column in {"front", "back"}:
lax_set = set()
tense_set = cell_set - dictionary_mapping["lax"]
elif height == "close" and column in {"near-front", "near-back"}:
tense_set = set()
lax_set = cell_set & dictionary_mapping["lax"]
else:
tense_set = cell_set - dictionary_mapping["lax"]
lax_set = cell_set & dictionary_mapping["lax"]
tense_rounded = tense_set & dictionary_mapping["rounded"]
tense_unrounded = tense_set & dictionary_mapping["unrounded"]
cell_data = {
"unrounded": sorted(tense_unrounded),
"rounded": sorted(tense_rounded),
}
plotted.update(tense_unrounded)
plotted.update(tense_rounded)
tense_cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ")
lax_rounded = lax_set & dictionary_mapping["rounded"]
lax_unrounded = lax_set & dictionary_mapping["unrounded"]
plotted.update(lax_rounded)
plotted.update(lax_unrounded)
cell_data = {
"unrounded": sorted(lax_unrounded),
"rounded": sorted(lax_rounded),
}
lax_cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ")
main_row.append(tense_cell_contents)
lax_row.append(lax_cell_contents)
if on in dictionary_mapping: # nasalized
nasal_rows.append(main_row)
if height != "open":
nasal_rows.append(lax_row)
else:
oral_rows.append(main_row)
if height != "open":
oral_rows.append(lax_row)
headers = [""] + [x.title() for x in headers]
if not has_nasal:
nasal_rows = None
header_row_string = "\n - ".join(x.title() for x in consonants["header"])
row_strings = "\n* - ".join("\n - ".join(x) for x in consonants["rows"])
stub_column_count = 1
consonant_chart = chart_template.format(
header_data=header_row_string,
row_data=row_strings,
type="consonant",
stub_column_count=stub_column_count,
)
vowels = {
"oral_rows": oral_rows,
"nasal_rows": nasal_rows,
"header": headers,
}
header_row_string = "\n - ".join(vowels["header"])
row_strings = "\n* - ".join("\n - ".join(x) for x in vowels["oral_rows"])
oral_chart = chart_template.format(
header_data=header_row_string, row_data=row_strings, type="vowel", stub_column_count=1
)
nasal_chart = None
if nasal_rows:
header_row_string = "\n - ".join(vowels["header"])
row_strings = "\n* - ".join("\n - ".join(x) for x in vowels["nasal_rows"])
nasal_chart = chart_template.format(
header_data=header_row_string, row_data=row_strings, type="vowel", stub_column_count=1
)
data = {
"consonant_chart": consonant_chart,
"oral_vowel_chart": oral_chart,
"nasal_vowel_chart": nasal_chart,
"diphthongs": dictionary_mapping["diphthong"],
"other": dictionary_mapping["other"] & (total_phones - plotted),
"triphthongs": dictionary_mapping["triphthong"],
}
for k in super_segmentals.keys():
if k in dictionary_mapping:
data[k] = dictionary_mapping[k]
return data
phone_charts = {}
model_mappings = {}
for model_type, model_class in MODEL_TYPES.items():
# if model_type != 'ivector':
# continue
meta_datas[model_type] = {}
model_mappings[model_type] = {}
model_directory = os.path.join(mfa_model_root, model_type)
staging_directory = os.path.join(model_directory, "staging")
models_to_stage = os.listdir(staging_directory)
for file_name in models_to_stage:
if not os.path.isfile(os.path.join(staging_directory, file_name)):
continue
if model_type == "dictionary" and not file_name.endswith(".dict"):
continue
print(file_name)
model = model_class(os.path.join(staging_directory, file_name))
print(model.meta)
s = model.name.split("_")
dialect = ""
if model_type == "language_model":
if "_mfa" in model.name:
s = model.name.replace("_mfa", "").split("_")
language = "_".join(s[:-1])
dialect = " ".join(s[1:-1])
phone_set = "MFA"
elif model_type == "ivector":
language = model.name.replace("_mfa", "")
dialect = ""
phone_set = ""
elif model_type == "tokenizer":
language = model.name.replace("_mfa", "")
dialect = ""
phone_set = ""
elif len(s) == 1:
language = s[0]
phone_set = "Unknown"
dialect = ""
elif len(s) == 2:
language, phone_set = s
phone_set = phone_set.upper()
dialect = ""
else:
language = s[0]
phone_set = s[-1].upper()
dialect = " ".join(s[1:-1])
try:
version = model.meta["version"]
except KeyError:
version = montreal_forced_aligner.utils.get_mfa_version()
if version.startswith("2.") or version.startswith("3."):
version = CURRENT_MODEL_VERSION
language = language.title()
if len(dialect) == 2:
dialect = dialect.upper()
else:
dialect = dialect.title()
print(model_directory, language, phone_set, version)
if dialect:
phone_set_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower()
else:
phone_set_folder = phone_set.lower()
if phone_set_folder:
output_directory = os.path.join(
model_directory, language.lower(), phone_set_folder, f"v{version}"
)
else:
output_directory = os.path.join(model_directory, language.lower(), f"v{version}")
os.makedirs(output_directory, exist_ok=True)
license_path = os.path.join(output_directory, "LICENSE")
if phone_set != "CV" and not os.path.exists(license_path):
shutil.copyfile(os.path.join(mfa_model_root, "LICENSE"), license_path)
meta_path = os.path.join(output_directory, "meta.json")
if OVERWRITE_METADATA or not os.path.exists(meta_path):
meta_data = generate_meta_data(
model, model_type, language, dialect, version, phone_set
)
with open(meta_path, "w", encoding="utf8") as f:
json.dump(meta_data, f, indent=4, ensure_ascii=False)
else:
with open(meta_path, "r", encoding="utf8") as f:
meta_data = json.load(f)
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data
keys = [language]
if model_type in {"language_model", "ivector", "tokenizer"}:
if dialect:
keys.append((language, dialect))
key = (language, dialect)
else:
if dialect:
keys.append((language, dialect))
keys.append((language, dialect, phone_set))
key = (language, dialect, phone_set)
dialect_key = (language, dialect)
else:
keys.append((language, phone_set))
for key in keys:
if key not in model_mappings[model_type]:
model_mappings[model_type][key] = []
model_mappings[model_type][key].append(generate_id(meta_data, model_type))
if model_type == "dictionary" and phone_set in {"MFA", "CV", "ARPA"}:
phone_set_type = "IPA"
if phone_set == "ARPA":
phone_set_type = "ARPA"
phone_charts[meta_data["name"]] = analyze_dictionary(
model.path, model.name, phone_set_type
)
# if language == 'hindi':
# err
existing_models = []
for language in os.listdir(model_directory):
if language in {"staging", "training", "filter_lists", "1.0"}:
continue
language_directory = os.path.join(model_directory, language)
if not os.path.isdir(language_directory):
continue
language = language.title()
if model_type in {"ivector", "tokenizer"}:
for version in os.listdir(language_directory):
meta_path = os.path.join(language_directory, version, "meta.json")
if not os.path.exists(meta_path):
continue
with open(meta_path, "r", encoding="utf8") as f:
meta_data = json.load(f)
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data
keys = [language]
for key in keys:
if key not in model_mappings[model_type]:
model_mappings[model_type][key] = []
model_mappings[model_type][key].append(generate_id(meta_data, model_type))
else:
for phone_set in os.listdir(language_directory):
print(phone_set)
phone_set_dir = os.path.join(language_directory, phone_set)
if "_" in phone_set:
dialect, phone_set = phone_set.rsplit("_", maxsplit=1)
else:
dialect = ""
for version in os.listdir(phone_set_dir):
meta_path = os.path.join(phone_set_dir, version, "meta.json")
if not os.path.exists(meta_path):
continue
with open(meta_path, "r", encoding="utf8") as f:
meta_data = json.load(f)
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data
keys = [language]
if model_type == "language_model":
if dialect:
keys.append((language, dialect))
key = (language, dialect)
else:
if dialect:
keys.append((language, dialect))
keys.append((language, dialect, phone_set))
key = (language, dialect, phone_set)
dialect_key = (language, dialect)
else:
keys.append((language, phone_set))
for key in keys:
if key not in model_mappings[model_type]:
model_mappings[model_type][key] = []
model_mappings[model_type][key].append(generate_id(meta_data, model_type))
if "dictionary" in meta_datas:
for k in model_corpus_mapping.keys():
dict_id = k.replace("acoutic model", "dictionary")
if dict_id in meta_datas["dictionary"]:
model_dictionary_mapping[k] = [dict_id]
if "g2p" in meta_datas:
for v in model_corpus_mapping.values():
for d_id in v:
g2p_id = d_id.replace("dictionary", "G2P model")
if g2p_id in meta_datas["g2p"]:
model_dictionary_mapping[g2p_id] = [d_id]
if "language_model" in meta_datas:
for k, v in model_dictionary_mapping.items():
lm_id = k.replace("acoustic", "language")
if lm_id in meta_datas["language_model"]:
model_dictionary_mapping[lm_id] = v
if "tokenizer" in meta_datas:
for k, v in model_dictionary_mapping.items():
tokenizer_id = k.replace("tokenizer", "language")
if tokenizer_id in meta_datas["tokenizer"]:
model_dictionary_mapping[tokenizer_id] = v
corpora_metadata = {}
model_mappings["corpus"] = {}
corpus_metadata_file = os.path.join(mfa_model_root, "corpus", "staging", "corpus_data.json")
if os.path.exists(corpus_metadata_file):
with open(corpus_metadata_file, "r", encoding="utf8") as f:
data = json.load(f)
for language, c_list in data.items():
if language == "Hindi-Urdu":
continue
for c in c_list:
name = c["name"]
if "version" in c:
name += f'_{c["version"]}'
id = make_path_safe(name)
c["language"] = language
c["id"] = generate_id(c, "corpus")
c["license_link"] = f"[{c['license']}]({license_links[c['license']]})"
if "dialects" not in c:
c["dialects"] = []
c["dialects"] = [x.title() if len(x) > 2 else x.upper() for x in c["dialects"]]
corpora_metadata[c["id"]] = c
print(c)
print(generate_id(c, "corpus"))
language_key = language
if language_key not in model_mappings["corpus"]:
model_mappings["corpus"][language_key] = []
model_mappings["corpus"][language_key].append(c["id"])
if c["dialects"]:
for d in c["dialects"]:
key = (language, d)
if key not in model_mappings["corpus"]:
model_mappings["corpus"][key] = []
model_mappings["corpus"][key].append(c["id"])
meta_datas["corpus"] = corpora_metadata
# Add links
for model_type, data in meta_datas.items():
for model_name, meta_data in data.items():
model_id = generate_id(meta_data, model_type)
if model_type in {"acoustic", "language_model", "ivector", "tokenizer"}:
print("HELLO!?", model_id, model_corpus_mapping.keys())
if model_id in model_corpus_mapping:
print(model_corpus_mapping[model_id])
print(corpora_metadata.keys())
meta_data["corpus"] = [corpora_metadata[x] for x in model_corpus_mapping[model_id]]
for corpus_id in model_corpus_mapping[model_id]:
if model_type not in meta_datas["corpus"][corpus_id]:
meta_datas["corpus"][corpus_id][model_type] = []
meta_datas["corpus"][corpus_id][model_type].append(model_id)
if model_type in {"language_model", "corpus", "ivector"}:
if "dialect" in meta_data and meta_data["dialect"]:
key = (meta_data["language"], meta_data["dialect"])
else:
key = meta_data["language"]
else:
if "dialect" in meta_data and meta_data["dialect"]:
key = (meta_data["language"], meta_data["dialect"], meta_data["phone_set"])
else:
key = (meta_data["language"], meta_data["phone_set"])
if model_type in {"acoustic", "language_model", "g2p"}:
print(meta_data["name"])
print(key)
print(model_mappings["dictionary"])
if key in model_mappings["dictionary"]:
if "dictionary" not in meta_data:
meta_data["dictionary"] = []
meta_data["dictionary"].extend(model_mappings["dictionary"][key])
if model_id in model_dictionary_mapping:
if "dictionary" not in meta_data:
meta_data["dictionary"] = []
meta_data["dictionary"].extend(
[
x
for x in model_dictionary_mapping[model_id]
if x not in meta_data["dictionary"]
]
)
elif model_type == "dictionary":
for t in ["acoustic", "g2p", "language_model", "corpus"]:
if key in model_mappings[t]:
if t not in meta_data:
meta_data[t] = []
meta_data[t].extend(model_mappings[t][key])
elif model_type == "corpus":
meta_data["dictionary"] = []
if "dialects" in meta_data and meta_data["dialects"]:
for dialect in meta_data["dialects"]:
key = (meta_data["language"], dialect)
if key in model_mappings["dictionary"]:
meta_data["dictionary"].extend(model_mappings["dictionary"][key])
else:
print(
meta_data["language"],
model_mappings["dictionary"],
meta_data["language"] in model_mappings["dictionary"],
)
if meta_data["language"] in model_mappings["dictionary"]:
for dictionary_id in model_mappings["dictionary"][meta_data["language"]]:
m = meta_datas["dictionary"][dictionary_id]
meta_data["dictionary"].append(dictionary_id)
for model_type, data in meta_datas.items():
docs_dir = os.path.join(mfa_model_root, "docs", "source", model_type)
os.makedirs(docs_dir, exist_ok=True)
language_model_doc_mds = {}
for model_name, meta_data in data.items():
print(model_name, meta_data)
if model_type not in {"language_model", "corpus"} and meta_data["phone_set"] in {
"PROSODYLAB",
"PINYIN",
}:
model_card_template = model_card_templates[model_type]["other"]
docs_md_template = docs_card_templates[model_type]["other"]
elif model_type not in {"language_model", "corpus"} and meta_data["phone_set"] in {"CV"}:
model_card_template = model_card_templates[model_type]["other"]
docs_md_template = docs_card_templates[model_type]["mfa"]
else:
model_card_template = model_card_templates[model_type]["mfa"]
docs_md_template = docs_card_templates[model_type]["mfa"]
if model_type == "language_model":
language, version = meta_data["language"], meta_data["version"]
elif model_type == "corpus":
language, name = meta_data["language"], meta_data["name"]
name = make_path_safe(name)
else:
language, phone_set, dialect, version = (
meta_data["language"],
meta_data["phone_set"],
meta_data["dialect"],
meta_data["version"],
)
output_directory = get_model_card_directory(model_type, meta_data)
os.makedirs(output_directory, exist_ok=True)
model_card_path = os.path.join(output_directory, "README.md")
rst_path = model_name + ".md"
docs_language_dir = os.path.join(docs_dir, language)
if language not in language_model_doc_mds:
language_model_doc_mds[language] = []
os.makedirs(docs_language_dir, exist_ok=True)
docs_card_path = os.path.join(docs_language_dir, rst_path)
language_model_doc_mds[language].append(rst_path)
if OVERWRITE_MD or not os.path.exists(model_card_path):
with open(model_card_path, "w", encoding="utf8") as f:
print(meta_data)
fields = extract_model_card_fields(meta_data, model_type)
f.write(model_card_template.format(**fields))
if OVERWRITE_MD or not os.path.exists(docs_card_path):
with open(docs_card_path, "w", encoding="utf8") as f:
print(meta_data)
fields = extract_doc_card_fields(meta_data, model_type)
f.write(docs_md_template.format(**fields))
index_path = os.path.join(docs_dir, "index.rst")
rst_string = " " + "\n ".join(
f"{x}/index.rst" for x in sorted(language_model_doc_mds.keys())
)
if model_type == "dictionary":
rst_string = " ../mfa_phone_set.md\n" + rst_string
model_type_name = model_type_names[model_type]
columns = model_type_columns[model_type]
widths = model_type_column_widths[model_type]
with open(index_path, "w", encoding="utf8") as f:
f.write(
f"""
.. _{model_type}:
{model_type_name}
{'='* len(model_type_name)}
.. needtable::
:types: {model_type}
:style: datatable
:columns: {columns}
:class: table-striped
:colwidths: {widths}
.. toctree::
:hidden:
{rst_string}
"""
)
for language, model_doc_mds in sorted(language_model_doc_mds.items()):
index_path = os.path.join(docs_dir, language, "index.rst")
rst_string = " " + "\n ".join(model_doc_mds)
with open(index_path, "w", encoding="utf8") as f:
f.write(
f"""
.. _{model_type}_{language.lower()}:
{language.title()}
{'='* len(language)}
.. needtable::
:types: {model_type}
:filter: language == "{language.title()}"
:style: datatable
:columns: {columns}
:class: table-striped
:colwidths: {widths}
.. toctree::
:hidden:
{rst_string}
"""
)