|
|
import collections |
|
|
import json |
|
|
import os |
|
|
import pathlib |
|
|
import random |
|
|
import re |
|
|
import shutil |
|
|
import typing |
|
|
from datetime import datetime |
|
|
|
|
|
import numpy as np |
|
|
import sqlalchemy |
|
|
from montreal_forced_aligner import config |
|
|
|
|
|
config.TEMPORARY_DIRECTORY = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) |
|
|
config.USE_POSTGRES = False |
|
|
import montreal_forced_aligner.utils |
|
|
from montreal_forced_aligner.data import PhoneSetType, voiced_variants, voiceless_variants |
|
|
from montreal_forced_aligner.db import Phone, PhoneType, Pronunciation, Word |
|
|
from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionary |
|
|
from montreal_forced_aligner.models import MODEL_TYPES |
|
|
|
|
|
rng = np.random.default_rng(1234) |
|
|
random.seed(1234) |
|
|
|
|
|
root_dir = pathlib.Path(__file__).resolve().parent |
|
|
template_dir = root_dir.joinpath("templates") |
|
|
|
|
|
CURRENT_MODEL_VERSION = "3.3.0" |
|
|
|
|
|
|
|
|
|
|
|
current_corpora = { |
|
|
"english": [ |
|
|
"Common Voice English v8_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
"ICE-Nigeria", |
|
|
"A Scripted Pakistani English Daily-use Speech Corpus", |
|
|
"L2-ARCTIC", |
|
|
], |
|
|
"czech": [ |
|
|
"Common Voice Czech v9_0", |
|
|
"GlobalPhone Czech v3_1", |
|
|
"Large Corpus of Czech Parliament Plenary Hearings", |
|
|
"Czech Parliament Meetings", |
|
|
], |
|
|
"hausa": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"], |
|
|
"swahili": ["Common Voice Swahili v9_0", "ALFFA Swahili", "GlobalPhone Swahili v3_1"], |
|
|
"korean": [ |
|
|
"GlobalPhone Korean v3_1", |
|
|
"Deeply Korean read speech corpus public sample", |
|
|
"Pansori TEDxKR", |
|
|
"Zeroth Korean", |
|
|
"Seoul Corpus", |
|
|
"ASR-KCSC: A Korean Conversational Speech Corpus", |
|
|
"ASR-SKDuSC: A Scripted Korean Daily-use Speech Corpus", |
|
|
"Korean Single Speaker Speech Dataset", |
|
|
"Common Voice Korean v16_1", |
|
|
], |
|
|
"mandarin": [ |
|
|
"Common Voice Chinese (China) v9_0", |
|
|
"Common Voice Chinese (Taiwan) v9_0", |
|
|
"AI-DataTang Corpus", |
|
|
"AISHELL-3", |
|
|
"THCHS-30", |
|
|
"GlobalPhone Chinese-Mandarin v3_1", |
|
|
], |
|
|
"japanese": [ |
|
|
"Common Voice Japanese v9_0", |
|
|
"GlobalPhone Japanese v3_1", |
|
|
"Microsoft Speech Language Translation Japanese", |
|
|
"Japanese Versatile Speech", |
|
|
"TEDxJP-10K v1_1", |
|
|
], |
|
|
"thai": ["Common Voice Thai v9_0", "GlobalPhone Thai v3_1"], |
|
|
"vietnamese": ["Common Voice Vietnamese v9_0", "VIVOS", "GlobalPhone Vietnamese v3_1"], |
|
|
} |
|
|
|
|
|
model_corpus_mapping = { |
|
|
"Abkhaz CV acoustic model v2_0_0": ["Common Voice Abkhaz v7_0"], |
|
|
"Armenian CV acoustic model v2_0_0": ["Common Voice Armenian v7_0"], |
|
|
"Bashkir CV acoustic model v2_0_0": ["Common Voice Bashkir v7_0"], |
|
|
"Basque CV acoustic model v2_0_0": ["Common Voice Basque v7_0"], |
|
|
"Belarusian CV acoustic model v2_0_0": ["Common Voice Belarusian v7_0"], |
|
|
"Bulgarian CV acoustic model v2_0_0": ["Common Voice Bulgarian v7_0"], |
|
|
"Chuvash CV acoustic model v2_0_0": ["Common Voice Chuvash v7_0"], |
|
|
"Czech CV acoustic model v2_0_0": ["Common Voice Czech v7_0"], |
|
|
"Dutch CV acoustic model v2_0_0": ["Common Voice Dutch v7_0"], |
|
|
"Georgian CV acoustic model v2_0_0": ["Common Voice Georgian v7_0"], |
|
|
"Greek CV acoustic model v2_0_0": ["Common Voice Greek v7_0"], |
|
|
"Guarani CV acoustic model v2_0_0": ["Common Voice Guarani v7_0"], |
|
|
"Hausa CV acoustic model v2_0_0": ["Common Voice Hausa v7_0"], |
|
|
"Hungarian CV acoustic model v2_0_0": ["Common Voice Hungarian v7_0"], |
|
|
"Italian CV acoustic model v2_0_0": ["Common Voice Italian v7_0"], |
|
|
"Kazakh CV acoustic model v2_0_0": ["Common Voice Kazakh v7_0"], |
|
|
"Kurmanji CV acoustic model v2_0_0": ["Common Voice Kurmanji v7_0"], |
|
|
"Kyrgyz CV acoustic model v2_0_0": ["Common Voice Kyrgyz v7_0"], |
|
|
"Polish CV acoustic model v2_0_0": ["Common Voice Polish v7_0"], |
|
|
"Portuguese CV acoustic model v2_0_0": ["Common Voice Portuguese v7_0"], |
|
|
"Romanian CV acoustic model v2_0_0": ["Common Voice Romanian v7_0"], |
|
|
"Russian CV acoustic model v2_0_0": ["Common Voice Russian v7_0"], |
|
|
"Sorbian (Upper) CV acoustic model v2_0_0": ["Common Voice Sorbian Upper v7_0"], |
|
|
"Swedish CV acoustic model v2_0_0": ["Common Voice Swedish v7_0"], |
|
|
"Tamil CV acoustic model v2_0_0": ["Common Voice Tamil v7_0"], |
|
|
"Tatar CV acoustic model v2_0_0": ["Common Voice Tatar v7_0"], |
|
|
"Thai CV acoustic model v2_0_0": ["Common Voice Thai v7_0"], |
|
|
"Turkish CV acoustic model v2_0_0": ["Common Voice Turkish v7_0"], |
|
|
"Ukrainian CV acoustic model v2_0_0": ["Common Voice Ukrainian v7_0"], |
|
|
"Uyghur CV acoustic model v2_0_0": ["Common Voice Uyghur v7_0"], |
|
|
"Uzbek CV acoustic model v2_0_0": ["Common Voice Uzbek v7_0"], |
|
|
"Vietnamese CV acoustic model v2_0_0": ["Common Voice Vietnamese v7_0"], |
|
|
"English (US) ARPA acoustic model v2_0_0": ["LibriSpeech English"], |
|
|
"English (US) ARPA acoustic model v2_0_0a": ["LibriSpeech English"], |
|
|
"English (US) ARPA acoustic model v3_0_0": ["LibriSpeech English"], |
|
|
"English MFA acoustic model v2_0_0": [ |
|
|
"Common Voice English v8_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
], |
|
|
"English MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice English v8_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
], |
|
|
"English MFA acoustic model v2_2_1": [ |
|
|
"Common Voice English v8_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
"ICE-Nigeria", |
|
|
"A Scripted Pakistani English Daily-use Speech Corpus", |
|
|
"L2-ARCTIC", |
|
|
], |
|
|
"English MFA acoustic model v3_0_0": [ |
|
|
"Common Voice English v8_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
"ICE-Nigeria", |
|
|
"A Scripted Pakistani English Daily-use Speech Corpus", |
|
|
"L2-ARCTIC", |
|
|
], |
|
|
"English MFA acoustic model v3_1_0": [ |
|
|
"Common Voice English v17_0", |
|
|
"LibriSpeech English", |
|
|
"Corpus of Regional African American Language v2021_07", |
|
|
"Google Nigerian English", |
|
|
"Google UK and Ireland English", |
|
|
"NCHLT English", |
|
|
"ARU English corpus", |
|
|
"ICE-Nigeria", |
|
|
"A Scripted Pakistani English Daily-use Speech Corpus", |
|
|
"L2-ARCTIC", |
|
|
], |
|
|
"English MFA ivector extractor v2_1_0": current_corpora["english"], |
|
|
"Multilingual MFA ivector extractor v2_1_0": [ |
|
|
x |
|
|
for k in [ |
|
|
"english", |
|
|
"czech", |
|
|
"hausa", |
|
|
"swahili", |
|
|
"thai", |
|
|
"vietnamese", |
|
|
"japanese", |
|
|
"mandarin", |
|
|
] |
|
|
for x in current_corpora[k] |
|
|
], |
|
|
"French MFA acoustic model v2_0_0": [ |
|
|
"Common Voice French v8_0", |
|
|
"Multilingual LibriSpeech French", |
|
|
"GlobalPhone French v3_1", |
|
|
"African-accented French", |
|
|
], |
|
|
"French MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice French v8_0", |
|
|
"Multilingual LibriSpeech French", |
|
|
"GlobalPhone French v3_1", |
|
|
"African-accented French", |
|
|
], |
|
|
"French MFA acoustic model v3_0_0": [ |
|
|
"Common Voice French v16_1", |
|
|
"GlobalPhone French v3_1", |
|
|
"African-accented French", |
|
|
], |
|
|
"German MFA acoustic model v2_0_0": [ |
|
|
"Common Voice German v8_0", |
|
|
"Multilingual LibriSpeech German", |
|
|
"GlobalPhone German v3_1", |
|
|
], |
|
|
"German MFA acoustic model v3_0_0": ["Common Voice German v16_1", "GlobalPhone German v3_1"], |
|
|
"German MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice German v8_0", |
|
|
"Multilingual LibriSpeech German", |
|
|
"GlobalPhone German v3_1", |
|
|
], |
|
|
"Japanese MFA acoustic model v2_0_1a": [ |
|
|
"Common Voice Japanese v12_0", |
|
|
"GlobalPhone Japanese v3_1", |
|
|
"Microsoft Speech Language Translation Japanese", |
|
|
"Japanese Versatile Speech", |
|
|
"TEDxJP-10K v1_1", |
|
|
], |
|
|
"Japanese MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Japanese v12_0", |
|
|
"GlobalPhone Japanese v3_1", |
|
|
"Microsoft Speech Language Translation Japanese", |
|
|
"Japanese Versatile Speech", |
|
|
"TEDxJP-10K v1_1", |
|
|
], |
|
|
"Hausa MFA acoustic model v2_0_0": ["Common Voice Hausa v8_0", "GlobalPhone Hausa v3_1"], |
|
|
"Hausa MFA acoustic model v2_0_0a": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"], |
|
|
"Hausa MFA acoustic model v3_0_0": ["Common Voice Hausa v9_0", "GlobalPhone Hausa v3_1"], |
|
|
"Mandarin MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Chinese (China) v8_0", |
|
|
"Common Voice Chinese (Taiwan) v8_0", |
|
|
"AI-DataTang Corpus", |
|
|
"AISHELL-3", |
|
|
"THCHS-30", |
|
|
], |
|
|
"Mandarin MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Chinese (China) v9_0", |
|
|
"Common Voice Chinese (Taiwan) v9_0", |
|
|
"AI-DataTang Corpus", |
|
|
"AISHELL-3", |
|
|
"THCHS-30", |
|
|
"GlobalPhone Chinese-Mandarin v3_1", |
|
|
], |
|
|
"Mandarin MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Chinese (China) v16_1", |
|
|
"Common Voice Chinese (Taiwan) v16_1", |
|
|
"AI-DataTang Corpus", |
|
|
"AISHELL-3", |
|
|
"THCHS-30", |
|
|
"GlobalPhone Chinese-Mandarin v3_1", |
|
|
], |
|
|
"Korean MFA acoustic model v2_0_0": [ |
|
|
"GlobalPhone Korean v3_1", |
|
|
"Deeply Korean read speech corpus public sample", |
|
|
"Pansori TEDxKR", |
|
|
"Zeroth Korean", |
|
|
"Seoul Corpus", |
|
|
], |
|
|
"Korean MFA acoustic model v2_0_0a": [ |
|
|
"GlobalPhone Korean v3_1", |
|
|
"Deeply Korean read speech corpus public sample", |
|
|
"Pansori TEDxKR", |
|
|
"Zeroth Korean", |
|
|
"Seoul Corpus", |
|
|
], |
|
|
"Korean MFA acoustic model v3_0_0": [ |
|
|
"GlobalPhone Korean v3_1", |
|
|
"Deeply Korean read speech corpus public sample", |
|
|
"Pansori TEDxKR", |
|
|
"Zeroth Korean", |
|
|
"ASR-KCSC A Korean Conversational Speech Corpus", |
|
|
"ASR-SKDuSC A Scripted Korean Daily-use Speech Corpus", |
|
|
"Korean Single Speaker Speech Dataset", |
|
|
"Common Voice Korean v16_1", |
|
|
], |
|
|
"Polish MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Polish v8_0", |
|
|
"Multilingual LibriSpeech Polish", |
|
|
"M-AILABS Polish", |
|
|
"GlobalPhone Polish v3_1", |
|
|
], |
|
|
"Polish MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Polish v8_0", |
|
|
"Multilingual LibriSpeech Polish", |
|
|
"M-AILABS Polish", |
|
|
"GlobalPhone Polish v3_1", |
|
|
], |
|
|
"Portuguese MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Portuguese v8_0", |
|
|
"Multilingual LibriSpeech Portuguese", |
|
|
"GlobalPhone Portuguese (Brazilian) v3_1", |
|
|
], |
|
|
"Portuguese MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Portuguese v8_0", |
|
|
"Multilingual LibriSpeech Portuguese", |
|
|
"GlobalPhone Portuguese (Brazilian) v3_1", |
|
|
], |
|
|
"Russian MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Russian v8_0", |
|
|
"Russian LibriSpeech", |
|
|
"M-AILABS Russian", |
|
|
"GlobalPhone Russian v3_1", |
|
|
], |
|
|
"Russian MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Russian v9_0", |
|
|
"Russian LibriSpeech", |
|
|
"M-AILABS Russian", |
|
|
"GlobalPhone Russian v3_1", |
|
|
], |
|
|
"Russian MFA acoustic model v3_1_0": [ |
|
|
"Common Voice Russian v17_0", |
|
|
"Russian LibriSpeech", |
|
|
"M-AILABS Russian", |
|
|
"Multilingual TEDx Russian", |
|
|
"GlobalPhone Russian v3_1", |
|
|
], |
|
|
"Spanish MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Spanish v8_0", |
|
|
"Multilingual LibriSpeech Spanish", |
|
|
"Google i18n Chile", |
|
|
"Google i18n Columbia", |
|
|
"Google i18n Peru", |
|
|
"Google i18n Puerto Rico", |
|
|
"Google i18n Venezuela", |
|
|
"M-AILABS Spanish", |
|
|
"GlobalPhone Spanish (Latin American) v3_1", |
|
|
], |
|
|
"Spanish MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Spanish v8_0", |
|
|
"Multilingual LibriSpeech Spanish", |
|
|
"Google i18n Chile", |
|
|
"Google i18n Columbia", |
|
|
"Google i18n Peru", |
|
|
"Google i18n Puerto Rico", |
|
|
"Google i18n Venezuela", |
|
|
"M-AILABS Spanish", |
|
|
"GlobalPhone Spanish (Latin American) v3_1", |
|
|
], |
|
|
"Spanish MFA acoustic model v3_3_0": [ |
|
|
"Common Voice Spanish v8_0", |
|
|
"Multilingual LibriSpeech Spanish", |
|
|
"Google i18n Chile", |
|
|
"Google i18n Columbia", |
|
|
"Google i18n Peru", |
|
|
"Google i18n Puerto Rico", |
|
|
"Google i18n Venezuela", |
|
|
"M-AILABS Spanish", |
|
|
"GlobalPhone Spanish (Latin American) v3_1", |
|
|
"Multilingual TEDx Spanish", |
|
|
], |
|
|
"Swahili MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Swahili v8_0", |
|
|
"ALFFA Swahili", |
|
|
"GlobalPhone Swahili v3_1", |
|
|
], |
|
|
"Swahili MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Swahili v9_0", |
|
|
"ALFFA Swahili", |
|
|
"GlobalPhone Swahili v3_1", |
|
|
], |
|
|
"Swedish MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Swedish v8_0", |
|
|
"NST Swedish", |
|
|
"GlobalPhone Swedish v3_1", |
|
|
], |
|
|
"Swedish MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Swedish v8_0", |
|
|
"NST Swedish", |
|
|
"GlobalPhone Swedish v3_1", |
|
|
], |
|
|
"Swedish MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Swedish v8_0", |
|
|
"NST Swedish", |
|
|
"GlobalPhone Swedish v3_1", |
|
|
], |
|
|
"Thai MFA acoustic model v2_0_0": ["Common Voice Thai v8_0", "GlobalPhone Thai v3_1"], |
|
|
"Thai MFA acoustic model v2_0_0a": ["Common Voice Thai v9_0", "GlobalPhone Thai v3_1"], |
|
|
"Thai MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Thai v16_1", |
|
|
"GlobalPhone Thai v3_1", |
|
|
"Lotus Corpus v1_0", |
|
|
"Gowajee Corpus v0_9_3", |
|
|
"Thai Elderly Speech dataset by Data Wow and VISAI v1_0_0", |
|
|
], |
|
|
"Bulgarian MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Bulgarian v8_0", |
|
|
"GlobalPhone Bulgarian v3_1", |
|
|
], |
|
|
"Bulgarian MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Bulgarian v9_0", |
|
|
"GlobalPhone Bulgarian v3_1", |
|
|
], |
|
|
"Bulgarian MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Bulgarian v16_1", |
|
|
"GlobalPhone Bulgarian v3_1", |
|
|
], |
|
|
"Croatian MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Serbian v8_0", |
|
|
"GlobalPhone Croatian v3_1", |
|
|
], |
|
|
"Croatian MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Serbian v9_0", |
|
|
"GlobalPhone Croatian v3_1", |
|
|
], |
|
|
"Croatian MFA acoustic model v3_3_0": [ |
|
|
"Common Voice Serbian v9_0", |
|
|
"GlobalPhone Croatian v3_1", |
|
|
], |
|
|
"Czech MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Czech v8_0", |
|
|
"GlobalPhone Czech v3_1", |
|
|
"Large Corpus of Czech Parliament Plenary Hearings", |
|
|
"Czech Parliament Meetings", |
|
|
], |
|
|
"Czech MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Czech v9_0", |
|
|
"GlobalPhone Czech v3_1", |
|
|
"Large Corpus of Czech Parliament Plenary Hearings", |
|
|
"Czech Parliament Meetings", |
|
|
], |
|
|
"Czech MFA acoustic model v3_3_0": [ |
|
|
"Common Voice Czech v9_0", |
|
|
"GlobalPhone Czech v3_1", |
|
|
"Large Corpus of Czech Parliament Plenary Hearings", |
|
|
"Czech Parliament Meetings", |
|
|
], |
|
|
"Turkish MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Turkish v16_1", |
|
|
"GlobalPhone Turkish v3_1", |
|
|
], |
|
|
"Turkish MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Turkish v8_0", |
|
|
"MediaSpeech Turkish v1_1", |
|
|
"GlobalPhone Turkish v3_1", |
|
|
], |
|
|
"Turkish MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Turkish v8_0", |
|
|
"MediaSpeech Turkish v1_1", |
|
|
"GlobalPhone Turkish v3_1", |
|
|
], |
|
|
"Ukrainian MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Ukrainian v8_0", |
|
|
"M-AILABS Ukrainian", |
|
|
"GlobalPhone Ukrainian v3_1", |
|
|
], |
|
|
"Ukrainian MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Ukrainian v9_0", |
|
|
"M-AILABS Ukrainian", |
|
|
"GlobalPhone Ukrainian v3_1", |
|
|
], |
|
|
"Ukrainian MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Ukrainian v16_1", |
|
|
"M-AILABS Ukrainian", |
|
|
"GlobalPhone Ukrainian v3_1", |
|
|
], |
|
|
"Vietnamese MFA acoustic model v2_0_0": [ |
|
|
"Common Voice Vietnamese v8_0", |
|
|
"VIVOS", |
|
|
"GlobalPhone Vietnamese v3_1", |
|
|
], |
|
|
"Vietnamese MFA acoustic model v2_0_0a": [ |
|
|
"Common Voice Vietnamese v9_0", |
|
|
"VIVOS", |
|
|
"GlobalPhone Vietnamese v3_1", |
|
|
], |
|
|
"Vietnamese MFA acoustic model v3_0_0": [ |
|
|
"Common Voice Vietnamese v17_0", |
|
|
"VIVOS", |
|
|
"GlobalPhone Vietnamese v3_1", |
|
|
], |
|
|
} |
|
|
|
|
|
model_dictionary_mapping = { |
|
|
"English MFA acoustic model v2_0_0": [ |
|
|
"English (US) MFA dictionary v2_0_0", |
|
|
"English (UK) MFA dictionary v2_0_0", |
|
|
"English (Nigeria) MFA dictionary v2_0_0", |
|
|
], |
|
|
"English MFA acoustic model v3_0_0": [ |
|
|
"English (US) MFA dictionary v3_0_0", |
|
|
"English (UK) MFA dictionary v3_0_0", |
|
|
"English (Nigeria) MFA dictionary v3_0_0", |
|
|
"English (India) MFA dictionary v3_0_0", |
|
|
], |
|
|
"English MFA acoustic model v3_1_0": [ |
|
|
"English (US) MFA dictionary v3_1_0", |
|
|
"English (UK) MFA dictionary v3_1_0", |
|
|
"English (Nigeria) MFA dictionary v3_1_0", |
|
|
"English (India) MFA dictionary v3_1_0", |
|
|
], |
|
|
"Vietnamese MFA acoustic model v2_0_0": [ |
|
|
"Vietnamese (Hanoi) MFA dictionary v2_0_0", |
|
|
"Vietnamese (Ho Chi Minh City) MFA dictionary v2_0_0", |
|
|
"Vietnamese (Hue) MFA dictionary v2_0_0", |
|
|
"Vietnamese MFA dictionary v2_0_0", |
|
|
], |
|
|
"Spanish MFA acoustic model v2_0_0": [ |
|
|
"Spanish (Latin America) MFA dictionary v2_0_0", |
|
|
"Spanish (Spain) MFA dictionary v2_0_0", |
|
|
"Spanish MFA dictionary v2_0_0", |
|
|
], |
|
|
"Spanish MFA acoustic model v3_3_0": [ |
|
|
"Spanish (Latin America) MFA dictionary v3_3_0", |
|
|
"Spanish (Spain) MFA dictionary v3_3_0", |
|
|
], |
|
|
"Portuguese MFA acoustic model v2_0_0": [ |
|
|
"Portuguese (Brazil) MFA dictionary v2_0_0", |
|
|
"Portuguese (Portugal) MFA dictionary v2_0_0", |
|
|
"Portuguese MFA dictionary v2_0_0", |
|
|
], |
|
|
"Mandarin MFA acoustic model v2_0_0": [ |
|
|
"Mandarin (China) MFA dictionary v2_0_0", |
|
|
"Mandarin (Erhua) MFA dictionary v2_0_0", |
|
|
"Mandarin (Taiwan) MFA dictionary v2_0_0", |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def make_path_safe(string): |
|
|
s = re.sub(r"[- .:()]+", "_", string.lower()) |
|
|
if s.endswith("_"): |
|
|
s = s[:-1] |
|
|
return s |
|
|
|
|
|
|
|
|
def get_model_card_directory(model_type, meta_data): |
|
|
model_directory = os.path.join(mfa_model_root, model_type) |
|
|
if model_type == "language_model": |
|
|
language, version = meta_data["language"], meta_data["version"] |
|
|
directory = os.path.join(model_directory, language.lower(), "mfa", f"v{version}") |
|
|
elif model_type in {"ivector", "tokenizer"}: |
|
|
language, version = meta_data["language"], meta_data["version"] |
|
|
directory = os.path.join(model_directory, language.lower(), f"v{version}") |
|
|
elif model_type == "corpus": |
|
|
language, name = meta_data["language"], meta_data["name"] |
|
|
name = make_path_safe(name) |
|
|
if "version" in meta_data: |
|
|
version = meta_data["version"] |
|
|
directory = os.path.join(model_directory, language.lower(), name, f"{version}") |
|
|
else: |
|
|
directory = os.path.join(model_directory, language.lower(), name) |
|
|
else: |
|
|
language, phone_set, dialect, version = ( |
|
|
meta_data["language"], |
|
|
meta_data["phone_set"], |
|
|
meta_data["dialect"], |
|
|
meta_data["version"], |
|
|
) |
|
|
if dialect: |
|
|
phoneset_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower() |
|
|
else: |
|
|
phoneset_folder = phone_set.lower() |
|
|
directory = os.path.join(model_directory, language.lower(), phoneset_folder, f"v{version}") |
|
|
|
|
|
return directory |
|
|
|
|
|
|
|
|
mfa_model_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
OVERWRITE_METADATA = False |
|
|
OVERWRITE_MD = False |
|
|
|
|
|
mfa_citation_template = ( |
|
|
"@techreport{{{id},\n\tauthor={{{extra_authors}McAuliffe, Michael and Sonderegger, Morgan}}," |
|
|
"\n\ttitle={{{title}}}," |
|
|
"\n\taddress={{\\url{{https://mfa-models.readthedocs.io/{model_type}/{language}/{link_safe_title}.html}}}}," |
|
|
"\n\tyear={{{year}}},\n\tmonth={{{month}}}," |
|
|
"\n}}" |
|
|
) |
|
|
cv_citation = ( |
|
|
"@misc{Ahn_Chodroff_2022,\n\tauthor={Ahn, Emily and Chodroff, Eleanor}," |
|
|
"\n\ttitle={VoxCommunis Corpus}," |
|
|
"\n\taddress={\\url{https://osf.io/t957v}}," |
|
|
"\n\tpublisher={OSF}," |
|
|
"\n\tyear={2022}, \n\tmonth={Jan}\n}" |
|
|
) |
|
|
prosodylab_citation = ( |
|
|
"@article{gorman2011prosodylab,\n\tauthor={Gorman, Kyle and Howell, Jonathan and Wagner, Michael}," |
|
|
"\n\ttitle={Prosodylab-aligner: A tool for forced alignment of laboratory speech}," |
|
|
"\n\tjournal={Canadian Acoustics}," |
|
|
"\n\tvolume={39},\n\tnumber={3},\n\tpages={192--193},\n\tyear={2011}\n}" |
|
|
) |
|
|
|
|
|
language_link_template = "[{}]({})" |
|
|
|
|
|
license_links = { |
|
|
"CC-0": "https://creativecommons.org/publicdomain/zero/1.0/", |
|
|
"CC BY 4.0": "https://creativecommons.org/licenses/by/4.0/", |
|
|
"CC BY 3.0": "https://creativecommons.org/licenses/by/3.0/", |
|
|
"CC BY-SA-NC 3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/", |
|
|
"CC BY-NC-SA 4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/", |
|
|
"CC BY-NC-SA 3.0": "https://creativecommons.org/licenses/by-nc-sa/3.0/", |
|
|
"CC BY-NC 4.0": "https://creativecommons.org/licenses/by-nc/4.0/", |
|
|
"CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", |
|
|
"CC BY-NC-ND 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", |
|
|
"CC BY-NC 2.0": "https://creativecommons.org/licenses/by-nc/2.0/", |
|
|
"CC BY-NC-ND 3.0": "https://creativecommons.org/licenses/by-nc-nd/3.0/", |
|
|
"Microsoft Research Data License": "https://msropendata-web-api.azurewebsites.net/licenses/2f933be3-284d-500b-7ea3-2aa2fd0f1bb2/view", |
|
|
"Apache 2.0": "https://www.apache.org/licenses/LICENSE-2.0", |
|
|
"O-UDA v1.0": "https://msropendata-web-api.azurewebsites.net/licenses/f1f352a6-243f-4905-8e00-389edbca9e83/view", |
|
|
"MIT": "https://opensource.org/licenses/MIT", |
|
|
"Public domain in the USA": "https://creativecommons.org/share-your-work/public-domain/cc0/", |
|
|
"M-AILABS License": "https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/", |
|
|
"ELRA": "https://www.elra.info/en/services-around-lrs/distribution/licensing/", |
|
|
"Buckeye License": "https://buckeyecorpus.osu.edu/php/registration.php", |
|
|
"LDC License": "https://www.ldc.upenn.edu/data-management/using/licensing", |
|
|
"LaboroTV Non-commercial": "https://laboro.ai/activity/column/engineer/eg-laboro-tv-corpus-jp/", |
|
|
} |
|
|
|
|
|
mfa_maintainer = "[Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/)" |
|
|
cv_maintainer = "[Vox Communis](https://osf.io/t957v/)" |
|
|
|
|
|
corpus_detail_template = """ |
|
|
* {link}: |
|
|
* **Hours:** `{num_hours:.2f}` |
|
|
* **Speakers:** `{num_speakers:,}` |
|
|
* **Utterances:** `{num_utterances:,}`""" |
|
|
|
|
|
g2p_training_detail_template = """ |
|
|
* **Words:** `{num_words:,}` |
|
|
* **Phones:** `{num_phones:,}` |
|
|
* **Graphemes:** `{num_graphemes:,}`""" |
|
|
|
|
|
g2p_evaluation_detail_template = """ |
|
|
* **Words:** `{num_words:,}` |
|
|
* **WER:** `{word_error_rate:.2f}%` |
|
|
* **PER:** `{phone_error_rate:.2f}%`""" |
|
|
|
|
|
tokenizer_training_detail_template = """ |
|
|
* **Utterances:** `{num_utterances:,}` |
|
|
* **Graphemes:** `{num_graphemes:,}`""" |
|
|
|
|
|
tokenizer_evaluation_detail_template = """ |
|
|
* **Utterances:** `{num_utterances:,}` |
|
|
* **UER:** `{utterance_error_rate:.2f}%` |
|
|
* **CER:** `{character_error_rate:.2f}%`""" |
|
|
|
|
|
lm_training_detail_template = """ |
|
|
* **Words:** `{num_words:,}` |
|
|
* **OOVs:** `{num_oovs:,}`""" |
|
|
|
|
|
lm_evaluation_detail_template = """ |
|
|
* **Large model:** `{large_perplexity:.2f}` |
|
|
* **Medium model:** `{medium_perplexity:.2f}` |
|
|
* **Small model:** `{small_perplexity:.2f}`""" |
|
|
|
|
|
link_template = "* {{ref}}`{}`" |
|
|
|
|
|
see_also_template = """```{{admonition}} {model_type_name} |
|
|
{links} |
|
|
```""" |
|
|
|
|
|
mfa_acoustic_model_card_template = template_dir.joinpath( |
|
|
"mfa_acoustic_model_card_template.md" |
|
|
).read_text("utf8") |
|
|
ivector_card_template = template_dir.joinpath("ivector_card_template.md").read_text("utf8") |
|
|
other_acoustic_model_card_template = template_dir.joinpath( |
|
|
"other_acoustic_model_card_template.md" |
|
|
).read_text("utf8") |
|
|
g2p_model_card_template = template_dir.joinpath("g2p_model_card_template.md").read_text("utf8") |
|
|
language_model_card_template = template_dir.joinpath("language_model_card_template.md").read_text( |
|
|
"utf8" |
|
|
) |
|
|
mfa_dictionary_card_template = template_dir.joinpath("mfa_dictionary_card_template.md").read_text( |
|
|
"utf8" |
|
|
) |
|
|
other_dictionary_card_template = template_dir.joinpath( |
|
|
"other_dictionary_card_template.md" |
|
|
).read_text("utf8") |
|
|
corpus_card_template = template_dir.joinpath("corpus_card_template.md").read_text("utf8") |
|
|
tokenizer_model_card_template = template_dir.joinpath( |
|
|
"tokenizer_model_card_template.md" |
|
|
).read_text("utf8") |
|
|
|
|
|
|
|
|
corpus_docs_md_template = template_dir.joinpath("corpus_docs_md_template.md").read_text("utf8") |
|
|
acoustic_docs_md_template = template_dir.joinpath("acoustic_docs_md_template.md").read_text("utf8") |
|
|
ivector_docs_md_template = template_dir.joinpath("ivector_docs_md_template.md").read_text("utf8") |
|
|
g2p_docs_md_template = template_dir.joinpath("g2p_docs_md_template.md").read_text("utf8") |
|
|
lm_docs_md_template = template_dir.joinpath("lm_docs_md_template.md").read_text("utf8") |
|
|
tokenizer_docs_md_template = template_dir.joinpath("tokenizer_docs_md_template.md").read_text( |
|
|
"utf8" |
|
|
) |
|
|
mfa_dictionary_docs_md_template = template_dir.joinpath( |
|
|
"mfa_dictionary_docs_md_template.md" |
|
|
).read_text("utf8") |
|
|
other_dictionary_docs_md_template = template_dir.joinpath( |
|
|
"other_dictionary_docs_md_template.md" |
|
|
).read_text("utf8") |
|
|
|
|
|
language_links = { |
|
|
"Abkhaz": ("Abkhaz", "https://en.wikipedia.org/wiki/Abkhaz_language"), |
|
|
"Arabic": ("Arabic", "https://en.wikipedia.org/wiki/Arabic"), |
|
|
"Armenian": ("Armenian", "https://en.wikipedia.org/wiki/Armenian_language"), |
|
|
"Bashkir": ("Bashkir", "https://en.wikipedia.org/wiki/Bashkir_language"), |
|
|
"Basque": ("Basque", "https://en.wikipedia.org/wiki/Basque_language"), |
|
|
"Belarusian": ("Belarusian", "https://en.wikipedia.org/wiki/Belarusian_language"), |
|
|
"Bulgarian": ("Bulgarian", "https://en.wikipedia.org/wiki/Bulgarian_language"), |
|
|
"Chuvash": ("Chuvash", "https://en.wikipedia.org/wiki/Chuvash_language"), |
|
|
"Croatian": ("Serbo-Croatian", "https://en.wikipedia.org/wiki/Serbo-Croatian"), |
|
|
"Serbocroatian": ("Serbo-Croatian", "https://en.wikipedia.org/wiki/Serbo-Croatian"), |
|
|
"Czech": ("Czech", "https://en.wikipedia.org/wiki/Czech_language"), |
|
|
"Dutch": ("Dutch", "https://en.wikipedia.org/wiki/Dutch_language"), |
|
|
"English": ("English", "https://en.wikipedia.org/wiki/English_language"), |
|
|
("English", "US"): ( |
|
|
"General American English", |
|
|
"https://en.wikipedia.org/wiki/General_American_English", |
|
|
), |
|
|
("English", "UK"): ("British English", "https://en.wikipedia.org/wiki/British_English"), |
|
|
("English", "Nigeria"): ("Nigerian English", "https://en.wikipedia.org/wiki/Nigerian_English"), |
|
|
("English", "India"): ("Indian English", "Japanese tokenizer v2_1_0.md"), |
|
|
"French": ("French", "https://en.wikipedia.org/wiki/French_language"), |
|
|
"Georgian": ("Georgian", "https://en.wikipedia.org/wiki/Georgian_language"), |
|
|
"German": ("German", "https://en.wikipedia.org/wiki/German_language"), |
|
|
"Greek": ("Greek", "https://en.wikipedia.org/wiki/Greek_language"), |
|
|
"Guarani": ("Guarani", "https://en.wikipedia.org/wiki/Guarani_language"), |
|
|
"Hungarian": ("Hungarian", "https://en.wikipedia.org/wiki/Hungarian_language"), |
|
|
"Italian": ("Italian", "https://en.wikipedia.org/wiki/Italian_language"), |
|
|
"Indonesian": ("Indonesian", "https://en.wikipedia.org/wiki/Indonesian_language"), |
|
|
"Hausa": ("Hausa", "https://en.wikipedia.org/wiki/Hausa_language"), |
|
|
"Kazakh": ("Kazakh", "https://en.wikipedia.org/wiki/Kazakh_language"), |
|
|
"Kyrgyz": ("Kyrgyz", "https://en.wikipedia.org/wiki/Kyrgyz_language"), |
|
|
"Kurmanji": ("Kurmanji", "https://en.wikipedia.org/wiki/Kurmanji"), |
|
|
"Maltese": ("Maltese", "https://en.wikipedia.org/wiki/Maltese_language"), |
|
|
"Uzbek": ("Uzbek", "https://en.wikipedia.org/wiki/Uzbek_language"), |
|
|
"Uyghur": ("Uyghur", "https://en.wikipedia.org/wiki/Uyghur_language"), |
|
|
"Punjabi": ("Punjabi", "https://en.wikipedia.org/wiki/Punjabi_language"), |
|
|
"Hindi": ("Hindi", "https://en.wikipedia.org/wiki/Hindi_language"), |
|
|
"Hindi-Urdu": ("Hindi-Urdu", "https://en.wikipedia.org/wiki/Hindustani_language"), |
|
|
"Japanese": ("Japanese", "https://en.wikipedia.org/wiki/Japanese_language"), |
|
|
"Korean": ("Korean", "https://en.wikipedia.org/wiki/Korean_language"), |
|
|
"Polish": ("Polish", "https://en.wikipedia.org/wiki/Polish_language"), |
|
|
"Portuguese": ("Portuguese", "https://en.wikipedia.org/wiki/Portuguese_language"), |
|
|
("Portuguese", "Brazil"): ( |
|
|
"Brazilian Portuguese", |
|
|
"https://en.wikipedia.org/wiki/Brazilian_Portuguese", |
|
|
), |
|
|
("Portuguese", "Portugal"): ( |
|
|
"European Portuguese", |
|
|
"https://en.wikipedia.org/wiki/European_Portuguese", |
|
|
), |
|
|
"Romanian": ("Romanian", "https://en.wikipedia.org/wiki/Romanian_language"), |
|
|
"Russian": ("Russian", "https://en.wikipedia.org/wiki/Russian_language"), |
|
|
"Spanish": ("Spanish", "https://en.wikipedia.org/wiki/Spanish_language"), |
|
|
("Spanish", "Latin America"): ( |
|
|
"Spanish in the Americas", |
|
|
"https://en.wikipedia.org/wiki/Spanish_language_in_the_Americas", |
|
|
), |
|
|
("Spanish", "Spain"): ( |
|
|
"Peninsular Spanish", |
|
|
"https://en.wikipedia.org/wiki/Peninsular_Spanish", |
|
|
), |
|
|
"Swahili": ("Swahili", "https://en.wikipedia.org/wiki/Swahili_language"), |
|
|
"Swedish": ("Swedish", "https://en.wikipedia.org/wiki/Swedish_language"), |
|
|
"Tamil": ("Tamil", "https://en.wikipedia.org/wiki/Tamil_language"), |
|
|
"Tatar": ("Tatar", "https://en.wikipedia.org/wiki/Tatar_language"), |
|
|
"Thai": ("Thai", "https://en.wikipedia.org/wiki/Thai_language"), |
|
|
"Turkish": ("Turkish", "https://en.wikipedia.org/wiki/Turkish_language"), |
|
|
"Ukrainian": ("Ukrainian", "https://en.wikipedia.org/wiki/Ukrainian_language"), |
|
|
"Vietnamese": ("Vietnamese", "https://en.wikipedia.org/wiki/Vietnamese_language"), |
|
|
("Vietnamese", "Ho Chi Minh City"): ( |
|
|
"Southern Vietnamese", |
|
|
"https://en.wikipedia.org/wiki/Vietnamese_language#Language_variation", |
|
|
), |
|
|
("Vietnamese", "Hanoi"): ( |
|
|
"Northern Vietnamese", |
|
|
"https://en.wikipedia.org/wiki/Vietnamese_language#Language_variation", |
|
|
), |
|
|
"Sorbian": ("Sorbian", "https://en.wikipedia.org/wiki/Sorbian_languages"), |
|
|
("Sorbian", "Upper"): ( |
|
|
"Upper Sorbian", |
|
|
"https://en.wikipedia.org/wiki/Upper_Sorbian_language", |
|
|
), |
|
|
"Mandarin": ("Mandarin Chinese", "https://en.wikipedia.org/wiki/Mandarin_Chinese"), |
|
|
("Mandarin", "Taiwan"): ( |
|
|
"Taiwanese Mandarin", |
|
|
"https://en.wikipedia.org/wiki/Taiwanese_Mandarin", |
|
|
), |
|
|
("Mandarin", "Erhua"): ("Beijing Mandarin", "https://en.wikipedia.org/wiki/Beijing_dialect"), |
|
|
("Mandarin", "China"): ( |
|
|
"Standard Mandarin Chinese", |
|
|
"https://en.wikipedia.org/wiki/Standard_Chinese", |
|
|
), |
|
|
"Urdu": ("Urdu", "https://en.wikipedia.org/wiki/Urdu"), |
|
|
} |
|
|
|
|
|
cv_phone_set_mapping = { |
|
|
"abkhaz": "XPF", |
|
|
"armenian": "XPF", |
|
|
"bashkir": "XPF", |
|
|
"basque": "XPF", |
|
|
"belarusian": "XPF", |
|
|
"bulgarian": "XPF", |
|
|
"chuvash": "XPF", |
|
|
"czech": "XPF", |
|
|
"dutch": "Epitran", |
|
|
"georgian": "XPF", |
|
|
"greek": "XPF", |
|
|
"guarani": "XPF", |
|
|
"hausa": "Epitran", |
|
|
"hindi": "Epitran", |
|
|
"hungarian": "XPF", |
|
|
"indonesian": "Epitran", |
|
|
"italian": "Epitran", |
|
|
"kazakh": "Epitran", |
|
|
"kurmanji": "Epitran", |
|
|
"kyrgyz": "Epitran", |
|
|
"maltese": "Epitran", |
|
|
"polish": "Epitran", |
|
|
"punjabi": "Epitran", |
|
|
"portuguese": "Epitran", |
|
|
"romanian": "XPF", |
|
|
"russian": "Epitran", |
|
|
"sorbian_upper": "XPF", |
|
|
"sorbian": "XPF", |
|
|
"swedish": "XPF", |
|
|
"tamil": "XPF", |
|
|
"tatar": "Epitran", |
|
|
"thai": "XPF", |
|
|
"turkish": "XPF", |
|
|
"ukrainian": "XPF", |
|
|
"uyghur": "Epitran", |
|
|
"uzbek": "Epitran", |
|
|
"urdu": "Epitran", |
|
|
"vietnamese": "XPF", |
|
|
} |
|
|
|
|
|
phone_set_templates = { |
|
|
"Epitran": "[Epitran](https://github.com/dmort27/epitran)", |
|
|
"XPF": "[XPF](https://github.com/CohenPr-XPF/XPF)", |
|
|
"ARPA": "[ARPA](https://en.wikipedia.org/wiki/ARPABET)", |
|
|
"PINYIN": "[PINYIN](https://en.wikipedia.org/wiki/Pinyin)", |
|
|
"PROSODYLAB": "[PROSODYLAB](https://github.com/prosodylab/prosodylab.dictionaries)", |
|
|
"MFA": "[MFA](https://mfa-models.readthedocs.io/en/refactor/mfa_phone_set.html#{language})", |
|
|
} |
|
|
|
|
|
model_id_templates = { |
|
|
"acoustic": "{language}{dialect_title_string} {phone_set} acoustic model{version_string}", |
|
|
"dictionary": "{language}{dialect_title_string} {phone_set} dictionary{version_string}", |
|
|
"g2p": "{language}{dialect_title_string} {phone_set} G2P model{version_string}", |
|
|
"language_model": "{language}{dialect_title_string} language model{version_string}", |
|
|
"corpus": "{corpus_name}{version_string}", |
|
|
"ivector": "{language} {phone_set} ivector extractor{version_string}", |
|
|
"tokenizer": "{language} tokenizer{version_string}", |
|
|
} |
|
|
|
|
|
pronunciation_dictionaries = {} |
|
|
|
|
|
|
|
|
def load_dict(dictionary_path, dict_name, phone_set_type) -> MultispeakerDictionary: |
|
|
if dict_name not in pronunciation_dictionaries: |
|
|
pronunciation_dictionaries[dict_name] = MultispeakerDictionary( |
|
|
dictionary_path, phone_set_type=phone_set_type, position_dependent_phones=False |
|
|
) |
|
|
if os.path.exists(pronunciation_dictionaries[dict_name].output_directory): |
|
|
shutil.rmtree(pronunciation_dictionaries[dict_name].output_directory) |
|
|
pronunciation_dictionaries[dict_name].dictionary_setup() |
|
|
return pronunciation_dictionaries[dict_name] |
|
|
|
|
|
|
|
|
def generate_id(meta_data, model_type): |
|
|
if "dialect" in meta_data and meta_data["dialect"]: |
|
|
dialect_title_string = f' ({meta_data["dialect"]})' |
|
|
else: |
|
|
dialect_title_string = "" |
|
|
if "version" in meta_data and meta_data["version"]: |
|
|
version_string = f' v{meta_data["version"]}' |
|
|
else: |
|
|
version_string = "" |
|
|
template = model_id_templates[model_type] |
|
|
if model_type == "corpus": |
|
|
fields = {"corpus_name": meta_data["name"], "version_string": version_string} |
|
|
else: |
|
|
fields = { |
|
|
"language": meta_data["language"].title(), |
|
|
"dialect_title_string": dialect_title_string, |
|
|
"version_string": version_string, |
|
|
} |
|
|
if model_type not in {"language_model"}: |
|
|
fields["phone_set"] = meta_data["phone_set"] |
|
|
if model_type == "ivector": |
|
|
fields["phone_set"] = "MFA" |
|
|
return template.format(**fields).replace(".", "_") |
|
|
|
|
|
|
|
|
def generate_meta_data(model, model_type, language, dialect, version, phone_set): |
|
|
citation_details = { |
|
|
"model_name": model.name, |
|
|
"version": version, |
|
|
"extra_authors": "", |
|
|
"model_type": model_type, |
|
|
"language": language.title(), |
|
|
"phone_set": phone_set.upper(), |
|
|
} |
|
|
citation_template = mfa_citation_template |
|
|
if language in {"Arabic"}: |
|
|
citation_details["extra_authors"] = "Shmueli, Natalia and " |
|
|
maintainer = mfa_maintainer |
|
|
if dialect: |
|
|
phone_set_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower() |
|
|
citation_details["dialect"] = dialect |
|
|
else: |
|
|
phone_set_folder = phone_set.lower() |
|
|
license = "CC BY 4.0" |
|
|
license_link = f"[CC BY 4.0](https://github.com/MontrealCorpusTools/mfa-models/tree/main/{model_type}/{language.lower()}/{phone_set_folder}/v{version}/LICENSE)" |
|
|
if model_type == "acoustic": |
|
|
if model.source.name.endswith("_cv.zip"): |
|
|
citation = cv_citation |
|
|
maintainer = cv_maintainer |
|
|
license = "CC-0" |
|
|
license_link = "[CC-0](https://creativecommons.org/publicdomain/zero/1.0/)" |
|
|
train_date = "02-11-2022" |
|
|
else: |
|
|
train_date = datetime.fromisoformat(model.meta["train_date"]).date() |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["id"] = f'mfa_{model.name}_acoustic_{citation_details["year"]}' |
|
|
citation = mfa_citation_template.format(**citation_details) |
|
|
features = "MFCC" |
|
|
if model.meta["features"].get("use_pitch", False): |
|
|
features += " + pitch" |
|
|
return { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"phone_set": phone_set, |
|
|
"version": version, |
|
|
"maintainer": maintainer, |
|
|
"citation": citation, |
|
|
"license": license, |
|
|
"license_link": license_link, |
|
|
"architecture": model.meta["architecture"], |
|
|
"features": features, |
|
|
"evaluation": {}, |
|
|
"decode": {}, |
|
|
"train_date": str(train_date), |
|
|
} |
|
|
if model_type == "dictionary": |
|
|
train_date = datetime.today().date() |
|
|
citation_details["model_type"] = "pronunciation dictionary" |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["id"] = f"mfa_{model.name}_dictionary_{train_date.year}" |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation = citation_template.format(**citation_details) |
|
|
phone_set = phone_set.upper() |
|
|
if model.path.name.endswith("_cv.dict"): |
|
|
citation = cv_citation |
|
|
maintainer = cv_maintainer |
|
|
license_link = "[CC-0](https://creativecommons.org/publicdomain/zero/1.0/)" |
|
|
dictionary_phone_set = "IPA" |
|
|
elif model.path.name.endswith("_mfa.dict"): |
|
|
dictionary_phone_set = "IPA" |
|
|
else: |
|
|
if model.path.name.endswith("_prosodylab.dict") or model.path.name.endswith( |
|
|
"us_arpa.dict" |
|
|
): |
|
|
citation = prosodylab_citation |
|
|
try: |
|
|
dictionary_phone_set = montreal_forced_aligner.data.PhoneSetType[phone_set].name |
|
|
except KeyError: |
|
|
dictionary_phone_set = "UNKNOWN" |
|
|
dictionary = load_dict(model.path, model.name, phone_set_type=dictionary_phone_set) |
|
|
word_count = len(dictionary.actual_words) |
|
|
data = { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"maintainer": maintainer, |
|
|
"license_link": license_link, |
|
|
"license": license, |
|
|
"phone_set": phone_set, |
|
|
"phones": sorted(dictionary.non_silence_phones), |
|
|
"word_count": word_count, |
|
|
"train_date": str(train_date), |
|
|
"version": version, |
|
|
"citation": citation, |
|
|
} |
|
|
output_path = os.path.join( |
|
|
os.path.dirname(get_model_card_directory("dictionary", data)), |
|
|
dictionary.name + ".dict", |
|
|
) |
|
|
dictionary.export_lexicon(1, output_path) |
|
|
return data |
|
|
if model_type == "g2p": |
|
|
train_date = datetime.fromisoformat(model.meta["train_date"]).date() |
|
|
citation_details["model_type"] = "G2P model" |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation_details["id"] = f"mfa_{model.name}_g2p_{train_date.year}" |
|
|
return { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"maintainer": maintainer, |
|
|
"license_link": license_link, |
|
|
"license": license, |
|
|
"architecture": model.meta["architecture"], |
|
|
"training": model.meta["training"], |
|
|
"evaluation": { |
|
|
k: v if v is not None else 100 for k, v in model.meta["evaluation"].items() |
|
|
}, |
|
|
"phone_set": phone_set, |
|
|
"phones": sorted(model.meta["phones"]), |
|
|
"version": version, |
|
|
"train_date": str(train_date), |
|
|
"citation": citation_template.format(**citation_details), |
|
|
} |
|
|
if model_type == "language_model": |
|
|
train_date = datetime.fromisoformat(model.meta["train_date"]).date() |
|
|
citation_details["model_type"] = "language model" |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["id"] = f"mfa_{model.name}_lm_{train_date.year}" |
|
|
return { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"phone_set": "MFA", |
|
|
"maintainer": maintainer, |
|
|
"license_link": license_link, |
|
|
"license": license, |
|
|
"architecture": model.meta["architecture"], |
|
|
"version": version, |
|
|
"train_date": str(train_date), |
|
|
"training": { |
|
|
"num_words": model.meta["training"]["num_words"], |
|
|
"num_oovs": model.meta["training"]["num_oovs"], |
|
|
}, |
|
|
"evaluation": { |
|
|
"large_perplexity": model.meta["evaluation_training"]["large_perplexity"], |
|
|
"medium_perplexity": model.meta["evaluation_training"]["medium_perplexity"], |
|
|
"small_perplexity": model.meta["evaluation_training"]["small_perplexity"], |
|
|
}, |
|
|
"citation": citation_template.format(**citation_details), |
|
|
} |
|
|
if model_type == "tokenizer": |
|
|
train_date = datetime.fromisoformat(model.meta["train_date"]).date() |
|
|
citation_details["model_type"] = "tokenizer" |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["id"] = f"mfa_{model.name}_tokenizer_{train_date.year}" |
|
|
return { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"phone_set": "MFA", |
|
|
"maintainer": maintainer, |
|
|
"license_link": license_link, |
|
|
"license": license, |
|
|
"architecture": model.meta["architecture"], |
|
|
"version": version, |
|
|
"train_date": str(train_date), |
|
|
"training": { |
|
|
"num_utterances": model.meta["training"]["num_utterances"], |
|
|
"num_graphemes": model.meta["training"]["num_graphemes"], |
|
|
}, |
|
|
"evaluation": { |
|
|
k: v if v is not None else 100 for k, v in model.meta["evaluation"].items() |
|
|
}, |
|
|
"citation": citation_template.format(**citation_details), |
|
|
} |
|
|
if model_type == "ivector": |
|
|
print(model.meta) |
|
|
if "train_date" in model.meta: |
|
|
train_date = datetime.fromisoformat(model.meta["train_date"]).date() |
|
|
else: |
|
|
train_date = datetime.now().date() |
|
|
citation_details["model_type"] = "ivector" |
|
|
citation_details["year"] = train_date.year |
|
|
citation_details["title"] = generate_id(citation_details, model_type).replace("_", ".") |
|
|
citation_details["link_safe_title"] = generate_id(citation_details, model_type) |
|
|
citation_details["month"] = train_date.strftime("%b") |
|
|
citation_details["id"] = f"mfa_{model.name}_ivector_{train_date.year}" |
|
|
return { |
|
|
"name": model.name, |
|
|
"language": language.title(), |
|
|
"dialect": dialect, |
|
|
"phone_set": "MFA", |
|
|
"maintainer": maintainer, |
|
|
"license_link": license_link, |
|
|
"license": license, |
|
|
"version": version, |
|
|
"train_date": str(train_date), |
|
|
"citation": citation_template.format(**citation_details), |
|
|
} |
|
|
return {} |
|
|
|
|
|
|
|
|
def extract_model_card_fields(meta_data, model_type): |
|
|
dialect_link = "N/A" |
|
|
if "dialect" in meta_data and meta_data["dialect"]: |
|
|
key = (meta_data["language"], meta_data["dialect"]) |
|
|
if key in language_links: |
|
|
dialect_link = language_link_template.format(*language_links[key]) |
|
|
if meta_data["language"] != "Multilingual": |
|
|
language_link = language_link_template.format(*language_links[meta_data["language"]]) |
|
|
else: |
|
|
language_link = meta_data["language"] |
|
|
if "dialects" in meta_data and meta_data["dialects"]: |
|
|
dialect_links = [] |
|
|
for d in meta_data["dialects"]: |
|
|
key = (meta_data["language"], d) |
|
|
if key in language_links: |
|
|
dialect_links.append(language_link_template.format(*language_links[key])) |
|
|
dialect_link = ", ".join(dialect_links) |
|
|
if "phone_set" in meta_data: |
|
|
phone_set = meta_data["phone_set"] |
|
|
if phone_set == "CV": |
|
|
phone_set = cv_phone_set_mapping[language.lower()] |
|
|
phone_set_link = phone_set_templates[phone_set] |
|
|
if phone_set == "MFA": |
|
|
phone_set_link = phone_set_link.format(language=meta_data["language"].lower()) |
|
|
name = generate_id(meta_data, model_type) |
|
|
discussion_title = name.replace(" ", "+").replace(")", "").replace("(", "").replace("_", ".") |
|
|
if model_type == "acoustic": |
|
|
corpora_details = "" |
|
|
if "corpus" in meta_data: |
|
|
for corpus in meta_data["corpus"]: |
|
|
if "version" in corpus and corpus["version"]: |
|
|
corpus_link_template = "[{name}](../../../../corpus/{language}/{corpus_safe_name}/{version}/README.md)" |
|
|
link = corpus_link_template.format( |
|
|
name=corpus["name"], |
|
|
language=make_path_safe(corpus["language"]), |
|
|
corpus_safe_name=make_path_safe(corpus["name"]), |
|
|
version=corpus["version"], |
|
|
) |
|
|
else: |
|
|
corpus_link_template = ( |
|
|
"[{name}](../../../../corpus/{language}/{corpus_safe_name}/README.md)" |
|
|
) |
|
|
link = corpus_link_template.format( |
|
|
name=corpus["name"], |
|
|
language=make_path_safe(corpus["language"]), |
|
|
corpus_safe_name=make_path_safe(corpus["name"]), |
|
|
) |
|
|
|
|
|
data = { |
|
|
"name": corpus["name"], |
|
|
"link": link, |
|
|
"num_hours": corpus["num_hours"], |
|
|
"num_speakers": corpus["num_speakers"], |
|
|
"num_utterances": corpus["num_utterances"], |
|
|
} |
|
|
corpora_details += "\n" + corpus_detail_template.format(**data) |
|
|
return { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"discussion_title": discussion_title, |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"dialect": meta_data["dialect"], |
|
|
"dialect_link": dialect_link, |
|
|
"version": meta_data["version"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"features": meta_data["features"], |
|
|
"architecture": meta_data["architecture"], |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"corpora_details": corpora_details, |
|
|
"phone_set_link": phone_set_link, |
|
|
} |
|
|
if model_type == "ivector": |
|
|
corpora_details = "" |
|
|
if "corpus" in meta_data: |
|
|
for corpus in meta_data["corpus"]: |
|
|
if "version" in corpus and corpus["version"]: |
|
|
corpus_link_template = "[{name}](../../../corpus/{language}/{corpus_safe_name}/{version}/README.md)" |
|
|
link = corpus_link_template.format( |
|
|
name=corpus["name"], |
|
|
language=make_path_safe(corpus["language"]), |
|
|
corpus_safe_name=make_path_safe(corpus["name"]), |
|
|
version=corpus["version"], |
|
|
) |
|
|
else: |
|
|
corpus_link_template = ( |
|
|
"[{name}](../../../corpus/{language}/{corpus_safe_name}/README.md)" |
|
|
) |
|
|
link = corpus_link_template.format( |
|
|
name=corpus["name"], |
|
|
language=make_path_safe(corpus["language"]), |
|
|
corpus_safe_name=make_path_safe(corpus["name"]), |
|
|
) |
|
|
|
|
|
data = { |
|
|
"name": corpus["name"], |
|
|
"link": link, |
|
|
"num_hours": corpus["num_hours"], |
|
|
"num_speakers": corpus["num_speakers"], |
|
|
"num_utterances": corpus["num_utterances"], |
|
|
} |
|
|
corpora_details += "\n" + corpus_detail_template.format(**data) |
|
|
return { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"discussion_title": discussion_title, |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"version": meta_data["version"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"features": meta_data.get("features", "MFCC"), |
|
|
"architecture": meta_data.get("architecture", "ivector"), |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"corpora_details": corpora_details, |
|
|
} |
|
|
if model_type == "corpus": |
|
|
citation = meta_data.get("citation", "") |
|
|
version = meta_data.get("version", "") |
|
|
dialects = meta_data.get("dialects", []) |
|
|
if dialects: |
|
|
dialects = ", ".join(dialects) |
|
|
else: |
|
|
dialects = "N/A" |
|
|
if version: |
|
|
version = f"- **Version:** `{version}`" |
|
|
return { |
|
|
"corpus_name": meta_data["name"], |
|
|
"title": meta_data["id"].replace("_", "."), |
|
|
"corpus_id": meta_data["id"], |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"discussion_title": discussion_title, |
|
|
"corpus_link": f"[{meta_data['name']}]({meta_data['link']})", |
|
|
"dialects": dialects, |
|
|
"dialect_link": dialect_link, |
|
|
"num_hours": meta_data["num_hours"], |
|
|
"num_utterances": meta_data["num_utterances"], |
|
|
"num_speakers": meta_data["num_speakers"], |
|
|
"num_female": meta_data.get("num_female", 0), |
|
|
"num_male": meta_data.get("num_male", 0), |
|
|
"num_other": meta_data.get("num_other", meta_data["num_speakers"]), |
|
|
"license_link": meta_data["license_link"], |
|
|
"version": version, |
|
|
"citation": citation, |
|
|
} |
|
|
if model_type == "dictionary": |
|
|
data = { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"dialect": meta_data["dialect"], |
|
|
"dialect_link": dialect_link, |
|
|
"discussion_title": discussion_title, |
|
|
"version": meta_data["version"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"phone_set": meta_data["phone_set"], |
|
|
"phones": " ".join(sorted(meta_data["phones"])), |
|
|
"word_count": meta_data["word_count"], |
|
|
"phone_set_link": phone_set_link, |
|
|
} |
|
|
if meta_data["phone_set"] in {"MFA", "ARPA"}: |
|
|
data[ |
|
|
"plain_link" |
|
|
] = f"https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/{language.lower()}/mfa/{model_name}.dict" |
|
|
return data |
|
|
if model_type == "g2p": |
|
|
training_details = g2p_training_detail_template.format(**meta_data["training"]) |
|
|
evaluation_details = g2p_evaluation_detail_template.format(**meta_data["evaluation"]) |
|
|
return { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"dialect": meta_data["dialect"], |
|
|
"dialect_link": dialect_link, |
|
|
"discussion_title": discussion_title, |
|
|
"architecture": meta_data["architecture"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"version": meta_data["version"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"phone_set": meta_data["phone_set"], |
|
|
"phones": ", ".join(f"{{ipa_inline}}`{x}`" for x in meta_data["phones"]), |
|
|
"training_details": training_details, |
|
|
"evaluation_details": evaluation_details, |
|
|
"phone_set_link": phone_set_link, |
|
|
} |
|
|
if model_type == "tokenizer": |
|
|
training_details = tokenizer_training_detail_template.format(**meta_data["training"]) |
|
|
evaluation_details = tokenizer_evaluation_detail_template.format(**meta_data["evaluation"]) |
|
|
return { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"discussion_title": discussion_title, |
|
|
"architecture": meta_data["architecture"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"version": meta_data["version"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"training_details": training_details, |
|
|
"evaluation_details": evaluation_details, |
|
|
} |
|
|
if model_type == "language_model": |
|
|
training_details = lm_training_detail_template.format(**meta_data["training"]) |
|
|
evaluation_details = lm_evaluation_detail_template.format(**meta_data["evaluation"]) |
|
|
return { |
|
|
"model_name": meta_data["name"], |
|
|
"title": name.replace("_", "."), |
|
|
"language": meta_data["language"], |
|
|
"language_link": language_link, |
|
|
"dialect": meta_data["dialect"], |
|
|
"dialect_link": dialect_link, |
|
|
"discussion_title": discussion_title, |
|
|
"architecture": meta_data["architecture"], |
|
|
"maintainer": meta_data["maintainer"], |
|
|
"version": meta_data["version"], |
|
|
"license_link": meta_data["license_link"], |
|
|
"mfa_version": CURRENT_MODEL_VERSION, |
|
|
"date": meta_data["train_date"], |
|
|
"citation": meta_data["citation"], |
|
|
"training_details": training_details, |
|
|
"evaluation_details": evaluation_details, |
|
|
} |
|
|
|
|
|
|
|
|
def extract_doc_card_fields(meta_data, model_type): |
|
|
tags = [meta_data["language"]] |
|
|
if model_type not in {"language_model", "corpus"}: |
|
|
tags.append(meta_data["phone_set"].upper()) |
|
|
|
|
|
see_also = "" |
|
|
links = [] |
|
|
for k in ["corpus", "dictionary", "g2p", "acoustic", "language_model", "tokenizer"]: |
|
|
if k == "corpus" and model_type in {"acoustic", "ivector"}: |
|
|
continue |
|
|
if k in meta_data: |
|
|
if k == "corpus": |
|
|
links.append( |
|
|
see_also_template.format( |
|
|
links="\n".join( |
|
|
link_template.format(x["id"].lower().replace(" ", "_")) |
|
|
for x in meta_data[k] |
|
|
), |
|
|
model_type_name=model_type_names[k], |
|
|
) |
|
|
) |
|
|
else: |
|
|
print(meta_data[k]) |
|
|
links.append( |
|
|
see_also_template.format( |
|
|
links="\n".join( |
|
|
link_template.format(x.lower().replace(" ", "_")) for x in meta_data[k] |
|
|
), |
|
|
model_type_name=model_type_names[k], |
|
|
) |
|
|
) |
|
|
|
|
|
if links: |
|
|
see_also = "\n\n".join(links) |
|
|
try: |
|
|
license_link = f"[{meta_data['license']}]({license_links[meta_data['license']]})" |
|
|
except KeyError: |
|
|
license_link = meta_data["license"] |
|
|
layout_type = "not_mfa" |
|
|
if "phone_set" in meta_data: |
|
|
phone_set = meta_data["phone_set"] |
|
|
if phone_set == "CV": |
|
|
phone_set = cv_phone_set_mapping[meta_data["language"].lower()] |
|
|
elif phone_set in {"MFA", "ARPA", "PROSODYLAB"}: |
|
|
layout_type = "mfa" |
|
|
try: |
|
|
phone_set_link = phone_set_templates[phone_set] |
|
|
if phone_set == "MFA": |
|
|
phone_set_link = phone_set_link.format(language=meta_data["language"].lower()) |
|
|
except KeyError: |
|
|
phone_set_link = phone_set |
|
|
if "dialect" in meta_data and meta_data["dialect"]: |
|
|
language_sub_folder = f"{meta_data['dialect']}_{meta_data['phone_set']}".replace( |
|
|
" ", "_" |
|
|
).lower() |
|
|
dialect_title_string = f" ({meta_data['dialect']})" |
|
|
else: |
|
|
language_sub_folder = meta_data["phone_set"].lower() |
|
|
dialect_title_string = "" |
|
|
name = generate_id(meta_data, model_type) |
|
|
if model_type == "acoustic": |
|
|
corpora_details = "" |
|
|
corpus_link_template = "{{ref}}`{corpus_id}`" |
|
|
dialects = [] |
|
|
if "corpus" in meta_data: |
|
|
for corpus in meta_data["corpus"]: |
|
|
if "dialects" in corpus: |
|
|
dialects.extend(corpus["dialects"]) |
|
|
data = { |
|
|
"name": corpus["name"], |
|
|
"link": corpus_link_template.format(corpus_id=corpus["id"].replace(" ", "_")), |
|
|
"num_hours": corpus["num_hours"], |
|
|
"num_speakers": corpus["num_speakers"], |
|
|
"num_utterances": corpus["num_utterances"], |
|
|
} |
|
|
corpora_details += "\n" + corpus_detail_template.format(**data) |
|
|
if not dialects and "dialect" in meta_data and meta_data["dialect"]: |
|
|
dialects = [meta_data["dialect"]] |
|
|
if meta_data["phone_set"] in {"CV", "MFA"}: |
|
|
tags.append("IPA") |
|
|
return { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"architecture": meta_data["architecture"], |
|
|
"version": meta_data["version"], |
|
|
"corpora_details": corpora_details, |
|
|
"see_also": see_also, |
|
|
"tags": ";".join(tags), |
|
|
"dialects": ";".join(sorted(set(dialects))) if dialects else "N/A", |
|
|
"language": meta_data["language"].lower(), |
|
|
"language_name": meta_data["language"].title(), |
|
|
"license": meta_data["license"], |
|
|
"phone_set": phone_set, |
|
|
"layout_type": layout_type, |
|
|
"license_link": license_link, |
|
|
"phone_set_link": phone_set_link, |
|
|
"dialect_title_string": dialect_title_string, |
|
|
"language_sub_folder": language_sub_folder, |
|
|
"phone_set_name": meta_data["phone_set"].upper(), |
|
|
} |
|
|
if model_type == "ivector": |
|
|
corpora_details = "" |
|
|
corpus_link_template = "{{ref}}`{corpus_id}`" |
|
|
if "corpus" in meta_data: |
|
|
for corpus in meta_data["corpus"]: |
|
|
data = { |
|
|
"name": corpus["name"], |
|
|
"link": corpus_link_template.format(corpus_id=corpus["id"].replace(" ", "_")), |
|
|
"num_hours": corpus["num_hours"], |
|
|
"num_speakers": corpus["num_speakers"], |
|
|
"num_utterances": corpus["num_utterances"], |
|
|
} |
|
|
corpora_details += "\n" + corpus_detail_template.format(**data) |
|
|
return { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"architecture": meta_data.get("architecture", "ivector"), |
|
|
"version": meta_data["version"], |
|
|
"corpora_details": corpora_details, |
|
|
"see_also": see_also, |
|
|
"tags": ";".join(tags), |
|
|
"language": meta_data["language"].lower(), |
|
|
"language_name": meta_data["language"].title(), |
|
|
"license": meta_data["license"], |
|
|
"layout_type": layout_type, |
|
|
"license_link": license_link, |
|
|
} |
|
|
if model_type == "corpus": |
|
|
if "tags" in meta_data: |
|
|
tags.extend(meta_data["tags"]) |
|
|
dialects = [] |
|
|
if "dialects" in meta_data: |
|
|
dialects = meta_data["dialects"] |
|
|
version_subdirectory = meta_data.get("version", "") |
|
|
if version_subdirectory: |
|
|
version_subdirectory = f"/{version_subdirectory}" |
|
|
return { |
|
|
"corpus_id": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"corpus_name": meta_data["name"], |
|
|
"corpus_name_safe": make_path_safe(meta_data["name"]), |
|
|
"license": meta_data["license"], |
|
|
"see_also": see_also, |
|
|
"layout_type": layout_type, |
|
|
"tags": ";".join(tags), |
|
|
"version_subdirectory": version_subdirectory, |
|
|
"language": meta_data["language"].lower(), |
|
|
"dialects": ";".join(sorted(set(dialects))) if dialects else "N/A", |
|
|
"language_name": meta_data["language"].title(), |
|
|
} |
|
|
if model_type == "dictionary": |
|
|
if meta_data["name"].endswith("_cv") or meta_data["name"].endswith("_mfa"): |
|
|
tags.append("IPA") |
|
|
elif meta_data["name"].endswith("_prosodylab") or meta_data["name"].endswith("us_arpa"): |
|
|
tags.append("PROSODYLAB") |
|
|
tags.append("MFA") |
|
|
data = { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"version": meta_data["version"], |
|
|
"see_also": see_also, |
|
|
"tags": ";".join(tags), |
|
|
"layout_type": layout_type, |
|
|
"language": meta_data["language"].lower(), |
|
|
"license": meta_data["license"], |
|
|
"language_name": meta_data["language"].title(), |
|
|
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A", |
|
|
"phone_set": phone_set, |
|
|
"language_sub_folder": language_sub_folder, |
|
|
"dialect_title_string": dialect_title_string, |
|
|
"phone_set_name": meta_data["phone_set"].upper(), |
|
|
} |
|
|
if meta_data["name"] in phone_charts: |
|
|
charts = phone_charts[meta_data["name"]] |
|
|
data["consonant_chart"] = charts["consonant_chart"] |
|
|
data["vowel_section"] = charts["oral_vowel_chart"] |
|
|
if charts["nasal_vowel_chart"]: |
|
|
data["vowel_section"] = "#### Oral Vowels\n\n" + data["vowel_section"] |
|
|
data["vowel_section"] += "\n\n#### Nasal Vowels\n\n" + charts["nasal_vowel_chart"] |
|
|
if charts["diphthongs"]: |
|
|
data["vowel_section"] += "\n\n#### Diphthongs\n\n* " + "\n* ".join( |
|
|
f"{{ipa_inline}}`{x}`" for x in sorted(charts["diphthongs"]) |
|
|
) |
|
|
if charts["triphthongs"]: |
|
|
data["vowel_section"] += "\n\n#### Triphthongs\n\n* " + "\n* ".join( |
|
|
f"{{ipa_inline}}`{x}`" for x in sorted(charts["triphthongs"]) |
|
|
) |
|
|
if "tones" in charts and charts["tones"]: |
|
|
data["vowel_section"] += "\n\n#### Tones\n\n* " + "\n* ".join( |
|
|
f"{{ipa_inline}}`{x}`" for x in sorted(charts["tones"]) |
|
|
) |
|
|
if "stress" in charts and charts["stress"]: |
|
|
data["vowel_section"] += "\n\n#### Stress\n\n* " + "\n* ".join( |
|
|
f"{{ipa_inline}}`{x}`" for x in sorted(charts["stress"]) |
|
|
) |
|
|
if "other" in charts and charts["other"]: |
|
|
data["vowel_section"] += "\n\n### Other phones\n\n* " + "\n* ".join( |
|
|
f"{{ipa_inline}}`{x}`" for x in sorted(charts["other"]) |
|
|
) |
|
|
|
|
|
return data |
|
|
if model_type == "g2p": |
|
|
if meta_data["phone_set"] in {"CV", "MFA"}: |
|
|
tags.append("IPA") |
|
|
return { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"architecture": meta_data["architecture"], |
|
|
"version": meta_data["version"], |
|
|
"see_also": see_also, |
|
|
"layout_type": layout_type, |
|
|
"language_sub_folder": language_sub_folder, |
|
|
"dialect_title_string": dialect_title_string, |
|
|
"tags": ";".join(tags), |
|
|
"license": meta_data["license"], |
|
|
"language": meta_data["language"].lower(), |
|
|
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A", |
|
|
"language_name": meta_data["language"].title(), |
|
|
"phone_set": phone_set, |
|
|
"phone_set_name": meta_data["phone_set"].upper(), |
|
|
} |
|
|
if model_type == "language_model": |
|
|
tags = ["MFA"] |
|
|
return { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"layout_type": layout_type, |
|
|
"language": meta_data["language"].lower(), |
|
|
"dialects": meta_data["dialect"] if meta_data["dialect"] else "N/A", |
|
|
"architecture": meta_data["architecture"], |
|
|
"language_name": meta_data["language"].title(), |
|
|
"dialect_title_string": dialect_title_string, |
|
|
"license": meta_data["license"], |
|
|
"version": meta_data["version"], |
|
|
"source": "mfa", |
|
|
"see_also": see_also, |
|
|
"tags": ";".join(tags), |
|
|
} |
|
|
if model_type == "tokenizer": |
|
|
tags = ["MFA"] |
|
|
return { |
|
|
"model_name": name, |
|
|
"ref": name.replace(" ", "_"), |
|
|
"title": name.replace("_", "."), |
|
|
"model_type": model_type, |
|
|
"layout_type": layout_type, |
|
|
"language": meta_data["language"].lower(), |
|
|
"architecture": meta_data["architecture"], |
|
|
"language_name": meta_data["language"].title(), |
|
|
"license": meta_data["license"], |
|
|
"version": meta_data["version"], |
|
|
"source": "mfa", |
|
|
"see_also": see_also, |
|
|
"tags": ";".join(tags), |
|
|
} |
|
|
|
|
|
|
|
|
model_card_templates = { |
|
|
"acoustic": { |
|
|
"mfa": mfa_acoustic_model_card_template, |
|
|
"other": other_acoustic_model_card_template, |
|
|
}, |
|
|
"dictionary": {"mfa": mfa_dictionary_card_template, "other": other_dictionary_card_template}, |
|
|
"g2p": {"mfa": g2p_model_card_template, "other": g2p_model_card_template}, |
|
|
"language_model": {"mfa": language_model_card_template, "other": language_model_card_template}, |
|
|
"corpus": {"mfa": corpus_card_template, "other": corpus_card_template}, |
|
|
"ivector": {"mfa": ivector_card_template, "other": ivector_card_template}, |
|
|
"tokenizer": {"mfa": tokenizer_model_card_template, "other": tokenizer_model_card_template}, |
|
|
} |
|
|
docs_card_templates = { |
|
|
"acoustic": {"mfa": acoustic_docs_md_template, "other": acoustic_docs_md_template}, |
|
|
"dictionary": { |
|
|
"mfa": mfa_dictionary_docs_md_template, |
|
|
"other": other_dictionary_docs_md_template, |
|
|
}, |
|
|
"g2p": {"mfa": g2p_docs_md_template, "other": g2p_docs_md_template}, |
|
|
"language_model": {"mfa": lm_docs_md_template, "other": lm_docs_md_template}, |
|
|
"corpus": {"mfa": corpus_docs_md_template, "other": corpus_docs_md_template}, |
|
|
"ivector": {"mfa": ivector_docs_md_template, "other": ivector_docs_md_template}, |
|
|
"tokenizer": {"mfa": tokenizer_docs_md_template, "other": tokenizer_docs_md_template}, |
|
|
} |
|
|
model_type_names = { |
|
|
"acoustic": "Acoustic models", |
|
|
"dictionary": "Pronunciation dictionaries", |
|
|
"g2p": "G2P models", |
|
|
"language_model": "Language models", |
|
|
"corpus": "Corpora", |
|
|
"ivector": "Ivector extractors", |
|
|
"tokenizer": "Tokenizer models", |
|
|
} |
|
|
model_type_columns = { |
|
|
"acoustic": "ID;language;dialect;phoneset;license", |
|
|
"ivector": "ID;language;license", |
|
|
"dictionary": "ID;language;dialect;phoneset;license", |
|
|
"g2p": "ID;language;dialect;phoneset;license", |
|
|
"language_model": "ID;language;dialect;license", |
|
|
"corpus": "ID;language;dialect;license", |
|
|
"tokenizer": "ID;language;license", |
|
|
} |
|
|
model_type_column_widths = { |
|
|
"acoustic": "40;20;20;10;10", |
|
|
"dictionary": "40;20;20;10;10", |
|
|
"g2p": "40;20;20;10;10", |
|
|
"language_model": "50;20;20;10", |
|
|
"ivector": "50;25;25", |
|
|
"tokenizer": "50;25;25", |
|
|
"corpus": "40;20;25;15", |
|
|
} |
|
|
|
|
|
meta_datas = {} |
|
|
|
|
|
chart_template = """``````{{list-table}} |
|
|
:header-rows: 1 |
|
|
:stub-columns: {stub_column_count} |
|
|
:class: {type}_chart table-striped table-bordered |
|
|
|
|
|
* - {header_data} |
|
|
* - {row_data} |
|
|
`````` |
|
|
""" |
|
|
|
|
|
|
|
|
def generate_extra_data(dictionary, base_indent): |
|
|
lines = [] |
|
|
for key, value in dictionary.items(): |
|
|
if isinstance(value, dict): |
|
|
lines.append(f"{base_indent}* {key}") |
|
|
if len(value) > 4: |
|
|
value = {k: value[k] for k in rng.choice(list(value.keys()), 4, replace=False)} |
|
|
lines.extend(generate_extra_data(value, base_indent=" " + base_indent)) |
|
|
else: |
|
|
lines.append(f"{base_indent}* {key}: {value}") |
|
|
return lines |
|
|
|
|
|
|
|
|
def format_ipa_cell( |
|
|
phone_data: dict[str, list[str]], |
|
|
extra_data: dict[str, dict[str, typing.Any]] = None, |
|
|
base_indent: typing.Optional[str] = "", |
|
|
) -> str: |
|
|
cell_lines = [f"```{{ipa_cell}}"] |
|
|
for phone_class, v in phone_data.items(): |
|
|
if not v: |
|
|
continue |
|
|
cell_lines.append(f"{base_indent}* {phone_class}") |
|
|
for phone in v: |
|
|
cell_lines.append(f"{base_indent} * {phone}") |
|
|
if phone in extra_data: |
|
|
cell_lines.extend( |
|
|
generate_extra_data(extra_data[phone], base_indent=base_indent + " ") |
|
|
) |
|
|
cell_lines.append(f"{base_indent}```") |
|
|
cell_content = "\n".join(cell_lines) |
|
|
return cell_content |
|
|
|
|
|
|
|
|
def check_phone(phone, feature_set, phone_set_type): |
|
|
if phone_set_type is PhoneSetType.ARPA: |
|
|
return phone in feature_set |
|
|
else: |
|
|
return any(x in phone for x in feature_set) |
|
|
|
|
|
|
|
|
def analyze_dictionary(dictionary_path, name, phone_set_type): |
|
|
d = load_dict(dictionary_path, name, phone_set_type=phone_set_type) |
|
|
dictionary_mapping = collections.defaultdict(set) |
|
|
if d.phone_set_type is PhoneSetType.ARPA: |
|
|
super_segmentals = {"stress": re.compile(r"[0-2]+")} |
|
|
ipa_mapping = { |
|
|
"stops": d.phone_set_type.stops, |
|
|
"voiced": d.phone_set_type.voiced_obstruents, |
|
|
"voiceless": d.phone_set_type.voiceless_obstruents, |
|
|
"fricative": d.phone_set_type.fricatives, |
|
|
"affricates": d.phone_set_type.affricates, |
|
|
"sibilant": d.phone_set_type.sibilants, |
|
|
"lateral": d.phone_set_type.laterals, |
|
|
"nasal": d.phone_set_type.nasals, |
|
|
"approximant": d.phone_set_type.approximants, |
|
|
"labial": d.phone_set_type.labials, |
|
|
"labiodental": d.phone_set_type.labiodental, |
|
|
"dental": d.phone_set_type.dental, |
|
|
"alveolar": d.phone_set_type.alveolar, |
|
|
"alveopalatal": d.phone_set_type.alveopalatal, |
|
|
"velar": d.phone_set_type.velar, |
|
|
"glottal": d.phone_set_type.glottal, |
|
|
"implosive": set(), |
|
|
"lateral_tap": set(), |
|
|
"tap": set(), |
|
|
"palatal": d.phone_set_type.palatal, |
|
|
"trill": set(), |
|
|
"pharyngeal": set(), |
|
|
"epiglottal": set(), |
|
|
"uvular": set(), |
|
|
"retroflex": set(), |
|
|
"lateral_fricative": set(), |
|
|
"close": d.phone_set_type.close_vowels, |
|
|
"close-mid": d.phone_set_type.close_mid_vowels, |
|
|
"open-mid": d.phone_set_type.open_mid_vowels, |
|
|
"open": d.phone_set_type.open_vowels, |
|
|
"front": d.phone_set_type.front_vowels - {"IH"}, |
|
|
"near-front": {"IH"}, |
|
|
"central": d.phone_set_type.central_vowels, |
|
|
"back": d.phone_set_type.back_vowels - {"UH"}, |
|
|
"near-back": {"UH"}, |
|
|
"rounded": d.phone_set_type.rounded_vowels, |
|
|
"unrounded": d.phone_set_type.unrounded_vowels, |
|
|
"lax": {"IH", "UH", "AH", "AE", "ER"}, |
|
|
"other": set(), |
|
|
} |
|
|
else: |
|
|
ipa_mapping = { |
|
|
"stops": d.phone_set_type.stops, |
|
|
"voiced": d.phone_set_type.voiced_obstruents, |
|
|
"voiceless": d.phone_set_type.voiceless_obstruents, |
|
|
"implosive": d.phone_set_type.implosive_obstruents, |
|
|
"fricative": d.phone_set_type.fricatives, |
|
|
"sibilant": d.phone_set_type.sibilants, |
|
|
"lateral": d.phone_set_type.laterals, |
|
|
"lateral_fricative": d.phone_set_type.lateral_fricatives, |
|
|
"nasal": d.phone_set_type.nasals, |
|
|
"nasal_approximants": d.phone_set_type.nasal_approximants, |
|
|
"trill": d.phone_set_type.trills, |
|
|
"tap": d.phone_set_type.taps, |
|
|
"lateral_tap": d.phone_set_type.lateral_taps, |
|
|
"approximant": d.phone_set_type.approximants - d.phone_set_type.nasal_approximants, |
|
|
"labial": d.phone_set_type.labials, |
|
|
"labiodental": d.phone_set_type.labiodental, |
|
|
"dental": d.phone_set_type.dental, |
|
|
"alveolar": d.phone_set_type.alveolar, |
|
|
"retroflex": d.phone_set_type.retroflex, |
|
|
"alveopalatal": d.phone_set_type.alveopalatal, |
|
|
"palatal": d.phone_set_type.palatal, |
|
|
"velar": d.phone_set_type.velar, |
|
|
"uvular": d.phone_set_type.uvular, |
|
|
"pharyngeal": d.phone_set_type.pharyngeal, |
|
|
"epiglottal": d.phone_set_type.epiglottal, |
|
|
"glottal": d.phone_set_type.glottal, |
|
|
"close": d.phone_set_type.close_vowels, |
|
|
"close-mid": d.phone_set_type.close_mid_vowels, |
|
|
"open-mid": d.phone_set_type.open_mid_vowels, |
|
|
"open": d.phone_set_type.open_vowels, |
|
|
"front": d.phone_set_type.front_vowels - {"ɪ", "ʏ", "ɛ̈", "ʏ̈"}, |
|
|
"near-front": {"ɪ", "ʏ", "ɛ̈", "ʏ̈"}, |
|
|
"central": d.phone_set_type.central_vowels, |
|
|
"back": d.phone_set_type.back_vowels - {"ʊ", "ɔ̈"}, |
|
|
"near-back": {"ʊ", "ɔ̈"}, |
|
|
"rounded": d.phone_set_type.rounded_vowels, |
|
|
"unrounded": d.phone_set_type.unrounded_vowels, |
|
|
"lax": {"ɪ", "ʏ", "ʊ", "ə", "ɐ", "æ", "ɚ"}, |
|
|
"nasalized": {"ã", "õ", "ĩ", "ũ", "ẽ"}, |
|
|
"other": {"kp", "ɧ", "ŋm"}, |
|
|
} |
|
|
super_segmentals = {"tones": re.compile(r"[˩˨˧˦˥ˀ]+")} |
|
|
for k, v in ipa_mapping.items(): |
|
|
voiceless = [x for x in v if x in ipa_mapping["voiceless"]] |
|
|
voiced = [x for x in v if x not in ipa_mapping["voiceless"]] |
|
|
mod_phones = set() |
|
|
for p in voiceless: |
|
|
mod_phones |= voiceless_variants(p) |
|
|
for p in voiced: |
|
|
mod_phones |= voiced_variants(p) |
|
|
ipa_mapping[k] = mod_phones | v |
|
|
extra_data = {} |
|
|
with d.session() as session: |
|
|
phones = session.query(Phone).filter(Phone.phone_type == PhoneType.non_silence) |
|
|
phone_counts = collections.Counter() |
|
|
pronunciations = session.query(Pronunciation.pronunciation) |
|
|
for (p,) in pronunciations: |
|
|
p = p.split() |
|
|
phone_counts.update(p) |
|
|
|
|
|
total_phones = set() |
|
|
triphthongs = d.phone_set_type.triphthong_phones |
|
|
diphthongs = d.phone_set_type.diphthong_phones |
|
|
for phone in phones: |
|
|
words = ( |
|
|
session.query(Word.word, Pronunciation.pronunciation) |
|
|
.join(Word.pronunciations) |
|
|
.filter( |
|
|
sqlalchemy.func.length(Word.word) > 2, |
|
|
sqlalchemy.func.length(Word.word) < 6, |
|
|
Pronunciation.probability != None, |
|
|
Pronunciation.pronunciation.regexp_match(rf"\b{phone.phone}(?=\s|$)"), |
|
|
) |
|
|
.distinct() |
|
|
.order_by(sqlalchemy.func.random()) |
|
|
.limit(4) |
|
|
) |
|
|
for super_seg, pattern in super_segmentals.items(): |
|
|
phone_m = pattern.search(phone.phone) |
|
|
if phone_m: |
|
|
dictionary_mapping[super_seg].add(phone_m.group(0)) |
|
|
counts = phone_counts[phone.phone] |
|
|
examples = {} |
|
|
for w, pron in words: |
|
|
examples[w] = f"[{pron}]" |
|
|
phone = phone.phone.replace(phone_m.group(0), "") |
|
|
if phone not in extra_data: |
|
|
extra_data[phone] = {"Occurrences": 0, "Examples": {}} |
|
|
if isinstance(extra_data[phone]["Occurrences"], str): |
|
|
try: |
|
|
extra_data[phone]["Occurrences"] = int(extra_data[phone]["Occurrences"]) |
|
|
except ValueError: |
|
|
extra_data[phone]["Occurrences"] = 0 |
|
|
extra_data[phone]["Occurrences"] += counts |
|
|
extra_data[phone]["Examples"].update(examples) |
|
|
break |
|
|
else: |
|
|
extra_data[phone.phone] = {"Occurrences": phone_counts[phone.phone], "Examples": {}} |
|
|
phone = phone.phone |
|
|
for w, pron in words: |
|
|
extra_data[phone]["Examples"][w] = f"[{pron}]" |
|
|
base_phone = d.get_base_phone(phone) |
|
|
query_set = {phone, base_phone} |
|
|
if base_phone in ipa_mapping["other"]: |
|
|
dictionary_mapping["other"].add(phone) |
|
|
continue |
|
|
if "ʲ" in phone: |
|
|
dictionary_mapping["palatalized"].add(phone) |
|
|
if "ʷ" in phone: |
|
|
dictionary_mapping["labialized"].add(phone) |
|
|
if "̃" in phone: |
|
|
dictionary_mapping["nasalized"].add(phone) |
|
|
base_phone = base_phone.replace("̃", "") |
|
|
if "͈" in phone: |
|
|
dictionary_mapping["tense"].add(phone) |
|
|
dictionary_mapping["voiceless"].add(phone) |
|
|
if "̪" in phone: |
|
|
dictionary_mapping["dental"].add(phone) |
|
|
if any(x in phone for x in ["ⁿ", "ᵑ", "ᵐ"]): |
|
|
dictionary_mapping["prenasalized"].add(phone) |
|
|
dictionary_mapping["voiced"].add(phone) |
|
|
elif "ʱ" in phone or "̤" in phone: |
|
|
dictionary_mapping["aspirated"].add(phone) |
|
|
dictionary_mapping["voiced"].add(phone) |
|
|
elif check_phone(phone, ipa_mapping["voiced"], d.phone_set_type): |
|
|
dictionary_mapping["voiced"].add(phone) |
|
|
elif check_phone(phone, ipa_mapping["implosive"], d.phone_set_type): |
|
|
dictionary_mapping["implosive"].add(phone) |
|
|
dictionary_mapping["voiced"].add(phone) |
|
|
elif "ʰ" in phone: |
|
|
dictionary_mapping["aspirated"].add(phone) |
|
|
dictionary_mapping["voiceless"].add(phone) |
|
|
elif "ʼ" in phone: |
|
|
dictionary_mapping["ejective"].add(phone) |
|
|
dictionary_mapping["voiceless"].add(phone) |
|
|
elif check_phone(phone, ipa_mapping["voiceless"], d.phone_set_type): |
|
|
dictionary_mapping["voiceless"].add(phone) |
|
|
if "̚" in phone: |
|
|
dictionary_mapping["unreleased"].add(phone) |
|
|
if any(x in diphthongs for x in query_set): |
|
|
dictionary_mapping["diphthong"].add(phone) |
|
|
elif any(x in triphthongs for x in query_set): |
|
|
dictionary_mapping["triphthong"].add(phone) |
|
|
elif any(x in d.phone_set_type.affricates for x in query_set): |
|
|
dictionary_mapping["affricate"].add(phone) |
|
|
elif any(x in d.phone_set_type.stops for x in query_set): |
|
|
dictionary_mapping["stop"].add(phone) |
|
|
for k, v in ipa_mapping.items(): |
|
|
if base_phone in v: |
|
|
dictionary_mapping[k].add(phone) |
|
|
total_phones.add(phone) |
|
|
for v in dictionary_mapping.values(): |
|
|
if phone in v: |
|
|
break |
|
|
else: |
|
|
dictionary_mapping["other"].add(phone) |
|
|
places = [ |
|
|
"labial", |
|
|
"labiodental", |
|
|
"dental", |
|
|
"alveolar", |
|
|
"alveopalatal", |
|
|
"retroflex", |
|
|
"palatal", |
|
|
"velar", |
|
|
"uvular", |
|
|
"pharyngeal", |
|
|
"epiglottal", |
|
|
"glottal", |
|
|
] |
|
|
columns = [] |
|
|
for p in places: |
|
|
if p in dictionary_mapping: |
|
|
columns.append(p) |
|
|
sub_manners = ["tense", "aspirated", "implosive", "ejective", "unreleased", "prenasalized"] |
|
|
|
|
|
rows = [] |
|
|
plotted = set() |
|
|
for manner in [ |
|
|
"nasal", |
|
|
"stop", |
|
|
"affricate", |
|
|
"sibilant", |
|
|
"fricative", |
|
|
"approximant", |
|
|
"tap", |
|
|
"trill", |
|
|
"lateral_fricative", |
|
|
"lateral", |
|
|
"lateral_tap", |
|
|
]: |
|
|
if manner not in dictionary_mapping: |
|
|
continue |
|
|
realized_submanner_rows = {} |
|
|
for x in sub_manners: |
|
|
if dictionary_mapping[manner] & dictionary_mapping[x]: |
|
|
realized_submanner_rows[x] = [f"{{submanner}}`{x.title()}`"] |
|
|
row_title = f"{{manner}}`{manner.replace('_',' ').title()}`" |
|
|
if realized_submanner_rows: |
|
|
row_title += " {submanner}`Plain`" |
|
|
row = [row_title] |
|
|
for place in columns: |
|
|
cell_set = dictionary_mapping[manner] & dictionary_mapping[place] |
|
|
base_set = dictionary_mapping[manner] & dictionary_mapping[place] |
|
|
for x in sub_manners: |
|
|
cell_set -= dictionary_mapping[x] |
|
|
base_set -= dictionary_mapping[x] |
|
|
voiced_set = base_set & dictionary_mapping["voiced"] |
|
|
voiceless_set = base_set & dictionary_mapping["voiceless"] |
|
|
other_set = base_set - dictionary_mapping["voiceless"] - dictionary_mapping["voiced"] |
|
|
plotted.update(voiceless_set) |
|
|
plotted.update(voiced_set) |
|
|
plotted.update(other_set) |
|
|
cell_data = { |
|
|
"voiceless": sorted(voiceless_set), |
|
|
"voiced": sorted(voiced_set), |
|
|
"other": sorted(other_set), |
|
|
} |
|
|
cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ") |
|
|
row.append(cell_contents) |
|
|
rows.append(row) |
|
|
if realized_submanner_rows: |
|
|
for place in columns: |
|
|
for sub_manner in realized_submanner_rows.keys(): |
|
|
cell_set = ( |
|
|
dictionary_mapping[manner] |
|
|
& dictionary_mapping[place] |
|
|
& dictionary_mapping[sub_manner] |
|
|
) |
|
|
for s in realized_submanner_rows.keys(): |
|
|
if s == sub_manner: |
|
|
continue |
|
|
cell_set -= dictionary_mapping[s] |
|
|
voiced_set = cell_set & dictionary_mapping["voiced"] |
|
|
voiceless_set = cell_set & dictionary_mapping["voiceless"] |
|
|
other_set = ( |
|
|
cell_set - dictionary_mapping["voiceless"] - dictionary_mapping["voiced"] |
|
|
) |
|
|
plotted.update(voiceless_set) |
|
|
plotted.update(voiced_set) |
|
|
plotted.update(other_set) |
|
|
cell_data = { |
|
|
"voiceless": sorted(voiceless_set), |
|
|
"voiced": sorted(voiced_set), |
|
|
"other": sorted(other_set), |
|
|
} |
|
|
cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ") |
|
|
realized_submanner_rows[sub_manner].append(cell_contents) |
|
|
rows.extend(realized_submanner_rows.values()) |
|
|
row_headers = ["Manner"] |
|
|
columns = row_headers + columns |
|
|
consonants = {"header": columns, "rows": rows} |
|
|
|
|
|
oral_rows = [] |
|
|
nasal_rows = [] |
|
|
headers = ["front", "near-front", "central", "near-back", "back"] |
|
|
has_nasal = False |
|
|
for height in ["close", "close-mid", "open-mid", "open"]: |
|
|
for on in ["nasalized", "oral"]: |
|
|
main_row = [height.title()] |
|
|
lax_row = [""] |
|
|
for column in headers: |
|
|
cell_set = dictionary_mapping[height] & dictionary_mapping[column] |
|
|
if on in dictionary_mapping: |
|
|
cell_set &= dictionary_mapping["nasalized"] |
|
|
if cell_set and not has_nasal: |
|
|
has_nasal = True |
|
|
else: |
|
|
cell_set -= dictionary_mapping["nasalized"] |
|
|
if height == "close" and column in {"front", "back"}: |
|
|
lax_set = set() |
|
|
tense_set = cell_set - dictionary_mapping["lax"] |
|
|
elif height == "close" and column in {"near-front", "near-back"}: |
|
|
tense_set = set() |
|
|
lax_set = cell_set & dictionary_mapping["lax"] |
|
|
else: |
|
|
tense_set = cell_set - dictionary_mapping["lax"] |
|
|
lax_set = cell_set & dictionary_mapping["lax"] |
|
|
|
|
|
tense_rounded = tense_set & dictionary_mapping["rounded"] |
|
|
tense_unrounded = tense_set & dictionary_mapping["unrounded"] |
|
|
cell_data = { |
|
|
"unrounded": sorted(tense_unrounded), |
|
|
"rounded": sorted(tense_rounded), |
|
|
} |
|
|
plotted.update(tense_unrounded) |
|
|
plotted.update(tense_rounded) |
|
|
tense_cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ") |
|
|
|
|
|
lax_rounded = lax_set & dictionary_mapping["rounded"] |
|
|
lax_unrounded = lax_set & dictionary_mapping["unrounded"] |
|
|
plotted.update(lax_rounded) |
|
|
plotted.update(lax_unrounded) |
|
|
cell_data = { |
|
|
"unrounded": sorted(lax_unrounded), |
|
|
"rounded": sorted(lax_rounded), |
|
|
} |
|
|
lax_cell_contents = format_ipa_cell(cell_data, extra_data, base_indent=" ") |
|
|
|
|
|
main_row.append(tense_cell_contents) |
|
|
lax_row.append(lax_cell_contents) |
|
|
if on in dictionary_mapping: |
|
|
nasal_rows.append(main_row) |
|
|
if height != "open": |
|
|
nasal_rows.append(lax_row) |
|
|
else: |
|
|
oral_rows.append(main_row) |
|
|
if height != "open": |
|
|
oral_rows.append(lax_row) |
|
|
|
|
|
headers = [""] + [x.title() for x in headers] |
|
|
if not has_nasal: |
|
|
nasal_rows = None |
|
|
|
|
|
header_row_string = "\n - ".join(x.title() for x in consonants["header"]) |
|
|
row_strings = "\n* - ".join("\n - ".join(x) for x in consonants["rows"]) |
|
|
stub_column_count = 1 |
|
|
consonant_chart = chart_template.format( |
|
|
header_data=header_row_string, |
|
|
row_data=row_strings, |
|
|
type="consonant", |
|
|
stub_column_count=stub_column_count, |
|
|
) |
|
|
vowels = { |
|
|
"oral_rows": oral_rows, |
|
|
"nasal_rows": nasal_rows, |
|
|
"header": headers, |
|
|
} |
|
|
header_row_string = "\n - ".join(vowels["header"]) |
|
|
row_strings = "\n* - ".join("\n - ".join(x) for x in vowels["oral_rows"]) |
|
|
|
|
|
oral_chart = chart_template.format( |
|
|
header_data=header_row_string, row_data=row_strings, type="vowel", stub_column_count=1 |
|
|
) |
|
|
nasal_chart = None |
|
|
if nasal_rows: |
|
|
header_row_string = "\n - ".join(vowels["header"]) |
|
|
row_strings = "\n* - ".join("\n - ".join(x) for x in vowels["nasal_rows"]) |
|
|
|
|
|
nasal_chart = chart_template.format( |
|
|
header_data=header_row_string, row_data=row_strings, type="vowel", stub_column_count=1 |
|
|
) |
|
|
|
|
|
data = { |
|
|
"consonant_chart": consonant_chart, |
|
|
"oral_vowel_chart": oral_chart, |
|
|
"nasal_vowel_chart": nasal_chart, |
|
|
"diphthongs": dictionary_mapping["diphthong"], |
|
|
"other": dictionary_mapping["other"] & (total_phones - plotted), |
|
|
"triphthongs": dictionary_mapping["triphthong"], |
|
|
} |
|
|
for k in super_segmentals.keys(): |
|
|
if k in dictionary_mapping: |
|
|
data[k] = dictionary_mapping[k] |
|
|
return data |
|
|
|
|
|
|
|
|
phone_charts = {} |
|
|
model_mappings = {} |
|
|
for model_type, model_class in MODEL_TYPES.items(): |
|
|
|
|
|
|
|
|
meta_datas[model_type] = {} |
|
|
model_mappings[model_type] = {} |
|
|
model_directory = os.path.join(mfa_model_root, model_type) |
|
|
staging_directory = os.path.join(model_directory, "staging") |
|
|
|
|
|
models_to_stage = os.listdir(staging_directory) |
|
|
for file_name in models_to_stage: |
|
|
if not os.path.isfile(os.path.join(staging_directory, file_name)): |
|
|
continue |
|
|
if model_type == "dictionary" and not file_name.endswith(".dict"): |
|
|
continue |
|
|
print(file_name) |
|
|
model = model_class(os.path.join(staging_directory, file_name)) |
|
|
print(model.meta) |
|
|
s = model.name.split("_") |
|
|
dialect = "" |
|
|
if model_type == "language_model": |
|
|
if "_mfa" in model.name: |
|
|
s = model.name.replace("_mfa", "").split("_") |
|
|
language = "_".join(s[:-1]) |
|
|
dialect = " ".join(s[1:-1]) |
|
|
phone_set = "MFA" |
|
|
elif model_type == "ivector": |
|
|
language = model.name.replace("_mfa", "") |
|
|
dialect = "" |
|
|
phone_set = "" |
|
|
elif model_type == "tokenizer": |
|
|
language = model.name.replace("_mfa", "") |
|
|
dialect = "" |
|
|
phone_set = "" |
|
|
elif len(s) == 1: |
|
|
language = s[0] |
|
|
phone_set = "Unknown" |
|
|
dialect = "" |
|
|
elif len(s) == 2: |
|
|
language, phone_set = s |
|
|
phone_set = phone_set.upper() |
|
|
dialect = "" |
|
|
else: |
|
|
language = s[0] |
|
|
phone_set = s[-1].upper() |
|
|
dialect = " ".join(s[1:-1]) |
|
|
try: |
|
|
version = model.meta["version"] |
|
|
except KeyError: |
|
|
version = montreal_forced_aligner.utils.get_mfa_version() |
|
|
if version.startswith("2.") or version.startswith("3."): |
|
|
version = CURRENT_MODEL_VERSION |
|
|
language = language.title() |
|
|
if len(dialect) == 2: |
|
|
dialect = dialect.upper() |
|
|
else: |
|
|
dialect = dialect.title() |
|
|
print(model_directory, language, phone_set, version) |
|
|
if dialect: |
|
|
phone_set_folder = f"{dialect}_{phone_set}".replace(" ", "_").lower() |
|
|
else: |
|
|
phone_set_folder = phone_set.lower() |
|
|
if phone_set_folder: |
|
|
output_directory = os.path.join( |
|
|
model_directory, language.lower(), phone_set_folder, f"v{version}" |
|
|
) |
|
|
else: |
|
|
output_directory = os.path.join(model_directory, language.lower(), f"v{version}") |
|
|
os.makedirs(output_directory, exist_ok=True) |
|
|
license_path = os.path.join(output_directory, "LICENSE") |
|
|
if phone_set != "CV" and not os.path.exists(license_path): |
|
|
shutil.copyfile(os.path.join(mfa_model_root, "LICENSE"), license_path) |
|
|
meta_path = os.path.join(output_directory, "meta.json") |
|
|
if OVERWRITE_METADATA or not os.path.exists(meta_path): |
|
|
meta_data = generate_meta_data( |
|
|
model, model_type, language, dialect, version, phone_set |
|
|
) |
|
|
with open(meta_path, "w", encoding="utf8") as f: |
|
|
json.dump(meta_data, f, indent=4, ensure_ascii=False) |
|
|
else: |
|
|
with open(meta_path, "r", encoding="utf8") as f: |
|
|
meta_data = json.load(f) |
|
|
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data |
|
|
keys = [language] |
|
|
if model_type in {"language_model", "ivector", "tokenizer"}: |
|
|
if dialect: |
|
|
keys.append((language, dialect)) |
|
|
key = (language, dialect) |
|
|
else: |
|
|
if dialect: |
|
|
keys.append((language, dialect)) |
|
|
keys.append((language, dialect, phone_set)) |
|
|
key = (language, dialect, phone_set) |
|
|
dialect_key = (language, dialect) |
|
|
else: |
|
|
keys.append((language, phone_set)) |
|
|
for key in keys: |
|
|
if key not in model_mappings[model_type]: |
|
|
model_mappings[model_type][key] = [] |
|
|
model_mappings[model_type][key].append(generate_id(meta_data, model_type)) |
|
|
if model_type == "dictionary" and phone_set in {"MFA", "CV", "ARPA"}: |
|
|
phone_set_type = "IPA" |
|
|
if phone_set == "ARPA": |
|
|
phone_set_type = "ARPA" |
|
|
phone_charts[meta_data["name"]] = analyze_dictionary( |
|
|
model.path, model.name, phone_set_type |
|
|
) |
|
|
|
|
|
|
|
|
existing_models = [] |
|
|
for language in os.listdir(model_directory): |
|
|
if language in {"staging", "training", "filter_lists", "1.0"}: |
|
|
continue |
|
|
language_directory = os.path.join(model_directory, language) |
|
|
if not os.path.isdir(language_directory): |
|
|
continue |
|
|
language = language.title() |
|
|
if model_type in {"ivector", "tokenizer"}: |
|
|
for version in os.listdir(language_directory): |
|
|
meta_path = os.path.join(language_directory, version, "meta.json") |
|
|
if not os.path.exists(meta_path): |
|
|
continue |
|
|
with open(meta_path, "r", encoding="utf8") as f: |
|
|
meta_data = json.load(f) |
|
|
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data |
|
|
keys = [language] |
|
|
for key in keys: |
|
|
if key not in model_mappings[model_type]: |
|
|
model_mappings[model_type][key] = [] |
|
|
model_mappings[model_type][key].append(generate_id(meta_data, model_type)) |
|
|
|
|
|
else: |
|
|
for phone_set in os.listdir(language_directory): |
|
|
print(phone_set) |
|
|
phone_set_dir = os.path.join(language_directory, phone_set) |
|
|
if "_" in phone_set: |
|
|
dialect, phone_set = phone_set.rsplit("_", maxsplit=1) |
|
|
else: |
|
|
dialect = "" |
|
|
for version in os.listdir(phone_set_dir): |
|
|
meta_path = os.path.join(phone_set_dir, version, "meta.json") |
|
|
if not os.path.exists(meta_path): |
|
|
continue |
|
|
with open(meta_path, "r", encoding="utf8") as f: |
|
|
meta_data = json.load(f) |
|
|
meta_datas[model_type][generate_id(meta_data, model_type)] = meta_data |
|
|
keys = [language] |
|
|
if model_type == "language_model": |
|
|
if dialect: |
|
|
keys.append((language, dialect)) |
|
|
key = (language, dialect) |
|
|
else: |
|
|
if dialect: |
|
|
keys.append((language, dialect)) |
|
|
keys.append((language, dialect, phone_set)) |
|
|
key = (language, dialect, phone_set) |
|
|
dialect_key = (language, dialect) |
|
|
else: |
|
|
keys.append((language, phone_set)) |
|
|
for key in keys: |
|
|
if key not in model_mappings[model_type]: |
|
|
model_mappings[model_type][key] = [] |
|
|
model_mappings[model_type][key].append(generate_id(meta_data, model_type)) |
|
|
|
|
|
|
|
|
if "dictionary" in meta_datas: |
|
|
for k in model_corpus_mapping.keys(): |
|
|
dict_id = k.replace("acoutic model", "dictionary") |
|
|
if dict_id in meta_datas["dictionary"]: |
|
|
model_dictionary_mapping[k] = [dict_id] |
|
|
|
|
|
if "g2p" in meta_datas: |
|
|
for v in model_corpus_mapping.values(): |
|
|
for d_id in v: |
|
|
g2p_id = d_id.replace("dictionary", "G2P model") |
|
|
if g2p_id in meta_datas["g2p"]: |
|
|
model_dictionary_mapping[g2p_id] = [d_id] |
|
|
|
|
|
|
|
|
if "language_model" in meta_datas: |
|
|
for k, v in model_dictionary_mapping.items(): |
|
|
lm_id = k.replace("acoustic", "language") |
|
|
if lm_id in meta_datas["language_model"]: |
|
|
model_dictionary_mapping[lm_id] = v |
|
|
|
|
|
|
|
|
if "tokenizer" in meta_datas: |
|
|
for k, v in model_dictionary_mapping.items(): |
|
|
tokenizer_id = k.replace("tokenizer", "language") |
|
|
if tokenizer_id in meta_datas["tokenizer"]: |
|
|
model_dictionary_mapping[tokenizer_id] = v |
|
|
|
|
|
|
|
|
corpora_metadata = {} |
|
|
model_mappings["corpus"] = {} |
|
|
corpus_metadata_file = os.path.join(mfa_model_root, "corpus", "staging", "corpus_data.json") |
|
|
if os.path.exists(corpus_metadata_file): |
|
|
with open(corpus_metadata_file, "r", encoding="utf8") as f: |
|
|
data = json.load(f) |
|
|
for language, c_list in data.items(): |
|
|
if language == "Hindi-Urdu": |
|
|
continue |
|
|
for c in c_list: |
|
|
name = c["name"] |
|
|
if "version" in c: |
|
|
name += f'_{c["version"]}' |
|
|
id = make_path_safe(name) |
|
|
c["language"] = language |
|
|
c["id"] = generate_id(c, "corpus") |
|
|
|
|
|
c["license_link"] = f"[{c['license']}]({license_links[c['license']]})" |
|
|
if "dialects" not in c: |
|
|
c["dialects"] = [] |
|
|
c["dialects"] = [x.title() if len(x) > 2 else x.upper() for x in c["dialects"]] |
|
|
corpora_metadata[c["id"]] = c |
|
|
print(c) |
|
|
print(generate_id(c, "corpus")) |
|
|
language_key = language |
|
|
if language_key not in model_mappings["corpus"]: |
|
|
model_mappings["corpus"][language_key] = [] |
|
|
model_mappings["corpus"][language_key].append(c["id"]) |
|
|
if c["dialects"]: |
|
|
for d in c["dialects"]: |
|
|
key = (language, d) |
|
|
if key not in model_mappings["corpus"]: |
|
|
model_mappings["corpus"][key] = [] |
|
|
model_mappings["corpus"][key].append(c["id"]) |
|
|
|
|
|
meta_datas["corpus"] = corpora_metadata |
|
|
|
|
|
|
|
|
for model_type, data in meta_datas.items(): |
|
|
for model_name, meta_data in data.items(): |
|
|
model_id = generate_id(meta_data, model_type) |
|
|
if model_type in {"acoustic", "language_model", "ivector", "tokenizer"}: |
|
|
print("HELLO!?", model_id, model_corpus_mapping.keys()) |
|
|
if model_id in model_corpus_mapping: |
|
|
print(model_corpus_mapping[model_id]) |
|
|
print(corpora_metadata.keys()) |
|
|
meta_data["corpus"] = [corpora_metadata[x] for x in model_corpus_mapping[model_id]] |
|
|
for corpus_id in model_corpus_mapping[model_id]: |
|
|
if model_type not in meta_datas["corpus"][corpus_id]: |
|
|
meta_datas["corpus"][corpus_id][model_type] = [] |
|
|
meta_datas["corpus"][corpus_id][model_type].append(model_id) |
|
|
if model_type in {"language_model", "corpus", "ivector"}: |
|
|
if "dialect" in meta_data and meta_data["dialect"]: |
|
|
key = (meta_data["language"], meta_data["dialect"]) |
|
|
else: |
|
|
key = meta_data["language"] |
|
|
else: |
|
|
if "dialect" in meta_data and meta_data["dialect"]: |
|
|
key = (meta_data["language"], meta_data["dialect"], meta_data["phone_set"]) |
|
|
else: |
|
|
key = (meta_data["language"], meta_data["phone_set"]) |
|
|
if model_type in {"acoustic", "language_model", "g2p"}: |
|
|
print(meta_data["name"]) |
|
|
print(key) |
|
|
print(model_mappings["dictionary"]) |
|
|
if key in model_mappings["dictionary"]: |
|
|
if "dictionary" not in meta_data: |
|
|
meta_data["dictionary"] = [] |
|
|
meta_data["dictionary"].extend(model_mappings["dictionary"][key]) |
|
|
if model_id in model_dictionary_mapping: |
|
|
if "dictionary" not in meta_data: |
|
|
meta_data["dictionary"] = [] |
|
|
meta_data["dictionary"].extend( |
|
|
[ |
|
|
x |
|
|
for x in model_dictionary_mapping[model_id] |
|
|
if x not in meta_data["dictionary"] |
|
|
] |
|
|
) |
|
|
elif model_type == "dictionary": |
|
|
for t in ["acoustic", "g2p", "language_model", "corpus"]: |
|
|
if key in model_mappings[t]: |
|
|
if t not in meta_data: |
|
|
meta_data[t] = [] |
|
|
meta_data[t].extend(model_mappings[t][key]) |
|
|
elif model_type == "corpus": |
|
|
meta_data["dictionary"] = [] |
|
|
if "dialects" in meta_data and meta_data["dialects"]: |
|
|
for dialect in meta_data["dialects"]: |
|
|
key = (meta_data["language"], dialect) |
|
|
if key in model_mappings["dictionary"]: |
|
|
meta_data["dictionary"].extend(model_mappings["dictionary"][key]) |
|
|
else: |
|
|
print( |
|
|
meta_data["language"], |
|
|
model_mappings["dictionary"], |
|
|
meta_data["language"] in model_mappings["dictionary"], |
|
|
) |
|
|
if meta_data["language"] in model_mappings["dictionary"]: |
|
|
for dictionary_id in model_mappings["dictionary"][meta_data["language"]]: |
|
|
m = meta_datas["dictionary"][dictionary_id] |
|
|
meta_data["dictionary"].append(dictionary_id) |
|
|
|
|
|
for model_type, data in meta_datas.items(): |
|
|
docs_dir = os.path.join(mfa_model_root, "docs", "source", model_type) |
|
|
os.makedirs(docs_dir, exist_ok=True) |
|
|
language_model_doc_mds = {} |
|
|
for model_name, meta_data in data.items(): |
|
|
print(model_name, meta_data) |
|
|
if model_type not in {"language_model", "corpus"} and meta_data["phone_set"] in { |
|
|
"PROSODYLAB", |
|
|
"PINYIN", |
|
|
}: |
|
|
model_card_template = model_card_templates[model_type]["other"] |
|
|
docs_md_template = docs_card_templates[model_type]["other"] |
|
|
elif model_type not in {"language_model", "corpus"} and meta_data["phone_set"] in {"CV"}: |
|
|
model_card_template = model_card_templates[model_type]["other"] |
|
|
docs_md_template = docs_card_templates[model_type]["mfa"] |
|
|
else: |
|
|
model_card_template = model_card_templates[model_type]["mfa"] |
|
|
docs_md_template = docs_card_templates[model_type]["mfa"] |
|
|
if model_type == "language_model": |
|
|
language, version = meta_data["language"], meta_data["version"] |
|
|
elif model_type == "corpus": |
|
|
language, name = meta_data["language"], meta_data["name"] |
|
|
name = make_path_safe(name) |
|
|
else: |
|
|
language, phone_set, dialect, version = ( |
|
|
meta_data["language"], |
|
|
meta_data["phone_set"], |
|
|
meta_data["dialect"], |
|
|
meta_data["version"], |
|
|
) |
|
|
output_directory = get_model_card_directory(model_type, meta_data) |
|
|
os.makedirs(output_directory, exist_ok=True) |
|
|
model_card_path = os.path.join(output_directory, "README.md") |
|
|
rst_path = model_name + ".md" |
|
|
docs_language_dir = os.path.join(docs_dir, language) |
|
|
if language not in language_model_doc_mds: |
|
|
language_model_doc_mds[language] = [] |
|
|
os.makedirs(docs_language_dir, exist_ok=True) |
|
|
docs_card_path = os.path.join(docs_language_dir, rst_path) |
|
|
language_model_doc_mds[language].append(rst_path) |
|
|
if OVERWRITE_MD or not os.path.exists(model_card_path): |
|
|
with open(model_card_path, "w", encoding="utf8") as f: |
|
|
print(meta_data) |
|
|
fields = extract_model_card_fields(meta_data, model_type) |
|
|
f.write(model_card_template.format(**fields)) |
|
|
if OVERWRITE_MD or not os.path.exists(docs_card_path): |
|
|
with open(docs_card_path, "w", encoding="utf8") as f: |
|
|
print(meta_data) |
|
|
fields = extract_doc_card_fields(meta_data, model_type) |
|
|
f.write(docs_md_template.format(**fields)) |
|
|
index_path = os.path.join(docs_dir, "index.rst") |
|
|
rst_string = " " + "\n ".join( |
|
|
f"{x}/index.rst" for x in sorted(language_model_doc_mds.keys()) |
|
|
) |
|
|
if model_type == "dictionary": |
|
|
rst_string = " ../mfa_phone_set.md\n" + rst_string |
|
|
model_type_name = model_type_names[model_type] |
|
|
columns = model_type_columns[model_type] |
|
|
widths = model_type_column_widths[model_type] |
|
|
with open(index_path, "w", encoding="utf8") as f: |
|
|
f.write( |
|
|
f""" |
|
|
|
|
|
.. _{model_type}: |
|
|
|
|
|
{model_type_name} |
|
|
{'='* len(model_type_name)} |
|
|
|
|
|
.. needtable:: |
|
|
:types: {model_type} |
|
|
:style: datatable |
|
|
:columns: {columns} |
|
|
:class: table-striped |
|
|
:colwidths: {widths} |
|
|
|
|
|
.. toctree:: |
|
|
:hidden: |
|
|
|
|
|
{rst_string} |
|
|
""" |
|
|
) |
|
|
for language, model_doc_mds in sorted(language_model_doc_mds.items()): |
|
|
index_path = os.path.join(docs_dir, language, "index.rst") |
|
|
rst_string = " " + "\n ".join(model_doc_mds) |
|
|
with open(index_path, "w", encoding="utf8") as f: |
|
|
f.write( |
|
|
f""" |
|
|
|
|
|
.. _{model_type}_{language.lower()}: |
|
|
|
|
|
{language.title()} |
|
|
{'='* len(language)} |
|
|
|
|
|
.. needtable:: |
|
|
:types: {model_type} |
|
|
:filter: language == "{language.title()}" |
|
|
:style: datatable |
|
|
:columns: {columns} |
|
|
:class: table-striped |
|
|
:colwidths: {widths} |
|
|
|
|
|
.. toctree:: |
|
|
:hidden: |
|
|
|
|
|
{rst_string} |
|
|
""" |
|
|
) |
|
|
|