import os from trainer import Trainer, TrainerArgs from TTS.tts.configs.shared_configs import BaseDatasetConfig,BaseAudioConfig,CharactersConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsAudioConfig from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) RESTORE_PATH = '/home/azureuser/BanglaTTS/nctb-vits-single-female-1/checkpoint.pth' SPEAKER_ID = 9 SPEAKER_GENDER = 'male' meta_file = f"/home/azureuser/BanglaTTS/nctb-audiobook-no-numbers/{SPEAKER_GENDER}/SP_{SPEAKER_ID}/metadata.txt" root_path = f"/home/azureuser/BanglaTTS/nctb-audiobook-no-numbers/{SPEAKER_GENDER}/SP_{SPEAKER_ID}" def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the LJSpeech meta data file to TTS format https://keithito.com/LJ-Speech-Dataset/""" txt_file = meta_file items = [] speaker_name = f"nctb_{SPEAKER_GENDER}_{SPEAKER_ID}" with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path,'audio', cols[0]) try: text = cols[1] except: print("not found") items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items dataset_config = BaseDatasetConfig( meta_file_train=meta_file, path=os.path.join(root_path, "") ) characters_config = CharactersConfig( pad = '', eos = '', #'', #'।', bos = '',# None, blank = '', phonemes = None, characters = "abcdefghijklmnopqrstuvwxyz0123456789+=/*√তট৫ভিঐঋখঊড়ইজমএেঘঙসীঢ়হঞ‘ঈকণ৬ঁৗশঢঠ\u200c১্২৮দৃঔগও—ছউংবৈঝাযফ\u200dচরষঅৌৎথড়৪ধ০ুূ৩আঃপয়’'”^নলো_…ৰ", #characters = "তট৫ভিঐঋখঊড়ইজমএেঘঙসীঢ়হঞ‘ঈকণ৬ঁৗশঢঠ\u200c১্২৮দৃঔগও—ছউংবৈঝাযফ\u200dচরষঅৌৎথড়৪ধ০ুূ৩আঃপয়’নলোˌamɾʃˈonbŋlitjʰɔdkpeɟːfɡuhrʈæsʒɖwəc", punctuations = "-–:;!,|.?॥। “", ) #ণ´0ুয)wCছ=ক'স_{rMথd“ো+W।চঋ৷ঔ…’Eৰওঢxoঝূৎ5iটআইSyAc—ড√ল8ঁিk়াYVz‍ফLbD-শlপ য়–গ(রঐ্ঊ‘অ‌Gঈষgভ!:n;ীO?vড়aq/tRঘবএঠpধ #ংখJঙঢ়]ৃউNহত,”নৗIfBৈmP॥sueঃৌhFমজদঞT.*েHj[ audio_config = VitsAudioConfig( sample_rate=16000, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None ) # VitsConfig: all model related values for training, validating and testing. config = VitsConfig( audio=audio_config, run_name="vits-ft-nctb", batch_size=48, eval_batch_size=8, batch_group_size=5, num_loader_workers=8, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, epochs=35, # testing # phonemizer="bn_phonemizer",# multi_phonemizer text_cleaner='multilingual_cleaners',#'multilingual_cleaners', #"collapse_whitespace" phoneme_cleaners multilingual_cleaners use_phonemes=False, # phoneme_language="bn", # phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, add_blank=True, use_language_weighted_sampler = True, print_step=500, print_eval=False, mixed_precision=True, output_path=output_path, datasets=[dataset_config], characters = characters_config, save_step=1000, cudnn_benchmark=True, # dashboard_logger = 'wandb', test_sentences = [ ["আমরা বাংলায় ওয়েব ডেভেলপমেন্ট নিয়ে কাজ করতে গিয়ে প্রথম যে সমস্যাটার মুখোমুখি হই, সেটা হলো, বাংলা ডেমো টেক্সট"], ["আমি বাঙালি ভাষায় কথা বলতে পারি।"], ["আমরা প্রকৃতি কে ভালোবাসি।"], ["আপনি কেমন আছেন?"], ] ) # INITIALIZE THE AUDIO PROCESSOR # Audio processor is used for feature extraction and audio I/O. # It mainly serves to the dataloader and the training loggers. ap = AudioProcessor.init_from_config(config) # INITIALIZE THE TOKENIZER # Tokenizer is used to convert text to sequences of token IDs. # config is updated with the default characters if not defined in the config. tokenizer, config = TTSTokenizer.init_from_config(config) # LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples( dataset_config, formatter=formatter, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init model model = Vits(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( TrainerArgs(restore_path = RESTORE_PATH), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, ) trainer.fit()