{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/darshan/ml-dev/tts/Indic-TTS\n", " > Using model: fast_pitch\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:False\n", " | > symmetric_norm:True\n", " | > mel_fmin:0\n", " | > mel_fmax:8000.0\n", " | > pitch_fmin:0.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:None\n", " | > base:2.718281828459045\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Init speaker_embedding layer.\n", " > Vocoder Model: hifigan\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:False\n", " | > symmetric_norm:True\n", " | > mel_fmin:0\n", " | > mel_fmax:8000.0\n", " | > pitch_fmin:0.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:None\n", " | > base:2.718281828459045\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Generator Model: hifigan_generator\n", " > Discriminator Model: hifigan_discriminator\n", "Removing weight norm...\n", " > Text: बधाई हो! स्पीच जनरेशन मॉडल चलने लगा।\n", " > Text splitted to sentences.\n", "['बधाई हो!', 'स्पीच जनरेशन मॉडल चलने लगा।']\n", "स्पीच जनरेशन मॉडल चलने लगा।\n", " [!] Character '।' not found in the vocabulary. Discarding it.\n", " > Processing time: 1.4851586818695068\n", " > Real-time factor: 0.3664042800664901\n", " > Saving output to output.mp4\n" ] } ], "source": [ "%cd Indic-TTS/\n", "!python3 -m TTS.bin.synthesize --text \"बधाई हो! स्पीच जनरेशन मॉडल चलने लगा।\" \\\n", " --model_path models/v1/hi/fastpitch/best_model.pth \\\n", " --config_path models/v1/hi/fastpitch/config.json \\\n", " --vocoder_path models/v1/hi/hifigan/best_model.pth \\\n", " --vocoder_config_path models/v1/hi/hifigan/config.json \\\n", " --speaker_idx \"female\" \\\n", " --out_path output.mp4" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import Audio, display\n", "\n", "display(Audio(\"output.mp4\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "!git clone https://github.com/AI4Bharat/Indic-TTS\n", "%cd Indic-TTS\n", "!echo \"[1/4] Cloned repository.\"\n", "\n", "# Install modified coqui-ai/Trainer\n", "!git clone https://github.com/gokulkarthik/Trainer\n", "%cd Trainer\n", "!python3 -m pip install -q -e .[all]\n", "%cd ..\n", "!echo \"[2/4] Cloned Trainer.\"\n", "\n", "# Install modified coqui-ai/TTS\n", "!git clone https://github.com/gokulkarthik/TTS\n", "%cd TTS\n", "!python3 -m pip install -q -e .[all]\n", "%cd ..\n", "!echo \"[3/4] Cloned TTS.\"\n", "\n", "# Install dependencies\n", "!python3 -m pip install -q -r requirements.txt\n", "!echo \"[4/4] Installed requirements.\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trainer 0.0.12 /home/darshan/ml-dev/tts/Indic-TTS/Trainer\n", "TTS 0.7.1 /home/darshan/ml-dev/tts/Indic-TTS/TTS\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip list | grep TTS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "py37_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }