{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#@title Choose English model { run: \"auto\" }\n",
    "lang = 'English'\n",
    "tag = 'training/espnet/egs2/ljspeech/tts1' #@param [\"kan-bayashi/ljspeech_tacotron2\", \"kan-bayashi/ljspeech_fastspeech\", \"kan-bayashi/ljspeech_fastspeech2\", \"kan-bayashi/ljspeech_conformer_fastspeech2\", \"kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_vits\"] {type:\"string\"}\n",
    "vocoder_tag = \"none\" #@param [\"none\", \"parallel_wavegan/ljspeech_parallel_wavegan.v1\", \"parallel_wavegan/ljspeech_full_band_melgan.v2\", \"parallel_wavegan/ljspeech_multi_band_melgan.v2\", \"parallel_wavegan/ljspeech_hifigan.v1\", \"parallel_wavegan/ljspeech_style_melgan.v1\"] {type:\"string\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbin\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtts_inference\u001b[39;00m \u001b[39mimport\u001b[39;00m Text2Speech\n\u001b[1;32m      2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m str_or_none\n\u001b[0;32m----> 4\u001b[0m text2speech \u001b[39m=\u001b[39m Text2Speech(\n\u001b[1;32m      5\u001b[0m     train_config\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m      6\u001b[0m     model_file\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m      7\u001b[0m     device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m      8\u001b[0m     \u001b[39m# Only for Tacotron 2 & Transformer\u001b[39;49;00m\n\u001b[1;32m      9\u001b[0m     threshold\u001b[39m=\u001b[39;49m\u001b[39m0.5\u001b[39;49m,\n\u001b[1;32m     10\u001b[0m     \u001b[39m# Only for Tacotron 2\u001b[39;49;00m\n\u001b[1;32m     11\u001b[0m     minlenratio\u001b[39m=\u001b[39;49m\u001b[39m0.0\u001b[39;49m,\n\u001b[1;32m     12\u001b[0m     maxlenratio\u001b[39m=\u001b[39;49m\u001b[39m10.0\u001b[39;49m,\n\u001b[1;32m     13\u001b[0m     use_att_constraint\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m     14\u001b[0m     backward_window\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m,\n\u001b[1;32m     15\u001b[0m     forward_window\u001b[39m=\u001b[39;49m\u001b[39m3\u001b[39;49m,\n\u001b[1;32m     16\u001b[0m     \u001b[39m# Only for FastSpeech & FastSpeech2 & VITS\u001b[39;49;00m\n\u001b[1;32m     17\u001b[0m     speed_control_alpha\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m,\n\u001b[1;32m     18\u001b[0m     \u001b[39m# Only for VITS\u001b[39;49;00m\n\u001b[1;32m     19\u001b[0m     noise_scale\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m     20\u001b[0m     noise_scale_dur\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m     21\u001b[0m )\n",
      "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/bin/tts_inference.py:92\u001b[0m, in \u001b[0;36mText2Speech.__init__\u001b[0;34m(self, train_config, model_file, threshold, minlenratio, maxlenratio, use_teacher_forcing, use_att_constraint, backward_window, forward_window, speed_control_alpha, noise_scale, noise_scale_dur, vocoder_config, vocoder_file, dtype, device, seed, always_fix_seed, prefer_normalized_feats)\u001b[0m\n\u001b[1;32m     89\u001b[0m \u001b[39massert\u001b[39;00m check_argument_types()\n\u001b[1;32m     91\u001b[0m \u001b[39m# setup model\u001b[39;00m\n\u001b[0;32m---> 92\u001b[0m model, train_args \u001b[39m=\u001b[39m TTSTask\u001b[39m.\u001b[39;49mbuild_model_from_file(\n\u001b[1;32m     93\u001b[0m     train_config, model_file, device\n\u001b[1;32m     94\u001b[0m )\n\u001b[1;32m     95\u001b[0m model\u001b[39m.\u001b[39mto(dtype\u001b[39m=\u001b[39m\u001b[39mgetattr\u001b[39m(torch, dtype))\u001b[39m.\u001b[39meval()\n\u001b[1;32m     96\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdevice \u001b[39m=\u001b[39m device\n",
      "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/abs_task.py:1822\u001b[0m, in \u001b[0;36mAbsTask.build_model_from_file\u001b[0;34m(cls, config_file, model_file, device)\u001b[0m\n\u001b[1;32m   1820\u001b[0m     args \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(f)\n\u001b[1;32m   1821\u001b[0m args \u001b[39m=\u001b[39m argparse\u001b[39m.\u001b[39mNamespace(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39margs)\n\u001b[0;32m-> 1822\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mbuild_model(args)\n\u001b[1;32m   1823\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, AbsESPnetModel):\n\u001b[1;32m   1824\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m   1825\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mmodel must inherit \u001b[39m\u001b[39m{\u001b[39;00mAbsESPnetModel\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, but got \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mtype\u001b[39m(model)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m   1826\u001b[0m     )\n",
      "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/tts.py:309\u001b[0m, in \u001b[0;36mTTSTask.build_model\u001b[0;34m(cls, args)\u001b[0m\n\u001b[1;32m    307\u001b[0m \u001b[39mif\u001b[39;00m args\u001b[39m.\u001b[39mnormalize \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    308\u001b[0m     normalize_class \u001b[39m=\u001b[39m normalize_choices\u001b[39m.\u001b[39mget_class(args\u001b[39m.\u001b[39mnormalize)\n\u001b[0;32m--> 309\u001b[0m     normalize \u001b[39m=\u001b[39m normalize_class(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49margs\u001b[39m.\u001b[39;49mnormalize_conf)\n\u001b[1;32m    310\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    311\u001b[0m     normalize \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
      "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/layers/global_mvn.py:40\u001b[0m, in \u001b[0;36mGlobalMVN.__init__\u001b[0;34m(self, stats_file, norm_means, norm_vars, eps)\u001b[0m\n\u001b[1;32m     37\u001b[0m stats_file \u001b[39m=\u001b[39m Path(stats_file)\n\u001b[1;32m     39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstats_file \u001b[39m=\u001b[39m stats_file\n\u001b[0;32m---> 40\u001b[0m stats \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mload(stats_file)\n\u001b[1;32m     41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(stats, np\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m     42\u001b[0m     \u001b[39m# Kaldi like stats\u001b[39;00m\n\u001b[1;32m     43\u001b[0m     count \u001b[39m=\u001b[39m stats[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mflatten()[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
      "File \u001b[0;32m~/.miniconda3/envs/espnet/lib/python3.8/site-packages/numpy/lib/npyio.py:390\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m    388\u001b[0m     own_fid \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m    389\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 390\u001b[0m     fid \u001b[39m=\u001b[39m stack\u001b[39m.\u001b[39menter_context(\u001b[39mopen\u001b[39;49m(os_fspath(file), \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[1;32m    391\u001b[0m     own_fid \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m    393\u001b[0m \u001b[39m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'"
     ]
    }
   ],
   "source": [
    "from espnet2.bin.tts_inference import Text2Speech\n",
    "from espnet2.utils.types import str_or_none\n",
    "\n",
    "text2speech = Text2Speech(\n",
    "    train_config=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\",\n",
    "    model_file=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\",\n",
    "    device=\"cuda\",\n",
    "    # Only for Tacotron 2 & Transformer\n",
    "    threshold=0.5,\n",
    "    # Only for Tacotron 2\n",
    "    minlenratio=0.0,\n",
    "    maxlenratio=10.0,\n",
    "    use_att_constraint=False,\n",
    "    backward_window=1,\n",
    "    forward_window=3,\n",
    "    # Only for FastSpeech & FastSpeech2 & VITS\n",
    "    speed_control_alpha=4,\n",
    "    # Only for VITS\n",
    "    noise_scale=0.333,\n",
    "    noise_scale_dur=0.333,\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import torch\n",
    "\n",
    "# decide the input sentence by yourself\n",
    "print(f\"Input your favorite sentence in {lang}.\")\n",
    "x = input()\n",
    "\n",
    "# synthesis\n",
    "with torch.no_grad():\n",
    "    start = time.time()\n",
    "    wav = text2speech(x)[\"wav\"]\n",
    "rtf = (time.time() - start) / (len(wav) / text2speech.fs)\n",
    "print(f\"RTF = {rtf:5f}\")\n",
    "\n",
    "# let us listen to generated samples\n",
    "from IPython.display import display, Audio\n",
    "display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.15 ('espnet')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "baacc56cbf39183fce53815df8d7ef29797de9f36fbce345069f80337ea8dac3"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}