Yurii Paniv commited on
Commit
524d54e
1 Parent(s): 96954c7

Test model inference

Browse files
Files changed (1) hide show
  1. training/esp_test.ipynb +114 -0
training/esp_test.ipynb ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#@title Choose English model { run: \"auto\" }\n",
10
+ "lang = 'English'\n",
11
+ "tag = 'training/espnet/egs2/ljspeech/tts1' #@param [\"kan-bayashi/ljspeech_tacotron2\", \"kan-bayashi/ljspeech_fastspeech\", \"kan-bayashi/ljspeech_fastspeech2\", \"kan-bayashi/ljspeech_conformer_fastspeech2\", \"kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_vits\"] {type:\"string\"}\n",
12
+ "vocoder_tag = \"none\" #@param [\"none\", \"parallel_wavegan/ljspeech_parallel_wavegan.v1\", \"parallel_wavegan/ljspeech_full_band_melgan.v2\", \"parallel_wavegan/ljspeech_multi_band_melgan.v2\", \"parallel_wavegan/ljspeech_hifigan.v1\", \"parallel_wavegan/ljspeech_style_melgan.v1\"] {type:\"string\"}"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 7,
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "ename": "FileNotFoundError",
22
+ "evalue": "[Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'",
23
+ "output_type": "error",
24
+ "traceback": [
25
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
26
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
27
+ "Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbin\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtts_inference\u001b[39;00m \u001b[39mimport\u001b[39;00m Text2Speech\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m str_or_none\n\u001b[0;32m----> 4\u001b[0m text2speech \u001b[39m=\u001b[39m Text2Speech(\n\u001b[1;32m 5\u001b[0m train_config\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 6\u001b[0m model_file\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 7\u001b[0m device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 8\u001b[0m \u001b[39m# Only for Tacotron 2 & Transformer\u001b[39;49;00m\n\u001b[1;32m 9\u001b[0m threshold\u001b[39m=\u001b[39;49m\u001b[39m0.5\u001b[39;49m,\n\u001b[1;32m 10\u001b[0m \u001b[39m# Only for Tacotron 2\u001b[39;49;00m\n\u001b[1;32m 11\u001b[0m minlenratio\u001b[39m=\u001b[39;49m\u001b[39m0.0\u001b[39;49m,\n\u001b[1;32m 12\u001b[0m maxlenratio\u001b[39m=\u001b[39;49m\u001b[39m10.0\u001b[39;49m,\n\u001b[1;32m 13\u001b[0m use_att_constraint\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m backward_window\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m,\n\u001b[1;32m 15\u001b[0m forward_window\u001b[39m=\u001b[39;49m\u001b[39m3\u001b[39;49m,\n\u001b[1;32m 16\u001b[0m \u001b[39m# Only for FastSpeech & FastSpeech2 & VITS\u001b[39;49;00m\n\u001b[1;32m 17\u001b[0m speed_control_alpha\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m,\n\u001b[1;32m 18\u001b[0m \u001b[39m# Only for VITS\u001b[39;49;00m\n\u001b[1;32m 19\u001b[0m noise_scale\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 20\u001b[0m noise_scale_dur\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 21\u001b[0m )\n",
28
+ "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/bin/tts_inference.py:92\u001b[0m, in \u001b[0;36mText2Speech.__init__\u001b[0;34m(self, train_config, model_file, threshold, minlenratio, maxlenratio, use_teacher_forcing, use_att_constraint, backward_window, forward_window, speed_control_alpha, noise_scale, noise_scale_dur, vocoder_config, vocoder_file, dtype, device, seed, always_fix_seed, prefer_normalized_feats)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[39massert\u001b[39;00m check_argument_types()\n\u001b[1;32m 91\u001b[0m \u001b[39m# setup model\u001b[39;00m\n\u001b[0;32m---> 92\u001b[0m model, train_args \u001b[39m=\u001b[39m TTSTask\u001b[39m.\u001b[39;49mbuild_model_from_file(\n\u001b[1;32m 93\u001b[0m train_config, model_file, device\n\u001b[1;32m 94\u001b[0m )\n\u001b[1;32m 95\u001b[0m model\u001b[39m.\u001b[39mto(dtype\u001b[39m=\u001b[39m\u001b[39mgetattr\u001b[39m(torch, dtype))\u001b[39m.\u001b[39meval()\n\u001b[1;32m 96\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdevice \u001b[39m=\u001b[39m device\n",
29
+ "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/abs_task.py:1822\u001b[0m, in \u001b[0;36mAbsTask.build_model_from_file\u001b[0;34m(cls, config_file, model_file, device)\u001b[0m\n\u001b[1;32m 1820\u001b[0m args \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(f)\n\u001b[1;32m 1821\u001b[0m args \u001b[39m=\u001b[39m argparse\u001b[39m.\u001b[39mNamespace(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39margs)\n\u001b[0;32m-> 1822\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mbuild_model(args)\n\u001b[1;32m 1823\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, AbsESPnetModel):\n\u001b[1;32m 1824\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1825\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mmodel must inherit \u001b[39m\u001b[39m{\u001b[39;00mAbsESPnetModel\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, but got \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mtype\u001b[39m(model)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1826\u001b[0m )\n",
30
+ "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/tts.py:309\u001b[0m, in \u001b[0;36mTTSTask.build_model\u001b[0;34m(cls, args)\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[39mif\u001b[39;00m args\u001b[39m.\u001b[39mnormalize \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 308\u001b[0m normalize_class \u001b[39m=\u001b[39m normalize_choices\u001b[39m.\u001b[39mget_class(args\u001b[39m.\u001b[39mnormalize)\n\u001b[0;32m--> 309\u001b[0m normalize \u001b[39m=\u001b[39m normalize_class(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49margs\u001b[39m.\u001b[39;49mnormalize_conf)\n\u001b[1;32m 310\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 311\u001b[0m normalize \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
31
+ "File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/layers/global_mvn.py:40\u001b[0m, in \u001b[0;36mGlobalMVN.__init__\u001b[0;34m(self, stats_file, norm_means, norm_vars, eps)\u001b[0m\n\u001b[1;32m 37\u001b[0m stats_file \u001b[39m=\u001b[39m Path(stats_file)\n\u001b[1;32m 39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstats_file \u001b[39m=\u001b[39m stats_file\n\u001b[0;32m---> 40\u001b[0m stats \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mload(stats_file)\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(stats, np\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 42\u001b[0m \u001b[39m# Kaldi like stats\u001b[39;00m\n\u001b[1;32m 43\u001b[0m count \u001b[39m=\u001b[39m stats[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mflatten()[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
32
+ "File \u001b[0;32m~/.miniconda3/envs/espnet/lib/python3.8/site-packages/numpy/lib/npyio.py:390\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 388\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 389\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 390\u001b[0m fid \u001b[39m=\u001b[39m stack\u001b[39m.\u001b[39menter_context(\u001b[39mopen\u001b[39;49m(os_fspath(file), \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[1;32m 391\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[39m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
33
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "from espnet2.bin.tts_inference import Text2Speech\n",
39
+ "from espnet2.utils.types import str_or_none\n",
40
+ "\n",
41
+ "text2speech = Text2Speech(\n",
42
+ " train_config=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\",\n",
43
+ " model_file=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\",\n",
44
+ " device=\"cuda\",\n",
45
+ " # Only for Tacotron 2 & Transformer\n",
46
+ " threshold=0.5,\n",
47
+ " # Only for Tacotron 2\n",
48
+ " minlenratio=0.0,\n",
49
+ " maxlenratio=10.0,\n",
50
+ " use_att_constraint=False,\n",
51
+ " backward_window=1,\n",
52
+ " forward_window=3,\n",
53
+ " # Only for FastSpeech & FastSpeech2 & VITS\n",
54
+ " speed_control_alpha=4,\n",
55
+ " # Only for VITS\n",
56
+ " noise_scale=0.333,\n",
57
+ " noise_scale_dur=0.333,\n",
58
+ ")\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "import time\n",
68
+ "import torch\n",
69
+ "\n",
70
+ "# decide the input sentence by yourself\n",
71
+ "print(f\"Input your favorite sentence in {lang}.\")\n",
72
+ "x = input()\n",
73
+ "\n",
74
+ "# synthesis\n",
75
+ "with torch.no_grad():\n",
76
+ " start = time.time()\n",
77
+ " wav = text2speech(x)[\"wav\"]\n",
78
+ "rtf = (time.time() - start) / (len(wav) / text2speech.fs)\n",
79
+ "print(f\"RTF = {rtf:5f}\")\n",
80
+ "\n",
81
+ "# let us listen to generated samples\n",
82
+ "from IPython.display import display, Audio\n",
83
+ "display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))"
84
+ ]
85
+ }
86
+ ],
87
+ "metadata": {
88
+ "kernelspec": {
89
+ "display_name": "Python 3.8.15 ('espnet')",
90
+ "language": "python",
91
+ "name": "python3"
92
+ },
93
+ "language_info": {
94
+ "codemirror_mode": {
95
+ "name": "ipython",
96
+ "version": 3
97
+ },
98
+ "file_extension": ".py",
99
+ "mimetype": "text/x-python",
100
+ "name": "python",
101
+ "nbconvert_exporter": "python",
102
+ "pygments_lexer": "ipython3",
103
+ "version": "3.8.15"
104
+ },
105
+ "orig_nbformat": 4,
106
+ "vscode": {
107
+ "interpreter": {
108
+ "hash": "baacc56cbf39183fce53815df8d7ef29797de9f36fbce345069f80337ea8dac3"
109
+ }
110
+ }
111
+ },
112
+ "nbformat": 4,
113
+ "nbformat_minor": 2
114
+ }