otioss commited on
Commit
4cb60dd
1 Parent(s): 0c88548

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +124 -0
  2. .ipynb_checkpoints/onefile-checkpoint.ipynb +232 -0
  3. .ipynb_checkpoints/voice_clone-checkpoint.ipynb +6 -0
  4. .vscode/settings.json +3 -0
  5. Colab/StyleTTS2_Demo_LJSpeech.ipynb +486 -0
  6. Colab/StyleTTS2_Demo_LibriTTS.ipynb +1218 -0
  7. Colab/StyleTTS2_Finetune_Demo.ipynb +480 -0
  8. Configs/config.yml +116 -0
  9. Configs/config_ft.yml +111 -0
  10. Configs/config_libritts.yml +113 -0
  11. Data/OOD_texts.txt +3 -0
  12. Data/train_list.txt +0 -0
  13. Data/val_list.txt +100 -0
  14. Demo/.ipynb_checkpoints/Inference_LJSpeech-checkpoint.ipynb +554 -0
  15. Demo/.ipynb_checkpoints/Inference_LibriTTS-checkpoint.ipynb +1155 -0
  16. Demo/Inference_LJSpeech.ipynb +3 -0
  17. Demo/Inference_LibriTTS.ipynb +3 -0
  18. Demo/reference_audio/1221-135767-0014.wav +0 -0
  19. Demo/reference_audio/1789_142896_000022_000005.wav +0 -0
  20. Demo/reference_audio/3.wav +0 -0
  21. Demo/reference_audio/4.wav +0 -0
  22. Demo/reference_audio/4077-13754-0000.wav +0 -0
  23. Demo/reference_audio/5.wav +0 -0
  24. Demo/reference_audio/5639-40744-0020.wav +0 -0
  25. Demo/reference_audio/696_92939_000016_000006.wav +0 -0
  26. Demo/reference_audio/908-157963-0027.wav +0 -0
  27. Demo/reference_audio/Gavin.wav +0 -0
  28. Demo/reference_audio/James.wav +3 -0
  29. Demo/reference_audio/James1.wav +3 -0
  30. Demo/reference_audio/James2.wav +3 -0
  31. Demo/reference_audio/Nima.wav +0 -0
  32. Demo/reference_audio/Vinay.wav +0 -0
  33. Demo/reference_audio/Yinghao.wav +0 -0
  34. Demo/reference_audio/amused.wav +0 -0
  35. Demo/reference_audio/anger.wav +0 -0
  36. Demo/reference_audio/disgusted.wav +0 -0
  37. Demo/reference_audio/sleepy.wav +0 -0
  38. LICENSE +21 -0
  39. Models/LJSpeech/config.yml +22 -0
  40. Models/LJSpeech/epoch_2nd_00100.pth +3 -0
  41. Models/LibriTTS/config.yml +21 -0
  42. Models/LibriTTS/epochs_2nd_00020.pth +3 -0
  43. Modules/__init__.py +1 -0
  44. Modules/__pycache__/__init__.cpython-311.pyc +0 -0
  45. Modules/__pycache__/discriminators.cpython-311.pyc +0 -0
  46. Modules/__pycache__/hifigan.cpython-311.pyc +0 -0
  47. Modules/__pycache__/istftnet.cpython-311.pyc +0 -0
  48. Modules/__pycache__/utils.cpython-311.pyc +0 -0
  49. Modules/diffusion/__init__.py +1 -0
  50. Modules/diffusion/__pycache__/__init__.cpython-311.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,127 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Data/OOD_texts.txt filter=lfs diff=lfs merge=lfs -text
37
+ Demo/Inference_LJSpeech.ipynb filter=lfs diff=lfs merge=lfs -text
38
+ Demo/Inference_LibriTTS.ipynb filter=lfs diff=lfs merge=lfs -text
39
+ Demo/reference_audio/James.wav filter=lfs diff=lfs merge=lfs -text
40
+ Demo/reference_audio/James1.wav filter=lfs diff=lfs merge=lfs -text
41
+ Demo/reference_audio/James2.wav filter=lfs diff=lfs merge=lfs -text
42
+ Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
43
+ Utils/PLBERT/step_1000000.t7 filter=lfs diff=lfs merge=lfs -text
44
+ original_voice.wav filter=lfs diff=lfs merge=lfs -text
45
+ styleenv/lib/python3.11/site-packages/Pillow.libs/libfreetype-82733d78.so.6.20.1 filter=lfs diff=lfs merge=lfs -text
46
+ styleenv/lib/python3.11/site-packages/Pillow.libs/libharfbuzz-e3b74c67.so.0.60821.0 filter=lfs diff=lfs merge=lfs -text
47
+ styleenv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
48
+ styleenv/lib/python3.11/site-packages/_soundfile_data/libsndfile_x86_64.so filter=lfs diff=lfs merge=lfs -text
49
+ styleenv/lib/python3.11/site-packages/altair/vegalite/v5/schema/__pycache__/channels.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
50
+ styleenv/lib/python3.11/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
51
+ styleenv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_cython.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
52
+ styleenv/lib/python3.11/site-packages/fontTools/cu2qu/cu2qu.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
53
+ styleenv/lib/python3.11/site-packages/fontTools/feaLib/lexer.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
54
+ styleenv/lib/python3.11/site-packages/fontTools/misc/bezierTools.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
55
+ styleenv/lib/python3.11/site-packages/fontTools/pens/momentsPen.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
56
+ styleenv/lib/python3.11/site-packages/fontTools/qu2cu/qu2cu.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
57
+ styleenv/lib/python3.11/site-packages/fontTools/varLib/iup.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
58
+ styleenv/lib/python3.11/site-packages/gradio/frpc_linux_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
59
+ styleenv/lib/python3.11/site-packages/gradio/templates/frontend/assets/Index-c6e4b94b.js.map filter=lfs diff=lfs merge=lfs -text
60
+ styleenv/lib/python3.11/site-packages/kiwisolver/_cext.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
61
+ styleenv/lib/python3.11/site-packages/llvmlite/binding/libllvmlite.so filter=lfs diff=lfs merge=lfs -text
62
+ styleenv/lib/python3.11/site-packages/lxml/etree.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
63
+ styleenv/lib/python3.11/site-packages/lxml/objectify.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
64
+ styleenv/lib/python3.11/site-packages/matplotlib/_image.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
65
+ styleenv/lib/python3.11/site-packages/matplotlib/_path.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
66
+ styleenv/lib/python3.11/site-packages/matplotlib/_qhull.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
67
+ styleenv/lib/python3.11/site-packages/matplotlib/backends/_backend_agg.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
68
+ styleenv/lib/python3.11/site-packages/matplotlib/ft2font.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
69
+ styleenv/lib/python3.11/site-packages/monotonic_align/core.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
70
+ styleenv/lib/python3.11/site-packages/monotonic_align/core1alt.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
71
+ styleenv/lib/python3.11/site-packages/monotonic_align/core2.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
72
+ styleenv/lib/python3.11/site-packages/monotonic_align/core2eps.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
73
+ styleenv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
74
+ styleenv/lib/python3.11/site-packages/numba/np/ufunc/tbbpool.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
75
+ styleenv/lib/python3.11/site-packages/numpy/core/_multiarray_umath.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
76
+ styleenv/lib/python3.11/site-packages/numpy/core/_simd.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
77
+ styleenv/lib/python3.11/site-packages/numpy.libs/libgfortran-040039e1.so.5.0.0 filter=lfs diff=lfs merge=lfs -text
78
+ styleenv/lib/python3.11/site-packages/numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so filter=lfs diff=lfs merge=lfs -text
79
+ styleenv/lib/python3.11/site-packages/nvfuser/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
80
+ styleenv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.12 filter=lfs diff=lfs merge=lfs -text
81
+ styleenv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublasLt.so.12 filter=lfs diff=lfs merge=lfs -text
82
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libcheckpoint.so filter=lfs diff=lfs merge=lfs -text
83
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12 filter=lfs diff=lfs merge=lfs -text
84
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libnvperf_host.so filter=lfs diff=lfs merge=lfs -text
85
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libnvperf_target.so filter=lfs diff=lfs merge=lfs -text
86
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.1 filter=lfs diff=lfs merge=lfs -text
87
+ styleenv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12 filter=lfs diff=lfs merge=lfs -text
88
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_adv_infer.so.8 filter=lfs diff=lfs merge=lfs -text
89
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_adv_train.so.8 filter=lfs diff=lfs merge=lfs -text
90
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_infer.so.8 filter=lfs diff=lfs merge=lfs -text
91
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 filter=lfs diff=lfs merge=lfs -text
92
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 filter=lfs diff=lfs merge=lfs -text
93
+ styleenv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_train.so.8 filter=lfs diff=lfs merge=lfs -text
94
+ styleenv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
95
+ styleenv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufftw.so.11 filter=lfs diff=lfs merge=lfs -text
96
+ styleenv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 filter=lfs diff=lfs merge=lfs -text
97
+ styleenv/lib/python3.11/site-packages/nvidia/cusolver/lib/libcusolver.so.11 filter=lfs diff=lfs merge=lfs -text
98
+ styleenv/lib/python3.11/site-packages/nvidia/cusolver/lib/libcusolverMg.so.11 filter=lfs diff=lfs merge=lfs -text
99
+ styleenv/lib/python3.11/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text
100
+ styleenv/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 filter=lfs diff=lfs merge=lfs -text
101
+ styleenv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12 filter=lfs diff=lfs merge=lfs -text
102
+ styleenv/lib/python3.11/site-packages/pandas/_libs/algos.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
103
+ styleenv/lib/python3.11/site-packages/pandas/_libs/groupby.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
104
+ styleenv/lib/python3.11/site-packages/pandas/_libs/hashtable.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
105
+ styleenv/lib/python3.11/site-packages/pandas/_libs/interval.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
106
+ styleenv/lib/python3.11/site-packages/pandas/_libs/join.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
107
+ styleenv/lib/python3.11/site-packages/pandas/_libs/tslibs/offsets.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
108
+ styleenv/lib/python3.11/site-packages/pyarrow/_compute.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
109
+ styleenv/lib/python3.11/site-packages/pyarrow/_dataset.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
110
+ styleenv/lib/python3.11/site-packages/pyarrow/_flight.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
111
+ styleenv/lib/python3.11/site-packages/pyarrow/lib.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
112
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow.so.1400 filter=lfs diff=lfs merge=lfs -text
113
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow_acero.so.1400 filter=lfs diff=lfs merge=lfs -text
114
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow_dataset.so.1400 filter=lfs diff=lfs merge=lfs -text
115
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow_flight.so.1400 filter=lfs diff=lfs merge=lfs -text
116
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow_python.so filter=lfs diff=lfs merge=lfs -text
117
+ styleenv/lib/python3.11/site-packages/pyarrow/libarrow_substrait.so.1400 filter=lfs diff=lfs merge=lfs -text
118
+ styleenv/lib/python3.11/site-packages/pyarrow/libparquet.so.1400 filter=lfs diff=lfs merge=lfs -text
119
+ styleenv/lib/python3.11/site-packages/pydantic_core/_pydantic_core.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
120
+ styleenv/lib/python3.11/site-packages/pyzmq.libs/libsodium-cb25555f.so.23.3.0 filter=lfs diff=lfs merge=lfs -text
121
+ styleenv/lib/python3.11/site-packages/pyzmq.libs/libzmq-f468291a.so.5.2.4 filter=lfs diff=lfs merge=lfs -text
122
+ styleenv/lib/python3.11/site-packages/regex/_regex.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
123
+ styleenv/lib/python3.11/site-packages/rpds/rpds.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
124
+ styleenv/lib/python3.11/site-packages/safetensors/_safetensors_rust.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
125
+ styleenv/lib/python3.11/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
126
+ styleenv/lib/python3.11/site-packages/scipy/linalg/_flapack.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
127
+ styleenv/lib/python3.11/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
128
+ styleenv/lib/python3.11/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
129
+ styleenv/lib/python3.11/site-packages/scipy/sparse/_sparsetools.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
130
+ styleenv/lib/python3.11/site-packages/scipy/spatial/_ckdtree.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
131
+ styleenv/lib/python3.11/site-packages/scipy/spatial/_qhull.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
132
+ styleenv/lib/python3.11/site-packages/scipy/special/_ufuncs.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
133
+ styleenv/lib/python3.11/site-packages/scipy/special/cython_special.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
134
+ styleenv/lib/python3.11/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
135
+ styleenv/lib/python3.11/site-packages/scipy.libs/libgfortran-040039e1.so.5.0.0 filter=lfs diff=lfs merge=lfs -text
136
+ styleenv/lib/python3.11/site-packages/scipy.libs/libopenblasp-r0-23e5df77.3.21.dev.so filter=lfs diff=lfs merge=lfs -text
137
+ styleenv/lib/python3.11/site-packages/sklearn/_loss/_loss.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
138
+ styleenv/lib/python3.11/site-packages/soxr/cysoxr.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
139
+ styleenv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86 filter=lfs diff=lfs merge=lfs -text
140
+ styleenv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86_64 filter=lfs diff=lfs merge=lfs -text
141
+ styleenv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef filter=lfs diff=lfs merge=lfs -text
142
+ styleenv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump filter=lfs diff=lfs merge=lfs -text
143
+ styleenv/lib/python3.11/site-packages/sympy/polys/benchmarks/__pycache__/bench_solvers.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
144
+ styleenv/lib/python3.11/site-packages/tiktoken/_tiktoken.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
145
+ styleenv/lib/python3.11/site-packages/tokenizers/tokenizers.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
146
+ styleenv/lib/python3.11/site-packages/torch/bin/nvfuser_tests filter=lfs diff=lfs merge=lfs -text
147
+ styleenv/lib/python3.11/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
148
+ styleenv/lib/python3.11/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
149
+ styleenv/lib/python3.11/site-packages/torch/lib/libc10.so filter=lfs diff=lfs merge=lfs -text
150
+ styleenv/lib/python3.11/site-packages/torch/lib/libnvfuser_codegen.so filter=lfs diff=lfs merge=lfs -text
151
+ styleenv/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so filter=lfs diff=lfs merge=lfs -text
152
+ styleenv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so filter=lfs diff=lfs merge=lfs -text
153
+ styleenv/lib/python3.11/site-packages/torch/lib/libtorch_cuda_linalg.so filter=lfs diff=lfs merge=lfs -text
154
+ styleenv/lib/python3.11/site-packages/torch/lib/libtorch_python.so filter=lfs diff=lfs merge=lfs -text
155
+ styleenv/lib/python3.11/site-packages/torchaudio/lib/libctc_prefix_decoder.so filter=lfs diff=lfs merge=lfs -text
156
+ styleenv/lib/python3.11/site-packages/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
157
+ styleenv/lib/python3.11/site-packages/triton/_C/libtriton.so filter=lfs diff=lfs merge=lfs -text
158
+ styleenv/lib/python3.11/site-packages/triton/third_party/cuda/bin/ptxas filter=lfs diff=lfs merge=lfs -text
159
+ styleenv/lib/python3.11/site-packages/yaml/_yaml.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/onefile-checkpoint.ipynb ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "612692e6-fe5f-4787-86d9-c660bb9a21ec",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "ename": "ModuleNotFoundError",
11
+ "evalue": "No module named 'models'",
12
+ "output_type": "error",
13
+ "traceback": [
14
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
15
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
16
+ "Cell \u001b[0;32mIn[1], line 27\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlibrosa\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtokenize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m word_tokenize\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TextCleaner\n",
17
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'models'"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "import torch\n",
23
+ "torch.manual_seed(0)\n",
24
+ "torch.backends.cudnn.benchmark = False\n",
25
+ "torch.backends.cudnn.deterministic = True\n",
26
+ "\n",
27
+ "import random\n",
28
+ "random.seed(0)\n",
29
+ "\n",
30
+ "import numpy as np\n",
31
+ "np.random.seed(0)\n",
32
+ "\n",
33
+ "#%cd ..\n",
34
+ "\n",
35
+ "# load packages\n",
36
+ "import time\n",
37
+ "import random\n",
38
+ "import yaml\n",
39
+ "from munch import Munch\n",
40
+ "import numpy as np\n",
41
+ "import torch\n",
42
+ "from torch import nn\n",
43
+ "import torch.nn.functional as F\n",
44
+ "import torchaudio\n",
45
+ "import librosa\n",
46
+ "from nltk.tokenize import word_tokenize\n",
47
+ "\n",
48
+ "from models import *\n",
49
+ "from utils import *\n",
50
+ "from text_utils import TextCleaner\n",
51
+ "textclenaer = TextCleaner()\n",
52
+ "\n",
53
+ "%matplotlib inline\n",
54
+ "\n",
55
+ "device = 'cuda'\n",
56
+ "\n",
57
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
58
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
59
+ "mean, std = -4, 4\n",
60
+ "\n",
61
+ "def length_to_mask(lengths):\n",
62
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
63
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
64
+ " return mask\n",
65
+ "\n",
66
+ "def preprocess(wave):\n",
67
+ " wave_tensor = torch.from_numpy(wave).float()\n",
68
+ " mel_tensor = to_mel(wave_tensor)\n",
69
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
70
+ " return mel_tensor\n",
71
+ "\n",
72
+ "def compute_style(ref_dicts):\n",
73
+ " reference_embeddings = {}\n",
74
+ " for key, path in ref_dicts.items():\n",
75
+ " wave, sr = librosa.load(path, sr=24000)\n",
76
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
77
+ " if sr != 24000:\n",
78
+ " audio = librosa.resample(audio, sr, 24000)\n",
79
+ " mel_tensor = preprocess(audio).to(device)\n",
80
+ "\n",
81
+ " with torch.no_grad():\n",
82
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
83
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
84
+ " \n",
85
+ " return reference_embeddings\n",
86
+ "\n",
87
+ "# load phonemizer\n",
88
+ "import phonemizer\n",
89
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n",
90
+ "\n",
91
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config.yml\"))\n",
92
+ "\n",
93
+ "# load pretrained ASR model\n",
94
+ "ASR_config = config.get('ASR_config', False)\n",
95
+ "ASR_path = config.get('ASR_path', False)\n",
96
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
97
+ "\n",
98
+ "# load pretrained F0 model\n",
99
+ "F0_path = config.get('F0_path', False)\n",
100
+ "pitch_extractor = load_F0_models(F0_path)\n",
101
+ "\n",
102
+ "# load BERT model\n",
103
+ "from Utils.PLBERT.util import load_plbert\n",
104
+ "BERT_path = config.get('PLBERT_dir', False)\n",
105
+ "plbert = load_plbert(BERT_path)\n",
106
+ "\n",
107
+ "model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)\n",
108
+ "_ = [model[key].eval() for key in model]\n",
109
+ "_ = [model[key].to(device) for key in model]\n",
110
+ "\n",
111
+ "params_whole = torch.load(\"Models/LJSpeech/epoch_2nd_00100.pth\", map_location='cpu')\n",
112
+ "params = params_whole['net']\n",
113
+ "\n",
114
+ "for key in model:\n",
115
+ " if key in params:\n",
116
+ " print('%s loaded' % key)\n",
117
+ " try:\n",
118
+ " model[key].load_state_dict(params[key])\n",
119
+ " except:\n",
120
+ " from collections import OrderedDict\n",
121
+ " state_dict = params[key]\n",
122
+ " new_state_dict = OrderedDict()\n",
123
+ " for k, v in state_dict.items():\n",
124
+ " name = k[7:] # remove `module.`\n",
125
+ " new_state_dict[name] = v\n",
126
+ " # load params\n",
127
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
128
+ "# except:\n",
129
+ "# _load(params[key], model[key])\n",
130
+ "_ = [model[key].eval() for key in model]\n",
131
+ "\n",
132
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
133
+ "\n",
134
+ "sampler = DiffusionSampler(\n",
135
+ " model.diffusion.diffusion,\n",
136
+ " sampler=ADPM2Sampler(),\n",
137
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
138
+ " clamp=False\n",
139
+ ")\n",
140
+ "\n",
141
+ "def inference(text, noise, diffusion_steps=5, embedding_scale=1):\n",
142
+ " text = text.strip()\n",
143
+ " text = text.replace('\"', '')\n",
144
+ " ps = global_phonemizer.phonemize([text])\n",
145
+ " ps = word_tokenize(ps[0])\n",
146
+ " ps = ' '.join(ps)\n",
147
+ "\n",
148
+ " tokens = textclenaer(ps)\n",
149
+ " tokens.insert(0, 0)\n",
150
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
151
+ " \n",
152
+ " with torch.no_grad():\n",
153
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
154
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
155
+ "\n",
156
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
157
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
158
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
159
+ "\n",
160
+ " s_pred = sampler(noise, \n",
161
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
162
+ " embedding_scale=embedding_scale).squeeze(0)\n",
163
+ "\n",
164
+ " s = s_pred[:, 128:]\n",
165
+ " ref = s_pred[:, :128]\n",
166
+ "\n",
167
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
168
+ "\n",
169
+ " x, _ = model.predictor.lstm(d)\n",
170
+ " duration = model.predictor.duration_proj(x)\n",
171
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
172
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
173
+ "\n",
174
+ " pred_dur[-1] += 5\n",
175
+ "\n",
176
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
177
+ " c_frame = 0\n",
178
+ " for i in range(pred_aln_trg.size(0)):\n",
179
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
180
+ " c_frame += int(pred_dur[i].data)\n",
181
+ "\n",
182
+ " # encode prosody\n",
183
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
184
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
185
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
186
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
187
+ " \n",
188
+ " return out.squeeze().cpu().numpy()\n",
189
+ "\n",
190
+ "# synthesize a text\n",
191
+ "text = ''' Hi James! How are you? '''\n",
192
+ "\n",
193
+ "start = time.time()\n",
194
+ "noise = torch.randn(1,1,256).to(device)\n",
195
+ "wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)\n",
196
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
197
+ "print(f\"RTF = {rtf:5f}\")\n",
198
+ "import IPython.display as ipd\n",
199
+ "display(ipd.Audio(wav, rate=24000))"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "id": "e28c4e1a-ddb6-4914-8b26-a6053e90c272",
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": []
209
+ }
210
+ ],
211
+ "metadata": {
212
+ "kernelspec": {
213
+ "display_name": "Python (styleenv)",
214
+ "language": "python",
215
+ "name": "styleenv"
216
+ },
217
+ "language_info": {
218
+ "codemirror_mode": {
219
+ "name": "ipython",
220
+ "version": 3
221
+ },
222
+ "file_extension": ".py",
223
+ "mimetype": "text/x-python",
224
+ "name": "python",
225
+ "nbconvert_exporter": "python",
226
+ "pygments_lexer": "ipython3",
227
+ "version": "3.11.6"
228
+ }
229
+ },
230
+ "nbformat": 4,
231
+ "nbformat_minor": 5
232
+ }
.ipynb_checkpoints/voice_clone-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "git.ignoreLimitWarning": true
3
+ }
Colab/StyleTTS2_Demo_LJSpeech.ipynb ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "authorship_tag": "ABX9TyM1x2mx2VnkYNFVlD+DFzmy",
9
+ "include_colab_link": true
10
+ },
11
+ "kernelspec": {
12
+ "name": "python3",
13
+ "display_name": "Python 3"
14
+ },
15
+ "language_info": {
16
+ "name": "python"
17
+ },
18
+ "accelerator": "GPU"
19
+ },
20
+ "cells": [
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {
24
+ "id": "view-in-github",
25
+ "colab_type": "text"
26
+ },
27
+ "source": [
28
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LJSpeech.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "source": [
34
+ "### Install packages and download models"
35
+ ],
36
+ "metadata": {
37
+ "id": "nm653VK4CG9F"
38
+ }
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "source": [
43
+ "%%shell\n",
44
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
45
+ "cd StyleTTS2\n",
46
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
47
+ "sudo apt-get install espeak-ng\n",
48
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LJSpeech\n",
49
+ "mv StyleTTS2-LJSpeech/Models ."
50
+ ],
51
+ "metadata": {
52
+ "id": "gciBKMqCCLvT"
53
+ },
54
+ "execution_count": null,
55
+ "outputs": []
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "source": [
60
+ "### Load models"
61
+ ],
62
+ "metadata": {
63
+ "id": "OAA8lx-XCQnM"
64
+ }
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "source": [
69
+ "%cd StyleTTS2\n",
70
+ "\n",
71
+ "import torch\n",
72
+ "torch.manual_seed(0)\n",
73
+ "torch.backends.cudnn.benchmark = False\n",
74
+ "torch.backends.cudnn.deterministic = True\n",
75
+ "\n",
76
+ "import random\n",
77
+ "random.seed(0)\n",
78
+ "\n",
79
+ "import numpy as np\n",
80
+ "np.random.seed(0)\n",
81
+ "\n",
82
+ "import nltk\n",
83
+ "nltk.download('punkt')\n",
84
+ "\n",
85
+ "# load packages\n",
86
+ "import time\n",
87
+ "import random\n",
88
+ "import yaml\n",
89
+ "from munch import Munch\n",
90
+ "import numpy as np\n",
91
+ "import torch\n",
92
+ "from torch import nn\n",
93
+ "import torch.nn.functional as F\n",
94
+ "import torchaudio\n",
95
+ "import librosa\n",
96
+ "from nltk.tokenize import word_tokenize\n",
97
+ "\n",
98
+ "from models import *\n",
99
+ "from utils import *\n",
100
+ "from text_utils import TextCleaner\n",
101
+ "textclenaer = TextCleaner()\n",
102
+ "\n",
103
+ "%matplotlib inline\n",
104
+ "\n",
105
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
106
+ "\n",
107
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
108
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
109
+ "mean, std = -4, 4\n",
110
+ "\n",
111
+ "def length_to_mask(lengths):\n",
112
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
113
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
114
+ " return mask\n",
115
+ "\n",
116
+ "def preprocess(wave):\n",
117
+ " wave_tensor = torch.from_numpy(wave).float()\n",
118
+ " mel_tensor = to_mel(wave_tensor)\n",
119
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
120
+ " return mel_tensor\n",
121
+ "\n",
122
+ "def compute_style(ref_dicts):\n",
123
+ " reference_embeddings = {}\n",
124
+ " for key, path in ref_dicts.items():\n",
125
+ " wave, sr = librosa.load(path, sr=24000)\n",
126
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
127
+ " if sr != 24000:\n",
128
+ " audio = librosa.resample(audio, sr, 24000)\n",
129
+ " mel_tensor = preprocess(audio).to(device)\n",
130
+ "\n",
131
+ " with torch.no_grad():\n",
132
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
133
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
134
+ "\n",
135
+ " return reference_embeddings\n",
136
+ "\n",
137
+ "# load phonemizer\n",
138
+ "import phonemizer\n",
139
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True, words_mismatch='ignore')\n",
140
+ "\n",
141
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config.yml\"))\n",
142
+ "\n",
143
+ "# load pretrained ASR model\n",
144
+ "ASR_config = config.get('ASR_config', False)\n",
145
+ "ASR_path = config.get('ASR_path', False)\n",
146
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
147
+ "\n",
148
+ "# load pretrained F0 model\n",
149
+ "F0_path = config.get('F0_path', False)\n",
150
+ "pitch_extractor = load_F0_models(F0_path)\n",
151
+ "\n",
152
+ "# load BERT model\n",
153
+ "from Utils.PLBERT.util import load_plbert\n",
154
+ "BERT_path = config.get('PLBERT_dir', False)\n",
155
+ "plbert = load_plbert(BERT_path)\n",
156
+ "\n",
157
+ "model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)\n",
158
+ "_ = [model[key].eval() for key in model]\n",
159
+ "_ = [model[key].to(device) for key in model]\n",
160
+ "\n",
161
+ "params_whole = torch.load(\"Models/LJSpeech/epoch_2nd_00100.pth\", map_location='cpu')\n",
162
+ "params = params_whole['net']\n",
163
+ "\n",
164
+ "for key in model:\n",
165
+ " if key in params:\n",
166
+ " print('%s loaded' % key)\n",
167
+ " try:\n",
168
+ " model[key].load_state_dict(params[key])\n",
169
+ " except:\n",
170
+ " from collections import OrderedDict\n",
171
+ " state_dict = params[key]\n",
172
+ " new_state_dict = OrderedDict()\n",
173
+ " for k, v in state_dict.items():\n",
174
+ " name = k[7:] # remove `module.`\n",
175
+ " new_state_dict[name] = v\n",
176
+ " # load params\n",
177
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
178
+ "# except:\n",
179
+ "# _load(params[key], model[key])\n",
180
+ "_ = [model[key].eval() for key in model]\n",
181
+ "\n",
182
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
183
+ "\n",
184
+ "sampler = DiffusionSampler(\n",
185
+ " model.diffusion.diffusion,\n",
186
+ " sampler=ADPM2Sampler(),\n",
187
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
188
+ " clamp=False\n",
189
+ ")\n",
190
+ "\n",
191
+ "def inference(text, noise, diffusion_steps=5, embedding_scale=1):\n",
192
+ " text = text.strip()\n",
193
+ " text = text.replace('\"', '')\n",
194
+ " ps = global_phonemizer.phonemize([text])\n",
195
+ " ps = word_tokenize(ps[0])\n",
196
+ " ps = ' '.join(ps)\n",
197
+ "\n",
198
+ " tokens = textclenaer(ps)\n",
199
+ " tokens.insert(0, 0)\n",
200
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
201
+ "\n",
202
+ " with torch.no_grad():\n",
203
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
204
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
205
+ "\n",
206
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
207
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
208
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
209
+ "\n",
210
+ " s_pred = sampler(noise,\n",
211
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
212
+ " embedding_scale=embedding_scale).squeeze(0)\n",
213
+ "\n",
214
+ " s = s_pred[:, 128:]\n",
215
+ " ref = s_pred[:, :128]\n",
216
+ "\n",
217
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
218
+ "\n",
219
+ " x, _ = model.predictor.lstm(d)\n",
220
+ " duration = model.predictor.duration_proj(x)\n",
221
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
222
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
223
+ "\n",
224
+ " pred_dur[-1] += 5\n",
225
+ "\n",
226
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
227
+ " c_frame = 0\n",
228
+ " for i in range(pred_aln_trg.size(0)):\n",
229
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
230
+ " c_frame += int(pred_dur[i].data)\n",
231
+ "\n",
232
+ " # encode prosody\n",
233
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
234
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
235
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),\n",
236
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
237
+ "\n",
238
+ " return out.squeeze().cpu().numpy()\n",
239
+ "\n",
240
+ "def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):\n",
241
+ " text = text.strip()\n",
242
+ " text = text.replace('\"', '')\n",
243
+ " ps = global_phonemizer.phonemize([text])\n",
244
+ " ps = word_tokenize(ps[0])\n",
245
+ " ps = ' '.join(ps)\n",
246
+ "\n",
247
+ " tokens = textclenaer(ps)\n",
248
+ " tokens.insert(0, 0)\n",
249
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
250
+ "\n",
251
+ " with torch.no_grad():\n",
252
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
253
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
254
+ "\n",
255
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
256
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
257
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
258
+ "\n",
259
+ " s_pred = sampler(noise,\n",
260
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
261
+ " embedding_scale=embedding_scale).squeeze(0)\n",
262
+ "\n",
263
+ " if s_prev is not None:\n",
264
+ " # convex combination of previous and current style\n",
265
+ " s_pred = alpha * s_prev + (1 - alpha) * s_pred\n",
266
+ "\n",
267
+ " s = s_pred[:, 128:]\n",
268
+ " ref = s_pred[:, :128]\n",
269
+ "\n",
270
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
271
+ "\n",
272
+ " x, _ = model.predictor.lstm(d)\n",
273
+ " duration = model.predictor.duration_proj(x)\n",
274
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
275
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
276
+ "\n",
277
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
278
+ " c_frame = 0\n",
279
+ " for i in range(pred_aln_trg.size(0)):\n",
280
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
281
+ " c_frame += int(pred_dur[i].data)\n",
282
+ "\n",
283
+ " # encode prosody\n",
284
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
285
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
286
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),\n",
287
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
288
+ "\n",
289
+ " return out.squeeze().cpu().numpy(), s_pred"
290
+ ],
291
+ "metadata": {
292
+ "id": "m0XRpbxSCSix"
293
+ },
294
+ "execution_count": null,
295
+ "outputs": []
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "source": [
300
+ "### Synthesize speech"
301
+ ],
302
+ "metadata": {
303
+ "id": "vuCbS0gdArgJ"
304
+ }
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "source": [
309
+ "# @title Input Text { display-mode: \"form\" }\n",
310
+ "# synthesize a text\n",
311
+ "text = \"StyleTTS 2 is a text-to-speech model that leverages style diffusion and adversarial training with large speech language models to achieve human-level text-to-speech synthesis.\" # @param {type:\"string\"}\n"
312
+ ],
313
+ "metadata": {
314
+ "id": "7Ud1Y-kbBPTw"
315
+ },
316
+ "execution_count": 3,
317
+ "outputs": []
318
+ },
319
+ {
320
+ "cell_type": "markdown",
321
+ "source": [
322
+ "#### Basic synthesis (5 diffusion steps)"
323
+ ],
324
+ "metadata": {
325
+ "id": "TM2NjuM7B6sz"
326
+ }
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "source": [
331
+ "start = time.time()\n",
332
+ "noise = torch.randn(1,1,256).to(device)\n",
333
+ "wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)\n",
334
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
335
+ "print(f\"RTF = {rtf:5f}\")\n",
336
+ "import IPython.display as ipd\n",
337
+ "display(ipd.Audio(wav, rate=24000))"
338
+ ],
339
+ "metadata": {
340
+ "id": "KILqC-V-Ay5e"
341
+ },
342
+ "execution_count": null,
343
+ "outputs": []
344
+ },
345
+ {
346
+ "cell_type": "markdown",
347
+ "source": [
348
+ "#### With higher diffusion steps (more diverse)\n",
349
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
350
+ ],
351
+ "metadata": {
352
+ "id": "oZk9o-EzCBVx"
353
+ }
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "source": [
358
+ "start = time.time()\n",
359
+ "noise = torch.randn(1,1,256).to(device)\n",
360
+ "wav = inference(text, noise, diffusion_steps=10, embedding_scale=1)\n",
361
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
362
+ "print(f\"RTF = {rtf:5f}\")\n",
363
+ "import IPython.display as ipd\n",
364
+ "display(ipd.Audio(wav, rate=24000))"
365
+ ],
366
+ "metadata": {
367
+ "id": "9_OHtzMbB9gL"
368
+ },
369
+ "execution_count": null,
370
+ "outputs": []
371
+ },
372
+ {
373
+ "cell_type": "markdown",
374
+ "source": [
375
+ "### Speech expressiveness\n",
376
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page."
377
+ ],
378
+ "metadata": {
379
+ "id": "NyDACd-0CaqL"
380
+ }
381
+ },
382
+ {
383
+ "cell_type": "markdown",
384
+ "source": [
385
+ "#### With embedding_scale=1\n",
386
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional."
387
+ ],
388
+ "metadata": {
389
+ "id": "cRkS5VWxCck4"
390
+ }
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "source": [
395
+ "texts = {}\n",
396
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
397
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
398
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
399
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
400
+ "\n",
401
+ "for k,v in texts.items():\n",
402
+ " noise = torch.randn(1,1,256).to(device)\n",
403
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=1)\n",
404
+ " print(k + \": \")\n",
405
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
406
+ ],
407
+ "metadata": {
408
+ "id": "H5g5RO-mCbZB"
409
+ },
410
+ "execution_count": null,
411
+ "outputs": []
412
+ },
413
+ {
414
+ "cell_type": "markdown",
415
+ "source": [
416
+ "#### With embedding_scale=2"
417
+ ],
418
+ "metadata": {
419
+ "id": "f4S8TXSpCgpA"
420
+ }
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "source": [
425
+ "texts = {}\n",
426
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
427
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
428
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
429
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
430
+ "\n",
431
+ "for k,v in texts.items():\n",
432
+ " noise = torch.randn(1,1,256).to(device)\n",
433
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=2) # embedding_scale=2 for more pronounced emotion\n",
434
+ " print(k + \": \")\n",
435
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
436
+ ],
437
+ "metadata": {
438
+ "id": "xHHIdeNrCezC"
439
+ },
440
+ "execution_count": null,
441
+ "outputs": []
442
+ },
443
+ {
444
+ "cell_type": "markdown",
445
+ "source": [
446
+ "### Long-form generation\n",
447
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
448
+ ],
449
+ "metadata": {
450
+ "id": "nAh7Tov4CkuH"
451
+ }
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "source": [
456
+ "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first-class homemade products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:\"string\"}"
457
+ ],
458
+ "metadata": {
459
+ "cellView": "form",
460
+ "id": "IJwUbgvACoDu"
461
+ },
462
+ "execution_count": 8,
463
+ "outputs": []
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "source": [
468
+ "sentences = passage.split('.') # simple split by comma\n",
469
+ "wavs = []\n",
470
+ "s_prev = None\n",
471
+ "for text in sentences:\n",
472
+ " if text.strip() == \"\": continue\n",
473
+ " text += '.' # add it back\n",
474
+ " noise = torch.randn(1,1,256).to(device)\n",
475
+ " wav, s_prev = LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=10, embedding_scale=1.5)\n",
476
+ " wavs.append(wav)\n",
477
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))"
478
+ ],
479
+ "metadata": {
480
+ "id": "nP-7i2QAC0JT"
481
+ },
482
+ "execution_count": null,
483
+ "outputs": []
484
+ }
485
+ ]
486
+ }
Colab/StyleTTS2_Demo_LibriTTS.ipynb ADDED
@@ -0,0 +1,1218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "view-in-github",
7
+ "colab_type": "text"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LibriTTS.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {
16
+ "id": "aAGQPfgYIR23"
17
+ },
18
+ "source": [
19
+ "### Install packages and download models"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {
26
+ "colab": {
27
+ "base_uri": "https://localhost:8080/"
28
+ },
29
+ "id": "zDPW5uSpISd2",
30
+ "outputId": "6463ff79-18d5-4071-c6ad-01947beeb368"
31
+ },
32
+ "outputs": [
33
+ {
34
+ "output_type": "stream",
35
+ "name": "stdout",
36
+ "text": [
37
+
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "%%shell\n",
43
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
44
+ "cd StyleTTS2\n",
45
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
46
+ "sudo apt-get install espeak-ng\n",
47
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS\n",
48
+ "mv StyleTTS2-LibriTTS/Models .\n",
49
+ "mv StyleTTS2-LibriTTS/reference_audio.zip .\n",
50
+ "unzip reference_audio.zip\n",
51
+ "mv reference_audio Demo/reference_audio"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "markdown",
56
+ "metadata": {
57
+ "id": "eJdB_nCOIVIN"
58
+ },
59
+ "source": [
60
+ "### Load models"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "id": "cha8Tr2uJwN0"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "import nltk\n",
72
+ "nltk.download('punkt')"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "metadata": {
79
+ "id": "Qoow8Wd8ITtm"
80
+ },
81
+ "outputs": [],
82
+ "source": [
83
+ "%cd StyleTTS2\n",
84
+ "\n",
85
+ "import torch\n",
86
+ "torch.manual_seed(0)\n",
87
+ "torch.backends.cudnn.benchmark = False\n",
88
+ "torch.backends.cudnn.deterministic = True\n",
89
+ "\n",
90
+ "import random\n",
91
+ "random.seed(0)\n",
92
+ "\n",
93
+ "import numpy as np\n",
94
+ "np.random.seed(0)\n",
95
+ "\n",
96
+ "# load packages\n",
97
+ "import time\n",
98
+ "import random\n",
99
+ "import yaml\n",
100
+ "from munch import Munch\n",
101
+ "import numpy as np\n",
102
+ "import torch\n",
103
+ "from torch import nn\n",
104
+ "import torch.nn.functional as F\n",
105
+ "import torchaudio\n",
106
+ "import librosa\n",
107
+ "from nltk.tokenize import word_tokenize\n",
108
+ "\n",
109
+ "from models import *\n",
110
+ "from utils import *\n",
111
+ "from text_utils import TextCleaner\n",
112
+ "textclenaer = TextCleaner()\n",
113
+ "\n",
114
+ "%matplotlib inline\n",
115
+ "\n",
116
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
117
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
118
+ "mean, std = -4, 4\n",
119
+ "\n",
120
+ "def length_to_mask(lengths):\n",
121
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
122
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
123
+ " return mask\n",
124
+ "\n",
125
+ "def preprocess(wave):\n",
126
+ " wave_tensor = torch.from_numpy(wave).float()\n",
127
+ " mel_tensor = to_mel(wave_tensor)\n",
128
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
129
+ " return mel_tensor\n",
130
+ "\n",
131
+ "def compute_style(path):\n",
132
+ " wave, sr = librosa.load(path, sr=24000)\n",
133
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
134
+ " if sr != 24000:\n",
135
+ " audio = librosa.resample(audio, sr, 24000)\n",
136
+ " mel_tensor = preprocess(audio).to(device)\n",
137
+ "\n",
138
+ " with torch.no_grad():\n",
139
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
140
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
141
+ "\n",
142
+ " return torch.cat([ref_s, ref_p], dim=1)\n",
143
+ "\n",
144
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
145
+ "\n",
146
+ "# load phonemizer\n",
147
+ "import phonemizer\n",
148
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n",
149
+ "\n",
150
+ "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
151
+ "\n",
152
+ "# load pretrained ASR model\n",
153
+ "ASR_config = config.get('ASR_config', False)\n",
154
+ "ASR_path = config.get('ASR_path', False)\n",
155
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
156
+ "\n",
157
+ "# load pretrained F0 model\n",
158
+ "F0_path = config.get('F0_path', False)\n",
159
+ "pitch_extractor = load_F0_models(F0_path)\n",
160
+ "\n",
161
+ "# load BERT model\n",
162
+ "from Utils.PLBERT.util import load_plbert\n",
163
+ "BERT_path = config.get('PLBERT_dir', False)\n",
164
+ "plbert = load_plbert(BERT_path)\n",
165
+ "\n",
166
+ "model_params = recursive_munch(config['model_params'])\n",
167
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
168
+ "_ = [model[key].eval() for key in model]\n",
169
+ "_ = [model[key].to(device) for key in model]\n",
170
+ "\n",
171
+ "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
172
+ "params = params_whole['net']\n",
173
+ "\n",
174
+ "for key in model:\n",
175
+ " if key in params:\n",
176
+ " print('%s loaded' % key)\n",
177
+ " try:\n",
178
+ " model[key].load_state_dict(params[key])\n",
179
+ " except:\n",
180
+ " from collections import OrderedDict\n",
181
+ " state_dict = params[key]\n",
182
+ " new_state_dict = OrderedDict()\n",
183
+ " for k, v in state_dict.items():\n",
184
+ " name = k[7:] # remove `module.`\n",
185
+ " new_state_dict[name] = v\n",
186
+ " # load params\n",
187
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
188
+ "# except:\n",
189
+ "# _load(params[key], model[key])\n",
190
+ "_ = [model[key].eval() for key in model]\n",
191
+ "\n",
192
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
193
+ "\n",
194
+ "sampler = DiffusionSampler(\n",
195
+ " model.diffusion.diffusion,\n",
196
+ " sampler=ADPM2Sampler(),\n",
197
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
198
+ " clamp=False\n",
199
+ ")\n",
200
+ "\n",
201
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
202
+ " text = text.strip()\n",
203
+ " ps = global_phonemizer.phonemize([text])\n",
204
+ " ps = word_tokenize(ps[0])\n",
205
+ " ps = ' '.join(ps)\n",
206
+ " tokens = textclenaer(ps)\n",
207
+ " tokens.insert(0, 0)\n",
208
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
209
+ "\n",
210
+ " with torch.no_grad():\n",
211
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
212
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
213
+ "\n",
214
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
215
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
216
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
217
+ "\n",
218
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
219
+ " embedding=bert_dur,\n",
220
+ " embedding_scale=embedding_scale,\n",
221
+ " features=ref_s, # reference from the same speaker as the embedding\n",
222
+ " num_steps=diffusion_steps).squeeze(1)\n",
223
+ "\n",
224
+ "\n",
225
+ " s = s_pred[:, 128:]\n",
226
+ " ref = s_pred[:, :128]\n",
227
+ "\n",
228
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
229
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
230
+ "\n",
231
+ " d = model.predictor.text_encoder(d_en,\n",
232
+ " s, input_lengths, text_mask)\n",
233
+ "\n",
234
+ " x, _ = model.predictor.lstm(d)\n",
235
+ " duration = model.predictor.duration_proj(x)\n",
236
+ "\n",
237
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
238
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
239
+ "\n",
240
+ "\n",
241
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
242
+ " c_frame = 0\n",
243
+ " for i in range(pred_aln_trg.size(0)):\n",
244
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
245
+ " c_frame += int(pred_dur[i].data)\n",
246
+ "\n",
247
+ " # encode prosody\n",
248
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
249
+ " if model_params.decoder.type == \"hifigan\":\n",
250
+ " asr_new = torch.zeros_like(en)\n",
251
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
252
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
253
+ " en = asr_new\n",
254
+ "\n",
255
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
256
+ "\n",
257
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
258
+ " if model_params.decoder.type == \"hifigan\":\n",
259
+ " asr_new = torch.zeros_like(asr)\n",
260
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
261
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
262
+ " asr = asr_new\n",
263
+ "\n",
264
+ " out = model.decoder(asr,\n",
265
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
266
+ "\n",
267
+ "\n",
268
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n",
269
+ "\n",
270
+ "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
271
+ " text = text.strip()\n",
272
+ " ps = global_phonemizer.phonemize([text])\n",
273
+ " ps = word_tokenize(ps[0])\n",
274
+ " ps = ' '.join(ps)\n",
275
+ " ps = ps.replace('``', '\"')\n",
276
+ " ps = ps.replace(\"''\", '\"')\n",
277
+ "\n",
278
+ " tokens = textclenaer(ps)\n",
279
+ " tokens.insert(0, 0)\n",
280
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
281
+ "\n",
282
+ " with torch.no_grad():\n",
283
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
284
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
285
+ "\n",
286
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
287
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
288
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
289
+ "\n",
290
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
291
+ " embedding=bert_dur,\n",
292
+ " embedding_scale=embedding_scale,\n",
293
+ " features=ref_s, # reference from the same speaker as the embedding\n",
294
+ " num_steps=diffusion_steps).squeeze(1)\n",
295
+ "\n",
296
+ " if s_prev is not None:\n",
297
+ " # convex combination of previous and current style\n",
298
+ " s_pred = t * s_prev + (1 - t) * s_pred\n",
299
+ "\n",
300
+ " s = s_pred[:, 128:]\n",
301
+ " ref = s_pred[:, :128]\n",
302
+ "\n",
303
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
304
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
305
+ "\n",
306
+ " s_pred = torch.cat([ref, s], dim=-1)\n",
307
+ "\n",
308
+ " d = model.predictor.text_encoder(d_en,\n",
309
+ " s, input_lengths, text_mask)\n",
310
+ "\n",
311
+ " x, _ = model.predictor.lstm(d)\n",
312
+ " duration = model.predictor.duration_proj(x)\n",
313
+ "\n",
314
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
315
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
316
+ "\n",
317
+ "\n",
318
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
319
+ " c_frame = 0\n",
320
+ " for i in range(pred_aln_trg.size(0)):\n",
321
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
322
+ " c_frame += int(pred_dur[i].data)\n",
323
+ "\n",
324
+ " # encode prosody\n",
325
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
326
+ " if model_params.decoder.type == \"hifigan\":\n",
327
+ " asr_new = torch.zeros_like(en)\n",
328
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
329
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
330
+ " en = asr_new\n",
331
+ "\n",
332
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
333
+ "\n",
334
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
335
+ " if model_params.decoder.type == \"hifigan\":\n",
336
+ " asr_new = torch.zeros_like(asr)\n",
337
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
338
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
339
+ " asr = asr_new\n",
340
+ "\n",
341
+ " out = model.decoder(asr,\n",
342
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
343
+ "\n",
344
+ "\n",
345
+ " return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later\n",
346
+ "\n",
347
+ "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
348
+ " text = text.strip()\n",
349
+ " ps = global_phonemizer.phonemize([text])\n",
350
+ " ps = word_tokenize(ps[0])\n",
351
+ " ps = ' '.join(ps)\n",
352
+ "\n",
353
+ " tokens = textclenaer(ps)\n",
354
+ " tokens.insert(0, 0)\n",
355
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
356
+ "\n",
357
+ " ref_text = ref_text.strip()\n",
358
+ " ps = global_phonemizer.phonemize([ref_text])\n",
359
+ " ps = word_tokenize(ps[0])\n",
360
+ " ps = ' '.join(ps)\n",
361
+ "\n",
362
+ " ref_tokens = textclenaer(ps)\n",
363
+ " ref_tokens.insert(0, 0)\n",
364
+ " ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
365
+ "\n",
366
+ "\n",
367
+ " with torch.no_grad():\n",
368
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
369
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
370
+ "\n",
371
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
372
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
373
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
374
+ "\n",
375
+ " ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
376
+ " ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
377
+ " ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
378
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
379
+ " embedding=bert_dur,\n",
380
+ " embedding_scale=embedding_scale,\n",
381
+ " features=ref_s, # reference from the same speaker as the embedding\n",
382
+ " num_steps=diffusion_steps).squeeze(1)\n",
383
+ "\n",
384
+ "\n",
385
+ " s = s_pred[:, 128:]\n",
386
+ " ref = s_pred[:, :128]\n",
387
+ "\n",
388
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
389
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
390
+ "\n",
391
+ " d = model.predictor.text_encoder(d_en,\n",
392
+ " s, input_lengths, text_mask)\n",
393
+ "\n",
394
+ " x, _ = model.predictor.lstm(d)\n",
395
+ " duration = model.predictor.duration_proj(x)\n",
396
+ "\n",
397
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
398
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
399
+ "\n",
400
+ "\n",
401
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
402
+ " c_frame = 0\n",
403
+ " for i in range(pred_aln_trg.size(0)):\n",
404
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
405
+ " c_frame += int(pred_dur[i].data)\n",
406
+ "\n",
407
+ " # encode prosody\n",
408
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
409
+ " if model_params.decoder.type == \"hifigan\":\n",
410
+ " asr_new = torch.zeros_like(en)\n",
411
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
412
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
413
+ " en = asr_new\n",
414
+ "\n",
415
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
416
+ "\n",
417
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
418
+ " if model_params.decoder.type == \"hifigan\":\n",
419
+ " asr_new = torch.zeros_like(asr)\n",
420
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
421
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
422
+ " asr = asr_new\n",
423
+ "\n",
424
+ " out = model.decoder(asr,\n",
425
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
426
+ "\n",
427
+ "\n",
428
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "markdown",
433
+ "metadata": {
434
+ "id": "32S6U0LyJbCA"
435
+ },
436
+ "source": [
437
+ "### Synthesize speech"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "markdown",
442
+ "metadata": {
443
+ "id": "ehK_0daMJdk_"
444
+ },
445
+ "source": [
446
+ "#### Basic synthesis (5 diffusion steps, seen speakers)"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "metadata": {
453
+ "id": "SJs2x41MJhM-"
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": null,
463
+ "metadata": {
464
+ "id": "xuqIJe-IJb7A"
465
+ },
466
+ "outputs": [],
467
+ "source": [
468
+ "reference_dicts = {}\n",
469
+ "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
470
+ "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "metadata": {
477
+ "id": "H3ra3IxJJmF0"
478
+ },
479
+ "outputs": [],
480
+ "source": [
481
+ "noise = torch.randn(1,1,256).to(device)\n",
482
+ "for k, path in reference_dicts.items():\n",
483
+ " ref_s = compute_style(path)\n",
484
+ " start = time.time()\n",
485
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
486
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
487
+ " print(f\"RTF = {rtf:5f}\")\n",
488
+ " import IPython.display as ipd\n",
489
+ " print(k + ' Synthesized:')\n",
490
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
491
+ " print('Reference:')\n",
492
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "markdown",
497
+ "metadata": {
498
+ "id": "aB3wUz6yJ-P_"
499
+ },
500
+ "source": [
501
+ "#### With higher diffusion steps (more diverse)\n",
502
+ "\n",
503
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": null,
509
+ "metadata": {
510
+ "id": "lF27XUo4JrKk"
511
+ },
512
+ "outputs": [],
513
+ "source": [
514
+ "noise = torch.randn(1,1,256).to(device)\n",
515
+ "for k, path in reference_dicts.items():\n",
516
+ " ref_s = compute_style(path)\n",
517
+ " start = time.time()\n",
518
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
519
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
520
+ " print(f\"RTF = {rtf:5f}\")\n",
521
+ " import IPython.display as ipd\n",
522
+ " print(k + ' Synthesized:')\n",
523
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
524
+ " print(k + ' Reference:')\n",
525
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "markdown",
530
+ "metadata": {
531
+ "id": "pFT_vmJcKDs1"
532
+ },
533
+ "source": [
534
+ "#### Basic synthesis (5 diffusion steps, unseen speakers)\n",
535
+ "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "metadata": {
542
+ "id": "HvNAeGPEKAWN"
543
+ },
544
+ "outputs": [],
545
+ "source": [
546
+ "reference_dicts = {}\n",
547
+ "# format: (path, text)\n",
548
+ "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
549
+ "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
550
+ "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
551
+ "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "code",
556
+ "execution_count": null,
557
+ "metadata": {
558
+ "id": "mFnyvYp5KAYN"
559
+ },
560
+ "outputs": [],
561
+ "source": [
562
+ "noise = torch.randn(1,1,256).to(device)\n",
563
+ "for k, v in reference_dicts.items():\n",
564
+ " path, text = v\n",
565
+ " ref_s = compute_style(path)\n",
566
+ " start = time.time()\n",
567
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
568
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
569
+ " print(f\"RTF = {rtf:5f}\")\n",
570
+ " import IPython.display as ipd\n",
571
+ " print(k + ' Synthesized: ' + text)\n",
572
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
573
+ " print(k + ' Reference:')\n",
574
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "markdown",
579
+ "metadata": {
580
+ "id": "QBZ53BQtKNQ6"
581
+ },
582
+ "source": [
583
+ "### Speech expressiveness\n",
584
+ "\n",
585
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training.\n",
586
+ "\n",
587
+ "#### With `embedding_scale=1`\n",
588
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional."
589
+ ]
590
+ },
591
+ {
592
+ "cell_type": "code",
593
+ "execution_count": null,
594
+ "metadata": {
595
+ "id": "5FwE9CefKQk6"
596
+ },
597
+ "outputs": [],
598
+ "source": [
599
+ "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": null,
605
+ "metadata": {
606
+ "id": "0CKMI0ZsKUDh"
607
+ },
608
+ "outputs": [],
609
+ "source": [
610
+ "texts = {}\n",
611
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
612
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
613
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
614
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
615
+ "\n",
616
+ "for k,v in texts.items():\n",
617
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
618
+ " print(k + \": \")\n",
619
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "markdown",
624
+ "metadata": {
625
+ "id": "reemQKVEKWAZ"
626
+ },
627
+ "source": [
628
+ "#### With `embedding_scale=2`"
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": null,
634
+ "metadata": {
635
+ "id": "npIAiAUvKYGv"
636
+ },
637
+ "outputs": [],
638
+ "source": [
639
+ "texts = {}\n",
640
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
641
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
642
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
643
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
644
+ "\n",
645
+ "for k,v in texts.items():\n",
646
+ " noise = torch.randn(1,1,256).to(device)\n",
647
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
648
+ " print(k + \": \")\n",
649
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "metadata": {
655
+ "id": "lqKZaXeYKbrH"
656
+ },
657
+ "source": [
658
+ "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
659
+ "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody."
660
+ ]
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "execution_count": null,
665
+ "metadata": {
666
+ "id": "VjXuRCCWKcdN"
667
+ },
668
+ "outputs": [],
669
+ "source": [
670
+ "texts = {}\n",
671
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
672
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
673
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
674
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
675
+ "\n",
676
+ "for k,v in texts.items():\n",
677
+ " noise = torch.randn(1,1,256).to(device)\n",
678
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
679
+ " print(k + \": \")\n",
680
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
681
+ ]
682
+ },
683
+ {
684
+ "cell_type": "markdown",
685
+ "metadata": {
686
+ "id": "xrwYXGh0KiIW"
687
+ },
688
+ "source": [
689
+ "### Zero-shot speaker adaptation\n",
690
+ "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance."
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "markdown",
695
+ "metadata": {
696
+ "id": "ETUywHHmKimE"
697
+ },
698
+ "source": [
699
+ "#### Acoustic Environment Maintenance\n",
700
+ "\n",
701
+ "Since we want to maintain the acoustic environment in the speaker (timbre), we set `alpha = 0` to make the speaker as close to the reference as possible while only changing the prosody according to the text. "
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "code",
706
+ "execution_count": null,
707
+ "metadata": {
708
+ "id": "yvjBK3syKnZL"
709
+ },
710
+ "outputs": [],
711
+ "source": [
712
+ "reference_dicts = {}\n",
713
+ "# format: (path, text)\n",
714
+ "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
715
+ "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
716
+ "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
717
+ ]
718
+ },
719
+ {
720
+ "cell_type": "code",
721
+ "execution_count": null,
722
+ "metadata": {
723
+ "id": "jclowWp4KomJ"
724
+ },
725
+ "outputs": [],
726
+ "source": [
727
+ "noise = torch.randn(1,1,256).to(device)\n",
728
+ "for k, v in reference_dicts.items():\n",
729
+ " path, text = v\n",
730
+ " ref_s = compute_style(path)\n",
731
+ " start = time.time()\n",
732
+ " wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
733
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
734
+ " print(f\"RTF = {rtf:5f}\")\n",
735
+ " import IPython.display as ipd\n",
736
+ " print('Synthesized: ' + text)\n",
737
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
738
+ " print('Reference:')\n",
739
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
740
+ ]
741
+ },
742
+ {
743
+ "cell_type": "markdown",
744
+ "metadata": {
745
+ "id": "LgIm7M93KqVZ"
746
+ },
747
+ "source": [
748
+ "#### Speaker’s Emotion Maintenance\n",
749
+ "\n",
750
+ "Since we want to maintain the emotion in the speaker (prosody), we set `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
751
+ ]
752
+ },
753
+ {
754
+ "cell_type": "code",
755
+ "execution_count": null,
756
+ "metadata": {
757
+ "id": "yzsNoP6oKulL"
758
+ },
759
+ "outputs": [],
760
+ "source": [
761
+ "reference_dicts = {}\n",
762
+ "# format: (path, text)\n",
763
+ "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
764
+ "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
765
+ "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
766
+ "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
767
+ ]
768
+ },
769
+ {
770
+ "cell_type": "code",
771
+ "execution_count": null,
772
+ "metadata": {
773
+ "id": "7h2-9cpfKwr4"
774
+ },
775
+ "outputs": [],
776
+ "source": [
777
+ "noise = torch.randn(1,1,256).to(device)\n",
778
+ "for k, v in reference_dicts.items():\n",
779
+ " path, text = v\n",
780
+ " ref_s = compute_style(path)\n",
781
+ " start = time.time()\n",
782
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
783
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
784
+ " print(f\"RTF = {rtf:5f}\")\n",
785
+ " import IPython.display as ipd\n",
786
+ " print(k + ' Synthesized: ' + text)\n",
787
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
788
+ " print(k + ' Reference:')\n",
789
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "markdown",
794
+ "metadata": {
795
+ "id": "aNS82PGwKzgg"
796
+ },
797
+ "source": [
798
+ "### Longform Narration\n",
799
+ "\n",
800
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
801
+ ]
802
+ },
803
+ {
804
+ "cell_type": "code",
805
+ "execution_count": null,
806
+ "metadata": {
807
+ "cellView": "form",
808
+ "id": "qs97nL5HK5DH"
809
+ },
810
+ "outputs": [],
811
+ "source": [
812
+ "passage = passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:\"string\"}"
813
+ ]
814
+ },
815
+ {
816
+ "cell_type": "code",
817
+ "execution_count": null,
818
+ "metadata": {
819
+ "colab": {
820
+ "background_save": true
821
+ },
822
+ "id": "8Mu9whHYK_1b"
823
+ },
824
+ "outputs": [],
825
+ "source": [
826
+ "# seen speaker\n",
827
+ "path = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
828
+ "s_ref = compute_style(path)\n",
829
+ "sentences = passage.split('.') # simple split by comma\n",
830
+ "wavs = []\n",
831
+ "s_prev = None\n",
832
+ "for text in sentences:\n",
833
+ " if text.strip() == \"\": continue\n",
834
+ " text += '.' # add it back\n",
835
+ "\n",
836
+ " wav, s_prev = LFinference(text,\n",
837
+ " s_prev,\n",
838
+ " s_ref,\n",
839
+ " alpha = 0.3,\n",
840
+ " beta = 0.9, # make it more suitable for the text\n",
841
+ " t = 0.7,\n",
842
+ " diffusion_steps=10, embedding_scale=1.5)\n",
843
+ " wavs.append(wav)\n",
844
+ "print('Synthesized: ')\n",
845
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
846
+ "print('Reference: ')\n",
847
+ "display(ipd.Audio(path, rate=24000, normalize=False))"
848
+ ]
849
+ },
850
+ {
851
+ "cell_type": "markdown",
852
+ "metadata": {
853
+ "id": "81Rh-lgWLB2i"
854
+ },
855
+ "source": [
856
+ "### Style Transfer\n",
857
+ "\n",
858
+ "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style)."
859
+ ]
860
+ },
861
+ {
862
+ "cell_type": "code",
863
+ "execution_count": null,
864
+ "metadata": {
865
+ "id": "CtIgr5kOLE9a"
866
+ },
867
+ "outputs": [],
868
+ "source": [
869
+ "# reference texts to sample styles\n",
870
+ "\n",
871
+ "ref_texts = {}\n",
872
+ "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
873
+ "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
874
+ "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
875
+ "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
876
+ ]
877
+ },
878
+ {
879
+ "cell_type": "code",
880
+ "execution_count": null,
881
+ "metadata": {
882
+ "id": "MlA1CbhzLIoI"
883
+ },
884
+ "outputs": [],
885
+ "source": [
886
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
887
+ "s_ref = compute_style(path)\n",
888
+ "\n",
889
+ "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
890
+ "for k,v in texts.items():\n",
891
+ " wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
892
+ " print(k + \": \")\n",
893
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
894
+ ]
895
+ },
896
+ {
897
+ "cell_type": "markdown",
898
+ "metadata": {
899
+ "id": "2M0iaXlkLJUQ"
900
+ },
901
+ "source": [
902
+ "### Speech diversity\n",
903
+ "\n",
904
+ "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page.\n",
905
+ "\n",
906
+ "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
907
+ "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different).\n",
908
+ "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis).\n"
909
+ ]
910
+ },
911
+ {
912
+ "cell_type": "markdown",
913
+ "metadata": {
914
+ "id": "tSxZDvF2LNu4"
915
+ },
916
+ "source": [
917
+ "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
918
+ "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text."
919
+ ]
920
+ },
921
+ {
922
+ "cell_type": "code",
923
+ "execution_count": null,
924
+ "metadata": {
925
+ "id": "AAomGCDZLIt5"
926
+ },
927
+ "outputs": [],
928
+ "source": [
929
+ "# unseen speaker\n",
930
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
931
+ "ref_s = compute_style(path)\n",
932
+ "\n",
933
+ "text = \"How much variation is there?\"\n",
934
+ "for _ in range(5):\n",
935
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
936
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
937
+ ]
938
+ },
939
+ {
940
+ "cell_type": "markdown",
941
+ "metadata": {
942
+ "id": "BKrSMdgcLQRP"
943
+ },
944
+ "source": [
945
+ "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
946
+ "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples."
947
+ ]
948
+ },
949
+ {
950
+ "cell_type": "code",
951
+ "execution_count": null,
952
+ "metadata": {
953
+ "id": "Uo7gVmFoLRfm"
954
+ },
955
+ "outputs": [],
956
+ "source": [
957
+ "# unseen speaker\n",
958
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
959
+ "ref_s = compute_style(path)\n",
960
+ "\n",
961
+ "text = \"How much variation is there?\"\n",
962
+ "for _ in range(5):\n",
963
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
964
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
965
+ ]
966
+ },
967
+ {
968
+ "cell_type": "markdown",
969
+ "metadata": {
970
+ "id": "nfQ0Xrg9LStd"
971
+ },
972
+ "source": [
973
+ "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
974
+ "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker. "
975
+ ]
976
+ },
977
+ {
978
+ "cell_type": "code",
979
+ "execution_count": null,
980
+ "metadata": {
981
+ "id": "cPHz4BzVLT_u"
982
+ },
983
+ "outputs": [],
984
+ "source": [
985
+ "# unseen speaker\n",
986
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
987
+ "ref_s = compute_style(path)\n",
988
+ "\n",
989
+ "text = \"How much variation is there?\"\n",
990
+ "for _ in range(5):\n",
991
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
992
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
993
+ ]
994
+ },
995
+ {
996
+ "cell_type": "markdown",
997
+ "source": [
998
+ "#### Extreme setting (`alpha = 1, beta=1`)\n",
999
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker."
1000
+ ],
1001
+ "metadata": {
1002
+ "id": "hPKg9eYpL00f"
1003
+ }
1004
+ },
1005
+ {
1006
+ "cell_type": "code",
1007
+ "source": [
1008
+ "# unseen speaker\n",
1009
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1010
+ "ref_s = compute_style(path)\n",
1011
+ "\n",
1012
+ "text = \"How much variation is there?\"\n",
1013
+ "for _ in range(5):\n",
1014
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
1015
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1016
+ ],
1017
+ "metadata": {
1018
+ "id": "Ei-7JOccL0bF"
1019
+ },
1020
+ "execution_count": null,
1021
+ "outputs": []
1022
+ },
1023
+ {
1024
+ "cell_type": "markdown",
1025
+ "source": [
1026
+ "#### No variation (`alpha = 0, beta=0`)\n",
1027
+ "This setting uses 100% of the reference timbre and prosody and do not use the diffusion model at all. This makes the speaker very similar to the reference speaker, but there is no variation."
1028
+ ],
1029
+ "metadata": {
1030
+ "id": "FVMPc3bhL3eL"
1031
+ }
1032
+ },
1033
+ {
1034
+ "cell_type": "code",
1035
+ "source": [
1036
+ "# unseen speaker\n",
1037
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1038
+ "ref_s = compute_style(path)\n",
1039
+ "\n",
1040
+ "text = \"How much variation is there?\"\n",
1041
+ "for _ in range(5):\n",
1042
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
1043
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1044
+ ],
1045
+ "metadata": {
1046
+ "id": "yh1QZ7uhL4wM"
1047
+ },
1048
+ "execution_count": null,
1049
+ "outputs": []
1050
+ },
1051
+ {
1052
+ "cell_type": "markdown",
1053
+ "source": [
1054
+ "### Extra fun!\n",
1055
+ "\n",
1056
+ "You can record your own voice and clone it using pre-trained StyleTTS 2 model here."
1057
+ ],
1058
+ "metadata": {
1059
+ "id": "T0EvkWrAMBDB"
1060
+ }
1061
+ },
1062
+ {
1063
+ "cell_type": "markdown",
1064
+ "source": [
1065
+ "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect."
1066
+ ],
1067
+ "metadata": {
1068
+ "id": "R985j5QONY8I"
1069
+ }
1070
+ },
1071
+ {
1072
+ "cell_type": "code",
1073
+ "source": [
1074
+ "# all imports\n",
1075
+ "from IPython.display import Javascript\n",
1076
+ "from google.colab import output\n",
1077
+ "from base64 import b64decode\n",
1078
+ "\n",
1079
+ "RECORD = \"\"\"\n",
1080
+ "const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n",
1081
+ "const b2text = blob => new Promise(resolve => {\n",
1082
+ " const reader = new FileReader()\n",
1083
+ " reader.onloadend = e => resolve(e.srcElement.result)\n",
1084
+ " reader.readAsDataURL(blob)\n",
1085
+ "})\n",
1086
+ "var record = time => new Promise(async resolve => {\n",
1087
+ " stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n",
1088
+ " recorder = new MediaRecorder(stream)\n",
1089
+ " chunks = []\n",
1090
+ " recorder.ondataavailable = e => chunks.push(e.data)\n",
1091
+ " recorder.start()\n",
1092
+ " await sleep(time)\n",
1093
+ " recorder.onstop = async ()=>{\n",
1094
+ " blob = new Blob(chunks)\n",
1095
+ " text = await b2text(blob)\n",
1096
+ " resolve(text)\n",
1097
+ " }\n",
1098
+ " recorder.stop()\n",
1099
+ "})\n",
1100
+ "\"\"\"\n",
1101
+ "\n",
1102
+ "def record(sec=3):\n",
1103
+ " display(Javascript(RECORD))\n",
1104
+ " s = output.eval_js('record(%d)' % (sec*1000))\n",
1105
+ " b = b64decode(s.split(',')[1])\n",
1106
+ " with open('audio.wav','wb') as f:\n",
1107
+ " f.write(b)\n",
1108
+ " return 'audio.wav' # or webm ?"
1109
+ ],
1110
+ "metadata": {
1111
+ "id": "MWrFs0KWMBpz"
1112
+ },
1113
+ "execution_count": null,
1114
+ "outputs": []
1115
+ },
1116
+ {
1117
+ "cell_type": "markdown",
1118
+ "source": [
1119
+ "#### Please run this cell and speak:"
1120
+ ],
1121
+ "metadata": {
1122
+ "id": "z35qXwM0Nhx1"
1123
+ }
1124
+ },
1125
+ {
1126
+ "cell_type": "code",
1127
+ "source": [
1128
+ "print('Speak now for 5 seconds.')\n",
1129
+ "audio = record(sec=5)\n",
1130
+ "import IPython.display as ipd\n",
1131
+ "display(ipd.Audio(audio, rate=24000, normalize=False))"
1132
+ ],
1133
+ "metadata": {
1134
+ "id": "KUEoFyQBMR-8"
1135
+ },
1136
+ "execution_count": null,
1137
+ "outputs": []
1138
+ },
1139
+ {
1140
+ "cell_type": "markdown",
1141
+ "source": [
1142
+ "#### Synthesize in your own voice"
1143
+ ],
1144
+ "metadata": {
1145
+ "id": "OQS_7IBpNmM1"
1146
+ }
1147
+ },
1148
+ {
1149
+ "cell_type": "code",
1150
+ "source": [
1151
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
1152
+ ],
1153
+ "metadata": {
1154
+ "cellView": "form",
1155
+ "id": "c0I3LY7vM8Ta"
1156
+ },
1157
+ "execution_count": null,
1158
+ "outputs": []
1159
+ },
1160
+ {
1161
+ "cell_type": "code",
1162
+ "source": [
1163
+ "reference_dicts = {}\n",
1164
+ "reference_dicts['You'] = audio"
1165
+ ],
1166
+ "metadata": {
1167
+ "id": "80eW-pwxNCxu"
1168
+ },
1169
+ "execution_count": null,
1170
+ "outputs": []
1171
+ },
1172
+ {
1173
+ "cell_type": "code",
1174
+ "source": [
1175
+ "start = time.time()\n",
1176
+ "noise = torch.randn(1,1,256).to(device)\n",
1177
+ "for k, path in reference_dicts.items():\n",
1178
+ " ref_s = compute_style(path)\n",
1179
+ "\n",
1180
+ " wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
1181
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
1182
+ " print('Speaker: ' + k)\n",
1183
+ " import IPython.display as ipd\n",
1184
+ " print('Synthesized:')\n",
1185
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
1186
+ " print('Reference:')\n",
1187
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
1188
+ ],
1189
+ "metadata": {
1190
+ "id": "yIga6MTuNJaN"
1191
+ },
1192
+ "execution_count": null,
1193
+ "outputs": []
1194
+ }
1195
+ ],
1196
+ "metadata": {
1197
+ "accelerator": "GPU",
1198
+ "colab": {
1199
+ "provenance": [],
1200
+ "collapsed_sections": [
1201
+ "aAGQPfgYIR23",
1202
+ "eJdB_nCOIVIN",
1203
+ "R985j5QONY8I"
1204
+ ],
1205
+ "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+",
1206
+ "include_colab_link": true
1207
+ },
1208
+ "kernelspec": {
1209
+ "display_name": "Python 3",
1210
+ "name": "python3"
1211
+ },
1212
+ "language_info": {
1213
+ "name": "python"
1214
+ }
1215
+ },
1216
+ "nbformat": 4,
1217
+ "nbformat_minor": 0
1218
+ }
Colab/StyleTTS2_Finetune_Demo.ipynb ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "authorship_tag": "ABX9TyNiDU9ykIeYxO86Lmuid+ph",
9
+ "include_colab_link": true
10
+ },
11
+ "kernelspec": {
12
+ "name": "python3",
13
+ "display_name": "Python 3"
14
+ },
15
+ "language_info": {
16
+ "name": "python"
17
+ },
18
+ "accelerator": "GPU"
19
+ },
20
+ "cells": [
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {
24
+ "id": "view-in-github",
25
+ "colab_type": "text"
26
+ },
27
+ "source": [
28
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Finetune_Demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "source": [
34
+ "### Install packages and download models"
35
+ ],
36
+ "metadata": {
37
+ "id": "yLqBa4uYPrqE"
38
+ }
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "source": [
43
+ "%%shell\n",
44
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
45
+ "cd StyleTTS2\n",
46
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
47
+ "sudo apt-get install espeak-ng\n",
48
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS\n",
49
+ "mv StyleTTS2-LibriTTS/Models ."
50
+ ],
51
+ "metadata": {
52
+ "id": "H72WF06ZPrTF"
53
+ },
54
+ "execution_count": null,
55
+ "outputs": []
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "source": [
60
+ "### Download dataset (LJSpeech, 200 samples, ~15 minutes of data)\n",
61
+ "\n",
62
+ "You can definitely do it with fewer samples. This is just a proof of concept with 200 smaples."
63
+ ],
64
+ "metadata": {
65
+ "id": "G398sL8wPzTB"
66
+ }
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "source": [
71
+ "%cd StyleTTS2\n",
72
+ "!rm -rf Data"
73
+ ],
74
+ "metadata": {
75
+ "id": "kJuQUBrEPy5C"
76
+ },
77
+ "execution_count": null,
78
+ "outputs": []
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "source": [
83
+ "!gdown --id 1vqz26D3yn7OXS2vbfYxfSnpLS6m6tOFP\n",
84
+ "!unzip Data.zip"
85
+ ],
86
+ "metadata": {
87
+ "id": "mDXW8ZZePuSb"
88
+ },
89
+ "execution_count": null,
90
+ "outputs": []
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "source": [
95
+ "### Change the finetuning config\n",
96
+ "\n",
97
+ "Depending on the GPU you got, you may want to change the bacth size, max audio length, epiochs and so on."
98
+ ],
99
+ "metadata": {
100
+ "id": "_AlBQREWU8ud"
101
+ }
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "source": [
106
+ "config_path = \"Configs/config_ft.yml\"\n",
107
+ "\n",
108
+ "import yaml\n",
109
+ "config = yaml.safe_load(open(config_path))"
110
+ ],
111
+ "metadata": {
112
+ "id": "7uEITi0hU4I2"
113
+ },
114
+ "execution_count": null,
115
+ "outputs": []
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "source": [
120
+ "config['data_params']['root_path'] = \"Data/wavs\"\n",
121
+ "\n",
122
+ "config['batch_size'] = 2 # not enough RAM\n",
123
+ "config['max_len'] = 100 # not enough RAM\n",
124
+ "config['loss_params']['joint_epoch'] = 110 # we do not do SLM adversarial training due to not enough RAM\n",
125
+ "\n",
126
+ "with open(config_path, 'w') as outfile:\n",
127
+ " yaml.dump(config, outfile, default_flow_style=True)"
128
+ ],
129
+ "metadata": {
130
+ "id": "TPTRgOKSVT4K"
131
+ },
132
+ "execution_count": null,
133
+ "outputs": []
134
+ },
135
+ {
136
+ "cell_type": "markdown",
137
+ "source": [
138
+ "### Start finetuning\n"
139
+ ],
140
+ "metadata": {
141
+ "id": "uUuB_19NWj2Y"
142
+ }
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "source": [
147
+ "!python train_finetune.py --config_path ./Configs/config_ft.yml"
148
+ ],
149
+ "metadata": {
150
+ "id": "HZVAD5GKWm-O"
151
+ },
152
+ "execution_count": null,
153
+ "outputs": []
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "source": [
158
+ "### Test the model quality\n",
159
+ "\n",
160
+ "Note that this mainly serves as a proof of concept due to RAM limitation of free Colab instances. A lot of settings are suboptimal. In the future when DDP works for train_second.py, we will also add mixed precision finetuning to save time and RAM. You can also add SLM adversarial training run if you have paid Colab services (such as A100 with 40G of RAM)."
161
+ ],
162
+ "metadata": {
163
+ "id": "I0_7wsGkXGfc"
164
+ }
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "source": [
169
+ "import nltk\n",
170
+ "nltk.download('punkt')"
171
+ ],
172
+ "metadata": {
173
+ "id": "OPLphjbncE7p"
174
+ },
175
+ "execution_count": null,
176
+ "outputs": []
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "source": [
181
+ "import torch\n",
182
+ "torch.manual_seed(0)\n",
183
+ "torch.backends.cudnn.benchmark = False\n",
184
+ "torch.backends.cudnn.deterministic = True\n",
185
+ "\n",
186
+ "import random\n",
187
+ "random.seed(0)\n",
188
+ "\n",
189
+ "import numpy as np\n",
190
+ "np.random.seed(0)\n",
191
+ "\n",
192
+ "# load packages\n",
193
+ "import time\n",
194
+ "import random\n",
195
+ "import yaml\n",
196
+ "from munch import Munch\n",
197
+ "import numpy as np\n",
198
+ "import torch\n",
199
+ "from torch import nn\n",
200
+ "import torch.nn.functional as F\n",
201
+ "import torchaudio\n",
202
+ "import librosa\n",
203
+ "from nltk.tokenize import word_tokenize\n",
204
+ "\n",
205
+ "from models import *\n",
206
+ "from utils import *\n",
207
+ "from text_utils import TextCleaner\n",
208
+ "textclenaer = TextCleaner()\n",
209
+ "\n",
210
+ "%matplotlib inline\n",
211
+ "\n",
212
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
213
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
214
+ "mean, std = -4, 4\n",
215
+ "\n",
216
+ "def length_to_mask(lengths):\n",
217
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
218
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
219
+ " return mask\n",
220
+ "\n",
221
+ "def preprocess(wave):\n",
222
+ " wave_tensor = torch.from_numpy(wave).float()\n",
223
+ " mel_tensor = to_mel(wave_tensor)\n",
224
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
225
+ " return mel_tensor\n",
226
+ "\n",
227
+ "def compute_style(path):\n",
228
+ " wave, sr = librosa.load(path, sr=24000)\n",
229
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
230
+ " if sr != 24000:\n",
231
+ " audio = librosa.resample(audio, sr, 24000)\n",
232
+ " mel_tensor = preprocess(audio).to(device)\n",
233
+ "\n",
234
+ " with torch.no_grad():\n",
235
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
236
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
237
+ "\n",
238
+ " return torch.cat([ref_s, ref_p], dim=1)\n",
239
+ "\n",
240
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
241
+ "\n",
242
+ "# load phonemizer\n",
243
+ "import phonemizer\n",
244
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n",
245
+ "\n",
246
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config_ft.yml\"))\n",
247
+ "\n",
248
+ "# load pretrained ASR model\n",
249
+ "ASR_config = config.get('ASR_config', False)\n",
250
+ "ASR_path = config.get('ASR_path', False)\n",
251
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
252
+ "\n",
253
+ "# load pretrained F0 model\n",
254
+ "F0_path = config.get('F0_path', False)\n",
255
+ "pitch_extractor = load_F0_models(F0_path)\n",
256
+ "\n",
257
+ "# load BERT model\n",
258
+ "from Utils.PLBERT.util import load_plbert\n",
259
+ "BERT_path = config.get('PLBERT_dir', False)\n",
260
+ "plbert = load_plbert(BERT_path)\n",
261
+ "\n",
262
+ "model_params = recursive_munch(config['model_params'])\n",
263
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
264
+ "_ = [model[key].eval() for key in model]\n",
265
+ "_ = [model[key].to(device) for key in model]"
266
+ ],
267
+ "metadata": {
268
+ "id": "jIIAoDACXJL0"
269
+ },
270
+ "execution_count": null,
271
+ "outputs": []
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "source": [
276
+ "files = [f for f in os.listdir(\"Models/LJSpeech/\") if f.endswith('.pth')]\n",
277
+ "sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))"
278
+ ],
279
+ "metadata": {
280
+ "id": "eKXRAyyzcMpQ"
281
+ },
282
+ "execution_count": null,
283
+ "outputs": []
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "source": [
288
+ "params_whole = torch.load(\"Models/LJSpeech/\" + sorted_files[-1], map_location='cpu')\n",
289
+ "params = params_whole['net']"
290
+ ],
291
+ "metadata": {
292
+ "id": "ULuU9-VDb9Pk"
293
+ },
294
+ "execution_count": null,
295
+ "outputs": []
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "source": [
300
+ "for key in model:\n",
301
+ " if key in params:\n",
302
+ " print('%s loaded' % key)\n",
303
+ " try:\n",
304
+ " model[key].load_state_dict(params[key])\n",
305
+ " except:\n",
306
+ " from collections import OrderedDict\n",
307
+ " state_dict = params[key]\n",
308
+ " new_state_dict = OrderedDict()\n",
309
+ " for k, v in state_dict.items():\n",
310
+ " name = k[7:] # remove `module.`\n",
311
+ " new_state_dict[name] = v\n",
312
+ " # load params\n",
313
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
314
+ "# except:\n",
315
+ "# _load(params[key], model[key])\n",
316
+ "_ = [model[key].eval() for key in model]"
317
+ ],
318
+ "metadata": {
319
+ "id": "J-U29yIYc2ea"
320
+ },
321
+ "execution_count": null,
322
+ "outputs": []
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "source": [
327
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
328
+ ],
329
+ "metadata": {
330
+ "id": "jrPQ_Yrwc3n6"
331
+ },
332
+ "execution_count": null,
333
+ "outputs": []
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "source": [
338
+ "sampler = DiffusionSampler(\n",
339
+ " model.diffusion.diffusion,\n",
340
+ " sampler=ADPM2Sampler(),\n",
341
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
342
+ " clamp=False\n",
343
+ ")"
344
+ ],
345
+ "metadata": {
346
+ "id": "n2CWYNoqc455"
347
+ },
348
+ "execution_count": null,
349
+ "outputs": []
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "source": [
354
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
355
+ " text = text.strip()\n",
356
+ " ps = global_phonemizer.phonemize([text])\n",
357
+ " ps = word_tokenize(ps[0])\n",
358
+ " ps = ' '.join(ps)\n",
359
+ " tokens = textclenaer(ps)\n",
360
+ " tokens.insert(0, 0)\n",
361
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
362
+ "\n",
363
+ " with torch.no_grad():\n",
364
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
365
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
366
+ "\n",
367
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
368
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
369
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
370
+ "\n",
371
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
372
+ " embedding=bert_dur,\n",
373
+ " embedding_scale=embedding_scale,\n",
374
+ " features=ref_s, # reference from the same speaker as the embedding\n",
375
+ " num_steps=diffusion_steps).squeeze(1)\n",
376
+ "\n",
377
+ "\n",
378
+ " s = s_pred[:, 128:]\n",
379
+ " ref = s_pred[:, :128]\n",
380
+ "\n",
381
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
382
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
383
+ "\n",
384
+ " d = model.predictor.text_encoder(d_en,\n",
385
+ " s, input_lengths, text_mask)\n",
386
+ "\n",
387
+ " x, _ = model.predictor.lstm(d)\n",
388
+ " duration = model.predictor.duration_proj(x)\n",
389
+ "\n",
390
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
391
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
392
+ "\n",
393
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
394
+ " c_frame = 0\n",
395
+ " for i in range(pred_aln_trg.size(0)):\n",
396
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
397
+ " c_frame += int(pred_dur[i].data)\n",
398
+ "\n",
399
+ " # encode prosody\n",
400
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
401
+ " if model_params.decoder.type == \"hifigan\":\n",
402
+ " asr_new = torch.zeros_like(en)\n",
403
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
404
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
405
+ " en = asr_new\n",
406
+ "\n",
407
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
408
+ "\n",
409
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
410
+ " if model_params.decoder.type == \"hifigan\":\n",
411
+ " asr_new = torch.zeros_like(asr)\n",
412
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
413
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
414
+ " asr = asr_new\n",
415
+ "\n",
416
+ " out = model.decoder(asr,\n",
417
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
418
+ "\n",
419
+ "\n",
420
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
421
+ ],
422
+ "metadata": {
423
+ "id": "2x5kVb3nc_eY"
424
+ },
425
+ "execution_count": null,
426
+ "outputs": []
427
+ },
428
+ {
429
+ "cell_type": "markdown",
430
+ "source": [
431
+ "### Synthesize speech"
432
+ ],
433
+ "metadata": {
434
+ "id": "O159JnwCc6CC"
435
+ }
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "source": [
440
+ "text = '''Maltby and Company would issue warrants on them deliverable to the importer, and the goods were then passed to be stored in neighboring warehouses.\n",
441
+ "'''"
442
+ ],
443
+ "metadata": {
444
+ "id": "ThciXQ6rc9Eq"
445
+ },
446
+ "execution_count": null,
447
+ "outputs": []
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "source": [
452
+ "# get a random reference in the training set, note that it doesn't matter which one you use\n",
453
+ "path = \"Data/wavs/LJ001-0110.wav\"\n",
454
+ "# this style vector ref_s can be saved as a parameter together with the model weights\n",
455
+ "ref_s = compute_style(path)"
456
+ ],
457
+ "metadata": {
458
+ "id": "jldPkJyCc83a"
459
+ },
460
+ "execution_count": null,
461
+ "outputs": []
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "source": [
466
+ "start = time.time()\n",
467
+ "wav = inference(text, ref_s, alpha=0.9, beta=0.9, diffusion_steps=10, embedding_scale=1)\n",
468
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
469
+ "print(f\"RTF = {rtf:5f}\")\n",
470
+ "import IPython.display as ipd\n",
471
+ "display(ipd.Audio(wav, rate=24000, normalize=False))"
472
+ ],
473
+ "metadata": {
474
+ "id": "_mIU0jqDdQ-c"
475
+ },
476
+ "execution_count": null,
477
+ "outputs": []
478
+ }
479
+ ]
480
+ }
Configs/config.yml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LJSpeech"
2
+ first_stage_path: "first_stage.pth"
3
+ save_freq: 2
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 200 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 100 # number of peochs for second stage training (joint training)
8
+ batch_size: 16
9
+ max_len: 400 # maximum number of frames
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ F0_path: "Utils/JDC/bst.t7"
15
+ ASR_config: "Utils/ASR/config.yml"
16
+ ASR_path: "Utils/ASR/epoch_00080.pth"
17
+ PLBERT_dir: 'Utils/PLBERT/'
18
+
19
+ data_params:
20
+ train_data: "Data/train_list.txt"
21
+ val_data: "Data/val_list.txt"
22
+ root_path: "/local/LJSpeech-1.1/wavs"
23
+ OOD_data: "Data/OOD_texts.txt"
24
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
25
+
26
+ preprocess_params:
27
+ sr: 24000
28
+ spect_params:
29
+ n_fft: 2048
30
+ win_length: 1200
31
+ hop_length: 300
32
+
33
+ model_params:
34
+ multispeaker: false
35
+
36
+ dim_in: 64
37
+ hidden_dim: 512
38
+ max_conv_dim: 512
39
+ n_layer: 3
40
+ n_mels: 80
41
+
42
+ n_token: 178 # number of phoneme tokens
43
+ max_dur: 50 # maximum duration of a single phoneme
44
+ style_dim: 128 # style vector size
45
+
46
+ dropout: 0.2
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'istftnet' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10, 6]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20, 12]
56
+ gen_istft_n_fft: 20
57
+ gen_istft_hop_size: 5
58
+
59
+ # speech language model config
60
+ slm:
61
+ model: 'microsoft/wavlm-base-plus'
62
+ sr: 16000 # sampling rate of SLM
63
+ hidden: 768 # hidden size of SLM
64
+ nlayers: 13 # number of layers of SLM
65
+ initial_channel: 64 # initial channels of SLM discriminator head
66
+
67
+ # style diffusion model config
68
+ diffusion:
69
+ embedding_mask_proba: 0.1
70
+ # transformer config
71
+ transformer:
72
+ num_layers: 3
73
+ num_heads: 8
74
+ head_features: 64
75
+ multiplier: 2
76
+
77
+ # diffusion distribution config
78
+ dist:
79
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
80
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
81
+ mean: -3.0
82
+ std: 1.0
83
+
84
+ loss_params:
85
+ lambda_mel: 5. # mel reconstruction loss
86
+ lambda_gen: 1. # generator loss
87
+ lambda_slm: 1. # slm feature matching loss
88
+
89
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
90
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
91
+ TMA_epoch: 50 # TMA starting epoch (1st stage)
92
+
93
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
94
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
95
+ lambda_dur: 1. # duration loss (2nd stage)
96
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
97
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
98
+ lambda_diff: 1. # score matching loss (2nd stage)
99
+
100
+ diff_epoch: 20 # style diffusion starting epoch (2nd stage)
101
+ joint_epoch: 50 # joint training starting epoch (2nd stage)
102
+
103
+ optimizer_params:
104
+ lr: 0.0001 # general learning rate
105
+ bert_lr: 0.00001 # learning rate for PLBERT
106
+ ft_lr: 0.00001 # learning rate for acoustic modules
107
+
108
+ slmadv_params:
109
+ min_len: 400 # minimum length of samples
110
+ max_len: 500 # maximum length of samples
111
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
112
+ iter: 10 # update the discriminator every this iterations of generator update
113
+ thresh: 5 # gradient norm above which the gradient is scaled
114
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
115
+ sig: 1.5 # sigma for differentiable duration modeling
116
+
Configs/config_ft.yml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LJSpeech"
2
+ save_freq: 5
3
+ log_interval: 10
4
+ device: "cuda"
5
+ epochs: 50 # number of finetuning epoch (1 hour of data)
6
+ batch_size: 8
7
+ max_len: 400 # maximum number of frames
8
+ pretrained_model: "Models/LibriTTS/epochs_2nd_00020.pth"
9
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
10
+ load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
11
+
12
+ F0_path: "Utils/JDC/bst.t7"
13
+ ASR_config: "Utils/ASR/config.yml"
14
+ ASR_path: "Utils/ASR/epoch_00080.pth"
15
+ PLBERT_dir: 'Utils/PLBERT/'
16
+
17
+ data_params:
18
+ train_data: "Data/train_list.txt"
19
+ val_data: "Data/val_list.txt"
20
+ root_path: "/local/LJSpeech-1.1/wavs"
21
+ OOD_data: "Data/OOD_texts.txt"
22
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
23
+
24
+ preprocess_params:
25
+ sr: 24000
26
+ spect_params:
27
+ n_fft: 2048
28
+ win_length: 1200
29
+ hop_length: 300
30
+
31
+ model_params:
32
+ multispeaker: true
33
+
34
+ dim_in: 64
35
+ hidden_dim: 512
36
+ max_conv_dim: 512
37
+ n_layer: 3
38
+ n_mels: 80
39
+
40
+ n_token: 178 # number of phoneme tokens
41
+ max_dur: 50 # maximum duration of a single phoneme
42
+ style_dim: 128 # style vector size
43
+
44
+ dropout: 0.2
45
+
46
+ # config for decoder
47
+ decoder:
48
+ type: 'hifigan' # either hifigan or istftnet
49
+ resblock_kernel_sizes: [3,7,11]
50
+ upsample_rates : [10,5,3,2]
51
+ upsample_initial_channel: 512
52
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
53
+ upsample_kernel_sizes: [20,10,6,4]
54
+
55
+ # speech language model config
56
+ slm:
57
+ model: 'microsoft/wavlm-base-plus'
58
+ sr: 16000 # sampling rate of SLM
59
+ hidden: 768 # hidden size of SLM
60
+ nlayers: 13 # number of layers of SLM
61
+ initial_channel: 64 # initial channels of SLM discriminator head
62
+
63
+ # style diffusion model config
64
+ diffusion:
65
+ embedding_mask_proba: 0.1
66
+ # transformer config
67
+ transformer:
68
+ num_layers: 3
69
+ num_heads: 8
70
+ head_features: 64
71
+ multiplier: 2
72
+
73
+ # diffusion distribution config
74
+ dist:
75
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
76
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
77
+ mean: -3.0
78
+ std: 1.0
79
+
80
+ loss_params:
81
+ lambda_mel: 5. # mel reconstruction loss
82
+ lambda_gen: 1. # generator loss
83
+ lambda_slm: 1. # slm feature matching loss
84
+
85
+ lambda_mono: 1. # monotonic alignment loss (TMA)
86
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
87
+
88
+ lambda_F0: 1. # F0 reconstruction loss
89
+ lambda_norm: 1. # norm reconstruction loss
90
+ lambda_dur: 1. # duration loss
91
+ lambda_ce: 20. # duration predictor probability output CE loss
92
+ lambda_sty: 1. # style reconstruction loss
93
+ lambda_diff: 1. # score matching loss
94
+
95
+ diff_epoch: 10 # style diffusion starting epoch
96
+ joint_epoch: 30 # joint training starting epoch
97
+
98
+ optimizer_params:
99
+ lr: 0.0001 # general learning rate
100
+ bert_lr: 0.00001 # learning rate for PLBERT
101
+ ft_lr: 0.0001 # learning rate for acoustic modules
102
+
103
+ slmadv_params:
104
+ min_len: 400 # minimum length of samples
105
+ max_len: 500 # maximum length of samples
106
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
107
+ iter: 10 # update the discriminator every this iterations of generator update
108
+ thresh: 5 # gradient norm above which the gradient is scaled
109
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
110
+ sig: 1.5 # sigma for differentiable duration modeling
111
+
Configs/config_libritts.yml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LibriTTS"
2
+ first_stage_path: "first_stage.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 50 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 30 # number of peochs for second stage training (joint training)
8
+ batch_size: 16
9
+ max_len: 300 # maximum number of frames
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ F0_path: "Utils/JDC/bst.t7"
15
+ ASR_config: "Utils/ASR/config.yml"
16
+ ASR_path: "Utils/ASR/epoch_00080.pth"
17
+ PLBERT_dir: 'Utils/PLBERT/'
18
+
19
+ data_params:
20
+ train_data: "Data/train_list.txt"
21
+ val_data: "Data/val_list.txt"
22
+ root_path: ""
23
+ OOD_data: "Data/OOD_texts.txt"
24
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
25
+
26
+ preprocess_params:
27
+ sr: 24000
28
+ spect_params:
29
+ n_fft: 2048
30
+ win_length: 1200
31
+ hop_length: 300
32
+
33
+ model_params:
34
+ multispeaker: true
35
+
36
+ dim_in: 64
37
+ hidden_dim: 512
38
+ max_conv_dim: 512
39
+ n_layer: 3
40
+ n_mels: 80
41
+
42
+ n_token: 178 # number of phoneme tokens
43
+ max_dur: 50 # maximum duration of a single phoneme
44
+ style_dim: 128 # style vector size
45
+
46
+ dropout: 0.2
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'hifigan' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10,5,3,2]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20,10,6,4]
56
+
57
+ # speech language model config
58
+ slm:
59
+ model: 'microsoft/wavlm-base-plus'
60
+ sr: 16000 # sampling rate of SLM
61
+ hidden: 768 # hidden size of SLM
62
+ nlayers: 13 # number of layers of SLM
63
+ initial_channel: 64 # initial channels of SLM discriminator head
64
+
65
+ # style diffusion model config
66
+ diffusion:
67
+ embedding_mask_proba: 0.1
68
+ # transformer config
69
+ transformer:
70
+ num_layers: 3
71
+ num_heads: 8
72
+ head_features: 64
73
+ multiplier: 2
74
+
75
+ # diffusion distribution config
76
+ dist:
77
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
78
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
79
+ mean: -3.0
80
+ std: 1.0
81
+
82
+ loss_params:
83
+ lambda_mel: 5. # mel reconstruction loss
84
+ lambda_gen: 1. # generator loss
85
+ lambda_slm: 1. # slm feature matching loss
86
+
87
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
88
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
89
+ TMA_epoch: 5 # TMA starting epoch (1st stage)
90
+
91
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
92
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
93
+ lambda_dur: 1. # duration loss (2nd stage)
94
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
95
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
96
+ lambda_diff: 1. # score matching loss (2nd stage)
97
+
98
+ diff_epoch: 10 # style diffusion starting epoch (2nd stage)
99
+ joint_epoch: 15 # joint training starting epoch (2nd stage)
100
+
101
+ optimizer_params:
102
+ lr: 0.0001 # general learning rate
103
+ bert_lr: 0.00001 # learning rate for PLBERT
104
+ ft_lr: 0.00001 # learning rate for acoustic modules
105
+
106
+ slmadv_params:
107
+ min_len: 400 # minimum length of samples
108
+ max_len: 500 # maximum length of samples
109
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
110
+ iter: 20 # update the discriminator every this iterations of generator update
111
+ thresh: 5 # gradient norm above which the gradient is scaled
112
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
113
+ sig: 1.5 # sigma for differentiable duration modeling
Data/OOD_texts.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0989ef6a9873b711befefcbe60660ced7a65532359277f766f4db504c558a72
3
+ size 31758898
Data/train_list.txt ADDED
The diff for this file is too large to render. See raw diff
 
Data/val_list.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹᵻɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wʌt ðeɪ hˈɪɹ ænd wʌt ðeɪ ɹˈiːd .|0
2
+ LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː , ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt , tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ , ænd ˈɔːl ðə fˈɜːnɪtʃɚ , aɪ wʊd biː mˈæd æz hˈɛl , tˈuː .|0
3
+ LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹᵻpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪŋkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn ˈeɪtiːn θˈɜːɾi fˈaɪv .|0
4
+ LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹᵻspˈɛkt :|0
5
+ LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹᵻspˈɛkt wʌz tə θɹˈoʊ ðə ɹᵻspˌɑːnsəbˈɪlɪɾi ˌɔn ˈʌðɚz .|0
6
+ LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛl ɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌŋkənvˈɪktᵻd pɹˈɪzənɚ , ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt , ænd stˈɪl ʌŋkəntˈæmᵻnˌeɪɾᵻd ,|0
7
+ LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔː stˈeɪʃənɚz . hɪz ɚɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz .|0
8
+ LJ047-0044.wav|ˈɑːswəld wʌz , haʊˈɛvɚ , wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz . hiː dᵻnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz|0
9
+ LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ . tʃˈɑːɹlz dʒˈeɪ . kˈæɹɪkˌoʊ , ɐ ɹˈɛzᵻdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi .|0
10
+ LJ048-0194.wav|dˈʊɹɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛnti tˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd .|0
11
+ LJ049-0026.wav|ˌɔn əkˈeɪʒən ðə sˈiːkɹᵻt sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt .|0
12
+ LJ004-0152.wav|ɔːlðˈoʊ æt mˈɪstɚ . bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən , ðə fˈɜːst stˈɛp təwˈɔːɹdz ɹᵻfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˈɛvəntˌiːn sˈɛvənti fˈoːɹ .|0
13
+ LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni , ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsᵻsɚɹi tə dˈɑːlɚ mˌeɪk ɐn ɛɡzˈæmpəl.dˈɑːlɚ|0
14
+ LJ043-0002.wav|ðə wˈɔːɹəŋ kəmˈɪʃən ɹᵻpˈoːɹt . baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɔnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi . tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld :|0
15
+ LJ009-0114.wav|mˈɪstɚ . wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dᵻskɹˈaɪbɪŋ ɐnˈʌðɚ ɹᵻlˈɪdʒəs sˈɜːvɪs , wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪˌeɪtli biː ɪnsˈɜːɾᵻd hˈɪɹ .|0
16
+ LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk .|0
17
+ LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd . ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzᵻz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə|0
18
+ LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp , hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪliˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən|0
19
+ LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl , kwˈoʊt , wiː hæd ɐ mˈoʊɾɚkˌeɪd wɛɹˈɛvɚ kplˈʌsplʌs wˌɪtʃ hɐdbɪn bˌɪn hˈeɪstili sˈʌmənd fɚðə ðə pˈɜːpəs wiː wˈɛnt , ˈɛnd kwˈoʊt .|0
20
+ LJ031-0070.wav|dˈɑːktɚ . klˈɑːɹk , hˌuː mˈoʊst klˈoʊsli əbzˈɜːvd ðə hˈɛd wˈuːnd ,|0
21
+ LJ034-0198.wav|jˈuːɪnz , hˌuː wʌz ɔnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstᵻfˌaɪd ðæt hiː kʊd nˌɑːt dᵻskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ .|0
22
+ LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt , tʊ ɐ smˈɔːl ɛkstˈɛnt ,|0
23
+ LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɔnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsᵻsɚɹi .|0
24
+ LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd|0
25
+ LJ005-0014.wav|spˈiːkɪŋ ˌɔn ɐ dᵻbˈeɪt ˌɔn pɹˈɪzən mˈæɾɚz , hiː dᵻklˈɛɹd ðˈæt|0
26
+ LJ012-0161.wav|hiː wʌz ɹᵻpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ .|0
27
+ LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹᵻpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹᵻfˈɜːd tuː|0
28
+ LJ019-0257.wav|hˈɪɹ ðə tɹˈɛd wˈiːl wʌz ɪn jˈuːs , ðɛɹ sˈɛljʊlɚ kɹˈæŋks , ɔːɹ hˈɑːɹd lˈeɪbɚ məʃˈiːnz .|0
29
+ LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɔn .|0
30
+ LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɔnðə kˈoːɹt ;|0
31
+ LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz , nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz . aɪ hæv hæd ɪnˈʌf .|0
32
+ LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp .|0
33
+ LJ046-0058.wav|dˈʊɹɹɪŋ hɪz pɹˈɛzɪdənsi , fɹˈæŋklɪn dˈiː . ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹɪd dʒˈɜːniz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹɪd fˈɪfti θˈaʊzənd mˈaɪlz .|0
34
+ LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ , ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv .|0
35
+ LJ002-0043.wav|lˈɔŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾi sˈɪks fˈiːt , sˈɪks twˈɛnti θɹˈiː fˈiːt , ænd ðɪ ˈeɪtθ ˈeɪtiːn ,|0
36
+ LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən .|0
37
+ LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hæd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹᵻpɹˈiːv , ænd wɪðˌɪn ɐ fjˈuː ˈaʊɚz ʌv ˌɛksɪkjˈuːʃən .|0
38
+ LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹᵻt sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹᵻlˈiːst ɔːɹ ɛskˈeɪps .|0
39
+ LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ , ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt .|0
40
+ LJ042-0096.wav|ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt|0
41
+ LJ049-0050.wav|hˈɪl hæd bˈoʊθ fˈiːt ɔnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mˈɪsɪz . kˈɛnədi .|0
42
+ LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt , nˈuːɡeɪt ɹᵻsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntiz ,|0
43
+ LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs , ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsəŋ ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd .|0
44
+ LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd .|0
45
+ LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kəŋklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɔnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld .|0
46
+ LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən .|0
47
+ LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt , ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ᵻlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm ?|0
48
+ LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪɾ ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz .|0
49
+ LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪst ænd ɹᵻpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɹiz ʌvðə sˈɪɾi ʌv lˈʌndən ,|0
50
+ LJ028-0275.wav|æt lˈæst , ɪnðə twˈɛntiəθ mˈʌnθ ,|0
51
+ LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋ plˈeɪs wɪð ɐ tɹˈæp dˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd .|0
52
+ LJ011-0096.wav|hiː mˈæɹid ɐ lˈeɪdi ˈɔːlsoʊ bᵻlˈɔŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz , hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃʊn , wˈɪtʃ , ænd hɪz ˈoʊn mˈʌni , hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm ,|0
53
+ LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː . kɹˈeɪɡ , ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti ,|0
54
+ LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz , ɡɹˈeɪt lˈɔɪɚz , ɡˈʌvɚnɚz ʌv pɹˈɪzənz , ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː .|0
55
+ LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst , ɐ səspˈɪʃəs sˈɜːkəmstˌæns , æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ .|0
56
+ LJ027-0141.wav|ɪz klˈoʊsli ɹᵻpɹədˈuːst ɪnðə lˈaɪf hˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ . ɔːɹ , ɪn ˈʌðɚ wˈɜːdz ,|0
57
+ LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi , ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz .|0
58
+ LJ031-0202.wav|mˈɪsɪz . kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hæd sˈɜːvd ɪnðə nˈeɪvi .|0
59
+ LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈo��pt fɔːɹ pˈiəɹɪəd ʌv pˈiːs ,|0
60
+ LJ016-0288.wav|dˈɑːlɚ mˈuːlɚ , mˈuːlɚ , hiːz ðə mˈæn , dˈɑːlɚ tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz , wˌɪtʃ wʌz ɹᵻsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz .|0
61
+ LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ , wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdili dɪstˈɪŋɡwɪʃ ðə fˈɔls fɹʌmðə tɹˈuː ,|0
62
+ LJ018-0081.wav|hɪz dᵻfˈɛns bˌiːɪŋ ðæt hiː hæd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd , bˌʌt ðˈæt , ɔnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hæd ɹˈɔŋd hˌɪm ,|0
63
+ LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪŋkɹiːs ɪnðə pˈeɪɹoʊlz , ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts|0
64
+ LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp , bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd , ænd ðə mˈæn wʌz kˈæɹid bˈæk tə dʒˈeɪl .|0
65
+ LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz , ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz .|0
66
+ LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən , ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl .|0
67
+ LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs , ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts , ðə hˈaʊskiːpɚ ðˈɛɹ .|0
68
+ LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛnti tˈuː , nˈaɪntiːn sˈɪksti θɹˈiː , fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈɪɹiəl fɚðə pˌiːˌɑːɹɹˈɛs dʒˈɛnɚɹəl fˈaɪlz|0
69
+ LJ017-0044.wav|ænd ðə dˈiːpɪst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm , ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn , ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ .|0
70
+ LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ , ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn , ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ .|0
71
+ LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɚɹˈɛstᵻd ˌɔn səspˈɪʃən , ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd ;|0
72
+ LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn , bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd , ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sᵻvˈɪɹli .|0
73
+ LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹihˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ . ɔːlðˈoʊ ɪɾ ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt ,|0
74
+ LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm .|0
75
+ LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹᵻkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɔŋ ɪn səspˈɛns .|0
76
+ LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dᵻfˈɜːd .|0
77
+ LJ047-0148.wav|ˌɔn ɑːktˈoʊbɚ twˈɛnti fˈaɪv ,|0
78
+ LJ008-0111.wav|ðeɪ ˈɛntɚd ɐ dˈɑːlɚ stˈoʊŋ kˈoʊld ɹˈuːm , dˈɑːlɚɹ ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ .|0
79
+ LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstᵻfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld .|0
80
+ LJ037-0234.wav|mˈɪsɪz . mˈɛɹi bɹˈɑːk , ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən , wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl ,|0
81
+ LJ040-0002.wav|tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld : bˈækɡɹaʊnd ænd pˈɑːsᵻbəl mˈoʊɾɪvz , pˈɑːɹt wˌʌn .|0
82
+ LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstᵻfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bᵻkˈʌmɪŋ ɪnvˈɑːlvd|0
83
+ LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɔn wˈɑːtʃᵻz , wɜː kˈɛɹfəli ɹᵻmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz .|0
84
+ LJ012-0250.wav|ɔnðə sˈɛvənθ dʒuːlˈaɪ , ˈeɪtiːn θˈɜːɾi sˈɛvən ,|0
85
+ LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈiːnɚz tə wˈɜːk baɪ ðə dʒˈɑːb .|0
86
+ LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən .|0
87
+ LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ᵻsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi .|0
88
+ LJ031-0134.wav|ˌɔn wˈʌn əkˈeɪʒən mˈɪsɪz . dʒˈɑːnsən , ɐkˈʌmpənid baɪ tˈuː sˈiːkɹᵻt sˈɜːvɪs ˈeɪdʒənts , lˈɛft ðə ɹˈuːm tə sˈiː mˈɪsɪz . kˈɛnədi ænd mˈɪsɪz . kˈɑːnæli .|0
89
+ LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn .|0
90
+ LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd , ˈoʊpənd , ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts .|0
91
+ LJ034-0160.wav|ˌɔn bɹˈɛnənz sˈʌbsᵻkwənt sˈɜːʔn̩ aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl .|0
92
+ LJ038-0199.wav|ᵻlˈɛvən . ɪf aɪɐm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ ,|0
93
+ LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈæd fɔːɹ hˌɪm , ænd ɹᵻmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm .|0
94
+ LJ033-0047.wav|aɪ nˈoʊɾɪst wɛn aɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɔn , ˈɛnd kwˈoʊt ,|0
95
+ LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ .|0
96
+ LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli .|0
97
+ LJ003-0111.wav|hiː wʌz ɪŋ kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː , ˈɛnd kwˈoʊt . ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɹɪˈɔsɪɾi .|0
98
+ LJ008-0258.wav|lˈɛt mˌiː ɹᵻtɹˈeɪs maɪ stˈɛps , ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz ,|0
99
+ LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæŋ kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt , mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs , fˈɔːɹt wˈɜːθ , sˌæn æntˈoʊnɪˌoʊ , ænd hjˈuːstən .|0
100
+ LJ004-0045.wav|mˈɪstɚ . stˈɜːdʒᵻz bˈoːɹn , sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ , sˌɜː dʒˈeɪmz skˈɑːɹlɪt , ænd wˈɪljəm wˈɪlbɚfˌoːɹs .|0
Demo/.ipynb_checkpoints/Inference_LJSpeech-checkpoint.ipynb ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS 2 Demo (LJSpeech)\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "6108384d",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Utils"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "96e173bf",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "import torch\n",
27
+ "torch.manual_seed(0)\n",
28
+ "torch.backends.cudnn.benchmark = False\n",
29
+ "torch.backends.cudnn.deterministic = True\n",
30
+ "\n",
31
+ "import random\n",
32
+ "random.seed(0)\n",
33
+ "\n",
34
+ "import numpy as np\n",
35
+ "np.random.seed(0)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "da84c60f",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "%cd .."
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "5a3ddcc8",
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# load packages\n",
56
+ "import time\n",
57
+ "import random\n",
58
+ "import yaml\n",
59
+ "from munch import Munch\n",
60
+ "import numpy as np\n",
61
+ "import torch\n",
62
+ "from torch import nn\n",
63
+ "import torch.nn.functional as F\n",
64
+ "import torchaudio\n",
65
+ "import librosa\n",
66
+ "from nltk.tokenize import word_tokenize\n",
67
+ "\n",
68
+ "from models import *\n",
69
+ "from utils import *\n",
70
+ "from text_utils import TextCleaner\n",
71
+ "textclenaer = TextCleaner()\n",
72
+ "\n",
73
+ "%matplotlib inline"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "id": "bbdc04c0",
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "id": "00ee05e1",
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
94
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
95
+ "mean, std = -4, 4\n",
96
+ "\n",
97
+ "def length_to_mask(lengths):\n",
98
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
99
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
100
+ " return mask\n",
101
+ "\n",
102
+ "def preprocess(wave):\n",
103
+ " wave_tensor = torch.from_numpy(wave).float()\n",
104
+ " mel_tensor = to_mel(wave_tensor)\n",
105
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
106
+ " return mel_tensor\n",
107
+ "\n",
108
+ "def compute_style(ref_dicts):\n",
109
+ " reference_embeddings = {}\n",
110
+ " for key, path in ref_dicts.items():\n",
111
+ " wave, sr = librosa.load(path, sr=24000)\n",
112
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
113
+ " if sr != 24000:\n",
114
+ " audio = librosa.resample(audio, sr, 24000)\n",
115
+ " mel_tensor = preprocess(audio).to(device)\n",
116
+ "\n",
117
+ " with torch.no_grad():\n",
118
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
119
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
120
+ " \n",
121
+ " return reference_embeddings"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "markdown",
126
+ "id": "7b9cecbe",
127
+ "metadata": {},
128
+ "source": [
129
+ "### Load models"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "id": "64fc4c0f",
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# load phonemizer\n",
140
+ "import phonemizer\n",
141
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "48e7b644",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config.yml\"))\n",
152
+ "\n",
153
+ "# load pretrained ASR model\n",
154
+ "ASR_config = config.get('ASR_config', False)\n",
155
+ "ASR_path = config.get('ASR_path', False)\n",
156
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
157
+ "\n",
158
+ "# load pretrained F0 model\n",
159
+ "F0_path = config.get('F0_path', False)\n",
160
+ "pitch_extractor = load_F0_models(F0_path)\n",
161
+ "\n",
162
+ "# load BERT model\n",
163
+ "from Utils.PLBERT.util import load_plbert\n",
164
+ "BERT_path = config.get('PLBERT_dir', False)\n",
165
+ "plbert = load_plbert(BERT_path)"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "ffc18cf7",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)\n",
176
+ "_ = [model[key].eval() for key in model]\n",
177
+ "_ = [model[key].to(device) for key in model]"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "id": "64529d5c",
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "params_whole = torch.load(\"Models/LJSpeech/epoch_2nd_00100.pth\", map_location='cpu')\n",
188
+ "params = params_whole['net']"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "895d9706",
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "for key in model:\n",
199
+ " if key in params:\n",
200
+ " print('%s loaded' % key)\n",
201
+ " try:\n",
202
+ " model[key].load_state_dict(params[key])\n",
203
+ " except:\n",
204
+ " from collections import OrderedDict\n",
205
+ " state_dict = params[key]\n",
206
+ " new_state_dict = OrderedDict()\n",
207
+ " for k, v in state_dict.items():\n",
208
+ " name = k[7:] # remove `module.`\n",
209
+ " new_state_dict[name] = v\n",
210
+ " # load params\n",
211
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
212
+ "# except:\n",
213
+ "# _load(params[key], model[key])\n",
214
+ "_ = [model[key].eval() for key in model]"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "id": "c1a59db2",
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "id": "e30985ab",
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "sampler = DiffusionSampler(\n",
235
+ " model.diffusion.diffusion,\n",
236
+ " sampler=ADPM2Sampler(),\n",
237
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
238
+ " clamp=False\n",
239
+ ")"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "id": "b803110e",
245
+ "metadata": {},
246
+ "source": [
247
+ "### Synthesize speech"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "id": "24655f46",
254
+ "metadata": {},
255
+ "outputs": [],
256
+ "source": [
257
+ "# synthesize a text\n",
258
+ "text = ''' StyleTTS 2 is a text-to-speech model that leverages style diffusion and adversarial training with large speech language models to achieve human-level text-to-speech synthesis. '''"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": null,
264
+ "id": "ca57469c",
265
+ "metadata": {},
266
+ "outputs": [],
267
+ "source": [
268
+ "def inference(text, noise, diffusion_steps=5, embedding_scale=1):\n",
269
+ " text = text.strip()\n",
270
+ " text = text.replace('\"', '')\n",
271
+ " ps = global_phonemizer.phonemize([text])\n",
272
+ " ps = word_tokenize(ps[0])\n",
273
+ " ps = ' '.join(ps)\n",
274
+ "\n",
275
+ " tokens = textclenaer(ps)\n",
276
+ " tokens.insert(0, 0)\n",
277
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
278
+ " \n",
279
+ " with torch.no_grad():\n",
280
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
281
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
282
+ "\n",
283
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
284
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
285
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
286
+ "\n",
287
+ " s_pred = sampler(noise, \n",
288
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
289
+ " embedding_scale=embedding_scale).squeeze(0)\n",
290
+ "\n",
291
+ " s = s_pred[:, 128:]\n",
292
+ " ref = s_pred[:, :128]\n",
293
+ "\n",
294
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
295
+ "\n",
296
+ " x, _ = model.predictor.lstm(d)\n",
297
+ " duration = model.predictor.duration_proj(x)\n",
298
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
299
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
300
+ "\n",
301
+ " pred_dur[-1] += 5\n",
302
+ "\n",
303
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
304
+ " c_frame = 0\n",
305
+ " for i in range(pred_aln_trg.size(0)):\n",
306
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
307
+ " c_frame += int(pred_dur[i].data)\n",
308
+ "\n",
309
+ " # encode prosody\n",
310
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
311
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
312
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
313
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
314
+ " \n",
315
+ " return out.squeeze().cpu().numpy()"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "markdown",
320
+ "id": "d438ef4f",
321
+ "metadata": {},
322
+ "source": [
323
+ "#### Basic synthesis (5 diffusion steps)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "id": "d3d7f7d5",
330
+ "metadata": {
331
+ "scrolled": true
332
+ },
333
+ "outputs": [],
334
+ "source": [
335
+ "start = time.time()\n",
336
+ "noise = torch.randn(1,1,256).to(device)\n",
337
+ "wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)\n",
338
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
339
+ "print(f\"RTF = {rtf:5f}\")\n",
340
+ "import IPython.display as ipd\n",
341
+ "display(ipd.Audio(wav, rate=24000))"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "markdown",
346
+ "id": "2d5d9df0",
347
+ "metadata": {},
348
+ "source": [
349
+ "#### With higher diffusion steps (more diverse)\n",
350
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "id": "a10129fd",
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "start = time.time()\n",
361
+ "noise = torch.randn(1,1,256).to(device)\n",
362
+ "wav = inference(text, noise, diffusion_steps=10, embedding_scale=1)\n",
363
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
364
+ "print(f\"RTF = {rtf:5f}\")\n",
365
+ "import IPython.display as ipd\n",
366
+ "display(ipd.Audio(wav, rate=24000))"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "markdown",
371
+ "id": "1877ea15",
372
+ "metadata": {},
373
+ "source": [
374
+ "### Speech expressiveness\n",
375
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page."
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "id": "4c4777b7",
381
+ "metadata": {},
382
+ "source": [
383
+ "#### With embedding_scale=1\n",
384
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional. "
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": null,
390
+ "id": "c29ea2f0",
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "texts = {}\n",
395
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
396
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
397
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
398
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
399
+ "\n",
400
+ "for k,v in texts.items():\n",
401
+ " noise = torch.randn(1,1,256).to(device)\n",
402
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=1)\n",
403
+ " print(k + \": \")\n",
404
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "markdown",
409
+ "id": "3c89499f",
410
+ "metadata": {},
411
+ "source": [
412
+ "#### With embedding_scale=2"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "f73be3aa",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "texts = {}\n",
423
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
424
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
425
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
426
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
427
+ "\n",
428
+ "for k,v in texts.items():\n",
429
+ " noise = torch.randn(1,1,256).to(device)\n",
430
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=2) # embedding_scale=2 for more pronounced emotion\n",
431
+ " print(k + \": \")\n",
432
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "markdown",
437
+ "id": "9320da63",
438
+ "metadata": {},
439
+ "source": [
440
+ "### Long-form generation\n",
441
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page. "
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": null,
447
+ "id": "cdd4db51",
448
+ "metadata": {},
449
+ "outputs": [],
450
+ "source": [
451
+ "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first-class homemade products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": null,
457
+ "id": "ebb941c8",
458
+ "metadata": {},
459
+ "outputs": [],
460
+ "source": [
461
+ "def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):\n",
462
+ " text = text.strip()\n",
463
+ " text = text.replace('\"', '')\n",
464
+ " ps = global_phonemizer.phonemize([text])\n",
465
+ " ps = word_tokenize(ps[0])\n",
466
+ " ps = ' '.join(ps)\n",
467
+ "\n",
468
+ " tokens = textclenaer(ps)\n",
469
+ " tokens.insert(0, 0)\n",
470
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
471
+ " \n",
472
+ " with torch.no_grad():\n",
473
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
474
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
475
+ "\n",
476
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
477
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
478
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
479
+ "\n",
480
+ " s_pred = sampler(noise, \n",
481
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
482
+ " embedding_scale=embedding_scale).squeeze(0)\n",
483
+ " \n",
484
+ " if s_prev is not None:\n",
485
+ " # convex combination of previous and current style\n",
486
+ " s_pred = alpha * s_prev + (1 - alpha) * s_pred\n",
487
+ " \n",
488
+ " s = s_pred[:, 128:]\n",
489
+ " ref = s_pred[:, :128]\n",
490
+ "\n",
491
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
492
+ "\n",
493
+ " x, _ = model.predictor.lstm(d)\n",
494
+ " duration = model.predictor.duration_proj(x)\n",
495
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
496
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
497
+ "\n",
498
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
499
+ " c_frame = 0\n",
500
+ " for i in range(pred_aln_trg.size(0)):\n",
501
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
502
+ " c_frame += int(pred_dur[i].data)\n",
503
+ "\n",
504
+ " # encode prosody\n",
505
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
506
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
507
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
508
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
509
+ " \n",
510
+ " return out.squeeze().cpu().numpy(), s_pred"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": null,
516
+ "id": "7ca0ef2e",
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "sentences = passage.split('.') # simple split by comma\n",
521
+ "wavs = []\n",
522
+ "s_prev = None\n",
523
+ "for text in sentences:\n",
524
+ " if text.strip() == \"\": continue\n",
525
+ " text += '.' # add it back\n",
526
+ " noise = torch.randn(1,1,256).to(device)\n",
527
+ " wav, s_prev = LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=10, embedding_scale=1.5)\n",
528
+ " wavs.append(wav)\n",
529
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))"
530
+ ]
531
+ }
532
+ ],
533
+ "metadata": {
534
+ "kernelspec": {
535
+ "display_name": "NLP",
536
+ "language": "python",
537
+ "name": "nlp"
538
+ },
539
+ "language_info": {
540
+ "codemirror_mode": {
541
+ "name": "ipython",
542
+ "version": 3
543
+ },
544
+ "file_extension": ".py",
545
+ "mimetype": "text/x-python",
546
+ "name": "python",
547
+ "nbconvert_exporter": "python",
548
+ "pygments_lexer": "ipython3",
549
+ "version": "3.9.7"
550
+ }
551
+ },
552
+ "nbformat": 4,
553
+ "nbformat_minor": 5
554
+ }
Demo/.ipynb_checkpoints/Inference_LibriTTS-checkpoint.ipynb ADDED
@@ -0,0 +1,1155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS 2 Demo (LibriTTS)\n",
9
+ "\n",
10
+ "Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "id": "6108384d",
16
+ "metadata": {},
17
+ "source": [
18
+ "### Utils"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "96e173bf",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import torch\n",
29
+ "torch.manual_seed(0)\n",
30
+ "torch.backends.cudnn.benchmark = False\n",
31
+ "torch.backends.cudnn.deterministic = True\n",
32
+ "\n",
33
+ "import random\n",
34
+ "random.seed(0)\n",
35
+ "\n",
36
+ "import numpy as np\n",
37
+ "np.random.seed(0)"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "id": "da84c60f",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "%cd .."
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "id": "5a3ddcc8",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "# load packages\n",
58
+ "import time\n",
59
+ "import random\n",
60
+ "import yaml\n",
61
+ "from munch import Munch\n",
62
+ "import numpy as np\n",
63
+ "import torch\n",
64
+ "from torch import nn\n",
65
+ "import torch.nn.functional as F\n",
66
+ "import torchaudio\n",
67
+ "import librosa\n",
68
+ "from nltk.tokenize import word_tokenize\n",
69
+ "\n",
70
+ "from models import *\n",
71
+ "from utils import *\n",
72
+ "from text_utils import TextCleaner\n",
73
+ "textclenaer = TextCleaner()\n",
74
+ "\n",
75
+ "%matplotlib inline"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "00ee05e1",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
86
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
87
+ "mean, std = -4, 4\n",
88
+ "\n",
89
+ "def length_to_mask(lengths):\n",
90
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
91
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
92
+ " return mask\n",
93
+ "\n",
94
+ "def preprocess(wave):\n",
95
+ " wave_tensor = torch.from_numpy(wave).float()\n",
96
+ " mel_tensor = to_mel(wave_tensor)\n",
97
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
98
+ " return mel_tensor\n",
99
+ "\n",
100
+ "def compute_style(path):\n",
101
+ " wave, sr = librosa.load(path, sr=24000)\n",
102
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
103
+ " if sr != 24000:\n",
104
+ " audio = librosa.resample(audio, sr, 24000)\n",
105
+ " mel_tensor = preprocess(audio).to(device)\n",
106
+ "\n",
107
+ " with torch.no_grad():\n",
108
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
109
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
110
+ "\n",
111
+ " return torch.cat([ref_s, ref_p], dim=1)"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "id": "bbdc04c0",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "markdown",
126
+ "id": "7b9cecbe",
127
+ "metadata": {},
128
+ "source": [
129
+ "### Load models"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "id": "64fc4c0f",
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# load phonemizer\n",
140
+ "import phonemizer\n",
141
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "48e7b644",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
152
+ "\n",
153
+ "# load pretrained ASR model\n",
154
+ "ASR_config = config.get('ASR_config', False)\n",
155
+ "ASR_path = config.get('ASR_path', False)\n",
156
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
157
+ "\n",
158
+ "# load pretrained F0 model\n",
159
+ "F0_path = config.get('F0_path', False)\n",
160
+ "pitch_extractor = load_F0_models(F0_path)\n",
161
+ "\n",
162
+ "# load BERT model\n",
163
+ "from Utils.PLBERT.util import load_plbert\n",
164
+ "BERT_path = config.get('PLBERT_dir', False)\n",
165
+ "plbert = load_plbert(BERT_path)"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "ffc18cf7",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "model_params = recursive_munch(config['model_params'])\n",
176
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
177
+ "_ = [model[key].eval() for key in model]\n",
178
+ "_ = [model[key].to(device) for key in model]"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "id": "64529d5c",
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
189
+ "params = params_whole['net']"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "id": "895d9706",
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "for key in model:\n",
200
+ " if key in params:\n",
201
+ " print('%s loaded' % key)\n",
202
+ " try:\n",
203
+ " model[key].load_state_dict(params[key])\n",
204
+ " except:\n",
205
+ " from collections import OrderedDict\n",
206
+ " state_dict = params[key]\n",
207
+ " new_state_dict = OrderedDict()\n",
208
+ " for k, v in state_dict.items():\n",
209
+ " name = k[7:] # remove `module.`\n",
210
+ " new_state_dict[name] = v\n",
211
+ " # load params\n",
212
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
213
+ "# except:\n",
214
+ "# _load(params[key], model[key])\n",
215
+ "_ = [model[key].eval() for key in model]"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "id": "c1a59db2",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "e30985ab",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": [
235
+ "sampler = DiffusionSampler(\n",
236
+ " model.diffusion.diffusion,\n",
237
+ " sampler=ADPM2Sampler(),\n",
238
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
239
+ " clamp=False\n",
240
+ ")"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "markdown",
245
+ "id": "b803110e",
246
+ "metadata": {},
247
+ "source": [
248
+ "### Synthesize speech"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": null,
254
+ "id": "ca57469c",
255
+ "metadata": {},
256
+ "outputs": [],
257
+ "source": [
258
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
259
+ " text = text.strip()\n",
260
+ " ps = global_phonemizer.phonemize([text])\n",
261
+ " ps = word_tokenize(ps[0])\n",
262
+ " ps = ' '.join(ps)\n",
263
+ " tokens = textclenaer(ps)\n",
264
+ " tokens.insert(0, 0)\n",
265
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
266
+ " \n",
267
+ " with torch.no_grad():\n",
268
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
269
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
270
+ "\n",
271
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
272
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
273
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
274
+ "\n",
275
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
276
+ " embedding=bert_dur,\n",
277
+ " embedding_scale=embedding_scale,\n",
278
+ " features=ref_s, # reference from the same speaker as the embedding\n",
279
+ " num_steps=diffusion_steps).squeeze(1)\n",
280
+ "\n",
281
+ "\n",
282
+ " s = s_pred[:, 128:]\n",
283
+ " ref = s_pred[:, :128]\n",
284
+ "\n",
285
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
286
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
287
+ "\n",
288
+ " d = model.predictor.text_encoder(d_en, \n",
289
+ " s, input_lengths, text_mask)\n",
290
+ "\n",
291
+ " x, _ = model.predictor.lstm(d)\n",
292
+ " duration = model.predictor.duration_proj(x)\n",
293
+ "\n",
294
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
295
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
296
+ "\n",
297
+ "\n",
298
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
299
+ " c_frame = 0\n",
300
+ " for i in range(pred_aln_trg.size(0)):\n",
301
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
302
+ " c_frame += int(pred_dur[i].data)\n",
303
+ "\n",
304
+ " # encode prosody\n",
305
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
306
+ " if model_params.decoder.type == \"hifigan\":\n",
307
+ " asr_new = torch.zeros_like(en)\n",
308
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
309
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
310
+ " en = asr_new\n",
311
+ "\n",
312
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
313
+ "\n",
314
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
315
+ " if model_params.decoder.type == \"hifigan\":\n",
316
+ " asr_new = torch.zeros_like(asr)\n",
317
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
318
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
319
+ " asr = asr_new\n",
320
+ "\n",
321
+ " out = model.decoder(asr, \n",
322
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
323
+ " \n",
324
+ " \n",
325
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "markdown",
330
+ "id": "d438ef4f",
331
+ "metadata": {},
332
+ "source": [
333
+ "#### Basic synthesis (5 diffusion steps, seen speakers)"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "id": "cace9787",
340
+ "metadata": {},
341
+ "outputs": [],
342
+ "source": [
343
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": null,
349
+ "id": "7c88f461",
350
+ "metadata": {},
351
+ "outputs": [],
352
+ "source": [
353
+ "reference_dicts = {}\n",
354
+ "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
355
+ "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "id": "16e8ac60",
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": [
365
+ "start = time.time()\n",
366
+ "noise = torch.randn(1,1,256).to(device)\n",
367
+ "for k, path in reference_dicts.items():\n",
368
+ " ref_s = compute_style(path)\n",
369
+ " \n",
370
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
371
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
372
+ " print(f\"RTF = {rtf:5f}\")\n",
373
+ " import IPython.display as ipd\n",
374
+ " print(k + ' Synthesized:')\n",
375
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
376
+ " print('Reference:')\n",
377
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "markdown",
382
+ "id": "14838708",
383
+ "metadata": {},
384
+ "source": [
385
+ "#### With higher diffusion steps (more diverse)\n",
386
+ "\n",
387
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "id": "6fbff03b",
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": [
397
+ "noise = torch.randn(1,1,256).to(device)\n",
398
+ "for k, path in reference_dicts.items():\n",
399
+ " ref_s = compute_style(path)\n",
400
+ " start = time.time()\n",
401
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
402
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
403
+ " print(f\"RTF = {rtf:5f}\")\n",
404
+ " import IPython.display as ipd\n",
405
+ " print(k + ' Synthesized:')\n",
406
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
407
+ " print(k + ' Reference:')\n",
408
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "markdown",
413
+ "id": "7e6867fd",
414
+ "metadata": {},
415
+ "source": [
416
+ "#### Basic synthesis (5 diffusion steps, umseen speakers)\n",
417
+ "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": null,
423
+ "id": "f4e8faa0",
424
+ "metadata": {},
425
+ "outputs": [],
426
+ "source": [
427
+ "reference_dicts = {}\n",
428
+ "# format: (path, text)\n",
429
+ "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
430
+ "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
431
+ "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
432
+ "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "id": "653f1406",
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "noise = torch.randn(1,1,256).to(device)\n",
443
+ "for k, v in reference_dicts.items():\n",
444
+ " path, text = v\n",
445
+ " ref_s = compute_style(path)\n",
446
+ " start = time.time()\n",
447
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
448
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
449
+ " print(f\"RTF = {rtf:5f}\")\n",
450
+ " import IPython.display as ipd\n",
451
+ " print(k + ' Synthesized: ' + text)\n",
452
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
453
+ " print(k + ' Reference:')\n",
454
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "markdown",
459
+ "id": "141e91b3",
460
+ "metadata": {},
461
+ "source": [
462
+ "### Speech expressiveness\n",
463
+ "\n",
464
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. \n",
465
+ "\n",
466
+ "#### With `embedding_scale=1`\n",
467
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.\n",
468
+ "\n"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "code",
473
+ "execution_count": null,
474
+ "id": "81addda4",
475
+ "metadata": {},
476
+ "outputs": [],
477
+ "source": [
478
+ "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
479
+ ]
480
+ },
481
+ {
482
+ "cell_type": "code",
483
+ "execution_count": null,
484
+ "id": "be1b2a11",
485
+ "metadata": {},
486
+ "outputs": [],
487
+ "source": [
488
+ "texts = {}\n",
489
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
490
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
491
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
492
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
493
+ "\n",
494
+ "for k,v in texts.items():\n",
495
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
496
+ " print(k + \": \")\n",
497
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "markdown",
502
+ "id": "96d262b8",
503
+ "metadata": {},
504
+ "source": [
505
+ "#### With `embedding_scale=2`"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": null,
511
+ "id": "3e7d40b4",
512
+ "metadata": {},
513
+ "outputs": [],
514
+ "source": [
515
+ "texts = {}\n",
516
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
517
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
518
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
519
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
520
+ "\n",
521
+ "for k,v in texts.items():\n",
522
+ " noise = torch.randn(1,1,256).to(device)\n",
523
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
524
+ " print(k + \": \")\n",
525
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "markdown",
530
+ "id": "402b2bd6",
531
+ "metadata": {},
532
+ "source": [
533
+ "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
534
+ "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. "
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": null,
540
+ "id": "599de5d5",
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": [
544
+ "texts = {}\n",
545
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
546
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
547
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
548
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
549
+ "\n",
550
+ "for k,v in texts.items():\n",
551
+ " noise = torch.randn(1,1,256).to(device)\n",
552
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
553
+ " print(k + \": \")\n",
554
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "markdown",
559
+ "id": "48548866",
560
+ "metadata": {},
561
+ "source": [
562
+ "### Zero-shot speaker adaptation\n",
563
+ "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. "
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "markdown",
568
+ "id": "23e81572",
569
+ "metadata": {},
570
+ "source": [
571
+ "#### Acoustic Environment Maintenance\n",
572
+ "\n",
573
+ "Since we want to maintain the acoustic environment in the speaker (timbre), we set `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text. "
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": null,
579
+ "id": "8087bccb",
580
+ "metadata": {},
581
+ "outputs": [],
582
+ "source": [
583
+ "reference_dicts = {}\n",
584
+ "# format: (path, text)\n",
585
+ "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
586
+ "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
587
+ "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": null,
593
+ "id": "1e99c200",
594
+ "metadata": {},
595
+ "outputs": [],
596
+ "source": [
597
+ "noise = torch.randn(1,1,256).to(device)\n",
598
+ "for k, v in reference_dicts.items():\n",
599
+ " path, text = v\n",
600
+ " ref_s = compute_style(path)\n",
601
+ " start = time.time()\n",
602
+ " wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
603
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
604
+ " print(f\"RTF = {rtf:5f}\")\n",
605
+ " import IPython.display as ipd\n",
606
+ " print('Synthesized: ' + text)\n",
607
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
608
+ " print('Reference:')\n",
609
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
610
+ ]
611
+ },
612
+ {
613
+ "cell_type": "markdown",
614
+ "id": "7d56505d",
615
+ "metadata": {},
616
+ "source": [
617
+ "#### Speaker’s Emotion Maintenance\n",
618
+ "\n",
619
+ "Since we want to maintain the emotion in the speaker (prosody), we set `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": null,
625
+ "id": "f90179e7",
626
+ "metadata": {},
627
+ "outputs": [],
628
+ "source": [
629
+ "reference_dicts = {}\n",
630
+ "# format: (path, text)\n",
631
+ "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
632
+ "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
633
+ "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
634
+ "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": null,
640
+ "id": "2e6bdfed",
641
+ "metadata": {},
642
+ "outputs": [],
643
+ "source": [
644
+ "noise = torch.randn(1,1,256).to(device)\n",
645
+ "for k, v in reference_dicts.items():\n",
646
+ " path, text = v\n",
647
+ " ref_s = compute_style(path)\n",
648
+ " start = time.time()\n",
649
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
650
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
651
+ " print(f\"RTF = {rtf:5f}\")\n",
652
+ " import IPython.display as ipd\n",
653
+ " print(k + ' Synthesized: ' + text)\n",
654
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
655
+ " print(k + ' Reference:')\n",
656
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "markdown",
661
+ "id": "37ae3963",
662
+ "metadata": {},
663
+ "source": [
664
+ "### Longform Narration\n",
665
+ "\n",
666
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
667
+ ]
668
+ },
669
+ {
670
+ "cell_type": "code",
671
+ "execution_count": null,
672
+ "id": "f12a716b",
673
+ "metadata": {},
674
+ "outputs": [],
675
+ "source": [
676
+ "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "code",
681
+ "execution_count": null,
682
+ "id": "a1a38079",
683
+ "metadata": {},
684
+ "outputs": [],
685
+ "source": [
686
+ "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
687
+ " text = text.strip()\n",
688
+ " ps = global_phonemizer.phonemize([text])\n",
689
+ " ps = word_tokenize(ps[0])\n",
690
+ " ps = ' '.join(ps)\n",
691
+ " ps = ps.replace('``', '\"')\n",
692
+ " ps = ps.replace(\"''\", '\"')\n",
693
+ "\n",
694
+ " tokens = textclenaer(ps)\n",
695
+ " tokens.insert(0, 0)\n",
696
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
697
+ " \n",
698
+ " with torch.no_grad():\n",
699
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
700
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
701
+ "\n",
702
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
703
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
704
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
705
+ "\n",
706
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
707
+ " embedding=bert_dur,\n",
708
+ " embedding_scale=embedding_scale,\n",
709
+ " features=ref_s, # reference from the same speaker as the embedding\n",
710
+ " num_steps=diffusion_steps).squeeze(1)\n",
711
+ " \n",
712
+ " if s_prev is not None:\n",
713
+ " # convex combination of previous and current style\n",
714
+ " s_pred = t * s_prev + (1 - t) * s_pred\n",
715
+ " \n",
716
+ " s = s_pred[:, 128:]\n",
717
+ " ref = s_pred[:, :128]\n",
718
+ " \n",
719
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
720
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
721
+ "\n",
722
+ " s_pred = torch.cat([ref, s], dim=-1)\n",
723
+ "\n",
724
+ " d = model.predictor.text_encoder(d_en, \n",
725
+ " s, input_lengths, text_mask)\n",
726
+ "\n",
727
+ " x, _ = model.predictor.lstm(d)\n",
728
+ " duration = model.predictor.duration_proj(x)\n",
729
+ "\n",
730
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
731
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
732
+ "\n",
733
+ "\n",
734
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
735
+ " c_frame = 0\n",
736
+ " for i in range(pred_aln_trg.size(0)):\n",
737
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
738
+ " c_frame += int(pred_dur[i].data)\n",
739
+ "\n",
740
+ " # encode prosody\n",
741
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
742
+ " if model_params.decoder.type == \"hifigan\":\n",
743
+ " asr_new = torch.zeros_like(en)\n",
744
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
745
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
746
+ " en = asr_new\n",
747
+ "\n",
748
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
749
+ "\n",
750
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
751
+ " if model_params.decoder.type == \"hifigan\":\n",
752
+ " asr_new = torch.zeros_like(asr)\n",
753
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
754
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
755
+ " asr = asr_new\n",
756
+ "\n",
757
+ " out = model.decoder(asr, \n",
758
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
759
+ " \n",
760
+ " \n",
761
+ " return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": null,
767
+ "id": "e9088f7a",
768
+ "metadata": {},
769
+ "outputs": [],
770
+ "source": [
771
+ "# unseen speaker\n",
772
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
773
+ "s_ref = compute_style(path)\n",
774
+ "sentences = passage.split('.') # simple split by comma\n",
775
+ "wavs = []\n",
776
+ "s_prev = None\n",
777
+ "for text in sentences:\n",
778
+ " if text.strip() == \"\": continue\n",
779
+ " text += '.' # add it back\n",
780
+ " \n",
781
+ " wav, s_prev = LFinference(text, \n",
782
+ " s_prev, \n",
783
+ " s_ref, \n",
784
+ " alpha = 0.3, \n",
785
+ " beta = 0.9, # make it more suitable for the text\n",
786
+ " t = 0.7, \n",
787
+ " diffusion_steps=10, embedding_scale=1.5)\n",
788
+ " wavs.append(wav)\n",
789
+ "print('Synthesized: ')\n",
790
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
791
+ "print('Reference: ')\n",
792
+ "display(ipd.Audio(path, rate=24000, normalize=False))"
793
+ ]
794
+ },
795
+ {
796
+ "cell_type": "markdown",
797
+ "id": "7517b657",
798
+ "metadata": {},
799
+ "source": [
800
+ "### Style Transfer\n",
801
+ "\n",
802
+ "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style). "
803
+ ]
804
+ },
805
+ {
806
+ "cell_type": "code",
807
+ "execution_count": null,
808
+ "id": "ed95d0f7",
809
+ "metadata": {},
810
+ "outputs": [],
811
+ "source": [
812
+ "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
813
+ " text = text.strip()\n",
814
+ " ps = global_phonemizer.phonemize([text])\n",
815
+ " ps = word_tokenize(ps[0])\n",
816
+ " ps = ' '.join(ps)\n",
817
+ "\n",
818
+ " tokens = textclenaer(ps)\n",
819
+ " tokens.insert(0, 0)\n",
820
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
821
+ " \n",
822
+ " ref_text = ref_text.strip()\n",
823
+ " ps = global_phonemizer.phonemize([ref_text])\n",
824
+ " ps = word_tokenize(ps[0])\n",
825
+ " ps = ' '.join(ps)\n",
826
+ "\n",
827
+ " ref_tokens = textclenaer(ps)\n",
828
+ " ref_tokens.insert(0, 0)\n",
829
+ " ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
830
+ " \n",
831
+ " \n",
832
+ " with torch.no_grad():\n",
833
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
834
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
835
+ "\n",
836
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
837
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
838
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
839
+ " \n",
840
+ " ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
841
+ " ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
842
+ " ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
843
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
844
+ " embedding=bert_dur,\n",
845
+ " embedding_scale=embedding_scale,\n",
846
+ " features=ref_s, # reference from the same speaker as the embedding\n",
847
+ " num_steps=diffusion_steps).squeeze(1)\n",
848
+ "\n",
849
+ "\n",
850
+ " s = s_pred[:, 128:]\n",
851
+ " ref = s_pred[:, :128]\n",
852
+ "\n",
853
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
854
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
855
+ "\n",
856
+ " d = model.predictor.text_encoder(d_en, \n",
857
+ " s, input_lengths, text_mask)\n",
858
+ "\n",
859
+ " x, _ = model.predictor.lstm(d)\n",
860
+ " duration = model.predictor.duration_proj(x)\n",
861
+ "\n",
862
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
863
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
864
+ "\n",
865
+ "\n",
866
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
867
+ " c_frame = 0\n",
868
+ " for i in range(pred_aln_trg.size(0)):\n",
869
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
870
+ " c_frame += int(pred_dur[i].data)\n",
871
+ "\n",
872
+ " # encode prosody\n",
873
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
874
+ " if model_params.decoder.type == \"hifigan\":\n",
875
+ " asr_new = torch.zeros_like(en)\n",
876
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
877
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
878
+ " en = asr_new\n",
879
+ "\n",
880
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
881
+ "\n",
882
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
883
+ " if model_params.decoder.type == \"hifigan\":\n",
884
+ " asr_new = torch.zeros_like(asr)\n",
885
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
886
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
887
+ " asr = asr_new\n",
888
+ "\n",
889
+ " out = model.decoder(asr, \n",
890
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
891
+ " \n",
892
+ " \n",
893
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
894
+ ]
895
+ },
896
+ {
897
+ "cell_type": "code",
898
+ "execution_count": null,
899
+ "id": "ec3f0da4",
900
+ "metadata": {},
901
+ "outputs": [],
902
+ "source": [
903
+ "# reference texts to sample styles\n",
904
+ "\n",
905
+ "ref_texts = {}\n",
906
+ "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
907
+ "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
908
+ "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
909
+ "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
910
+ ]
911
+ },
912
+ {
913
+ "cell_type": "code",
914
+ "execution_count": null,
915
+ "id": "6d0a3825",
916
+ "metadata": {
917
+ "scrolled": false
918
+ },
919
+ "outputs": [],
920
+ "source": [
921
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
922
+ "s_ref = compute_style(path)\n",
923
+ "\n",
924
+ "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
925
+ "for k,v in texts.items():\n",
926
+ " wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
927
+ " print(k + \": \")\n",
928
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
929
+ ]
930
+ },
931
+ {
932
+ "cell_type": "markdown",
933
+ "id": "6750aed9",
934
+ "metadata": {},
935
+ "source": [
936
+ "### Speech diversity\n",
937
+ "\n",
938
+ "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. \n",
939
+ "\n",
940
+ "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
941
+ "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). \n",
942
+ "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). \n"
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "markdown",
947
+ "id": "f6ae0aa5",
948
+ "metadata": {},
949
+ "source": [
950
+ "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
951
+ "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. "
952
+ ]
953
+ },
954
+ {
955
+ "cell_type": "code",
956
+ "execution_count": null,
957
+ "id": "36dc0148",
958
+ "metadata": {},
959
+ "outputs": [],
960
+ "source": [
961
+ "# unseen speaker\n",
962
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
963
+ "ref_s = compute_style(path)\n",
964
+ "\n",
965
+ "text = \"How much variation is there?\"\n",
966
+ "for _ in range(5):\n",
967
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
968
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
969
+ ]
970
+ },
971
+ {
972
+ "cell_type": "markdown",
973
+ "id": "bf9ef421",
974
+ "metadata": {},
975
+ "source": [
976
+ "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
977
+ "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. "
978
+ ]
979
+ },
980
+ {
981
+ "cell_type": "code",
982
+ "execution_count": null,
983
+ "id": "9ba406bd",
984
+ "metadata": {},
985
+ "outputs": [],
986
+ "source": [
987
+ "# unseen speaker\n",
988
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
989
+ "ref_s = compute_style(path)\n",
990
+ "\n",
991
+ "text = \"How much variation is there?\"\n",
992
+ "for _ in range(5):\n",
993
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
994
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
995
+ ]
996
+ },
997
+ {
998
+ "cell_type": "markdown",
999
+ "id": "a38fe464",
1000
+ "metadata": {},
1001
+ "source": [
1002
+ "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
1003
+ "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker. "
1004
+ ]
1005
+ },
1006
+ {
1007
+ "cell_type": "code",
1008
+ "execution_count": null,
1009
+ "id": "5f25bf94",
1010
+ "metadata": {},
1011
+ "outputs": [],
1012
+ "source": [
1013
+ "# unseen speaker\n",
1014
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1015
+ "ref_s = compute_style(path)\n",
1016
+ "\n",
1017
+ "text = \"How much variation is there?\"\n",
1018
+ "for _ in range(5):\n",
1019
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
1020
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1021
+ ]
1022
+ },
1023
+ {
1024
+ "cell_type": "markdown",
1025
+ "id": "21c3a071",
1026
+ "metadata": {},
1027
+ "source": [
1028
+ "#### Extreme setting (`alpha = 1, beta=1`)\n",
1029
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. "
1030
+ ]
1031
+ },
1032
+ {
1033
+ "cell_type": "code",
1034
+ "execution_count": null,
1035
+ "id": "fff8bab1",
1036
+ "metadata": {},
1037
+ "outputs": [],
1038
+ "source": [
1039
+ "# unseen speaker\n",
1040
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1041
+ "ref_s = compute_style(path)\n",
1042
+ "\n",
1043
+ "text = \"How much variation is there?\"\n",
1044
+ "for _ in range(5):\n",
1045
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
1046
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1047
+ ]
1048
+ },
1049
+ {
1050
+ "cell_type": "markdown",
1051
+ "id": "a8741e5a",
1052
+ "metadata": {},
1053
+ "source": [
1054
+ "#### No variation (`alpha = 0, beta=0`)\n",
1055
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. "
1056
+ ]
1057
+ },
1058
+ {
1059
+ "cell_type": "code",
1060
+ "execution_count": null,
1061
+ "id": "e55dd281",
1062
+ "metadata": {},
1063
+ "outputs": [],
1064
+ "source": [
1065
+ "# unseen speaker\n",
1066
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1067
+ "ref_s = compute_style(path)\n",
1068
+ "\n",
1069
+ "text = \"How much variation is there?\"\n",
1070
+ "for _ in range(5):\n",
1071
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
1072
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1073
+ ]
1074
+ },
1075
+ {
1076
+ "cell_type": "markdown",
1077
+ "id": "d5e86423",
1078
+ "metadata": {},
1079
+ "source": [
1080
+ "### Extra fun!\n",
1081
+ "\n",
1082
+ "Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here."
1083
+ ]
1084
+ },
1085
+ {
1086
+ "cell_type": "code",
1087
+ "execution_count": null,
1088
+ "id": "6f558314",
1089
+ "metadata": {},
1090
+ "outputs": [],
1091
+ "source": [
1092
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
1093
+ ]
1094
+ },
1095
+ {
1096
+ "cell_type": "code",
1097
+ "execution_count": null,
1098
+ "id": "caa5747c",
1099
+ "metadata": {},
1100
+ "outputs": [],
1101
+ "source": [
1102
+ "reference_dicts = {}\n",
1103
+ "reference_dicts['Yinghao'] = \"Demo/reference_audio/Yinghao.wav\"\n",
1104
+ "reference_dicts['Gavin'] = \"Demo/reference_audio/Gavin.wav\"\n",
1105
+ "reference_dicts['Vinay'] = \"Demo/reference_audio/Vinay.wav\"\n",
1106
+ "reference_dicts['Nima'] = \"Demo/reference_audio/Nima.wav\""
1107
+ ]
1108
+ },
1109
+ {
1110
+ "cell_type": "code",
1111
+ "execution_count": null,
1112
+ "id": "44a4cea1",
1113
+ "metadata": {
1114
+ "scrolled": false
1115
+ },
1116
+ "outputs": [],
1117
+ "source": [
1118
+ "start = time.time()\n",
1119
+ "noise = torch.randn(1,1,256).to(device)\n",
1120
+ "for k, path in reference_dicts.items():\n",
1121
+ " ref_s = compute_style(path)\n",
1122
+ " \n",
1123
+ " wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
1124
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
1125
+ " print('Speaker: ' + k)\n",
1126
+ " import IPython.display as ipd\n",
1127
+ " print('Synthesized:')\n",
1128
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
1129
+ " print('Reference:')\n",
1130
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
1131
+ ]
1132
+ }
1133
+ ],
1134
+ "metadata": {
1135
+ "kernelspec": {
1136
+ "display_name": "NLP",
1137
+ "language": "python",
1138
+ "name": "nlp"
1139
+ },
1140
+ "language_info": {
1141
+ "codemirror_mode": {
1142
+ "name": "ipython",
1143
+ "version": 3
1144
+ },
1145
+ "file_extension": ".py",
1146
+ "mimetype": "text/x-python",
1147
+ "name": "python",
1148
+ "nbconvert_exporter": "python",
1149
+ "pygments_lexer": "ipython3",
1150
+ "version": "3.9.7"
1151
+ }
1152
+ },
1153
+ "nbformat": 4,
1154
+ "nbformat_minor": 5
1155
+ }
Demo/Inference_LJSpeech.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce816595cf645bece49b912627fa5f317d26f72f429f1a67566f0d2149a63213
3
+ size 10573186
Demo/Inference_LibriTTS.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65da24dc5388d4af6e65b698794d7daa1a21d15b329905b32a72b0e5070a336c
3
+ size 36102818
Demo/reference_audio/1221-135767-0014.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/1789_142896_000022_000005.wav ADDED
Binary file (150 kB). View file
 
Demo/reference_audio/3.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/4.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/4077-13754-0000.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/5.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/5639-40744-0020.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/696_92939_000016_000006.wav ADDED
Binary file (145 kB). View file
 
Demo/reference_audio/908-157963-0027.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/Gavin.wav ADDED
Binary file (999 kB). View file
 
Demo/reference_audio/James.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace7afc8b1a55978c22e10f862374465e654f70ba72cea894e32751fdb4be6ec
3
+ size 4663340
Demo/reference_audio/James1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3cd888091538093ba760e9f8ceaa9f4d88bd861ab8c4dea5db7e79e6bc893d0
3
+ size 1419308
Demo/reference_audio/James2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4fca55c26fea4b130566738a8632b935592f2aa29042f5d4e4ba0180eddcf96
3
+ size 1757284
Demo/reference_audio/Nima.wav ADDED
Binary file (758 kB). View file
 
Demo/reference_audio/Vinay.wav ADDED
Binary file (694 kB). View file
 
Demo/reference_audio/Yinghao.wav ADDED
Binary file (405 kB). View file
 
Demo/reference_audio/amused.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/anger.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/disgusted.wav ADDED
Binary file (96 kB). View file
 
Demo/reference_audio/sleepy.wav ADDED
Binary file (96 kB). View file
 
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Models/LJSpeech/config.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
2
+ PLBERT_dir: Utils/PLBERT/, batch_size: 16, data_params: {OOD_data: Data/OOD_texts.txt,
3
+ min_length: 50, root_path: /local/LJSpeech-1.1/wavs, train_data: Data/train_list.txt,
4
+ val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 100,
5
+ first_stage_path: first_stage.pth, load_only_params: false, log_dir: Models/LJSpeech,
6
+ log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 20, joint_epoch: 50,
7
+ lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0, lambda_gen: 1.0,
8
+ lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0, lambda_slm: 1.0,
9
+ lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5,
10
+ gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
11
+ 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512,
12
+ upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: {
13
+ estimate_sigma_data: true, mean: -3.0, sigma_data: 0.45731624995853165, std: 1.0},
14
+ embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8,
15
+ num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512,
16
+ max_dur: 50, multispeaker: false, n_layer: 3, n_mels: 80, n_token: 178, slm: {
17
+ hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
18
+ sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
19
+ lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
20
+ win_length: 1200}, sr: 24000}, pretrained_model:,
21
+ save_freq: 2, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
22
+ iter: 10, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
Models/LJSpeech/epoch_2nd_00100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fead81b4ccee69f296efc5e4ead0973a51cb645ac24195c995977d4a9ee8d780
3
+ size 749716474
Models/LibriTTS/config.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
2
+ PLBERT_dir: Utils/PLBERT/, batch_size: 8, data_params: {OOD_data: Data/OOD_texts.txt,
3
+ min_length: 50, root_path: '', train_data: Data/train_list.txt, val_data: Data/val_list.txt},
4
+ device: cuda, epochs_1st: 40, epochs_2nd: 25, first_stage_path: first_stage.pth,
5
+ load_only_params: false, log_dir: Models/LibriTTS, log_interval: 10, loss_params: {
6
+ TMA_epoch: 4, diff_epoch: 0, joint_epoch: 0, lambda_F0: 1.0, lambda_ce: 20.0,
7
+ lambda_diff: 1.0, lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0,
8
+ lambda_norm: 1.0, lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300,
9
+ model_params: {decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
10
+ 5]], resblock_kernel_sizes: [3, 7, 11], type: hifigan, upsample_initial_channel: 512,
11
+ upsample_kernel_sizes: [20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {
12
+ dist: {estimate_sigma_data: true, mean: -3.0, sigma_data: 0.19926648961191362,
13
+ std: 1.0}, embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2,
14
+ num_heads: 8, num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512,
15
+ max_conv_dim: 512, max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178,
16
+ slm: {hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
17
+ sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
18
+ lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
19
+ win_length: 1200}, sr: 24000}, pretrained_model: Models/LibriTTS/epoch_2nd_00002.pth,
20
+ save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
21
+ iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
Models/LibriTTS/epochs_2nd_00020.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1164ffe19a17449d2c722234cecaf2836b35a698fb8ffd42562d2663657dca0a
3
+ size 771390526
Modules/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
Modules/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (162 Bytes). View file
 
Modules/__pycache__/discriminators.cpython-311.pyc ADDED
Binary file (12.2 kB). View file
 
Modules/__pycache__/hifigan.cpython-311.pyc ADDED
Binary file (30 kB). View file
 
Modules/__pycache__/istftnet.cpython-311.pyc ADDED
Binary file (34.4 kB). View file
 
Modules/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.17 kB). View file
 
Modules/diffusion/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
Modules/diffusion/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file