mygyasir konverner commited on
Commit
13c43fe
0 Parent(s):

Duplicate from konverner/deep-voice-cloning

Browse files

Co-authored-by: Konstantin Verner <konverner@users.noreply.huggingface.co>

Files changed (42) hide show
  1. .gitignore +169 -0
  2. Dockerfile +4 -0
  3. LICENSE +21 -0
  4. README.md +10 -0
  5. app.py +29 -0
  6. build/lib/deep_voice_cloning/__init__.py +0 -0
  7. build/lib/deep_voice_cloning/cloning/__init__.py +0 -0
  8. build/lib/deep_voice_cloning/cloning/config.json +7 -0
  9. build/lib/deep_voice_cloning/cloning/model.py +57 -0
  10. build/lib/deep_voice_cloning/data/__init__.py +0 -0
  11. build/lib/deep_voice_cloning/data/collator.py +45 -0
  12. build/lib/deep_voice_cloning/data/dataset.py +63 -0
  13. build/lib/deep_voice_cloning/transcriber/__init__.py +0 -0
  14. build/lib/deep_voice_cloning/transcriber/config.json +7 -0
  15. build/lib/deep_voice_cloning/transcriber/model.py +22 -0
  16. models/.gitkeep +0 -0
  17. notebooks/.gitkeep +0 -0
  18. notebooks/CLI_Example.ipynb +0 -0
  19. pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt +1 -0
  20. pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt +1 -0
  21. pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml +1 -0
  22. pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt +1 -0
  23. pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt +1 -0
  24. requirements.txt +64 -0
  25. scripts/cloning_inference.py +30 -0
  26. scripts/inference_config.json +7 -0
  27. scripts/input/hank.mp3 +0 -0
  28. scripts/input/homer.mp3 +0 -0
  29. scripts/output/.gitkeep +0 -0
  30. scripts/train.py +71 -0
  31. scripts/training_config.json +9 -0
  32. setup.py +106 -0
  33. src/deep_voice_cloning/__init__.py +0 -0
  34. src/deep_voice_cloning/cloning/__init__.py +0 -0
  35. src/deep_voice_cloning/cloning/config.json +7 -0
  36. src/deep_voice_cloning/cloning/model.py +57 -0
  37. src/deep_voice_cloning/data/__init__.py +0 -0
  38. src/deep_voice_cloning/data/collator.py +45 -0
  39. src/deep_voice_cloning/data/dataset.py +63 -0
  40. src/deep_voice_cloning/transcriber/__init__.py +0 -0
  41. src/deep_voice_cloning/transcriber/config.json +7 -0
  42. src/deep_voice_cloning/transcriber/model.py +22 -0
.gitignore ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initially taken from Github's Python gitignore file
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # tests and logs
12
+ tests/fixtures/cached_*_text.txt
13
+ logs/
14
+ lightning_logs/
15
+ lang_code_data/
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # celery beat schedule file
92
+ celerybeat-schedule
93
+
94
+ # SageMath parsed files
95
+ *.sage.py
96
+
97
+ # Environments
98
+ .env
99
+ .venv
100
+ env/
101
+ venv/
102
+ ENV/
103
+ env.bak/
104
+ venv.bak/
105
+
106
+ # Spyder project settings
107
+ .spyderproject
108
+ .spyproject
109
+
110
+ # Rope project settings
111
+ .ropeproject
112
+
113
+ # mkdocs documentation
114
+ /site
115
+
116
+ # mypy
117
+ .mypy_cache/
118
+ .dmypy.json
119
+ dmypy.json
120
+
121
+ # Pyre type checker
122
+ .pyre/
123
+
124
+ # vscode
125
+ .vs
126
+ .vscode
127
+
128
+ # Pycharm
129
+ .idea
130
+
131
+ # TF code
132
+ tensorflow_code
133
+
134
+ # Models
135
+ proc_data
136
+
137
+ # examples
138
+ runs
139
+ /runs_old
140
+ /wandb
141
+ /examples/runs
142
+ /examples/**/*.args
143
+ /examples/rag/sweep
144
+
145
+ # data
146
+ /data
147
+ serialization_dir
148
+
149
+ # emacs
150
+ *.*~
151
+ debug.env
152
+
153
+ # vim
154
+ .*.swp
155
+
156
+ #ctags
157
+ tags
158
+
159
+ # pre-commit
160
+ .pre-commit*
161
+
162
+ # .lock
163
+ *.lock
164
+
165
+ # DS_Store (MacOS)
166
+ .DS_Store
167
+
168
+ # ruff
169
+ .ruff_cache
Dockerfile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ FROM python:3.9
2
+ MAINTAINER Konstantin Verner <konst.verner@gmail.com>
3
+ COPY . .
4
+ RUN pip install .
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Konstantin Verner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: openrail
3
+ title: Deep Voice Cloning
4
+ sdk: gradio
5
+ emoji: 🌖
6
+ colorFrom: yellow
7
+ colorTo: purple
8
+ pinned: true
9
+ duplicated_from: konverner/deep-voice-cloning
10
+ ---
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+
6
+
7
+ os.system('pip install .')
8
+
9
+ def greet(text, audio_file_path, progress=gr.Progress()):
10
+ text = "%s" % text
11
+ audio_file_path = "%s" % audio_file_path
12
+ out_path = Path("scripts/output/audio.wav")
13
+ progress(0.2, desc="Training voice embedding... (aprx 20 mins)")
14
+ os.system(f'python scripts/train.py --audio_path {audio_file_path}\
15
+ --output_dir "models"')
16
+ progress(0.9, desc="Generating voice...")
17
+ os.system(f'python scripts/cloning_inference.py --model_path "models/microsoft_speecht5_tts_{Path(audio_file_path).stem}"\
18
+ --input_text "{text}" --output_path "{str(out_path)}"')
19
+ return out_path
20
+
21
+
22
+ demo = gr.Interface(
23
+ fn=greet,
24
+ inputs=[gr.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
25
+ gr.Audio(type="filepath", source="upload", label='Upload a voice to clone (max. 50mb)')],
26
+ outputs="audio",
27
+ title="Deep Voice Cloning Tool"
28
+ )
29
+ demo.launch()
build/lib/deep_voice_cloning/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/cloning/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/cloning/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "model_path": "microsoft/speecht5_tts",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
6
+ }
7
+ }
build/lib/deep_voice_cloning/cloning/model.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import torch
8
+ from speechbrain.pretrained import EncoderClassifier
9
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+
11
+
12
+ class CloningModel:
13
+ def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
14
+ super(CloningModel, self).__init__()
15
+ if config is None:
16
+ self.speaker_embedding = None
17
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
18
+ self.config = json.load(f)[lang]
19
+ else:
20
+ self.config = config
21
+ self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
22
+ self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
23
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
24
+ self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
25
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
27
+ self.to(self.device)
28
+
29
+
30
+
31
+ def to(self, device: torch.device):
32
+ self.model = self.model.to(device)
33
+ self.vocoder = self.vocoder.to(device)
34
+
35
+ def save_pretrained(self, save_directory: str):
36
+ self.model.save_pretrained(save_directory)
37
+ self.processor.save_pretrained(save_directory)
38
+ torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
39
+
40
+ def forward(self, text: str) -> np.array:
41
+ # tokenize text
42
+ inputs = self.processor(text=text, return_tensors="pt")
43
+ # generate spectrogram using backbone model
44
+ spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
45
+ self.speaker_embedding.to(self.device))
46
+ # decode spectrogram into waveform using vocoder
47
+ with torch.no_grad():
48
+ waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
49
+ return waveform_array
50
+
51
+ def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
52
+ with torch.no_grad():
53
+ speaker_embeddings = self.speaker_model.encode_batch(waveform)
54
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
55
+ self.speaker_embedding = speaker_embeddings
56
+ speaker_embeddings = speaker_embeddings.squeeze()
57
+ return speaker_embeddings
build/lib/deep_voice_cloning/data/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/data/collator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Dict, List, Union
3
+
4
+
5
+ class TTSDataCollatorWithPadding:
6
+
7
+ def __init__(self, model, processor):
8
+ self.model = model
9
+ self.processor = processor
10
+
11
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
12
+ input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
13
+ label_features = [{"input_values": feature["labels"]} for feature in features]
14
+ speaker_features = [feature["speaker_embeddings"] for feature in features]
15
+
16
+ # collate the inputs and targets into a batch
17
+ batch = self.processor.pad(
18
+ input_ids=input_ids,
19
+ labels=label_features,
20
+ return_tensors="pt",
21
+ )
22
+
23
+ # replace padding with -100 to ignore loss correctly
24
+ batch["labels"] = batch["labels"].masked_fill(
25
+ batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
26
+ )
27
+
28
+ # not used during fine-tuning
29
+ del batch["decoder_attention_mask"]
30
+
31
+ # round down target lengths to multiple of reduction factor
32
+ if self.model.config.reduction_factor > 1:
33
+ target_lengths = torch.tensor([
34
+ len(feature["input_values"]) for feature in label_features
35
+ ])
36
+ target_lengths = target_lengths.new([
37
+ length - length % self.model.config.reduction_factor for length in target_lengths
38
+ ])
39
+ max_length = max(target_lengths)
40
+ batch["labels"] = batch["labels"][:, :max_length]
41
+
42
+ # add the speaker embeddings
43
+ batch["speaker_embeddings"] = torch.tensor(speaker_features)
44
+
45
+ return batch
build/lib/deep_voice_cloning/data/dataset.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import torch
4
+ import librosa
5
+ import numpy as np
6
+ from datasets import Dataset
7
+
8
+ from ..cloning.model import CloningModel
9
+ from ..transcriber.model import TranscriberModel
10
+
11
+
12
+ def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
13
+ """
14
+ Prepare a single example for training
15
+ """
16
+ # feature extraction and tokenization
17
+ processed_example = model.processor(
18
+ text=example["normalized_text"],
19
+ audio_target=example["audio"]["array"],
20
+ sampling_rate=16000,
21
+ return_attention_mask=False,
22
+ )
23
+
24
+ # strip off the batch dimension
25
+ if len(torch.tensor(processed_example['input_ids']).shape) > 1:
26
+ processed_example['input_ids'] = processed_example['input_ids'][0]
27
+
28
+ processed_example["labels"] = processed_example["labels"][0]
29
+
30
+ # use SpeechBrain to obtain x-vector
31
+ processed_example["speaker_embeddings"] = model.create_speaker_embedding(
32
+ torch.tensor(example["audio"]["array"])
33
+ ).numpy()
34
+
35
+ return processed_example
36
+
37
+
38
+ def get_cloning_dataset(input_audio_path: str,
39
+ transcriber_model: TranscriberModel,
40
+ cloning_model: CloningModel,
41
+ sampling_rate: int = 16000,
42
+ window_size_secs: int = 5) -> Dataset:
43
+ """
44
+ Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
45
+ """
46
+ speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
47
+
48
+ # split a waveform into splits of 5 secs each
49
+ speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
50
+ texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
51
+ for speech_array in speech_arrays]
52
+
53
+ dataset = Dataset.from_list([
54
+ {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
55
+ for i in range(len(speech_arrays))]
56
+ )
57
+
58
+ dataset = dataset.map(
59
+ prepare_dataset, fn_kwargs={'model': cloning_model},
60
+ remove_columns=dataset.column_names,
61
+ )
62
+
63
+ return dataset
build/lib/deep_voice_cloning/transcriber/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/transcriber/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "language_model_names": {
3
+ "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
4
+ "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
5
+ "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
6
+ }
7
+ }
build/lib/deep_voice_cloning/transcriber/model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import numpy as np
5
+ import torch
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
+
8
+
9
+ class TranscriberModel:
10
+ def __init__(self, lang: str = 'en'):
11
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
12
+ config = json.load(f)
13
+ self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
14
+ self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
15
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
18
+ model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
19
+ with torch.no_grad():
20
+ logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ return self.processor.batch_decode(predicted_ids)
models/.gitkeep ADDED
File without changes
notebooks/.gitkeep ADDED
File without changes
notebooks/CLI_Example.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/classifier.ckpt
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/embedding_model.ckpt
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/hyperparams.yaml
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/label_encoder.txt
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/mean_var_norm_emb.ckpt
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.21.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ appdirs==1.4.4
5
+ async-timeout==4.0.2
6
+ attrs==23.1.0
7
+ audioread==3.0.0
8
+ certifi==2023.5.7
9
+ cffi==1.15.1
10
+ charset-normalizer==3.2.0
11
+ colorama==0.4.6
12
+ datasets==2.13.1
13
+ decorator>=4.0.2
14
+ dill==0.3.6
15
+ filelock==3.12.2
16
+ frozenlist==1.4.0
17
+ fsspec==2023.6.0
18
+ huggingface-hub==0.16.4
19
+ HyperPyYAML==1.2.1
20
+ idna==3.4
21
+ Jinja2==3.1.2
22
+ joblib==1.3.1
23
+ lazy_loader==0.3
24
+ librosa==0.10.0.post2
25
+ llvmlite==0.40.1
26
+ MarkupSafe==2.1.3
27
+ mpmath==1.3.0
28
+ msgpack==1.0.5
29
+ multidict==6.0.4
30
+ multiprocess==0.70.14
31
+ networkx==3.1
32
+ numba==0.57.1
33
+ numpy>=1.22
34
+ packaging==23.1
35
+ pandas>=1.5.3
36
+ pooch==1.6.0
37
+ psutil==5.9.5
38
+ pyarrow>=3.0.0
39
+ pycparser==2.21
40
+ python-dateutil==2.8.2
41
+ pytz==2023.3
42
+ PyYAML==6.0
43
+ ruamel.yaml==0.17.28
44
+ ruamel.yaml.clib==0.2.7
45
+ safetensors==0.3.1
46
+ scikit-learn==1.3.0
47
+ scipy==1.11.1
48
+ sentencepiece==0.1.99
49
+ six==1.16.0
50
+ soundfile==0.12.1
51
+ soxr==0.3.5
52
+ speechbrain==0.5.14
53
+ sympy==1.12
54
+ threadpoolctl==3.2.0
55
+ tokenizers==0.13.3
56
+ torch==2.0.1
57
+ torchaudio==2.0.2
58
+ tqdm==4.65.0
59
+ transformers==4.30.2
60
+ typing_extensions==4.7.1
61
+ tzdata==2023.3
62
+ urllib3==2.0.3
63
+ xxhash==3.2.0
64
+ yarl==1.9.2
scripts/cloning_inference.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import soundfile as sf
6
+
7
+ from deep_voice_cloning.cloning.model import CloningModel
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
13
+ parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
14
+ parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
15
+ args = parser.parse_args()
16
+
17
+ with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
18
+ config = json.load(f)
19
+
20
+ if args.model_path is not None:
21
+ config['model_path'] = args.model_path
22
+ if args.input_text is not None:
23
+ config['input_text'] = args.input_text
24
+ if args.output_path is not None:
25
+ config['output_path'] = args.output_path
26
+
27
+ cloning_model = CloningModel(config)
28
+ waveform_array = cloning_model.forward(config["input_text"])
29
+
30
+ sf.write(config['output_path'], waveform_array, samplerate=16000)
scripts/inference_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
3
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "input_text": "do the things, not because they are easy, but because they are hard",
6
+ "output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
7
+ }
scripts/input/hank.mp3 ADDED
Binary file (526 kB). View file
 
scripts/input/homer.mp3 ADDED
Binary file (913 kB). View file
 
scripts/output/.gitkeep ADDED
File without changes
scripts/train.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
8
+
9
+ from deep_voice_cloning.cloning.model import CloningModel
10
+ from deep_voice_cloning.transcriber.model import TranscriberModel
11
+ from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
12
+ from deep_voice_cloning.data.dataset import get_cloning_dataset
13
+
14
+
15
+ if __name__ == "__main__":
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
18
+ parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
19
+ parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
20
+ args = parser.parse_args()
21
+
22
+ with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
23
+ training_config = json.load(f)
24
+
25
+ if args.lang is not None:
26
+ training_config['lang'] = args.lang
27
+ if args.audio_path is not None:
28
+ training_config['audio_path'] = Path(args.audio_path)
29
+ if args.output_dir is not None:
30
+ training_config['output_dir'] = Path(args.output_dir)
31
+
32
+ transcriber_model = TranscriberModel(lang=training_config['lang'])
33
+ cloning_model = CloningModel(lang=training_config['lang'])
34
+
35
+ dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
36
+ data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
37
+
38
+ training_args = Seq2SeqTrainingArguments(
39
+ output_dir=training_config["output_dir"],
40
+ per_device_train_batch_size=training_config['batch_size'],
41
+ gradient_accumulation_steps=2,
42
+ overwrite_output_dir=True,
43
+ learning_rate=training_config['learning_rate'],
44
+ warmup_steps=training_config['warmup_steps'],
45
+ max_steps=training_config['max_steps'],
46
+ gradient_checkpointing=True,
47
+ fp16=transcriber_model.device == torch.device("cuda"),
48
+ evaluation_strategy="steps",
49
+ per_device_eval_batch_size=8,
50
+ save_strategy="no",
51
+ eval_steps=100,
52
+ logging_steps=20,
53
+ load_best_model_at_end=False,
54
+ greater_is_better=False,
55
+ label_names=["labels"],
56
+ )
57
+
58
+ trainer = Seq2SeqTrainer(
59
+ args=training_args,
60
+ model=cloning_model.model,
61
+ train_dataset=dataset,
62
+ eval_dataset=dataset,
63
+ data_collator=data_collator,
64
+ tokenizer=cloning_model.processor.tokenizer,
65
+ )
66
+
67
+ trainer.train()
68
+ cloning_model.save_pretrained(Path(training_config["output_dir"]) /
69
+ Path(cloning_model.config['model_path'].replace('/', '_')
70
+ + '_' + Path(training_config['audio_path']).stem)
71
+ )
scripts/training_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
3
+ "output_dir": "/content/deep-voice-cloning/models",
4
+ "lang": "en",
5
+ "batch_size": 2,
6
+ "learning_rate": 1e-4,
7
+ "max_steps": 300,
8
+ "warmup_steps": 30
9
+ }
setup.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from setuptools import find_packages, setup
4
+
5
+ README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
6
+
7
+ MAINTAINER = "Konstantin Verner"
8
+ MAINTAINER_EMAIL = "konst.verner@gmail.com"
9
+ REQUIRED_PKGS = ["accelerate==0.21.0",
10
+ "aiohttp==3.8.4",
11
+ "aiosignal==1.3.1",
12
+ "appdirs==1.4.4",
13
+ "async-timeout==4.0.2",
14
+ "attrs==23.1.0",
15
+ "audioread==3.0.0",
16
+ "certifi==2023.5.7",
17
+ "cffi==1.15.1",
18
+ "charset-normalizer==3.2.0",
19
+ "colorama==0.4.6",
20
+ "datasets==2.13.1",
21
+ "decorator>=4.0.2",
22
+ "dill==0.3.6",
23
+ "filelock==3.12.2",
24
+ "frozenlist==1.4.0",
25
+ "fsspec==2023.6.0",
26
+ "huggingface-hub==0.16.4",
27
+ "HyperPyYAML==1.2.1",
28
+ "idna==3.4",
29
+ "Jinja2==3.1.2",
30
+ "joblib==1.3.1",
31
+ "lazy_loader==0.3",
32
+ "librosa==0.10.0.post2",
33
+ "llvmlite==0.40.1",
34
+ "MarkupSafe==2.1.3",
35
+ "mpmath==1.3.0",
36
+ "msgpack==1.0.5",
37
+ "multidict==6.0.4",
38
+ "multiprocess==0.70.14",
39
+ "networkx==3.1",
40
+ "numba==0.57.1",
41
+ "numpy>=1.22",
42
+ "packaging==23.1",
43
+ "pandas>=1.5.3",
44
+ "pooch==1.6.0",
45
+ "psutil==5.9.5",
46
+ "pyarrow>=3.0.0",
47
+ "pycparser==2.21",
48
+ "python-dateutil==2.8.2",
49
+ "pytz==2023.3",
50
+ "PyYAML==6.0",
51
+ "ruamel.yaml==0.17.28",
52
+ "ruamel.yaml.clib==0.2.7",
53
+ "safetensors==0.3.1",
54
+ "scikit-learn==1.3.0",
55
+ "scipy==1.11.1",
56
+ "sentencepiece==0.1.99",
57
+ "six==1.16.0",
58
+ "soundfile==0.12.1",
59
+ "soxr==0.3.5",
60
+ "speechbrain==0.5.14",
61
+ "sympy==1.12",
62
+ "threadpoolctl==3.2.0",
63
+ "tokenizers==0.13.3",
64
+ "torch==2.0.1",
65
+ "torchaudio==2.0.2",
66
+ "tqdm==4.65.0",
67
+ "transformers==4.30.2",
68
+ "typing_extensions==4.7.1",
69
+ "tzdata==2023.3",
70
+ "urllib3==2.0.3",
71
+ "xxhash==3.2.0",
72
+ "yarl==1.9.2"]
73
+
74
+ print(find_packages("src"))
75
+
76
+ setup(
77
+ name="deep_voice_cloning",
78
+ version="0.1.0",
79
+ description="Few-Shot Voice Cloning",
80
+ long_description=README_TEXT,
81
+ long_description_content_type="text/markdown",
82
+ maintainer=MAINTAINER,
83
+ maintainer_email=MAINTAINER_EMAIL,
84
+ url="",
85
+ download_url="",
86
+ license="MIT",
87
+ package_dir={"": "src"},
88
+ packages=find_packages("src"),
89
+ include_package_data=True,
90
+ package_data={"": ["*.json"]},
91
+ install_requires=REQUIRED_PKGS,
92
+ classifiers=[
93
+ "Development Status :: 1 - Planning",
94
+ "Intended Audience :: Developers",
95
+ "Intended Audience :: Education",
96
+ "Intended Audience :: Science/Research",
97
+ "License :: OSI Approved :: MIT",
98
+ "Operating System :: OS Independent",
99
+ "Programming Language :: Python :: 3",
100
+ "Programming Language :: Python :: 3.8",
101
+ "Programming Language :: Python :: 3.9",
102
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
103
+ ],
104
+ keywords="asr, machine learning, fewshot learning, transformers",
105
+ zip_safe=False, # Required for mypy to find the py.typed file
106
+ )
src/deep_voice_cloning/__init__.py ADDED
File without changes
src/deep_voice_cloning/cloning/__init__.py ADDED
File without changes
src/deep_voice_cloning/cloning/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "model_path": "microsoft/speecht5_tts",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
6
+ }
7
+ }
src/deep_voice_cloning/cloning/model.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import torch
8
+ from speechbrain.pretrained import EncoderClassifier
9
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+
11
+
12
+ class CloningModel:
13
+ def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
14
+ super(CloningModel, self).__init__()
15
+ if config is None:
16
+ self.speaker_embedding = None
17
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
18
+ self.config = json.load(f)[lang]
19
+ else:
20
+ self.config = config
21
+ self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
22
+ self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
23
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
24
+ self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
25
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
27
+ self.to(self.device)
28
+
29
+
30
+
31
+ def to(self, device: torch.device):
32
+ self.model = self.model.to(device)
33
+ self.vocoder = self.vocoder.to(device)
34
+
35
+ def save_pretrained(self, save_directory: str):
36
+ self.model.save_pretrained(save_directory)
37
+ self.processor.save_pretrained(save_directory)
38
+ torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
39
+
40
+ def forward(self, text: str) -> np.array:
41
+ # tokenize text
42
+ inputs = self.processor(text=text, return_tensors="pt")
43
+ # generate spectrogram using backbone model
44
+ spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
45
+ self.speaker_embedding.to(self.device))
46
+ # decode spectrogram into waveform using vocoder
47
+ with torch.no_grad():
48
+ waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
49
+ return waveform_array
50
+
51
+ def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
52
+ with torch.no_grad():
53
+ speaker_embeddings = self.speaker_model.encode_batch(waveform)
54
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
55
+ self.speaker_embedding = speaker_embeddings
56
+ speaker_embeddings = speaker_embeddings.squeeze()
57
+ return speaker_embeddings
src/deep_voice_cloning/data/__init__.py ADDED
File without changes
src/deep_voice_cloning/data/collator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Dict, List, Union
3
+
4
+
5
+ class TTSDataCollatorWithPadding:
6
+
7
+ def __init__(self, model, processor):
8
+ self.model = model
9
+ self.processor = processor
10
+
11
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
12
+ input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
13
+ label_features = [{"input_values": feature["labels"]} for feature in features]
14
+ speaker_features = [feature["speaker_embeddings"] for feature in features]
15
+
16
+ # collate the inputs and targets into a batch
17
+ batch = self.processor.pad(
18
+ input_ids=input_ids,
19
+ labels=label_features,
20
+ return_tensors="pt",
21
+ )
22
+
23
+ # replace padding with -100 to ignore loss correctly
24
+ batch["labels"] = batch["labels"].masked_fill(
25
+ batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
26
+ )
27
+
28
+ # not used during fine-tuning
29
+ del batch["decoder_attention_mask"]
30
+
31
+ # round down target lengths to multiple of reduction factor
32
+ if self.model.config.reduction_factor > 1:
33
+ target_lengths = torch.tensor([
34
+ len(feature["input_values"]) for feature in label_features
35
+ ])
36
+ target_lengths = target_lengths.new([
37
+ length - length % self.model.config.reduction_factor for length in target_lengths
38
+ ])
39
+ max_length = max(target_lengths)
40
+ batch["labels"] = batch["labels"][:, :max_length]
41
+
42
+ # add the speaker embeddings
43
+ batch["speaker_embeddings"] = torch.tensor(speaker_features)
44
+
45
+ return batch
src/deep_voice_cloning/data/dataset.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import torch
4
+ import librosa
5
+ import numpy as np
6
+ from datasets import Dataset
7
+
8
+ from ..cloning.model import CloningModel
9
+ from ..transcriber.model import TranscriberModel
10
+
11
+
12
+ def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
13
+ """
14
+ Prepare a single example for training
15
+ """
16
+ # feature extraction and tokenization
17
+ processed_example = model.processor(
18
+ text=example["normalized_text"],
19
+ audio_target=example["audio"]["array"],
20
+ sampling_rate=16000,
21
+ return_attention_mask=False,
22
+ )
23
+
24
+ # strip off the batch dimension
25
+ if len(torch.tensor(processed_example['input_ids']).shape) > 1:
26
+ processed_example['input_ids'] = processed_example['input_ids'][0]
27
+
28
+ processed_example["labels"] = processed_example["labels"][0]
29
+
30
+ # use SpeechBrain to obtain x-vector
31
+ processed_example["speaker_embeddings"] = model.create_speaker_embedding(
32
+ torch.tensor(example["audio"]["array"])
33
+ ).numpy()
34
+
35
+ return processed_example
36
+
37
+
38
+ def get_cloning_dataset(input_audio_path: str,
39
+ transcriber_model: TranscriberModel,
40
+ cloning_model: CloningModel,
41
+ sampling_rate: int = 16000,
42
+ window_size_secs: int = 5) -> Dataset:
43
+ """
44
+ Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
45
+ """
46
+ speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
47
+
48
+ # split a waveform into splits of 5 secs each
49
+ speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
50
+ texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
51
+ for speech_array in speech_arrays]
52
+
53
+ dataset = Dataset.from_list([
54
+ {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
55
+ for i in range(len(speech_arrays))]
56
+ )
57
+
58
+ dataset = dataset.map(
59
+ prepare_dataset, fn_kwargs={'model': cloning_model},
60
+ remove_columns=dataset.column_names,
61
+ )
62
+
63
+ return dataset
src/deep_voice_cloning/transcriber/__init__.py ADDED
File without changes
src/deep_voice_cloning/transcriber/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "language_model_names": {
3
+ "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
4
+ "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
5
+ "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
6
+ }
7
+ }
src/deep_voice_cloning/transcriber/model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import numpy as np
5
+ import torch
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
+
8
+
9
+ class TranscriberModel:
10
+ def __init__(self, lang: str = 'en'):
11
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
12
+ config = json.load(f)
13
+ self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
14
+ self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
15
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
18
+ model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
19
+ with torch.no_grad():
20
+ logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ return self.processor.batch_decode(predicted_ids)