Spaces:
Runtime error
Runtime error
gradio support updated
Browse files- README.md +39 -1
- app.py +25 -0
- scripts/train.py +6 -5
- scripts/training_config.json +2 -2
- src/deep_voice_cloning/cloning/model.py +5 -2
README.md
CHANGED
@@ -28,4 +28,42 @@ python scripts/cloning_inference.py --model_path "/content/deep-voice-cloning/mo
|
|
28 |
--output_path "scripts/output/do_the_things.wav"
|
29 |
```
|
30 |
|
31 |
-
Resulting audio file will be saved as `output_path` file.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
--output_path "scripts/output/do_the_things.wav"
|
29 |
```
|
30 |
|
31 |
+
Resulting audio file will be saved as `output_path` file.
|
32 |
+
|
33 |
+
# Docker
|
34 |
+
|
35 |
+
To build docker image:
|
36 |
+
|
37 |
+
```
|
38 |
+
docker build -t deep-voice-cloning .
|
39 |
+
```
|
40 |
+
|
41 |
+
To pull docker image from Hub:
|
42 |
+
|
43 |
+
```angular2html
|
44 |
+
docker pull konverner/deep-voice-cloning:latest
|
45 |
+
```
|
46 |
+
|
47 |
+
To run image in a container:
|
48 |
+
|
49 |
+
```
|
50 |
+
docker run -it --entrypoint=/bin/bash konverner/deep-voice-cloning
|
51 |
+
```
|
52 |
+
|
53 |
+
To run training in a container for example:
|
54 |
+
|
55 |
+
```
|
56 |
+
python scripts/train.py --audio_path scripts/input/hank.mp3 --output_dir models
|
57 |
+
```
|
58 |
+
|
59 |
+
To run inference in a container for example:
|
60 |
+
|
61 |
+
```
|
62 |
+
python scripts/cloning_inference.py --model_path models/microsoft_speecht5_tts_hank --input_text "do the things, not because they are easy, but because they are hard" --output_path scripts/output/do_the_things.wav
|
63 |
+
```
|
64 |
+
|
65 |
+
|
66 |
+
# Notebook Examples
|
67 |
+
|
68 |
+
Example of using CLI for training and inference can be found in [notebook](https://github.com/konverner/deep-voice-cloning/blob/main/notebooks/CLI_Example.ipynb)
|
69 |
+
|
app.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
|
7 |
+
def greet(text, audio_file_path):
|
8 |
+
text = "%s" % text
|
9 |
+
audio_file_path = "%s" % audio_file_path
|
10 |
+
out_path = Path("scripts/output/audio.wav")
|
11 |
+
os.system(f'python scripts/train.py --audio_path {audio_file_path}\
|
12 |
+
--output_dir "models"')
|
13 |
+
os.system(f'python scripts/cloning_inference.py --model_path "models/microsoft_speecht5_tts_{Path(audio_file_path).stem}"\
|
14 |
+
--input_text "{text}" --output_path "{str(out_path)}"')
|
15 |
+
return out_path
|
16 |
+
|
17 |
+
|
18 |
+
demo = gr.Interface(
|
19 |
+
fn=greet,
|
20 |
+
inputs=[gr.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
|
21 |
+
gr.Audio(type="filepath", source="upload", label='Upload a voice to clone (max. 50mb)')],
|
22 |
+
outputs="audio",
|
23 |
+
title="Deep Voice Cloning Tool"
|
24 |
+
)
|
25 |
+
demo.launch()
|
scripts/train.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import argparse
|
2 |
import json
|
3 |
import os
|
|
|
4 |
|
5 |
import torch
|
6 |
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
|
@@ -24,9 +25,9 @@ if __name__ == "__main__":
|
|
24 |
if args.lang is not None:
|
25 |
training_config['lang'] = args.lang
|
26 |
if args.audio_path is not None:
|
27 |
-
training_config['audio_path'] = args.audio_path
|
28 |
if args.output_dir is not None:
|
29 |
-
training_config['output_dir'] = args.output_dir
|
30 |
|
31 |
transcriber_model = TranscriberModel(lang=training_config['lang'])
|
32 |
cloning_model = CloningModel(lang=training_config['lang'])
|
@@ -64,6 +65,6 @@ if __name__ == "__main__":
|
|
64 |
)
|
65 |
|
66 |
trainer.train()
|
67 |
-
cloning_model.save_pretrained(training_config["output_dir"]
|
68 |
-
|
69 |
-
'_' + training_config['audio_path']
|
|
|
1 |
import argparse
|
2 |
import json
|
3 |
import os
|
4 |
+
from pathlib import Path
|
5 |
|
6 |
import torch
|
7 |
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
|
|
|
25 |
if args.lang is not None:
|
26 |
training_config['lang'] = args.lang
|
27 |
if args.audio_path is not None:
|
28 |
+
training_config['audio_path'] = Path(args.audio_path)
|
29 |
if args.output_dir is not None:
|
30 |
+
training_config['output_dir'] = Path(args.output_dir)
|
31 |
|
32 |
transcriber_model = TranscriberModel(lang=training_config['lang'])
|
33 |
cloning_model = CloningModel(lang=training_config['lang'])
|
|
|
65 |
)
|
66 |
|
67 |
trainer.train()
|
68 |
+
cloning_model.save_pretrained(Path(training_config["output_dir"]) /
|
69 |
+
Path(cloning_model.config['model_path'].replace('/', '_')) +\
|
70 |
+
'_' + Path(training_config['audio_path']).stem)
|
scripts/training_config.json
CHANGED
@@ -4,6 +4,6 @@
|
|
4 |
"lang": "en",
|
5 |
"batch_size": 2,
|
6 |
"learning_rate": 1e-4,
|
7 |
-
"max_steps":
|
8 |
-
"warmup_steps":
|
9 |
}
|
|
|
4 |
"lang": "en",
|
5 |
"batch_size": 2,
|
6 |
"learning_rate": 1e-4,
|
7 |
+
"max_steps": 15,
|
8 |
+
"warmup_steps": 2
|
9 |
}
|
src/deep_voice_cloning/cloning/model.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import json
|
3 |
from typing import Dict
|
|
|
4 |
|
5 |
import numpy as np
|
6 |
import torch
|
@@ -17,7 +18,7 @@ class CloningModel:
|
|
17 |
self.config = json.load(f)[lang]
|
18 |
else:
|
19 |
self.config = config
|
20 |
-
self.speaker_embedding = torch.load(self.config['model_path']
|
21 |
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
22 |
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
23 |
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
@@ -25,6 +26,8 @@ class CloningModel:
|
|
25 |
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
26 |
self.to(self.device)
|
27 |
|
|
|
|
|
28 |
def to(self, device: torch.device):
|
29 |
self.model = self.model.to(device)
|
30 |
self.vocoder = self.vocoder.to(device)
|
@@ -32,7 +35,7 @@ class CloningModel:
|
|
32 |
def save_pretrained(self, save_directory: str):
|
33 |
self.model.save_pretrained(save_directory)
|
34 |
self.processor.save_pretrained(save_directory)
|
35 |
-
torch.save(self.speaker_embedding, save_directory
|
36 |
|
37 |
def forward(self, text: str) -> np.array:
|
38 |
# tokenize text
|
|
|
1 |
import os
|
2 |
import json
|
3 |
from typing import Dict
|
4 |
+
from pathlib import Path
|
5 |
|
6 |
import numpy as np
|
7 |
import torch
|
|
|
18 |
self.config = json.load(f)[lang]
|
19 |
else:
|
20 |
self.config = config
|
21 |
+
self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
|
22 |
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
23 |
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
24 |
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
|
|
26 |
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
27 |
self.to(self.device)
|
28 |
|
29 |
+
|
30 |
+
|
31 |
def to(self, device: torch.device):
|
32 |
self.model = self.model.to(device)
|
33 |
self.vocoder = self.vocoder.to(device)
|
|
|
35 |
def save_pretrained(self, save_directory: str):
|
36 |
self.model.save_pretrained(save_directory)
|
37 |
self.processor.save_pretrained(save_directory)
|
38 |
+
torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
|
39 |
|
40 |
def forward(self, text: str) -> np.array:
|
41 |
# tokenize text
|