Spaces:

konverner
/

deep-voice-cloning

Running

App Files Files Community

konverner commited on Jul 22, 2023

Commit

899cf32

•

0 Parent(s):

Initial commit

Browse files

Files changed (23) hide show

.gitignore +169 -0
LICENSE +21 -0
README.md +31 -0
models/.gitkeep +0 -0
notebooks/.gitkeep +0 -0
scripts/cloning_inference.py +30 -0
scripts/inference_config.json +7 -0
scripts/input/hank.mp3 +0 -0
scripts/input/homer.mp3 +0 -0
scripts/output/.gitkeep +0 -0
scripts/train.py +69 -0
scripts/training_config.json +9 -0
setup.py +106 -0
src/deep_voice_cloning/__init__.py +0 -0
src/deep_voice_cloning/cloning/__init__.py +0 -0
src/deep_voice_cloning/cloning/config.json +7 -0
src/deep_voice_cloning/cloning/model.py +54 -0
src/deep_voice_cloning/data/__init__.py +0 -0
src/deep_voice_cloning/data/collator.py +45 -0
src/deep_voice_cloning/data/dataset.py +63 -0
src/deep_voice_cloning/transcriber/__init__.py +0 -0
src/deep_voice_cloning/transcriber/config.json +7 -0
src/deep_voice_cloning/transcriber/model.py +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# ruff
+.ruff_cache

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Konstantin Verner
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Few-Shot Voice Cloning
+This repository is an implementation of the pipeline for few-short voice cloning based on SpeechT5 architecture introduced in [ SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205).
+It is able to clone a voice from 15-30 seconds of audio recording in English (another languages are planned).
+# Getting Started
+Clone repository
+```angular2html
+git clone https://github.com/konverner/deep-voice-cloning.git
+```
+Install the modules
+```angular2html
+pip install .
+```
+Run traning specifying arguments using config file `training_config.json` or the console command, for example
+```angular2html
+python scripts/train.py --audio_path scripts/input/hank.mp3 --output_dir /content/deep-voice-cloning/models
+```
+Resulting model will be saved in `output_dir` directory. It will be used in the next step.
+Run inference specifying arguments using config file `inference_config.json` or the console command, for example
+```angular2html
+python scripts/cloning_inference.py --model_path "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank"\
+--input_text 'do the things, not because they are easy, but because they are hard'\
+--output_path "scripts/output/do_the_things.wav"
+```
+Resulting audio file will be saved as `output_path` file.

models/.gitkeep ADDED Viewed

File without changes

notebooks/.gitkeep ADDED Viewed

File without changes

scripts/cloning_inference.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import argparse
+import json
+import os
+import soundfile as sf
+from deep_voice_cloning.cloning.model import CloningModel
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
+    parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
+    parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
+    args = parser.parse_args()
+    with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
+        config = json.load(f)
+    if args.model_path is not None:
+        config['model_path'] = args.model_path
+    if args.input_text is not None:
+        config['input_text'] = args.input_text
+    if args.output_path is not None:
+        config['output_path'] = args.output_path
+    cloning_model = CloningModel(config)
+    waveform_array = cloning_model.forward(config["input_text"])
+    sf.write(config['output_path'], waveform_array, samplerate=16000)

scripts/inference_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
+    "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
+    "vocoder_name": "microsoft/speecht5_hifigan",
+    "input_text": "do the things, not because they are easy, but because they are hard",
+    "output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
+}

scripts/input/hank.mp3 ADDED Viewed

Binary file (526 kB). View file

scripts/input/homer.mp3 ADDED Viewed

Binary file (913 kB). View file

scripts/output/.gitkeep ADDED Viewed

File without changes

scripts/train.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import argparse
+import json
+import os
+import torch
+from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
+from deep_voice_cloning.cloning.model import CloningModel
+from deep_voice_cloning.transcriber.model import TranscriberModel
+from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
+from deep_voice_cloning.data.dataset import get_cloning_dataset
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
+    parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
+    parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
+    args = parser.parse_args()
+    with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
+        training_config = json.load(f)
+    if args.lang is not None:
+        training_config['lang'] = args.lang
+    if args.audio_path is not None:
+        training_config['audio_path'] = args.audio_path
+    if args.output_dir is not None:
+        training_config['output_dir'] = args.output_dir
+    transcriber_model = TranscriberModel(lang=training_config['lang'])
+    cloning_model = CloningModel(lang=training_config['lang'])
+    dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
+    data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=training_config["output_dir"],
+        per_device_train_batch_size=training_config['batch_size'],
+        gradient_accumulation_steps=2,
+        overwrite_output_dir=True,
+        learning_rate=training_config['learning_rate'],
+        warmup_steps=training_config['warmup_steps'],
+        max_steps=training_config['max_steps'],
+        gradient_checkpointing=True,
+        fp16=transcriber_model.device == torch.device("cuda"),
+        evaluation_strategy="steps",
+        per_device_eval_batch_size=8,
+        save_strategy="no",
+        eval_steps=100,
+        logging_steps=20,
+        load_best_model_at_end=False,
+        greater_is_better=False,
+        label_names=["labels"],
+    )
+    trainer = Seq2SeqTrainer(
+        args=training_args,
+        model=cloning_model.model,
+        train_dataset=dataset,
+        eval_dataset=dataset,
+        data_collator=data_collator,
+        tokenizer=cloning_model.processor.tokenizer,
+    )
+    trainer.train()
+    cloning_model.save_pretrained(training_config["output_dir"] +\
+                                  '/' + cloning_model.config['model_path'].replace('/', '_') +\
+                                  '_' + training_config['audio_path'].split('/')[-1].split('.')[0])

scripts/training_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
+    "output_dir": "/content/deep-voice-cloning/models",
+    "lang": "en",
+    "batch_size": 2,
+    "learning_rate": 1e-4,
+    "max_steps": 1500,
+    "warmup_steps": 250
+}

setup.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pathlib import Path
+from setuptools import find_packages, setup
+README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
+MAINTAINER = "Konstantin Verner"
+MAINTAINER_EMAIL = "konst.verner@gmail.com"
+REQUIRED_PKGS = ["accelerate==0.21.0",
+                 "aiohttp==3.8.4",
+                 "aiosignal==1.3.1",
+                 "appdirs==1.4.4",
+                 "async-timeout==4.0.2",
+                 "attrs==23.1.0",
+                 "audioread==3.0.0",
+                 "certifi==2023.5.7",
+                 "cffi==1.15.1",
+                 "charset-normalizer==3.2.0",
+                 "colorama==0.4.6",
+                 "datasets==2.13.1",
+                 "decorator>=4.0.2",
+                 "dill==0.3.6",
+                 "filelock==3.12.2",
+                 "frozenlist==1.4.0",
+                 "fsspec==2023.6.0",
+                 "huggingface-hub==0.16.4",
+                 "HyperPyYAML==1.2.1",
+                 "idna==3.4",
+                 "Jinja2==3.1.2",
+                 "joblib==1.3.1",
+                 "lazy_loader==0.3",
+                 "librosa==0.10.0.post2",
+                 "llvmlite==0.40.1",
+                 "MarkupSafe==2.1.3",
+                 "mpmath==1.3.0",
+                 "msgpack==1.0.5",
+                 "multidict==6.0.4",
+                 "multiprocess==0.70.14",
+                 "networkx==3.1",
+                 "numba==0.57.1",
+                 "numpy>=1.22",
+                 "packaging==23.1",
+                 "pandas>=1.5.3",
+                 "pooch==1.6.0",
+                 "psutil==5.9.5",
+                 "pyarrow>=3.0.0",
+                 "pycparser==2.21",
+                 "python-dateutil==2.8.2",
+                 "pytz==2023.3",
+                 "PyYAML==6.0",
+                 "ruamel.yaml==0.17.28",
+                 "ruamel.yaml.clib==0.2.7",
+                 "safetensors==0.3.1",
+                 "scikit-learn==1.3.0",
+                 "scipy==1.11.1",
+                 "sentencepiece==0.1.99",
+                 "six==1.16.0",
+                 "soundfile==0.12.1",
+                 "soxr==0.3.5",
+                 "speechbrain==0.5.14",
+                 "sympy==1.12",
+                 "threadpoolctl==3.2.0",
+                 "tokenizers==0.13.3",
+                 "torch==2.0.1",
+                 "torchaudio==2.0.2",
+                 "tqdm==4.65.0",
+                 "transformers==4.30.2",
+                 "typing_extensions==4.7.1",
+                 "tzdata==2023.3",
+                 "urllib3==2.0.3",
+                 "xxhash==3.2.0",
+                 "yarl==1.9.2"]
+print(find_packages("src"))
+setup(
+    name="deep_voice_cloning",
+    version="0.1.0",
+    description="Few-Shot Voice Cloning",
+    long_description=README_TEXT,
+    long_description_content_type="text/markdown",
+    maintainer=MAINTAINER,
+    maintainer_email=MAINTAINER_EMAIL,
+    url="",
+    download_url="",
+    license="MIT",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    include_package_data=True,
+    package_data={"": ["*.json"]},
+    install_requires=REQUIRED_PKGS,
+    classifiers=[
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    keywords="asr, machine learning, fewshot learning, transformers",
+    zip_safe=False,  # Required for mypy to find the py.typed file
+)

src/deep_voice_cloning/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/cloning/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/cloning/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "en": {
+      "model_path": "microsoft/speecht5_tts",
+      "vocoder_name": "microsoft/speecht5_hifigan",
+      "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
+    }
+}

src/deep_voice_cloning/cloning/model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import json
+from typing import Dict
+import numpy as np
+import torch
+from speechbrain.pretrained import EncoderClassifier
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+class CloningModel:
+    def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
+        super(CloningModel, self).__init__()
+        if config is None:
+            self.speaker_embedding = None
+            with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+                self.config = json.load(f)[lang]
+        else:
+            self.config = config
+            self.speaker_embedding = torch.load(self.config['model_path'] + "/speaker_embedding.pt")[0]
+        self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
+        self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
+        self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
+        self.to(self.device)
+    def to(self, device: torch.device):
+        self.model = self.model.to(device)
+        self.vocoder = self.vocoder.to(device)
+    def save_pretrained(self, save_directory: str):
+        self.model.save_pretrained(save_directory)
+        self.processor.save_pretrained(save_directory)
+        torch.save(self.speaker_embedding, save_directory + "/speaker_embedding.pt")
+    def forward(self, text: str) -> np.array:
+        # tokenize text
+        inputs = self.processor(text=text, return_tensors="pt")
+        # generate spectrogram using backbone model
+        spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
+                                                 self.speaker_embedding.to(self.device))
+        # decode spectrogram into waveform using vocoder
+        with torch.no_grad():
+            waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
+        return waveform_array
+    def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
+        with torch.no_grad():
+            speaker_embeddings = self.speaker_model.encode_batch(waveform)
+            speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+            self.speaker_embedding = speaker_embeddings
+            speaker_embeddings = speaker_embeddings.squeeze()
+        return speaker_embeddings

src/deep_voice_cloning/data/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/data/collator.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from typing import Any, Dict, List, Union
+class TTSDataCollatorWithPadding:
+    def __init__(self, model, processor):
+        self.model = model
+        self.processor = processor
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+        label_features = [{"input_values": feature["labels"]} for feature in features]
+        speaker_features = [feature["speaker_embeddings"] for feature in features]
+        # collate the inputs and targets into a batch
+        batch = self.processor.pad(
+            input_ids=input_ids,
+            labels=label_features,
+            return_tensors="pt",
+        )
+        # replace padding with -100 to ignore loss correctly
+        batch["labels"] = batch["labels"].masked_fill(
+            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
+        )
+        # not used during fine-tuning
+        del batch["decoder_attention_mask"]
+        # round down target lengths to multiple of reduction factor
+        if self.model.config.reduction_factor > 1:
+            target_lengths = torch.tensor([
+                len(feature["input_values"]) for feature in label_features
+            ])
+            target_lengths = target_lengths.new([
+                length - length % self.model.config.reduction_factor for length in target_lengths
+            ])
+            max_length = max(target_lengths)
+            batch["labels"] = batch["labels"][:, :max_length]
+        # add the speaker embeddings
+        batch["speaker_embeddings"] = torch.tensor(speaker_features)
+        return batch

src/deep_voice_cloning/data/dataset.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Dict, Any
+import torch
+import librosa
+import numpy as np
+from datasets import Dataset
+from ..cloning.model import CloningModel
+from ..transcriber.model import TranscriberModel
+def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
+    """
+    Prepare a single example for training
+    """
+    # feature extraction and tokenization
+    processed_example = model.processor(
+        text=example["normalized_text"],
+        audio_target=example["audio"]["array"],
+        sampling_rate=16000,
+        return_attention_mask=False,
+    )
+    # strip off the batch dimension
+    if len(torch.tensor(processed_example['input_ids']).shape) > 1:
+        processed_example['input_ids'] = processed_example['input_ids'][0]
+    processed_example["labels"] = processed_example["labels"][0]
+    # use SpeechBrain to obtain x-vector
+    processed_example["speaker_embeddings"] = model.create_speaker_embedding(
+        torch.tensor(example["audio"]["array"])
+    ).numpy()
+    return processed_example
+def get_cloning_dataset(input_audio_path: str,
+                        transcriber_model: TranscriberModel,
+                        cloning_model: CloningModel,
+                        sampling_rate: int = 16000,
+                        window_size_secs: int = 5) -> Dataset:
+    """
+    Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
+    """
+    speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
+    # split a waveform into splits of 5 secs each
+    speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
+    texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
+             for speech_array in speech_arrays]
+    dataset = Dataset.from_list([
+        {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
+        for i in range(len(speech_arrays))]
+    )
+    dataset = dataset.map(
+        prepare_dataset, fn_kwargs={'model': cloning_model},
+        remove_columns=dataset.column_names,
+    )
+    return dataset

src/deep_voice_cloning/transcriber/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/transcriber/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "language_model_names": {
+        "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+        "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
+        "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
+    }
+}

src/deep_voice_cloning/transcriber/model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import json
+import numpy as np
+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+class TranscriberModel:
+    def __init__(self, lang: str = 'en'):
+        with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+            config = json.load(f)
+        self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
+        self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
+        model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+        return self.processor.batch_decode(predicted_ids)