Spaces:

internetsignal
/

Bark-w-voice-clone

Build error

App Files Files Community

internetsignal commited on May 23, 2023

Commit

1a1be03

•

0 Parent(s):

Duplicate from internetsignal/Bark-with-Voice-Cloning

Browse files

Files changed (12) hide show

.gitattributes +40 -0
Bark.wav +0 -0
README.md +14 -0
SE_checkpoint.pth.tar +3 -0
app.py +313 -0
best_model.pth.tar +3 -0
config.json +373 -0
config_se.json +119 -0
language_ids.json +5 -0
ref.wav +3 -0
requirements.txt +5 -0
speakers.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,40 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+SE_checkpoint.pth.tar filter=lfs diff=lfs merge=lfs -text
+best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
+nana_longest_vocal.wav filter=lfs diff=lfs merge=lfs -text
+test.wav filter=lfs diff=lfs merge=lfs -text
+reference.wav filter=lfs diff=lfs merge=lfs -text
+ref.wav filter=lfs diff=lfs merge=lfs -text

Bark.wav ADDED Viewed

Binary file (449 kB). View file

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Bark with Voice Cloning
+emoji: 📊
+colorFrom: purple
+colorTo: purple
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: internetsignal/Bark-with-Voice-Cloning
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

SE_checkpoint.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
+size 44610930

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+#os.system("pip install git+https://github.com/suno-ai/bark.git")
+from bark.generation import SUPPORTED_LANGS
+from bark import SAMPLE_RATE, generate_audio
+from scipy.io.wavfile import write as write_wav
+from datetime import datetime
+import shutil
+import gradio as gr
+import sys
+import string
+import time
+import argparse
+import json
+import numpy as np
+# import IPython
+# from IPython.display import Audio
+import torch
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+try:
+  from TTS.utils.audio import AudioProcessor
+except:
+  from TTS.utils.audio import AudioProcessor
+from TTS.tts.models import setup_model
+from TTS.config import load_config
+from TTS.tts.models.vits import *
+from TTS.tts.utils.speakers import SpeakerManager
+from pydub import AudioSegment
+# from google.colab import files
+import librosa
+from scipy.io.wavfile import write, read
+import subprocess
+'''
+from google.colab import drive
+drive.mount('/content/drive')
+src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar')
+dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar')
+shutil.copy(src_path, dst_path)
+'''
+TTS_PATH = "TTS/"
+# add libraries into environment
+sys.path.append(TTS_PATH) # set this if TTS is not installed globally
+# Paths definition
+OUT_PATH = 'out/'
+# create output path
+os.makedirs(OUT_PATH, exist_ok=True)
+# model vars
+MODEL_PATH = 'best_model.pth.tar'
+CONFIG_PATH = 'config.json'
+TTS_LANGUAGES = "language_ids.json"
+TTS_SPEAKERS = "speakers.json"
+USE_CUDA = torch.cuda.is_available()
+# load the config
+C = load_config(CONFIG_PATH)
+# load the audio processor
+ap = AudioProcessor(**C.audio)
+speaker_embedding = None
+C.model_args['d_vector_file'] = TTS_SPEAKERS
+C.model_args['use_speaker_encoder_as_loss'] = False
+model = setup_model(C)
+model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
+# print(model.language_manager.num_languages, model.embedded_language_dim)
+# print(model.emb_l)
+cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
+# remove speaker encoder
+model_weights = cp['model'].copy()
+for key in list(model_weights.keys()):
+  if "speaker_encoder" in key:
+    del model_weights[key]
+model.load_state_dict(model_weights)
+model.eval()
+if USE_CUDA:
+    model = model.cuda()
+# synthesize voice
+use_griffin_lim = False
+# Paths definition
+CONFIG_SE_PATH = "config_se.json"
+CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
+# Load the Speaker encoder
+SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
+# Define helper function
+def compute_spec(ref_file):
+  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
+  spec = ap.spectrogram(y)
+  spec = torch.FloatTensor(spec).unsqueeze(0)
+  return spec
+def voice_conversion(ta, ra, da):
+  target_audio = 'target.wav'
+  reference_audio = 'reference.wav'
+  driving_audio = 'driving.wav'
+  write(target_audio, ta[0], ta[1])
+  write(reference_audio, ra[0], ra[1])
+  write(driving_audio, da[0], da[1])
+  # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
+  # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
+  # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
+  files = [target_audio, reference_audio, driving_audio]
+  for file in files:
+      subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
+  # ta_ = read(target_audio)
+  target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
+  target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
+  driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
+  driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
+  # Convert the voice
+  driving_spec = compute_spec(driving_audio)
+  y_lengths = torch.tensor([driving_spec.size(-1)])
+  if USE_CUDA:
+      ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
+      ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
+  else:
+      ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
+      ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
+  # print("Reference Audio after decoder:")
+  # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
+  return (ap.sample_rate, ref_wav_voc)
+def generate_text_to_speech(text_prompt, selected_speaker, text_temp, waveform_temp):
+    audio_array = generate_audio(text_prompt, selected_speaker, text_temp, waveform_temp)
+    now = datetime.now()
+    date_str = now.strftime("%m-%d-%Y")
+    time_str = now.strftime("%H-%M-%S")
+    outputs_folder = os.path.join(os.getcwd(), "outputs")
+    if not os.path.exists(outputs_folder):
+        os.makedirs(outputs_folder)
+    sub_folder = os.path.join(outputs_folder, date_str)
+    if not os.path.exists(sub_folder):
+        os.makedirs(sub_folder)
+    file_name = f"audio_{time_str}.wav"
+    file_path = os.path.join(sub_folder, file_name)
+    write_wav(file_path, SAMPLE_RATE, audio_array)
+    return file_path
+speakers_list = []
+for lang, code in SUPPORTED_LANGS:
+    for n in range(10):
+        speakers_list.append(f"{code}_speaker_{n}")
+examples1 = [["ref.wav", "Bark.wav", "Bark.wav"]]
+with gr.Blocks() as demo:
+    gr.Markdown(
+            f""" # <center>🐶🎶🥳 - Bark with Voice Cloning</center>
+            ### <center>🤗 - Powered by [Bark](https://huggingface.co/spaces/suno/bark) and [YourTTS](https://github.com/Edresson/YourTTS). Inspired by [bark-webui](https://github.com/makawy7/bark-webui).</center>
+            1. You can duplicate and use it with a GPU: <a href="https://huggingface.co/spaces/{os.getenv('SPACE_ID')}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a>
+            2. First use Bark to generate audio from text and then use YourTTS to get new audio in a custom voice you like. Easy to use!
+            3. For voice cloning, longer reference audio (~90s) will generally lead to better quality of the cloned speech. Also, please make sure the input audio generated by Bark is not too short.
+        """
+    )
+    with gr.Row().style(equal_height=True):
+        inp1 = gr.Textbox(label="Input Text", lines=4, placeholder="Enter text here...")
+        inp3 = gr.Slider(
+            0.1,
+            1.0,
+            value=0.7,
+            label="Generation Temperature",
+            info="1.0 more diverse, 0.1 more conservative",
+        )
+        inp4 = gr.Slider(
+            0.1, 1.0, value=0.7, label="Waveform Temperature", info="1.0 more diverse, 0.1 more conservative"
+        )
+    with gr.Row().style(equal_height=True):
+        inp2 = gr.Dropdown(speakers_list, value=speakers_list[1], label="Acoustic Prompt")
+        button = gr.Button("Generate using Bark")
+        out1 = gr.Audio(label="Generated Audio")
+    button.click(generate_text_to_speech, [inp1, inp2, inp3, inp4], [out1])
+    with gr.Row().style(equal_height=True):
+        inp5 = gr.Audio(label="Upload Reference Audio for Voice Cloning Here")
+        inp6 = out1
+        inp7 = out1
+        btn = gr.Button("Generate using YourTTS")
+        out2 = gr.Audio(label="Generated Audio in a Custom Voice")
+    btn.click(voice_conversion, [inp5, inp6, inp7], [out2])
+    gr.Examples(examples=examples1, fn=voice_conversion, inputs=[inp5, inp6, inp7],
+                outputs=[out2], cache_examples=True)
+    gr.Markdown(
+            """ ### <center>NOTE: Please do not generate any audio that is potentially harmful to any person or organization❗</center>
+        """
+    )
+    gr.Markdown(
+            """
+### <center>😄 - You may also apply [VoiceFixer](https://huggingface.co/spaces/Kevin676/VoiceFixer) to the generated audio in order to enhance the speech.</center>
+## 🌎 Foreign Language
+Bark supports various languages out-of-the-box and automatically determines language from input text. \
+When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
+Try the prompt:
+```
+Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
+```
+## 🤭 Non-Speech Sounds
+Below is a list of some known non-speech sounds, but we are finding more every day. \
+Please let us know if you find patterns that work particularly well on Discord!
+* [laughter]
+* [laughs]
+* [sighs]
+* [music]
+* [gasps]
+* [clears throat]
+* — or ... for hesitations
+* ♪ for song lyrics
+* capitalization for emphasis of a word
+* MAN/WOMAN: for bias towards speaker
+Try the prompt:
+```
+" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
+```
+## 🎶 Music
+Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
+Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
+Try the prompt:
+```
+♪ In the jungle, the mighty jungle, the lion barks tonight ♪
+```
+## 🧬 Voice Cloning
+Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
+The model also attempts to preserve music, ambient noise, etc. from input audio. \
+However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
+## 👥 Speaker Prompts
+You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
+Please note that these are not always respected, especially if a conflicting audio history prompt is given.
+Try the prompt:
+```
+WOMAN: I would like an oatmilk latte please.
+MAN: Wow, that's expensive!
+```
+## Details
+Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
+Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
+        """
+    )
+    gr.HTML('''
+        <div class="footer">
+                    <p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing — Steve Jobs
+                    </p>
+        </div>
+    ''')
+demo.queue().launch(show_error=True)

best_model.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:017bfd8907c80bb5857d65d0223f0e4e4b9d699ef52e2a853d9cc7eb7e308cf0
+size 379957289

config.json ADDED Viewed

	@@ -0,0 +1,373 @@

+{
+    "model": "vits",
+    "run_name": "vits_tts-portuguese",
+    "run_description": "",
+    "epochs": 1000,
+    "batch_size": 52,
+    "eval_batch_size": 52,
+    "mixed_precision": false,
+    "scheduler_after_epoch": true,
+    "run_eval": true,
+    "test_delay_epochs": -1,
+    "print_eval": true,
+    "dashboard_logger": "tensorboard",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "project_name": null,
+    "log_model_step": 10000,
+    "wandb_entity": null,
+    "save_step": 10000,
+    "checkpoint": true,
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "num_loader_workers": 4,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "use_language_weighted_sampler": true,
+    "output_path": "../checkpoints/VITS-multilingual/VITS_fixes/new/new-SE/use_noise_aument_false/xlarge-ZS-PT-VCTK/pt-en+LibriTTS-fr/speaker_encoder_as_loss_9_alpha/mixed-p-false-bug-SDP-fixed/",
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 16000,
+        "resample": false,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "log_func": "np.log",
+        "do_trim_silence": true,
+        "trim_db": 45,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null,
+        "spec_gain": 1,
+        "do_amp_to_db_linear": false,
+        "do_amp_to_db_mel": true,
+        "signal_norm": false,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": null
+    },
+    "use_phonemes": false,
+    "use_espeak_phonemes": false,
+    "phoneme_language": "pt-br",
+    "compute_input_seq_cache": false,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "unique": true
+    },
+    "batch_group_size": 0,
+    "loss_masking": null,
+    "min_seq_len": 90,
+    "max_seq_len": 270,
+    "compute_f0": false,
+    "compute_linear_spec": true,
+    "add_blank": true,
+    "datasets": [
+        {
+            "name": "vctk",
+            "path": "../../datasets/VCTK-Corpus-removed-silence_16Khz/",
+            "meta_file_train": null,
+            "ununsed_speakers": [
+                "p225",
+                "p234",
+                "p238",
+                "p245",
+                "p248",
+                "p261",
+                "p294",
+                "p302",
+                "p326",
+                "p335",
+                "p347"
+            ],
+            "language": "en",
+            "meta_file_val": null,
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "libri_tts",
+            "path": "../../datasets/LibriTTS/LibriTTS/dataset-preprocessed-clean-100-and-360/dataset-22k/",
+            "meta_file_train": "metadata_all.csv",
+            "ununsed_speakers": null,
+            "language": "en",
+            "meta_file_val": "dev-clean_500.csv",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "brspeech",
+            "path": "../../datasets/TTS-Portuguese-Corpus_16khz/",
+            "meta_file_train": "train_TTS-Portuguese_Corpus_metadata.csv",
+            "ununsed_speakers": null,
+            "language": "pt-br",
+            "meta_file_val": "eval_TTS-Portuguese_Corpus_metadata.csv",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "mailabs",
+            "path": "../../datasets/M-AILABS/fr_FR",
+            "meta_file_train": "",
+            "ununsed_speakers": null,
+            "language": "fr-fr",
+            "meta_file_val": null,
+            "meta_file_attn_mask": null
+        }
+    ],
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": "",
+    "lr_scheduler_params": null,
+    "test_sentences": [
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "VCTK_p225",
+            null,
+            "en"
+        ],
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "ED",
+            null,
+            "en"
+        ],
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "bernard",
+            null,
+            "en"
+        ],
+        [
+            "This cake is great. It's so delicious and moist.",
+            "VCTK_p234",
+            null,
+            "en"
+        ],
+        [
+            "This cake is great. It's so delicious and moist.",
+            "ED",
+            null,
+            "en"
+        ],
+        [
+            "This cake is great. It's so delicious and moist.",
+            "ezwa",
+            null,
+            "en"
+        ],
+        [
+            "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
+            "ED",
+            null,
+            "pt-br"
+        ],
+        [
+            "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
+            "VCTK_p238",
+            null,
+            "pt-br"
+        ],
+        [
+            "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
+            "gilles_g_le_blanc",
+            null,
+            "pt-br"
+        ],
+        [
+            "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
+            "ED",
+            null,
+            "pt-br"
+        ],
+        [
+            "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
+            "VCTK_p245",
+            null,
+            "pt-br"
+        ],
+        [
+            "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
+            "nadine_eckert_boulet",
+            null,
+            "pt-br"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "VCTK_p245",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "ED",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "ezwa",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "bernard",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "gilles_g_le_blanc",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "nadine_eckert_boulet",
+            null,
+            "fr-fr"
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "zeckou",
+            null,
+            "fr-fr"
+        ]
+    ],
+    "use_speaker_embedding": true,
+    "use_d_vector_file": true,
+    "d_vector_dim": 512,
+    "model_args": {
+        "num_chars": 165,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": 1,
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 0.8,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": true,
+        "num_speakers": 1244,
+        "speakers_file": null,
+        "d_vector_file": "../speaker_embeddings/new-SE/VCTK-LibriTTS+TTS-PT+MAILABS-FR/speakers.json",
+        "speaker_embedding_channels": 512,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": true,
+        "embedded_language_dim": 4,
+        "num_languages": 3,
+        "use_speaker_encoder_as_loss": true,
+        "speaker_encoder_config_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/config.json",
+        "speaker_encoder_model_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/converted_checkpoint.pth.tar",
+        "fine_tuning_mode": 0,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false
+    },
+    "grad_clip": [
+        5.0,
+        5.0
+    ],
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "r": 1
+}

config_se.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+    "model": "speaker_encoder",
+    "run_name": "speaker_encoder",
+    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
+    "epochs": 100000,
+    "batch_size": null,
+    "eval_batch_size": null,
+    "mixed_precision": false,
+    "run_eval": true,
+    "test_delay_epochs": 0,
+    "print_eval": false,
+    "print_step": 50,
+    "tb_plot_step": 100,
+    "tb_model_param_stats": false,
+    "save_step": 1000,
+    "checkpoint": true,
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "num_loader_workers": 8,
+    "num_val_loader_workers": 0,
+    "use_noise_augment": false,
+    "output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "audio": {
+        "fft_size": 512,
+        "win_length": 400,
+        "hop_length": 160,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 16000,
+        "resample": false,
+        "preemphasis": 0.97,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "do_trim_silence": false,
+        "trim_db": 60,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 64,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "spec_gain": 20,
+        "signal_norm": false,
+        "min_level_db": -100,
+        "symmetric_norm": false,
+        "max_norm": 4.0,
+        "clip_norm": false,
+        "stats_path": null
+    },
+    "datasets": [
+        {
+            "name": "voxceleb2",
+            "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
+            "meta_file_train": null,
+            "ununsed_speakers": null,
+            "meta_file_val": null,
+            "meta_file_attn_mask": "",
+            "language": "voxceleb"
+        }
+    ],
+    "model_params": {
+        "model_name": "resnet",
+        "input_dim": 64,
+        "use_torch_spec": true,
+        "log_input": true,
+        "proj_dim": 512
+    },
+    "audio_augmentation": {
+        "p": 0.5,
+        "rir": {
+            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
+            "conv_mode": "full"
+        },
+        "additive": {
+            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
+            "speech": {
+                "min_snr_in_db": 13,
+                "max_snr_in_db": 20,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+            },
+            "noise": {
+                "min_snr_in_db": 0,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+            },
+            "music": {
+                "min_snr_in_db": 5,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+            }
+        },
+        "gaussian": {
+            "p": 0.0,
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-05
+        }
+    },
+    "storage": {
+        "sample_from_storage_p": 0.5,
+        "storage_size": 40
+    },
+    "max_train_step": 1000000,
+    "loss": "angleproto",
+    "grad_clip": 3.0,
+    "lr": 0.0001,
+    "lr_decay": false,
+    "warmup_steps": 4000,
+    "wd": 1e-06,
+    "steps_plot_stats": 100,
+    "num_speakers_in_batch": 100,
+    "num_utters_per_speaker": 4,
+    "skip_speakers": true,
+    "voice_len": 2.0
+}

language_ids.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "en": 0,
+    "fr-fr": 1,
+    "pt-br": 2
+}

ref.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:056ae4d5aba4c623ff5815c7425d0431db80be276b0582432aa0aa8ded6f90f2
+size 15996512

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+git+https://github.com/suno-ai/bark.git
+git+https://github.com/Edresson/Coqui-TTS@multilingual-torchaudio-SE
+torchaudio
+pydub
+ffmpeg-normalize==1.21.0

speakers.json ADDED Viewed

The diff for this file is too large to render. See raw diff