Spaces:

nateraw
/

voice-cloning

Runtime error

App Files Files Community

nateraw commited on Apr 25, 2023

Commit

25ab393

•

1 Parent(s): b0324c8

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (2) hide show

app.py +58 -16
training_so_vits_svc_fork.ipynb +540 -0

app.py CHANGED Viewed

@@ -14,12 +14,23 @@ from so_vits_svc_fork.hparams import HParams
 from so_vits_svc_fork.inference.core import Svc
-##########################################################
-# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME
-##########################################################
 repo_id = "dog/kanye"
 ckpt_name = None
-##########################################################
 # Figure out the latest generator by taking highest value one.
 # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
@@ -33,12 +44,21 @@ if ckpt_name is None:
     )[-1]
     ckpt_name = f"G_{latest_id}.pth"
 generator_path = hf_hub_download(repo_id, ckpt_name)
 config_path = hf_hub_download(repo_id, "config.json")
 hparams = HParams(**json.loads(Path(config_path).read_text()))
 speakers = list(hparams.spk.keys())
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None)
 demucs_model = get_model(DEFAULT_MODEL)
@@ -133,7 +153,7 @@ def predict(
 def predict_song_from_yt(
-    ytid,
     start,
     end,
     speaker=speakers[0],
@@ -147,7 +167,14 @@ def predict_song_from_yt(
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
 ):
-    original_track_filepath = download_youtube_clip(ytid, start, end, "track.wav", force=True)
     vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
     if transpose != 0:
         inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
@@ -187,9 +214,13 @@ interface_mic = gr.Interface(
         gr.Audio(type="filepath", source="microphone", label="Source Audio"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
-        gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
-        gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
     ],
     outputs="audio",
     title="Voice Cloning",
@@ -203,9 +234,13 @@ interface_file = gr.Interface(
         gr.Audio(type="filepath", source="upload", label="Source Audio"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
-        gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
-        gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
     ],
     outputs="audio",
     title="Voice Cloning",
@@ -215,23 +250,30 @@ interface_file = gr.Interface(
 interface_yt = gr.Interface(
     predict_song_from_yt,
     inputs=[
-        "text",
         gr.Number(value=0, label="Start Time (seconds)"),
         gr.Number(value=15, label="End Time (seconds)"),
         gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
-        gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
-        gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
     ],
     outputs=["audio", "audio"],
     title="Voice Cloning",
     description=description,
     article=article,
     examples=[
-        ["COz9lDCFHjw", 75, 90, speakers[0], 0, False, 0.0, 0.4, "dio"],
-        ["Wvm5GuDfAas", 15, 30, speakers[0], 0, False, 0.0, 0.4, "crepe"],
     ],
 )
 interface = gr.TabbedInterface(

 from so_vits_svc_fork.inference.core import Svc
+###################################################################
+# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
+###################################################################
+# The Hugging Face Hub repo ID
 repo_id = "dog/kanye"
+# If None, Uses latest ckpt in the repo
 ckpt_name = None
+# If None, Uses "kmeans.pt" if it exists in the repo
+cluster_model_name = None
+# Set the default f0 type to use - use the one it was trained on.
+# The default for so-vits-svc-fork is "dio".
+# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
+default_f0_method = "crepe"
+# The default ratio of cluster inference to SVC inference.
+# If cluster_model_name is not found in the repo, this is set to 0.
+default_cluster_infer_ratio = 0.5
+###################################################################
 # Figure out the latest generator by taking highest value one.
 # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
     )[-1]
     ckpt_name = f"G_{latest_id}.pth"
+cluster_model_name = cluster_model_name or "kmeans.pt"
+if cluster_model_name in list_repo_files(repo_id):
+    print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
+    cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
+else:
+    print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
+    cluster_model_path = None
+default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
 generator_path = hf_hub_download(repo_id, ckpt_name)
 config_path = hf_hub_download(repo_id, "config.json")
 hparams = HParams(**json.loads(Path(config_path).read_text()))
 speakers = list(hparams.spk.keys())
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
 demucs_model = get_model(DEFAULT_MODEL)
 def predict_song_from_yt(
+    ytid_or_url,
     start,
     end,
     speaker=speakers[0],
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
 ):
+    original_track_filepath = download_youtube_clip(
+        ytid_or_url,
+        start,
+        end,
+        "track.wav",
+        force=True,
+        url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
+    )
     vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
     if transpose != 0:
         inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
         gr.Audio(type="filepath", source="microphone", label="Source Audio"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="f0 method",
+        ),
     ],
     outputs="audio",
     title="Voice Cloning",
         gr.Audio(type="filepath", source="upload", label="Source Audio"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="f0 method",
+        ),
     ],
     outputs="audio",
     title="Voice Cloning",
 interface_yt = gr.Interface(
     predict_song_from_yt,
     inputs=[
+        gr.Textbox(
+            label="YouTube URL or ID", info="A YouTube URL (or ID) to a song on YouTube you want to clone from"
+        ),
         gr.Number(value=0, label="Start Time (seconds)"),
         gr.Number(value=15, label="End Time (seconds)"),
         gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
         gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
         gr.Checkbox(False, label="Auto Predict F0"),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"),
         gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="f0 method",
+        ),
     ],
     outputs=["audio", "audio"],
     title="Voice Cloning",
     description=description,
     article=article,
     examples=[
+        ["COz9lDCFHjw", 75, 90, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
+        ["dQw4w9WgXcQ", 21, 35, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
+        ["Wvm5GuDfAas", 15, 30, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
     ],
 )
 interface = gr.TabbedInterface(

training_so_vits_svc_fork.ipynb ADDED Viewed

	@@ -0,0 +1,540 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/nateraw/voice-cloning/blob/main/training_so_vits_svc_fork.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jIcNJ5QfDsV_"
+      },
+      "outputs": [],
+      "source": [
+        "# %%capture\n",
+        "! pip install git+https://github.com/nateraw/so-vits-svc-fork@main\n",
+        "! pip install openai-whisper yt-dlp huggingface_hub demucs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6uZAhUPOhFv9"
+      },
+      "source": [
+        "---\n",
+        "\n",
+        "# Restart runtime\n",
+        "\n",
+        "After running the cell above, you'll need to restart the Colab runtime because we installed a different version of numpy.\n",
+        "\n",
+        "`Runtime -> Restart runtime`\n",
+        "\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DROusQatF-wF"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import login\n",
+        "\n",
+        "login()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Settings"
+      ],
+      "metadata": {
+        "id": "yOM9WWmmRqTA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5oTDjDEKFz3W"
+      },
+      "outputs": [],
+      "source": [
+        "CHARACTER = \"kanye\"\n",
+        "DO_EXTRACT_VOCALS = False\n",
+        "MODEL_REPO_ID = \"dog/kanye\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BFd_ly1P_5Ht"
+      },
+      "source": [
+        "## Data Preparation\n",
+        "\n",
+        "Prepare a data.csv file here with `ytid,start,end` as the first line (they're the expected column names). Then, prepare a training set given YouTube IDs and their start and end segment times in seconds. Try to pick segments that have dry vocal only, as that'll provide the best results.\n",
+        "\n",
+        "An example is given below for Kanye West."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rBrtgDtWmhRb"
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile data.csv\n",
+        "ytid,start,end\n",
+        "lkK4de9nbzQ,0,137\n",
+        "gXU9Am2Seo0,30,69\n",
+        "gXU9Am2Seo0,94,135\n",
+        "iVgrhWvQpqU,0,55\n",
+        "iVgrhWvQpqU,58,110\n",
+        "UIV-q-gneKA,85,99\n",
+        "UIV-q-gneKA,110,125\n",
+        "UIV-q-gneKA,127,141\n",
+        "UIV-q-gneKA,173,183\n",
+        "GmlyYCGE9ak,0,102\n",
+        "x-7aWcPmJ60,25,43\n",
+        "x-7aWcPmJ60,47,72\n",
+        "x-7aWcPmJ60,98,113\n",
+        "DK2LCIzIBrU,0,56\n",
+        "DK2LCIzIBrU,80,166\n",
+        "_W56nZk0fCI,184,224"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cxxp4uYoC0aG"
+      },
+      "outputs": [],
+      "source": [
+        "import subprocess\n",
+        "from pathlib import Path\n",
+        "import librosa\n",
+        "from scipy.io import wavfile\n",
+        "import numpy as np\n",
+        "from demucs.pretrained import get_model, DEFAULT_MODEL\n",
+        "from demucs.apply import apply_model\n",
+        "import torch\n",
+        "import csv\n",
+        "import whisper\n",
+        "\n",
+        "\n",
+        "def download_youtube_clip(video_identifier, start_time, end_time, output_filename, num_attempts=5, url_base=\"https://www.youtube.com/watch?v=\"):\n",
+        "    status = False\n",
+        "\n",
+        "    output_path = Path(output_filename)\n",
+        "    if output_path.exists():\n",
+        "        return True, \"Already Downloaded\"\n",
+        "\n",
+        "    command = f\"\"\"\n",
+        "        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o \"{output_filename}\" --download-sections \"*{start_time}-{end_time}\" \"{url_base}{video_identifier}\"\n",
+        "    \"\"\".strip()\n",
+        "\n",
+        "    attempts = 0\n",
+        "    while True:\n",
+        "        try:\n",
+        "            output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)\n",
+        "        except subprocess.CalledProcessError as err:\n",
+        "            attempts += 1\n",
+        "            if attempts == num_attempts:\n",
+        "                return status, err.output\n",
+        "        else:\n",
+        "            break\n",
+        "\n",
+        "    status = output_path.exists()\n",
+        "    return status, \"Downloaded\"\n",
+        "\n",
+        "\n",
+        "def split_long_audio(model, filepaths, character_name, save_dir=\"data_dir\", out_sr=44100):\n",
+        "    if isinstance(filepaths, str):\n",
+        "        filepaths = [filepaths]\n",
+        "\n",
+        "    for file_idx, filepath in enumerate(filepaths):\n",
+        "\n",
+        "        save_path = Path(save_dir) / character_name\n",
+        "        save_path.mkdir(exist_ok=True, parents=True)\n",
+        "\n",
+        "        print(f\"Transcribing file {file_idx}: '{filepath}' to segments...\")\n",
+        "        result = model.transcribe(filepath, word_timestamps=True, task=\"transcribe\", beam_size=5, best_of=5)\n",
+        "        segments = result['segments']\n",
+        "    \n",
+        "        wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)\n",
+        "        wav, _ = librosa.effects.trim(wav, top_db=20)\n",
+        "        peak = np.abs(wav).max()\n",
+        "        if peak > 1.0:\n",
+        "            wav = 0.98 * wav / peak\n",
+        "        wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)\n",
+        "        wav2 /= max(wav2.max(), -wav2.min())\n",
+        "\n",
+        "        for i, seg in enumerate(segments):\n",
+        "            start_time = seg['start']\n",
+        "            end_time = seg['end']\n",
+        "            wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]\n",
+        "            wav_seg_name = f\"{character_name}_{file_idx}_{i}.wav\"\n",
+        "            out_fpath = save_path / wav_seg_name\n",
+        "            wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))\n",
+        "\n",
+        "\n",
+        "def extract_vocal_demucs(model, filename, out_filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):\n",
+        "    wav, sr = librosa.load(filename, mono=False, sr=sr)\n",
+        "    wav = torch.tensor(wav)\n",
+        "    ref = wav.mean(0)\n",
+        "    wav = (wav - ref.mean()) / ref.std()\n",
+        "    sources = apply_model(\n",
+        "        model,\n",
+        "        wav[None],\n",
+        "        device=device,\n",
+        "        shifts=shifts,\n",
+        "        split=split,\n",
+        "        overlap=overlap,\n",
+        "        progress=True,\n",
+        "        num_workers=jobs\n",
+        "    )[0]\n",
+        "    sources = sources * ref.std() + ref.mean()\n",
+        "\n",
+        "    wav = sources[-1]\n",
+        "    wav = wav / max(1.01 * wav.abs().max(), 1)\n",
+        "    wavfile.write(out_filename, rate=sr, data=wav.numpy().T)\n",
+        "    return out_filename\n",
+        "\n",
+        "\n",
+        "def create_dataset(\n",
+        "    clips_csv_filepath = \"data.csv\",\n",
+        "    character = \"somebody\",\n",
+        "    do_extract_vocals = False,\n",
+        "    whisper_size = \"medium\",\n",
+        "    # Where raw yt clips will be downloaded to\n",
+        "    dl_dir = \"downloads\",\n",
+        "    # Where actual data will be organized\n",
+        "    data_dir = \"dataset_raw\",\n",
+        "    **kwargs\n",
+        "):\n",
+        "    dl_path = Path(dl_dir) / character\n",
+        "    dl_path.mkdir(exist_ok=True, parents=True)\n",
+        "    if do_extract_vocals:\n",
+        "        demucs_model = get_model(DEFAULT_MODEL)\n",
+        "\n",
+        "    with Path(clips_csv_filepath).open() as f:\n",
+        "        reader = csv.DictReader(f)\n",
+        "        for i, row in enumerate(reader):\n",
+        "            outfile_path = dl_path / f\"{character}_{i:04d}.wav\"\n",
+        "            download_youtube_clip(row['ytid'], row['start'], row['end'], outfile_path)\n",
+        "            if do_extract_vocals:\n",
+        "                extract_vocal_demucs(demucs_model, outfile_path, outfile_path)\n",
+        "\n",
+        "    filenames = sorted([str(x) for x in dl_path.glob(\"*.wav\")])\n",
+        "    whisper_model = whisper.load_model(whisper_size)\n",
+        "    split_long_audio(whisper_model, filenames, character, data_dir)    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D9GrcDUKEGro"
+      },
+      "outputs": [],
+      "source": [
+        "\"\"\"\n",
+        "Here, we override config to have num_workers=0 because\n",
+        "of a limitation in HF Spaces Docker /dev/shm.\n",
+        "\"\"\"\n",
+        "\n",
+        "import json\n",
+        "from pathlib import Path\n",
+        "import multiprocessing\n",
+        "\n",
+        "def update_config(config_file=\"configs/44k/config.json\"):\n",
+        "    config_path = Path(config_file)\n",
+        "    data = json.loads(config_path.read_text())\n",
+        "    data['train']['batch_size'] = 32\n",
+        "    data['train']['eval_interval'] = 500\n",
+        "    data['train']['num_workers'] = multiprocessing.cpu_count()\n",
+        "    data['train']['persistent_workers'] = True\n",
+        "    data['train']['push_to_hub'] = True\n",
+        "    data['train']['repo_id'] = MODEL_REPO_ID # tuple(data['spk'])[0]\n",
+        "    data['train']['private'] = True\n",
+        "    config_path.write_text(json.dumps(data, indent=2, sort_keys=False))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run all Preprocessing Steps"
+      ],
+      "metadata": {
+        "id": "aF6OZkTZRzhj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OAPnD3xKD_Gw"
+      },
+      "outputs": [],
+      "source": [
+        "create_dataset(character=CHARACTER, do_extract_vocals=DO_EXTRACT_VOCALS)\n",
+        "! svc pre-resample\n",
+        "! svc pre-config\n",
+        "! svc pre-hubert -fm crepe\n",
+        "update_config()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {
+        "id": "VpyGazF6R3CE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "MByHpf_wEByg"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import annotations\n",
+        "\n",
+        "import os\n",
+        "import re\n",
+        "import warnings\n",
+        "from logging import getLogger\n",
+        "from multiprocessing import cpu_count\n",
+        "from pathlib import Path\n",
+        "from typing import Any\n",
+        "\n",
+        "import lightning.pytorch as pl\n",
+        "import torch\n",
+        "from lightning.pytorch.accelerators import MPSAccelerator, TPUAccelerator\n",
+        "from lightning.pytorch.loggers import TensorBoardLogger\n",
+        "from lightning.pytorch.strategies.ddp import DDPStrategy\n",
+        "from lightning.pytorch.tuner import Tuner\n",
+        "from torch.cuda.amp import autocast\n",
+        "from torch.nn import functional as F\n",
+        "from torch.utils.data import DataLoader\n",
+        "from torch.utils.tensorboard.writer import SummaryWriter\n",
+        "\n",
+        "import so_vits_svc_fork.f0\n",
+        "import so_vits_svc_fork.modules.commons as commons\n",
+        "import so_vits_svc_fork.utils\n",
+        "\n",
+        "from so_vits_svc_fork import utils\n",
+        "from so_vits_svc_fork.dataset import TextAudioCollate, TextAudioDataset\n",
+        "from so_vits_svc_fork.logger import is_notebook\n",
+        "from so_vits_svc_fork.modules.descriminators import MultiPeriodDiscriminator\n",
+        "from so_vits_svc_fork.modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss\n",
+        "from so_vits_svc_fork.modules.mel_processing import mel_spectrogram_torch\n",
+        "from so_vits_svc_fork.modules.synthesizers import SynthesizerTrn\n",
+        "\n",
+        "from so_vits_svc_fork.train import VitsLightning, VCDataModule\n",
+        "\n",
+        "LOG = getLogger(__name__)\n",
+        "torch.set_float32_matmul_precision(\"high\")\n",
+        "\n",
+        "\n",
+        "from pathlib import Path\n",
+        "\n",
+        "from huggingface_hub import create_repo, upload_folder, login, list_repo_files, delete_file\n",
+        "\n",
+        "# if os.environ.get(\"HF_TOKEN\"):\n",
+        "#     login(os.environ.get(\"HF_TOKEN\"))\n",
+        "\n",
+        "\n",
+        "class HuggingFacePushCallback(pl.Callback):\n",
+        "    def __init__(self, repo_id, private=False, every=100):\n",
+        "        self.repo_id = repo_id\n",
+        "        self.private = private\n",
+        "        self.every = every\n",
+        "\n",
+        "    def on_validation_epoch_end(self, trainer, pl_module):\n",
+        "        self.repo_url = create_repo(\n",
+        "            repo_id=self.repo_id,\n",
+        "            exist_ok=True,\n",
+        "            private=self.private\n",
+        "        )\n",
+        "        self.repo_id = self.repo_url.repo_id\n",
+        "        if pl_module.global_step == 0:\n",
+        "            return\n",
+        "        print(f\"\\n🤗 Pushing to Hugging Face Hub: {self.repo_url}...\")\n",
+        "        model_dir = pl_module.hparams.model_dir\n",
+        "        upload_folder(\n",
+        "            repo_id=self.repo_id,\n",
+        "            folder_path=model_dir,\n",
+        "            path_in_repo=\".\",\n",
+        "            commit_message=\"🍻 cheers\",\n",
+        "            ignore_patterns=[\"*.git*\", \"*README.md*\", \"*__pycache__*\"],\n",
+        "        )\n",
+        "        ckpt_pattern = r'^(D_|G_)\\d+\\.pth$'\n",
+        "        todelete = []\n",
+        "        repo_ckpts = [x for x in list_repo_files(self.repo_id) if re.match(ckpt_pattern, x) and x not in [\"G_0.pth\", \"D_0.pth\"]]\n",
+        "        local_ckpts = [x.name for x in Path(model_dir).glob(\"*.pth\") if re.match(ckpt_pattern, x.name)]\n",
+        "        to_delete = set(repo_ckpts) - set(local_ckpts)\n",
+        "\n",
+        "        for fname in to_delete:\n",
+        "            print(f\"🗑 Deleting {fname} from repo\")\n",
+        "            delete_file(fname, self.repo_id)\n",
+        "\n",
+        "\n",
+        "def train(\n",
+        "    config_path: Path | str, model_path: Path | str, reset_optimizer: bool = False\n",
+        "):\n",
+        "    config_path = Path(config_path)\n",
+        "    model_path = Path(model_path)\n",
+        "\n",
+        "    hparams = utils.get_backup_hparams(config_path, model_path)\n",
+        "    utils.ensure_pretrained_model(model_path, hparams.model.get(\"type_\", \"hifi-gan\"))\n",
+        "\n",
+        "    datamodule = VCDataModule(hparams)\n",
+        "    strategy = (\n",
+        "        (\n",
+        "            \"ddp_find_unused_parameters_true\"\n",
+        "            if os.name != \"nt\"\n",
+        "            else DDPStrategy(find_unused_parameters=True, process_group_backend=\"gloo\")\n",
+        "        )\n",
+        "        if torch.cuda.device_count() > 1\n",
+        "        else \"auto\"\n",
+        "    )\n",
+        "    LOG.info(f\"Using strategy: {strategy}\")\n",
+        "    \n",
+        "    callbacks = []\n",
+        "    if hparams.train.push_to_hub:\n",
+        "        callbacks.append(HuggingFacePushCallback(hparams.train.repo_id, hparams.train.private))\n",
+        "    if not is_notebook():\n",
+        "        callbacks.append(pl.callbacks.RichProgressBar())\n",
+        "    if callbacks == []:\n",
+        "        callbacks = None\n",
+        "\n",
+        "    trainer = pl.Trainer(\n",
+        "        logger=TensorBoardLogger(\n",
+        "            model_path, \"lightning_logs\", hparams.train.get(\"log_version\", 0)\n",
+        "        ),\n",
+        "        # profiler=\"simple\",\n",
+        "        val_check_interval=hparams.train.eval_interval,\n",
+        "        max_epochs=hparams.train.epochs,\n",
+        "        check_val_every_n_epoch=None,\n",
+        "        precision=\"16-mixed\"\n",
+        "        if hparams.train.fp16_run\n",
+        "        else \"bf16-mixed\"\n",
+        "        if hparams.train.get(\"bf16_run\", False)\n",
+        "        else 32,\n",
+        "        strategy=strategy,\n",
+        "        callbacks=callbacks,\n",
+        "        benchmark=True,\n",
+        "        enable_checkpointing=False,\n",
+        "    )\n",
+        "    tuner = Tuner(trainer)\n",
+        "    model = VitsLightning(reset_optimizer=reset_optimizer, **hparams)\n",
+        "\n",
+        "    # automatic batch size scaling\n",
+        "    batch_size = hparams.train.batch_size\n",
+        "    batch_split = str(batch_size).split(\"-\")\n",
+        "    batch_size = batch_split[0]\n",
+        "    init_val = 2 if len(batch_split) <= 1 else int(batch_split[1])\n",
+        "    max_trials = 25 if len(batch_split) <= 2 else int(batch_split[2])\n",
+        "    if batch_size == \"auto\":\n",
+        "        batch_size = \"binsearch\"\n",
+        "    if batch_size in [\"power\", \"binsearch\"]:\n",
+        "        model.tuning = True\n",
+        "        tuner.scale_batch_size(\n",
+        "            model,\n",
+        "            mode=batch_size,\n",
+        "            datamodule=datamodule,\n",
+        "            steps_per_trial=1,\n",
+        "            init_val=init_val,\n",
+        "            max_trials=max_trials,\n",
+        "        )\n",
+        "        model.tuning = False\n",
+        "    else:\n",
+        "        batch_size = int(batch_size)\n",
+        "    # automatic learning rate scaling is not supported for multiple optimizers\n",
+        "    \"\"\"if hparams.train.learning_rate  == \"auto\":\n",
+        "    lr_finder = tuner.lr_find(model)\n",
+        "    LOG.info(lr_finder.results)\n",
+        "    fig = lr_finder.plot(suggest=True)\n",
+        "    fig.savefig(model_path / \"lr_finder.png\")\"\"\"\n",
+        "\n",
+        "    trainer.fit(model, datamodule=datamodule)\n",
+        "\n",
+        "if __name__ == '__main__':\n",
+        "    train('configs/44k/config.json', 'logs/44k')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Train Cluster Model"
+      ],
+      "metadata": {
+        "id": "b2vNCDrSR8Xo"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DBBEx-6Y1sOy"
+      },
+      "outputs": [],
+      "source": [
+        "! svc train-cluster"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "y_qYMuNY1tlm"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import upload_file\n",
+        "\n",
+        "upload_file(path_or_fileobj=\"/content/logs/44k/kmeans.pt\", repo_id=MODEL_REPO_ID, path_in_repo=\"kmeans.pt\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "machine_shape": "hm",
+      "provenance": [],
+      "authorship_tag": "ABX9TyOQeFSvxop9rlCaglNlNoXI",
+      "include_colab_link": true
+    },
+    "gpuClass": "premium",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}