{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyP+PQNqeOew0Ap+hzVH7i8r",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/KevinWang676/Bert-VITS2-quick-start/blob/main/Bert_VITS2_Guide.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 0. 如果使用AutoDL，请运行下载packages的加速代码："
      ],
      "metadata": {
        "id": "CGg4SV4ObQaT"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MgfAJzoHbK2-"
      },
      "outputs": [],
      "source": [
        "!source /etc/network_turbo\n",
        "!python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 1. 数据集重采样和标注"
      ],
      "metadata": {
        "id": "sloMn00-bgxY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import subprocess\n",
        "import random\n",
        "import os\n",
        "from pathlib import Path\n",
        "import librosa\n",
        "from scipy.io import wavfile\n",
        "import numpy as np\n",
        "import torch\n",
        "import csv\n",
        "import whisper\n",
        "\n",
        "a=\"linghua\" # 请在这里修改说话人的名字，目前只支持中文语音\n",
        "\n",
        "def split_long_audio(model, filepaths, save_dir=\"data_dir\", out_sr=44100):\n",
        "    if isinstance(filepaths, str):\n",
        "        filepaths = [filepaths]\n",
        "\n",
        "    for file_idx, filepath in enumerate(filepaths):\n",
        "\n",
        "        save_path = Path(save_dir)\n",
        "        save_path.mkdir(exist_ok=True, parents=True)\n",
        "\n",
        "        print(f\"Transcribing file {file_idx}: '{filepath}' to segments...\")\n",
        "        result = model.transcribe(filepath, word_timestamps=True, task=\"transcribe\", beam_size=5, best_of=5)\n",
        "        segments = result['segments']\n",
        "\n",
        "        wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)\n",
        "        wav, _ = librosa.effects.trim(wav, top_db=20)\n",
        "        peak = np.abs(wav).max()\n",
        "        if peak > 1.0:\n",
        "            wav = 0.98 * wav / peak\n",
        "        wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)\n",
        "        wav2 /= max(wav2.max(), -wav2.min())\n",
        "\n",
        "        for i, seg in enumerate(segments):\n",
        "            start_time = seg['start']\n",
        "            end_time = seg['end']\n",
        "            wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]\n",
        "            wav_seg_name = f\"{a}_{i}.wav\" # 在上方可修改名字\n",
        "            out_fpath = save_path / wav_seg_name\n",
        "            wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))"
      ],
      "metadata": {
        "id": "LtLBGhGCbYYh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "whisper_size = \"large\"\n",
        "whisper_model = whisper.load_model(whisper_size)"
      ],
      "metadata": {
        "id": "--wS7X95b--m"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 请将下方的**linghua.wav**修改成自己的.wav文件名，路径./custom_character_voice/**linghua**/也可以改为自己的角色名\n"
      ],
      "metadata": {
        "id": "0wAE5HRXcCQ_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "split_long_audio(whisper_model, \"./linghua.wav\", \"./custom_character_voice/linghua/\")"
      ],
      "metadata": {
        "id": "3f7ljJhCcEbd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python short_audio_transcribe.py --languages \"C\" --whisper_size large"
      ],
      "metadata": {
        "id": "rBJDPe3ccVrP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### 处理完成后，可以打开\"./filelists/short_character_anno.list\"文件进行微调"
      ],
      "metadata": {
        "id": "4pesbcMjcikn"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 2. 文本处理"
      ],
      "metadata": {
        "id": "9pxo4KL-ceGI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python preprocess_text.py"
      ],
      "metadata": {
        "id": "_xfO2r_0cgCT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 3. 运行bert_gen.py"
      ],
      "metadata": {
        "id": "DoDs7lL6cu01"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python bert_gen.py"
      ],
      "metadata": {
        "id": "jyiT28B3cxWX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 4. 训练"
      ],
      "metadata": {
        "id": "dHQPDFdbc04g"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### 可以在\"./configs/config.json\"更改训练参数，包括epoch,学习率等"
      ],
      "metadata": {
        "id": "gHNws-IUc6Sd"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "cd monotonic_align"
      ],
      "metadata": {
        "id": "S56s0emH8BqN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python setup.py build_ext --inplace"
      ],
      "metadata": {
        "id": "6rLsyel-8KKc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "cd .."
      ],
      "metadata": {
        "id": "mUgA6ho2-XAN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### 若为首次训练，请运行："
      ],
      "metadata": {
        "id": "8vJ6VF__dCYW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python train_ms.py -c ./configs/config.json"
      ],
      "metadata": {
        "id": "iwHCWVijc5h6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### 若为继续训练，请运行："
      ],
      "metadata": {
        "id": "skAGULw2dKXW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python train_ms.py -c ./configs/config.json --cont"
      ],
      "metadata": {
        "id": "Ru09Gmavc2t4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 5. 推理"
      ],
      "metadata": {
        "id": "IinmucfadVLU"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### 请将下方的**G_lastest.pth**修改为最新的模型文件，如**G_3400.pth**"
      ],
      "metadata": {
        "id": "psBRLH_TdZDb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python inference_webui.py --model_dir ./logs/OUTPUT_MODEL/G_latest.pth"
      ],
      "metadata": {
        "id": "lOWVtUgMdUZa"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}