Edit model card
YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

注意:本模型仅用于学习用途,任何非法使用造成的后果,与模型制作者、项目负责人无关!

准备环境

1.克隆项目

git clone https://github.com/svc-develop-team/so-vits-svc.git

2.阅读README.md,按步骤配置项目

(1)安装依赖

pip install -r requirements.txt

(2)下载编码器和nsf_hifigan

# vec256l9/vec768l12
wget https://ibm.box.com/s/z1wgl1stco8ffooyatzdwsqn2psd9lrr
# hubert-soft
wget https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt
# nsf_hifigan
wget -P pretrain/ https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip
unzip -od pretrain/nsf_hifigan pretrain/nsf_hifigan_20221211.zip

(3)下载底模

链接:HuggingFace

(4)放置模型

准备数据集

此项目使用的是GeshinVoice公开数据集,地址:[GitHub](geshin-voice github)
整理文件为以下目录格式:
dataset_raw
├───speaker0
│   ├───xxx1-xxx1.wav
│   ├───...
│   └───Lxx-0xx8.wav
└───speaker1
    ├───xx2-0xxx2.wav
    ├───...
    └───xxx7-xxx007.wav
具体数据集目录结构如下:
GeshinImpactDataset
dataset_raw
├─ABeiDuo
├─AiErHaiSen
├─AiLiSi
├─AiLuoYi
├─AnBo
├─BaBaLa
├─BaChongShenZi
├─BaiShu
├─BanNiTe
├─BeiDou
├─ChongYun
├─DaDaLiYa
├─DaiInSiLeiBu
├─DiAoNa
├─DiLuKe
├─DiXiYa
├─DuoLi
├─DuoTuoLei
├─FaLuShan
├─FeiXieEr
├─FengYuanWanYe
├─GanYu
├─HuangYanYiDou
├─HuTao
├─JiuQiRen
├─JiuTiaoShaLou
├─KaiYa
├─KanDiSi
├─KaWei
├─KeLai
├─KeLi
├─KeQing
├─Kong
├─LaiLa
├─LeiDianJiangJun
├─LeiZe
├─LiSha
├─LiuLangZhe
├─LuoShaLiYa
├─LuYeYuanPingZang
├─MiKa
├─MoNa
├─NaXiDa
├─NiLu
├─NingGuang
├─NuoAiEr
├─OuLa
├─PaiMeng
├─Qin
├─QiQi
├─SaiNo
├─ShanHuGongXingHai
├─ShaTang
├─ShenHe
├─ShenLiLingHua
├─ShenLiLingRen
├─SiKaLaMuQi
├─TiNaLi
├─TuoMa
├─WeiBing
├─WenDi
├─WuLang
├─XiangLing
├─Xiao
├─XiaoGong
├─XingQiu
├─XiNuoLa
├─XinYan
├─YanFei
├─YaoYao
├─YeLan
├─Ying
├─YunJing
├─ZaoYou
└─ZhongLi

预处理与预训练

#重采样
python resample.py --skip_loudnorm
#选择编码器vec768l12,生成config.json、train.txt和val.txt
python preprocess_flist_config.py --speech_encoder vec768l12  --vol_aug
#选择f0预测器crepe,生成预训练文件
python preprocess_hubert_f0.py --f0_predictor crepe --num_processes 8

调整config

config.json
{
    "train": {
        "log_interval": 2000,
        "eval_interval": 2000,
        "seed": 1234,
        "epochs": 10000,
        "learning_rate": 0.0001,
        "betas": [
            0.8,
            0.99
        ],
        "eps": 1e-09,
        "batch_size": 1,
        "fp16_run": false,
        "lr_decay": 0.999875,
        "segment_size": 10240,
        "init_lr_ratio": 1,
        "warmup_epochs": 0,
        "c_mel": 45,
        "c_kl": 1.0,
        "use_sr": true,
        "max_speclen": 512,
        "port": "8001",
        "keep_ckpts": 0,
        "all_in_mem": false
    },
    "data": {
        "training_files": "filelists/train.txt",
        "validation_files": "filelists/val.txt",
        "max_wav_value": 32768.0,
        "sampling_rate": 44100,
        "filter_length": 2048,
        "hop_length": 512,
        "win_length": 2048,
        "n_mel_channels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": 22050
    },
    "model": {
        "inter_channels": 192,
        "hidden_channels": 192,
        "filter_channels": 768,
        "n_heads": 2,
        "n_layers": 6,
        "kernel_size": 3,
        "p_dropout": 0.1,
        "resblock": "1",
        "resblock_kernel_sizes": [
            3,
            7,
            11
        ],
        "resblock_dilation_sizes": [
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            8,
            8,
            2,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            16,
            16,
            4,
            4,
            4
        ],
        "n_layers_q": 3,
        "use_spectral_norm": false,
        "gin_channels": 768,
        "ssl_dim": 768,
        "n_speakers": 75,
        "speech_encoder": "vec768l12",
        "speaker_embedding": false
    },
    "spk": {
        "ABeiDuo": 0,
        "AiErHaiSen": 1,
        "AiLiSi": 2,
        "AiLuoYi": 3,
        "AnBo": 4,
        "BaBaLa": 5,
        "BaChongShenZi": 6,
        "BaiShu": 7,
        "BanNiTe": 8,
        "BeiDou": 9,
        "ChongYun": 10,
        "DaDaLiYa": 11,
        "DaiInSiLeiBu": 12,
        "DiAoNa": 13,
        "DiLuKe": 14,
        "DiXiYa": 15,
        "DuoLi": 16,
        "DuoTuoLei": 17,
        "FaLuShan": 18,
        "FeiXieEr": 19,
        "FengYuanWanYe": 20,
        "GanYu": 21,
        "HuangYanYiDou": 22,
        "HuTao": 23,
        "JiuQiRen": 24,
        "JiuTiaoShaLou": 25,
        "KaiYa": 26,
        "KanDiSi": 27,
        "KaWei": 28,
        "KeLai": 29,
        "KeLi": 30,
        "KeQing": 31,
        "Kong": 32,
        "LaiLa": 33,
        "LeiDianJiangJun": 34,
        "LeiZe": 35,
        "LiSha": 36,
        "LiuLangZhe": 37,
        "LuoShaLiYa": 38,
        "LuYeYuanPingZang": 39,
        "MiKa": 40,
        "MoNa": 41,
        "NaXiDa": 42,
        "NiLu": 43,
        "NingGuang": 44,
        "NuoAiEr": 45,
        "OuLa": 46,
        "PaiMeng": 47,
        "Qin": 48,
        "QiQi": 49,
        "SaiNo": 50,
        "ShanHuGongXingHai": 51,
        "ShaTang": 52,
        "ShenHe": 53,
        "ShenLiLingHua": 54,
        "ShenLiLingRen": 55,
        "SiKaLaMuQi": 56,
        "TiNaLi": 57,
        "TuoMa": 58,
        "WeiBing": 59,
        "WenDi": 60,
        "WuLang": 61,
        "XiangLing": 62,
        "Xiao": 63,
        "XiaoGong": 64,
        "XingQiu": 65,
        "XiNuoLa": 66,
        "XinYan": 67,
        "YanFei": 68,
        "YaoYao": 69,
        "YeLan": 70,
        "Ying": 71,
        "YunJing": 72,
        "ZaoYou": 73,
        "ZhongLi": 74
    }
}

重点参数如下
"log_interval": 2000,  //每2000steps验证一次模型
"eval_interval": 2000,  //每2000steps保存一次模型
"learning_rate": 0.0001,  //学习率
"batch_size": 1,  //每次取1条语音进行训练(显存6GB推荐)
"keep_ckpts": 0,  //保存最新几个模型,0为不删除模型
"speech_encoder": "vec768l12" //编码器

训练

将G_0.pth、D_0.pth拷贝至.\logs\44k下

开始训练

python train.py -c configs/config.json -m 44k

训练至较好效果后,关闭训练(注意不要过拟合),压缩模型

注:G_xxx.pth为生成器(Generator)模型,D_xxx.pth是验证模型,前者用于推理,后者在训练结束后可删除

python compress_model.py -c="configs/config.json" -i="logs/44k/G_200000.pth" -o="logs/44k/GeshinImpact.pth"

推理

# Example
python inference_main.py -m "logs/44k/GeshinImpact.pth" -c "configs/config.json" -n "干声(不含伴奏).wav" -t 0 -s "NaXiDa" -f0p crepe -lea 1

干声可使用Facebook Research的demucs分离,具体见其GitHub项目

Demo

Downloads last month
2
Unable to determine this model’s pipeline type. Check the docs .