Spaces:

Mahiruoshi
/

MyGO_VIts-bert

Running

App Files Files Community

Mahiruoshi commited on Nov 27, 2023

Commit

9169788

•

1 Parent(s): 3d43932

Upload 120 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Data/BanGDream/configs/config.json +106 -0
Data/BanGDream/filelists/Mygo.list +0 -0
Data/BanGDream/filelists/val.list +8 -0
Data/BanGDream/models/G_49000.pth +3 -0
README.md +8 -6
app.py +159 -382
attentions_onnx.py +378 -0
bert/bert-base-japanese-v3/README.md +1 -1
bert/bert-base-japanese-v3/vocab.txt +1 -1
bert/bert-large-japanese-v2/.gitattributes +34 -0
bert/bert-large-japanese-v2/README.md +53 -0
bert/bert-large-japanese-v2/config.json +19 -0
bert/bert-large-japanese-v2/tokenizer_config.json +10 -0
bert/bert-large-japanese-v2/vocab.txt +0 -0
bert/bert_models.json +14 -0
bert/chinese-roberta-wwm-ext-large/README.md +5 -5
bert/chinese-roberta-wwm-ext-large/added_tokens.json +1 -1
bert/chinese-roberta-wwm-ext-large/special_tokens_map.json +1 -1
bert/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
bert/chinese-roberta-wwm-ext-large/tokenizer_config.json +1 -1
bert/deberta-v2-large-japanese-char-wwm/.gitattributes +34 -0
bert/deberta-v2-large-japanese-char-wwm/README.md +89 -0
bert/deberta-v2-large-japanese-char-wwm/config.json +37 -0
bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin +3 -0
bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json +7 -0
bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json +19 -0
bert/deberta-v2-large-japanese-char-wwm/vocab.txt +0 -0
bert/deberta-v2-large-japanese/.gitattributes +34 -0
bert/deberta-v2-large-japanese/README.md +111 -0
bert/deberta-v2-large-japanese/config.json +38 -0
bert/deberta-v2-large-japanese/special_tokens_map.json +9 -0
bert/deberta-v2-large-japanese/tokenizer.json +0 -0
bert/deberta-v2-large-japanese/tokenizer_config.json +15 -0
bert/deberta-v3-large/.gitattributes +27 -0
bert/deberta-v3-large/README.md +93 -0
bert/deberta-v3-large/config.json +22 -0
bert/deberta-v3-large/generator_config.json +22 -0
bert/deberta-v3-large/pytorch_model.bin +3 -0
bert/deberta-v3-large/spm.model +3 -0
bert/deberta-v3-large/tokenizer_config.json +4 -0
bert_gen.py +32 -20
commons.py +7 -1
config.yml +174 -0
configs/config.json +863 -80
data_utils.py +38 -34
default_config.yml +174 -0
emo_gen.py +174 -0
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes +28 -0
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE +437 -0
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md +127 -0

Data/BanGDream/configs/config.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 1000,
+    "seed": 42,
+    "epochs": 1000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 2,
+    "fp16_run": false,
+    "lr_decay": 0.99995,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "skip_optimizer": true
+  },
+  "data": {
+    "training_files": "Data/BanGDream/filelists/train.list",
+    "validation_files": "Data/BanGDream/filelists/val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 896,
+    "cleaned_text": true,
+    "spk2id": {
+      "燈": 0,
+      "愛音": 1,
+      "楽奈": 2,
+      "そよ": 3,
+      "立希": 4,
+      "祥子": 5,
+      "睦": 6,
+      "海鈴": 7,
+      "にゃむ": 8,
+      "初華": 9,
+      "三月七": 10
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": false,
+    "use_duration_discriminator": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "version": "2.1"
+}

Data/BanGDream/filelists/Mygo.list ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/BanGDream/filelists/val.list ADDED Viewed

	@@ -0,0 +1,8 @@

+D:/Vits2/Dataset/area5524-005.wav|燈|JP|え......立希ちゃん,そこまでは......|_ e . . . . . . t a t e n o z o m i ch a n , s o k o m a d e w a . . . . . . _|0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 4 6 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1
+D:/Vits2/Dataset/area5579-002.wav|燈|JP|え......!?おもしろい......?|_ e . . . . . . ! ? o m o sh i r o i . . . . . . ? _|0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1
+D:/Vits2/Dataset/event235-03-068.wav|燈|JP|ご,ごめん......|_ g o , g o m e n . . . . . . _|0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0|1 2 1 2 2 1 1 1 1 1 1 1 1
+D:/Vits2/Dataset/event235-19-061.wav|燈|JP|......愛音ちゃんは進んでる.行き止まりになっても,ちゃんと道を探して......進もうとしてる......|_ . . . . . . a i o n ch a n w a s u s u n d e r u . i k i d o m a r i n i n a q t e m o , ch a n t o m i ch i o s a g a sh i t e . . . . . . s u s u m o o t o sh i t e r u . . . . . . _|0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 2 2 1 1 1 2 3 2 2 2 1 2 2 2 2 1 2 2 1 2 2 1 2 1 1 1 4 1 3 3 2 1 1 1 1 1 1 3 3 1 2 2 2 2 1 1 1 1 1 1 1
+D:/Vits2/Dataset/event235-37-046.wav|愛音|JP|どれどれ-......やばっ,何これ?ちゃんとキメてるの私だけじゃん!|_ d o r e d o r e - . . . . . . y a b a q , n a n i k o r e ? ch a n t o k i m e t e r u n o w a t a sh i d a k e j a n ! _|0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0|1 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 4 2 2 1 2 1 1 1 2 2 2 2 2 6 2 2 1 1 1 1 1
+D:/Vits2/Dataset/event240-05-052.wav|愛音|JP|私たちも,りっきーと楽奈ちゃん探そっか!今度は五人で考えよ!|_ w a t a sh i t a ch i m o , r i q k i i t o r a k u n a ch a n s a g a s e s o q k a ! k o n d o w a g o n i n d e k a n g a e y o ! _|0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0|1 6 2 2 2 1 2 2 1 1 2 4 2 1 1 1 6 2 2 1 1 3 2 2 2 3 2 3 3 2 1 1
+D:/Vits2/Dataset/event235-06-032.wav|愛音|JP|へー,......そういうのもあるんだ.そよさん,バンドのことに詳しいんですね!|_ e e , . . . . . . s o o y u u n o m o a r u n d a . s o y o s a n , b a n d o n o k o t o n i k u w a sh i i n d e s u n e ! _|0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 2 1 2 2 2 1 1 2 2 1 2 2 2 2 3 2 2 1 2 2 2 1 1
+D:/Vits2/Dataset/event235-03-020.wav|愛音|JP|おーい!えっと,確か......高松さんきゃあ!|_ o o i ! e q t o , t a sh i k a . . . . . . t a k a m a ts u s a n ky a a ! _|0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0|1 1 1 1 1 2 1 1 1 3 3 1 1 1 1 1 1 4 4 2 1 1 1 1 1 1

Data/BanGDream/models/G_49000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d9f2b0baff45ed4d88bbc6162bfaa4c960f5965b0b42085463311a672b350a
+size 721511718

README.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
-title: BangDream-Vits-bert
-emoji: ⚡
-colorFrom: yellow
 colorTo: green
 sdk: gradio
-sdk_version: 3.15.0
 app_file: app.py
 pinned: false
-license: other
----

 ---
+title: Bushiroad BertVIts2 Emotional
+emoji: 📚
+colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 4.7.1
 app_file: app.py
 pinned: false
+license: openrail
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,5 +1,10 @@
-# flake8: noqa: E402
 import logging
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
@@ -10,82 +15,71 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-import datetime
 import numpy as np
 import torch
-from ebooklib import epub
-import PyPDF2
-from PyPDF2 import PdfReader
-import zipfile
-import shutil
-import sys, os
-import json
-from bs4 import BeautifulSoup
-import argparse
 import commons
 import utils
 from models import SynthesizerTrn
 from text.symbols import symbols
-from text import cleaned_text_to_sequence, get_bert
-from text.cleaner import clean_text
-import gradio as gr
-import webbrowser
-import re
-from scipy.io.wavfile import write
-from datetime import datetime
 net_g = None
-BandList = {
-        "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
-        "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
-        "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
-        "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
-        "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
-        "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
-        "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
-        "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"],
-}
-if sys.platform == "darwin" and torch.backends.mps.is_available():
-    device = "mps"
-    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-else:
-    device = "cuda"
-def is_japanese(string):
-        for ch in string:
-            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
-                return True
-        return False
-def extrac(text):
-    text = re.sub("<[^>]*>","",text)
-    result_list = re.split(r'\n', text)
-    final_list = []
-    for i in result_list:
-        i = i.replace('\n','').replace(' ','')
-        #Current length of single sentence: 20
-        if len(i)>1:
-            if len(i) > 20:
-                try:
-                    cur_list = re.split(r'。|！', i)
-                    for i in cur_list:
-                        if len(i)>1:
-                            final_list.append(i+'。')
-                except:
-                    pass
-            else:
-                final_list.append(i)
-            '''
-        final_list.append(i)
-        '''
-    final_list = [x for x in final_list if x != '']
-    return final_list
-def get_text(text, language_str, hps):
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
     if hps.data.add_blank:
         phone = commons.intersperse(phone, 0)
         tone = commons.intersperse(tone, 0)
@@ -93,19 +87,24 @@ def get_text(text, language_str, hps):
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
-    bert = get_bert(norm_text, word2ph, language_str, device)
     del word2ph
-    assert bert.shape[-1] == len(phone), phone
     if language_str == "ZH":
-        bert = bert
-        ja_bert = torch.zeros(768, len(phone))
-    elif language_str == "JA":
-        ja_bert = bert
         bert = torch.zeros(1024, len(phone))
-    else:
         bert = torch.zeros(1024, len(phone))
-        ja_bert = torch.zeros(768, len(phone))
     assert bert.shape[-1] == len(
         phone
@@ -114,19 +113,53 @@ def get_text(text, language_str, hps):
     phone = torch.LongTensor(phone)
     tone = torch.LongTensor(tone)
     language = torch.LongTensor(language)
-    return bert, ja_bert, phone, tone, language
-def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
-    global net_g
-    bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
         lang_ids = lang_ids.to(device).unsqueeze(0)
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
@@ -138,6 +171,8 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
                 lang_ids,
                 bert,
                 ja_bert,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
@@ -147,256 +182,39 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
             .float()
             .numpy()
         )
-        current_time = datetime.now()
-        print(str(current_time)+':'+str(sid))
-        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
-        return audio
-def tts_fn(
-    text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence
-):
-    if not LongSentence:
-        with torch.no_grad():
-            audio = infer(
-                text,
-                sdp_ratio=sdp_ratio,
-                noise_scale=noise_scale,
-                noise_scale_w=noise_scale_w,
-                length_scale=length_scale,
-                sid=speaker,
-                language= "JP" if is_japanese(text) else "ZH",
-            )
             torch.cuda.empty_cache()
-        return (hps.data.sampling_rate, audio)
-    else:
-        audiopath = 'voice.wav'
-        a = ['【','[','(','（']
-        b = ['】',']',')','）']
-        for i in a:
-            text = text.replace(i,'<')
-        for i in b:
-            text = text.replace(i,'>')
-        final_list = extrac(text.replace('“','').replace('”',''))
-        audio_fin = []
-        for sentence in final_list:
-            with torch.no_grad():
-                audio = infer(
-                    sentence,
-                    sdp_ratio=sdp_ratio,
-                    noise_scale=noise_scale,
-                    noise_scale_w=noise_scale_w,
-                    length_scale=length_scale,
-                    sid=speaker,
-                    language= "JP" if is_japanese(text) else "ZH",
-                )
-            audio_fin.append(audio)
-        return (hps.data.sampling_rate, np.concatenate(audio_fin))
-def split_into_sentences(text):
-    """将文本分割为句子，基于中文的标点符号"""
-    sentences = re.split(r'(?<=[。！？…\n])', text)
-    return [sentence.strip() for sentence in sentences if sentence]
-def seconds_to_ass_time(seconds):
-    """将秒数转换为ASS时间格式"""
-    hours = int(seconds / 3600)
-    minutes = int((seconds % 3600) / 60)
-    seconds = int(seconds) % 60
-    milliseconds = int((seconds - int(seconds)) * 1000)
-    return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
-def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
-    audio_fin = []
-    ass_entries = []
-    start_time = 0
-    ass_header = """[Script Info]
-; Script generated by OpenAI Assistant
-Title: Audiobook
-ScriptType: v4.00+
-WrapStyle: 0
-PlayResX: 640
-PlayResY: 360
-ScaledBorderAndShadow: yes
-[V4+ Styles]
-Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
-Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
-[Events]
-Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
-"""
-    for sentence in group:
-        try:
-            print(sentence)
-            FakeSpeaker = sentence.split("|")[0]
-            print(FakeSpeaker)
-            SpeakersList = re.split('\n', spealerList)
-            if FakeSpeaker in list(hps.data.spk2id.keys()):
-                speaker = FakeSpeaker
-            for i in SpeakersList:
-                if FakeSpeaker == i.split("|")[1]:
-                    speaker = i.split("|")[0]
-            speaker_ids = hps.data.spk2id
-            _, audio = tts_fn(sentence.split("|")[-1], speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, LongSentence=True)
-            silence_frames = int(silenceTime * 44010)
-            silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
-            audio_fin.append(audio)
-            audio_fin.append(silence_data)
-            duration = len(audio) / sampling_rate
-            end_time = start_time + duration + silenceTime
-            ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|","：")))
-            start_time = end_time
-        except:
-            pass
-    wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
-    ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
-    write(wav_filename, sampling_rate, np.concatenate(audio_fin))
-    with open(ass_filename, 'w', encoding='utf-8') as f:
-        f.write(ass_header + '\n'.join(ass_entries))
-    return (hps.data.sampling_rate, np.concatenate(audio_fin))
-def extract_text_from_epub(file_path):
-    book = epub.read_epub(file_path)
-    content = []
-    for item in book.items:
-        if isinstance(item, epub.EpubHtml):
-            soup = BeautifulSoup(item.content, 'html.parser')
-            content.append(soup.get_text())
-    return '\n'.join(content)
-def extract_text_from_pdf(file_path):
-    with open(file_path, 'rb') as file:
-        reader = PdfReader(file)
-        content = [page.extract_text() for page in reader.pages]
-    return '\n'.join(content)
-def extract_text_from_game2(data):
-    current_content = []
-    def _extract(data, current_data=None):
-        nonlocal current_content
-        if current_data is None:
-            current_data = {}
-        if isinstance(data, dict):
-            if 'name' in data and 'body' in data:
-                current_name = data['name']
-                current_body = data['body'].replace('\n', '')
-                current_content.append(f"{current_name}|{current_body}")
-            for key, value in data.items():
-                _extract(value, dict(current_data))
-        elif isinstance(data, list):
-            for item in data:
-                _extract(item, dict(current_data))
-    _extract(data)
-    return '\n'.join(current_content)
-def extract_text_from_file(inputFile):
-    file_extension = os.path.splitext(inputFile)[1].lower()
-    if file_extension == ".epub":
-        return extract_text_from_epub(inputFile)
-    elif file_extension == ".pdf":
-        return extract_text_from_pdf(inputFile)
-    elif file_extension == ".txt":
-        with open(inputFile, 'r', encoding='utf-8') as f:
-            return f.read()
-    elif file_extension == ".asset":
-        with open(inputFile, 'r', encoding='utf-8') as f:
-            content =  json.load(f)
-        return extract_text_from_game2(content) if extract_text_from_game2(content) != '' else extract_text_from_game2(content)
-    else:
-        raise ValueError(f"Unsupported file format: {file_extension}")
-def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
-    directory_path = "books"
-    output_path = "books/audiobook_part_1.wav"
-    if os.path.exists(directory_path):
-        shutil.rmtree(directory_path)
-    os.makedirs(directory_path)
-    text = extract_text_from_file(inputFile.name)
-    sentences = split_into_sentences(text)
-    GROUP_SIZE = groupsize
-    for i in range(0, len(sentences), GROUP_SIZE):
-        group = sentences[i:i+GROUP_SIZE]
-        if spealerList == "":
-            spealerList = "无"
-        result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
-        if not torch.cuda.is_available():
-            return result
-    return result
 def loadmodel(model):
     _ = net_g.eval()
     _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
     return "success"
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m", "--model", default="./logs/BangDream/G_45000.pth", help="path of your model"
-    )
-    parser.add_argument(
-        "-c",
-        "--config",
-        default="configs/config.json",
-        help="path of your config file",
-    )
-    parser.add_argument(
-        "--share", default=True, help="make link public", action="store_true"
     )
-    parser.add_argument(
-        "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
-    )
-    args = parser.parse_args()
-    if args.debug:
-        logger.info("Enable DEBUG-LEVEL log")
-        logging.basicConfig(level=logging.DEBUG)
-    device = (
-        "cuda:0"
-        if torch.cuda.is_available()
-        else (
-            "mps"
-            if sys.platform == "darwin" and torch.backends.mps.is_available()
-            else "cpu"
-        )
-    )
-    hps = utils.get_hparams_from_file(args.config)
-    net_g = SynthesizerTrn(
-        len(symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model,
-    ).to(device)
-    loadmodel(args.model)
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
-    languages = ["ZH", "JP"]
-    examples = [
-        ["filelist/Scenarioband6-018.asset", 500, "つくし", "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子", "扩展功能"],
-    ]
     modelPaths = []
-    for dirpath, dirnames, filenames in os.walk("./logs/BangDream/"):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
     with gr.Blocks() as app:
-        gr.Markdown(
-             f"少歌邦邦全员TTS,使用本模型请严格遵守法律法规!\n 发布二创作品请注明项目和本模型作者<a href='https://space.bilibili.com/19874615/'>B站@Mahiroshi</a>及项目链接\n从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看使用说明</a>"
-        )
         for band in BandList:
             with gr.TabItem(band):
                 for name in BandList[band]:
@@ -412,20 +230,10 @@ if __name__ == "__main__":
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
-                                with gr.Accordion(label="切换模型(合成中文建议切换为早期模型)", open=False):
-                                    modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
-                                    btnMod = gr.Button("载入模型")
-                                    statusa = gr.TextArea()
-                                    btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
-                            with gr.Column():
-                                text = gr.TextArea(
-                                    label="输入纯日语或者中文",
-                                    placeholder="输入纯日语或者中文",
-                                    value="有个人躺在地上，哀嚎......\n有个人睡着了，睡在盒子里。\n我要把它打开，看看他的梦是什么。",
-                                )
-                                btn = gr.Button("点击生成", variant="primary")
-                                audio_output = gr.Audio(label="Output Audio")
-                                with gr.Accordion(label="其它参数设定", open=False):
                                     sdp_ratio = gr.Slider(
                                     minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
                                     )
@@ -435,73 +243,42 @@ if __name__ == "__main__":
                                     noise_scale_w = gr.Slider(
                                         minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
                                     )
-                                    LongSentence = gr.Checkbox(value=True, label="Generate LongSentence")
                                     speaker = gr.Dropdown(
                                         choices=speakers, value=name, label="说话人"
-                                    )
                     btn.click(
-                        tts_fn,
                         inputs=[
                             text,
-                            speaker,
                             sdp_ratio,
                             noise_scale,
                             noise_scale_w,
                             length_scale,
-                            LongSentence,
-                        ],
-                        outputs=[audio_output],
-                    )
-        for i in examples:
-            with gr.Tab(i[-1]):
-                with gr.Row():
-                    with gr.Column():
-                        gr.Markdown(
-                                        f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
-                                    )
-                        inputFile = gr.inputs.File(label="上传txt(可设置角色对应表)、epub或mobi文件")
-                        groupSize = gr.Slider(
-                        minimum=10, maximum=1000,value = i[1], step=1, label="当个音频文件包含的最大字数"
-                        )
-                        silenceTime = gr.Slider(
-                        minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔"
-                        )
-                        spealerList = gr.TextArea(
-                                            label="角色对应表",
-                                            placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}|{SeakerInUploadText3}\n",
-                                            value = i[3],
-                        )
-                        speaker = gr.Dropdown(
-                            choices=speakers, value = i[2], label="选择默认说话人"
-                        )
-                    with gr.Column():
-                        sdp_ratio = gr.Slider(
-                        minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
-                        )
-                        noise_scale = gr.Slider(
-                            minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
-                        )
-                        noise_scale_w = gr.Slider(
-                            minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
-                        )
-                        length_scale = gr.Slider(
-                            minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
-                        )
-                        LastAudioOutput = gr.Audio(label="当用cuda在本地运行时才能在book文件夹下浏览全部合成内容")
-                        btn2 = gr.Button("点击生成", variant="primary")
-                    btn2.click(
-                        audiobook,
-                        inputs=[
-                            inputFile,
-                            groupSize,
                             speaker,
-                            sdp_ratio,
-                            noise_scale,
-                            noise_scale_w,
-                            length_scale,
-                            spealerList,
-                            silenceTime
                         ],
-                        outputs=[LastAudioOutput],
                     )
-app.launch()

+import argparse
+import os
+from pathlib import Path
 import logging
+import re_matching
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 )
 logger = logging.getLogger(__name__)
+import librosa
 import numpy as np
 import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from transformers import Wav2Vec2Processor
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+)
+import gradio as gr
+import utils
+from config import config
+import torch
 import commons
+from text import cleaned_text_to_sequence, get_bert
+from emo_gen import process_func, EmotionModel, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2PreTrainedModel, RegressionHead
+from text.cleaner import clean_text
 import utils
 from models import SynthesizerTrn
 from text.symbols import symbols
+import sys
 net_g = None
+device = 'cpu'
+device = (
+        "cuda:0"
+        if torch.cuda.is_available()
+        else (
+            "mps"
+            if sys.platform == "darwin" and torch.backends.mps.is_available()
+            else "cpu"
+        )
+    )
+BandList = {
+        "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈"],
+        "AveMujica":["祥子","睦","海鈴","にゃむ","初華"]
+}
+def get_net_g(model_path: str, version: str, device: str, hps):
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    ).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
+    return net_g
+def get_text(text, language_str, hps, device):
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    print(text)
     if hps.data.add_blank:
         phone = commons.intersperse(phone, 0)
         tone = commons.intersperse(tone, 0)
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
+    bert_ori = get_bert(norm_text, word2ph, language_str, device)
     del word2ph
+    assert bert_ori.shape[-1] == len(phone), phone
     if language_str == "ZH":
+        bert = bert_ori
+        ja_bert = torch.zeros(1024, len(phone))
+        en_bert = torch.zeros(1024, len(phone))
+    elif language_str == "JP":
         bert = torch.zeros(1024, len(phone))
+        ja_bert = bert_ori
+        en_bert = torch.zeros(1024, len(phone))
+    elif language_str == "EN":
         bert = torch.zeros(1024, len(phone))
+        ja_bert = torch.zeros(1024, len(phone))
+        en_bert = bert_ori
+    else:
+        raise ValueError("language_str should be ZH, JP or EN")
     assert bert.shape[-1] == len(
         phone
     phone = torch.LongTensor(phone)
     tone = torch.LongTensor(tone)
     language = torch.LongTensor(language)
+    return bert, ja_bert, en_bert, phone, tone, language
+def get_emo_(reference_audio, emotion):
+    emo = (
+        torch.from_numpy(get_emo(reference_audio))
+        if reference_audio
+        else torch.Tensor([emotion])
+    )
+    return emo
+def get_emo(path):
+    wav, sr = librosa.load(path, 16000)
+    device = config.bert_gen_config.device
+    return process_func(
+        np.expand_dims(wav, 0).astype(np.float),
+        sr,
+        emotional_model,
+        emotional_processor,
+        device,
+        embeddings=True,
+    ).squeeze(0)
+def infer(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    reference_audio=None,
+    emotion=None,
+):
+    language= 'JP' if is_japanese(text) else 'ZH'
+    bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text, language, hps, device
+    )
+    emo = get_emo_(reference_audio, emotion)
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
         lang_ids = lang_ids.to(device).unsqueeze(0)
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        emo = emo.to(device).unsqueeze(0)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
                 lang_ids,
                 bert,
                 ja_bert,
+                en_bert,
+                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
             .float()
             .numpy()
         )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        return (hps.data.sampling_rate,audio)
+def is_japanese(string):
+        for ch in string:
+            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
+                return True
+        return False
 def loadmodel(model):
     _ = net_g.eval()
     _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
     return "success"
 if __name__ == "__main__":
+    emotional_model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+    REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+    emotional_processor = Wav2Vec2Processor.from_pretrained(emotional_model_name)
+    emotional_model = EmotionModel.from_pretrained(emotional_model_name).to(device)
+    hps = utils.get_hparams_from_file('Data/BanGDream/configs/config.json')
+    net_g = get_net_g(
+        model_path='Data/BangDream/models/G_49000.pth', version="2.1", device=device, hps=hps
     )
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
+    languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
+    for dirpath, dirnames, filenames in os.walk("Data/BanGDream/models/"):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
     with gr.Blocks() as app:
         for band in BandList:
             with gr.TabItem(band):
                 for name in BandList[band]:
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
+                                emotion = gr.Slider(
+                                    minimum=0, maximum=9, value=0, step=1, label="Emotion"
+                                )
+                                with gr.Accordion(label="参数设定", open=False):
                                     sdp_ratio = gr.Slider(
                                     minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
                                     )
                                     noise_scale_w = gr.Slider(
                                         minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
                                     )
                                     speaker = gr.Dropdown(
                                         choices=speakers, value=name, label="说话人"
+                                    )
+                                with gr.Accordion(label="切换模型", open=False):
+                                    modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
+                                    btnMod = gr.Button("载入模型")
+                                    statusa = gr.TextArea()
+                                    btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
+                            with gr.Column():
+                                text = gr.TextArea(
+                                    label="输入纯日语或者中文",
+                                    placeholder="输入纯日语或者中文",
+                                    value="为什么要演奏春日影!",
+                                )
+                                reference_audio = gr.Audio(label="情感参考音频（WAV 格式）：用于生成语音的情感参考。（WAV 格式）", type="filepath")
+                                btn = gr.Button("点击生成", variant="primary")
+                                audio_output = gr.Audio(label="Output Audio")
+                                '''
+                                btntran = gr.Button("快速中翻日")
+                                translateResult = gr.TextArea("从这复制翻译后的文本")
+                                btntran.click(translate, inputs=[text], outputs = [translateResult])
+                                '''
                     btn.click(
+                        infer,
                         inputs=[
                             text,
                             sdp_ratio,
                             noise_scale,
                             noise_scale_w,
                             length_scale,
                             speaker,
+                            reference_audio,
+                            emotion,
                         ],
+                        outputs=[audio_output],
                     )
+    print("推理页面已开启!")
+    app.launch()

attentions_onnx.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import logging
+logger = logging.getLogger(__name__)
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        isflow=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        # if isflow:
+        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+        #  self.cond_layer = weight_norm(cond_layer, name='weight')
+        #  self.gin_channels = 256
+        self.cond_layer_idx = self.n_layers
+        if "gin_channels" in kwargs:
+            self.gin_channels = kwargs["gin_channels"]
+            if self.gin_channels != 0:
+                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+                # vits2 says 3rd block, so idx is 2 by default
+                self.cond_layer_idx = (
+                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+                )
+                logging.debug(self.gin_channels, self.cond_layer_idx)
+                assert (
+                    self.cond_layer_idx < self.n_layers
+                ), "cond_layer_idx should be less than n_layers"
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, g=None):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            if i == self.cond_layer_idx and g is not None:
+                g = self.spk_emb_linear(g.transpose(1, 2))
+                g = g.transpose(1, 2)
+                x = x + g
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x

bert/bert-base-japanese-v3/README.md CHANGED Viewed

@@ -50,4 +50,4 @@ The pretrained models are distributed under the Apache License 2.0.
 ## Acknowledgments
-This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.


50
51	## Acknowledgments
52
53	+ This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.

bert/bert-base-japanese-v3/vocab.txt CHANGED Viewed

@@ -13,7 +13,7 @@
 [unused7]
 [unused8]
 [unused9]
 !
 "
 #

 [unused7]
 [unused8]
 [unused9]
 !
 "
 #

bert/bert-large-japanese-v2/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/bert-large-japanese-v2/README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+license: apache-2.0
+datasets:
+- cc100
+- wikipedia
+language:
+- ja
+widget:
+- text: 東北大学で[MASK]の研究をしています。
+---
+# BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
+This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
+This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
+Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
+The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
+## Model architecture
+The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
+## Training Data
+The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
+For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
+The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
+For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
+## Tokenization
+The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
+The vocabulary size is 32768.
+We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
+## Training
+We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
+For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
+For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
+## Licenses
+The pretrained models are distributed under the Apache License 2.0.
+## Acknowledgments
+This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.

bert/bert-large-japanese-v2/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "BertForPreTraining"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 32768
+}

bert/bert-large-japanese-v2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "tokenizer_class": "BertJapaneseTokenizer",
+    "model_max_length": 512,
+    "do_lower_case": false,
+    "word_tokenizer_type": "mecab",
+    "subword_tokenizer_type": "wordpiece",
+    "mecab_kwargs": {
+        "mecab_dic": "unidic_lite"
+    }
+}

bert/bert-large-japanese-v2/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bert/bert_models.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "deberta-v2-large-japanese-char-wwm": {
+        "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
+        "files": ["pytorch_model.bin"]
+    },
+    "chinese-roberta-wwm-ext-large": {
+        "repo_id": "hfl/chinese-roberta-wwm-ext-large",
+        "files": ["pytorch_model.bin"]
+    },
+    "deberta-v3-large": {
+        "repo_id": "microsoft/deberta-v3-large",
+        "files": ["spm.model", "pytorch_model.bin"]
+    }
+}

bert/chinese-roberta-wwm-ext-large/README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-language:
 - zh
 tags:
 - bert
@@ -9,9 +9,9 @@ license: "apache-2.0"
 # Please use 'Bert' related functions to load this model!
 ## Chinese BERT with Whole Word Masking
-For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
-**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
 Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
 This repository is developed based on：https://github.com/google-research/bert
@@ -46,7 +46,7 @@ If you find the technical report or resource is useful, please cite the followin
     pages = "657--668",
 }
 ```
-- Secondary: https://arxiv.org/abs/1906.08101
 ```
 @article{chinese-bert-wwm,
   title={Pre-Training with Whole Word Masking for Chinese BERT},
@@ -54,4 +54,4 @@ If you find the technical report or resource is useful, please cite the followin
   journal={arXiv preprint arXiv:1906.08101},
   year={2019}
  }
-```

 ---
+language:
 - zh
 tags:
 - bert
 # Please use 'Bert' related functions to load this model!
 ## Chinese BERT with Whole Word Masking
+For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
+**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
 Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
 This repository is developed based on：https://github.com/google-research/bert
     pages = "657--668",
 }
 ```
+- Secondary: https://arxiv.org/abs/1906.08101
 ```
 @article{chinese-bert-wwm,
   title={Pre-Training with Whole Word Masking for Chinese BERT},
   journal={arXiv preprint arXiv:1906.08101},
   year={2019}
  }
+```

bert/chinese-roberta-wwm-ext-large/added_tokens.json CHANGED Viewed

	@@ -1 +1 @@
1	- {}


1	+ {}

bert/chinese-roberta-wwm-ext-large/special_tokens_map.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

bert/chinese-roberta-wwm-ext-large/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

bert/chinese-roberta-wwm-ext-large/tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"init_inputs": []}


1	+ {"init_inputs": []}

bert/deberta-v2-large-japanese-char-wwm/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/deberta-v2-large-japanese-char-wwm/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+---
+language: ja
+license: cc-by-sa-4.0
+library_name: transformers
+tags:
+  - deberta
+  - deberta-v2
+  - fill-mask
+  - character
+  - wwm
+datasets:
+  - wikipedia
+  - cc100
+  - oscar
+metrics:
+  - accuracy
+mask_token: "[MASK]"
+widget:
+    - text: "京都大学で自然言語処理を[MASK][MASK]する。"
+---
+# Model Card for Japanese character-level DeBERTa V2 large
+## Model description
+This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the Japanese portion of OSCAR.
+This model is trained with character-level tokenization and whole word masking.
+## How to use
+You can use this model for masked language modeling as follows:
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
+model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
+sentence = '京都大学で自然言語処理を[MASK][MASK]する。'
+encoding = tokenizer(sentence, return_tensors='pt')
+...
+```
+You can also fine-tune this model on downstream tasks.
+## Tokenization
+There is no need to tokenize texts in advance, and you can give raw texts to the tokenizer.
+The texts are tokenized into character-level tokens by [sentencepiece](https://github.com/google/sentencepiece).
+## Training data
+We used the following corpora for pre-training:
+- Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
+- Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
+- Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
+Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
+Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
+## Training procedure
+We first segmented texts in the corpora into words using [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) for whole word masking.
+Then, we built a sentencepiece model with 22,012 tokens including all characters that appear in the training corpus.
+We tokenized raw corpora into character-level subwords using the sentencepiece model and trained the Japanese DeBERTa model using [transformers](https://github.com/huggingface/transformers) library.
+The training took 26 days using 16 NVIDIA A100-SXM4-40GB GPUs.
+The following hyperparameters were used during pre-training:
+- learning_rate: 1e-4
+- per_device_train_batch_size: 26
+- distributed_type: multi-GPU
+- num_devices: 16
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 3,328
+- max_seq_length: 512
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
+- lr_scheduler_type: linear schedule with warmup (lr = 0 at 300k steps)
+- training_steps: 260,000
+- warmup_steps: 10,000
+The accuracy of the trained model on the masked language modeling task was 0.795.
+The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
+## Acknowledgments
+This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of Large-Scale Japanese Language Models".
+For training models, we used the mdx: a platform for the data-driven future.

bert/deberta-v2-large-japanese-char-wwm/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "DebertaV2ForMaskedLM"
+  ],
+  "attention_head_size": 64,
+  "attention_probs_dropout_prob": 0.1,
+  "conv_act": "gelu",
+  "conv_kernel_size": 3,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 1024,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 0,
+  "vocab_size": 22012
+}

bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf0dab8ad87bd7c22e85ec71e04f2240804fda6d33196157d6b5923af6ea1201
+size 1318456639

bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "jumanpp_kwargs": null,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "subword_tokenizer_type": "character",
+  "sudachi_kwargs": null,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "unk_token": "[UNK]",
+  "word_tokenizer_type": "basic"
+}

bert/deberta-v2-large-japanese-char-wwm/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bert/deberta-v2-large-japanese/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/deberta-v2-large-japanese/README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+---
+language: ja
+license: cc-by-sa-4.0
+library_name: transformers
+tags:
+  - deberta
+  - deberta-v2
+  - fill-mask
+datasets:
+  - wikipedia
+  - cc100
+  - oscar
+metrics:
+  - accuracy
+mask_token: "[MASK]"
+widget:
+  - text: "京都 大学 で 自然 言語 処理 を [MASK] する 。"
+---
+# Model Card for Japanese DeBERTa V2 large
+## Model description
+This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the
+Japanese portion of OSCAR.
+## How to use
+You can use this model for masked language modeling as follows:
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese')
+model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese')
+sentence = '京都 大学 で 自然 言語 処理 を [MASK] する 。'  # input should be segmented into words by Juman++ in advance
+encoding = tokenizer(sentence, return_tensors='pt')
+...
+```
+You can also fine-tune this model on downstream tasks.
+## Tokenization
+The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in
+advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each
+word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).
+## Training data
+We used the following corpora for pre-training:
+- Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
+- Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
+- Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
+Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
+Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of
+CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
+## Training procedure
+We first segmented texts in the corpora into words using [Juman++](https://github.com/ku-nlp/jumanpp).
+Then, we built a sentencepiece model with 32000 tokens including words ([JumanDIC](https://github.com/ku-nlp/JumanDIC))
+and subwords induced by the unigram language model of [sentencepiece](https://github.com/google/sentencepiece).
+We tokenized the segmented corpora into subwords using the sentencepiece model and trained the Japanese DeBERTa model
+using [transformers](https://github.com/huggingface/transformers) library.
+The training took 36 days using 8 NVIDIA A100-SXM4-40GB GPUs.
+The following hyperparameters were used during pre-training:
+- learning_rate: 1e-4
+- per_device_train_batch_size: 18
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 16
+- total_train_batch_size: 2,304
+- max_seq_length: 512
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
+- lr_scheduler_type: linear schedule with warmup
+- training_steps: 300,000
+- warmup_steps: 10,000
+The accuracy of the trained model on the masked language modeling task was 0.799.
+The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
+## Fine-tuning on NLU tasks
+We fine-tuned the following models and evaluated them on the dev set of JGLUE.
+We tuned learning rate and training epochs for each model and task
+following [the JGLUE paper](https://www.jstage.jst.go.jp/article/jnlp/30/1/30_63/_pdf/-char/ja).
+| Model                         | MARC-ja/acc | JSTS/pearson | JSTS/spearman | JNLI/acc | JSQuAD/EM | JSQuAD/F1 | JComQA/acc |
+|-------------------------------|-------------|--------------|---------------|----------|-----------|-----------|------------|
+| Waseda RoBERTa base           | 0.965       | 0.913        | 0.876         | 0.905    | 0.853     | 0.916     | 0.853      |
+| Waseda RoBERTa large (seq512) | 0.969       | 0.925        | 0.890         | 0.928    | 0.910     | 0.955     | 0.900      |
+| LUKE Japanese base*           | 0.965       | 0.916        | 0.877         | 0.912    | -         | -         | 0.842      |
+| LUKE Japanese large*          | 0.965       | 0.932        | 0.902         | 0.927    | -         | -         | 0.893      |
+| DeBERTaV2 base                | 0.970       | 0.922        | 0.886         | 0.922    | 0.899     | 0.951     | 0.873      |
+| DeBERTaV2 large               | 0.968       | 0.925        | 0.892         | 0.924    | 0.912     | 0.959     | 0.890      |
+*The scores of LUKE are from [the official repository](https://github.com/studio-ousia/luke).
+## Acknowledgments
+This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (
+JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of
+Large-Scale Japanese Language Models".
+For training models, we used the mdx: a platform for the data-driven future.

bert/deberta-v2-large-japanese/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_name_or_path": "configs/deberta_v2_large.json",
+  "architectures": [
+    "DebertaV2ForMaskedLM"
+  ],
+  "attention_head_size": 64,
+  "attention_probs_dropout_prob": 0.1,
+  "conv_act": "gelu",
+  "conv_kernel_size": 3,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 1024,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.1",
+  "type_vocab_size": 0,
+  "vocab_size": 32000
+}

bert/deberta-v2-large-japanese/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert/deberta-v2-large-japanese/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert/deberta-v2-large-japanese/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "keep_accents": true,
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": null,
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]"
+}

bert/deberta-v3-large/.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/deberta-v3-large/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+language: en
+tags:
+  - deberta
+  - deberta-v3
+  - fill-mask
+thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
+license: mit
+---
+## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
+[DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
+In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa,  our V3 version significantly improves the model performance on downstream tasks.  You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
+Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
+The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters  with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer.  This model was trained using the 160GB data as DeBERTa V2.
+#### Fine-tuning on NLU tasks
+We present the dev results on SQuAD 2.0 and MNLI tasks.
+| Model             |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
+|-------------------|----------|-------------------|-----------|----------|
+| RoBERTa-large     |50     |304                | 89.4/86.5 | 90.2   |
+| XLNet-large       |32     |-                  | 90.6/87.9 | 90.8   |
+| DeBERTa-large     |50     |-                  | 90.7/88.0 | 91.3   |
+| **DeBERTa-v3-large**|128|304                  |  **91.5/89.0**| **91.8/91.9**|
+#### Fine-tuning with HF transformers
+```bash
+#!/bin/bash
+cd transformers/examples/pytorch/text-classification/
+pip install datasets
+export TASK_NAME=mnli
+output_dir="ds_results"
+num_gpus=8
+batch_size=8
+python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
+  run_glue.py \
+  --model_name_or_path microsoft/deberta-v3-large \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --evaluation_strategy steps \
+  --max_seq_length 256 \
+  --warmup_steps 50 \
+  --per_device_train_batch_size ${batch_size} \
+  --learning_rate 6e-6 \
+  --num_train_epochs 2 \
+  --output_dir $output_dir \
+  --overwrite_output_dir \
+  --logging_steps 1000 \
+  --logging_dir $output_dir
+```
+### Citation
+If you find DeBERTa useful for your work, please cite the following papers:
+``` latex
+@misc{he2021debertav3,
+      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
+      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
+      year={2021},
+      eprint={2111.09543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+``` latex
+@inproceedings{
+he2021deberta,
+title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
+author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
+booktitle={International Conference on Learning Representations},
+year={2021},
+url={https://openreview.net/forum?id=XPZIaotutsD}
+}
+```

bert/deberta-v3-large/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+		"model_type": "deberta-v2",
+		"attention_probs_dropout_prob": 0.1,
+		"hidden_act": "gelu",
+		"hidden_dropout_prob": 0.1,
+		"hidden_size": 1024,
+		"initializer_range": 0.02,
+		"intermediate_size": 4096,
+		"max_position_embeddings": 512,
+		"relative_attention": true,
+		"position_buckets": 256,
+		"norm_rel_ebd": "layer_norm",
+		"share_att_key": true,
+		"pos_att_type": "p2c|c2p",
+		"layer_norm_eps": 1e-7,
+		"max_relative_positions": -1,
+		"position_biased_input": false,
+		"num_attention_heads": 16,
+		"num_hidden_layers": 24,
+		"type_vocab_size": 0,
+		"vocab_size": 128100
+}

bert/deberta-v3-large/generator_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+	"model_type": "deberta-v2",
+	"attention_probs_dropout_prob": 0.1,
+	"hidden_act": "gelu",
+	"hidden_dropout_prob": 0.1,
+	"hidden_size": 1024,
+	"initializer_range": 0.02,
+	"intermediate_size": 4096,
+	"max_position_embeddings": 512,
+	"relative_attention": true,
+	"position_buckets": 256,
+	"norm_rel_ebd": "layer_norm",
+	"share_att_key": true,
+	"pos_att_type": "p2c|c2p",
+	"layer_norm_eps": 1e-7,
+	"max_relative_positions": -1,
+	"position_biased_input": false,
+	"num_attention_heads": 16,
+	"num_hidden_layers": 12,
+	"type_vocab_size": 0,
+	"vocab_size": 128100
+}

bert/deberta-v3-large/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd5b5d93e2db101aaf281df0ea1216c07ad73620ff59c5b42dccac4bf2eef5b5
+size 873673253

bert/deberta-v3-large/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

bert/deberta-v3-large/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "do_lower_case": false,
+  "vocab_type": "spm"
+}

bert_gen.py CHANGED Viewed

@@ -6,14 +6,19 @@ from tqdm import tqdm
 from text import cleaned_text_to_sequence, get_bert
 import argparse
 import torch.multiprocessing as mp
 def process_line(line):
-    rank = mp.current_process()._identity
-    rank = rank[0] if len(rank) > 0 else 0
-    if torch.cuda.is_available():
-        gpu_id = rank % torch.cuda.device_count()
-        device = torch.device(f"cuda:{gpu_id}")
     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
     phone = phones.split(" ")
     tone = [int(i) for i in tone.split(" ")]
@@ -21,15 +26,14 @@ def process_line(line):
     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
-    if hps.data.add_blank:
-        phone = commons.intersperse(phone, 0)
-        tone = commons.intersperse(tone, 0)
-        language = commons.intersperse(language, 0)
-        for i in range(len(word2ph)):
-            word2ph[i] = word2ph[i] * 2
-        word2ph[0] += 1
-    bert_path = wav_path.replace(".wav", ".bert.pt")
     try:
         bert = torch.load(bert_path)
@@ -40,11 +44,17 @@ def process_line(line):
         torch.save(bert, bert_path)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", type=str, default="configs/config.json")
-    parser.add_argument("--num_processes", type=int, default=2)
-    args = parser.parse_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
     lines = []
@@ -53,8 +63,10 @@ if __name__ == "__main__":
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
-    num_processes = args.num_processes
-    with Pool(processes=num_processes) as pool:
-        for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
-            pass

 from text import cleaned_text_to_sequence, get_bert
 import argparse
 import torch.multiprocessing as mp
+from config import config
 def process_line(line):
+    device = config.bert_gen_config.device
+    if config.bert_gen_config.use_multi_device:
+        rank = mp.current_process()._identity
+        rank = rank[0] if len(rank) > 0 else 0
+        if torch.cuda.is_available():
+            gpu_id = rank % torch.cuda.device_count()
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device("cpu")
     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
     phone = phones.split(" ")
     tone = [int(i) for i in tone.split(" ")]
     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    phone = commons.intersperse(phone, 0)
+    tone = commons.intersperse(tone, 0)
+    language = commons.intersperse(language, 0)
+    for i in range(len(word2ph)):
+        word2ph[i] = word2ph[i] * 2
+    word2ph[0] += 1
+    bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
     try:
         bert = torch.load(bert_path)
         torch.save(bert, bert_path)
+preprocess_text_config = config.preprocess_text_config
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.bert_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.bert_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
     lines = []
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
+    if len(lines) != 0:
+        num_processes = args.num_processes
+        with Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
+                pass
+    print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")

commons.py CHANGED Viewed

@@ -50,7 +50,13 @@ def slice_segments(x, ids_str, segment_size=4):
     for i in range(x.size(0)):
         idx_str = ids_str[i]
         idx_end = idx_str + segment_size
-        ret[i] = x[i, :, idx_str:idx_end]
     return ret

     for i in range(x.size(0)):
         idx_str = ids_str[i]
         idx_end = idx_str + segment_size
+        if idx_str < 0:
+            i1 = x.size(2) + idx_str
+            r1 = x[i, :, i1:]
+            r2 = x[i, :, :idx_end]
+            ret[i] = torch.cat([r1, r2], dim=1)
+        else:
+            ret[i] = x[i, :, idx_str:idx_end]
     return ret

config.yml ADDED Viewed

	@@ -0,0 +1,174 @@

+# 全局配置
+# 对于希望在同一时间使用多个配置文件的情况，例如两个GPU同时跑两个训练集：通过环境变量指定配置文件，不指定则默认为./config.yml
+# 拟提供通用路径配置，统一存放数据，避免数据放得很乱
+# 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
+# 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/BanGDream"
+# 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
+mirror: "openai"
+openi_token: ""  # openi token
+# resample 音频重采样配置
+# 注意， “:” 后需要加空格
+resample:
+  # 目标重采样率
+  sampling_rate: 44100
+  # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
+  # 请填入相对于datasetPath的相对路径
+  in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
+  # 音频文件重采样后输出路径
+  out_dir: ""
+# preprocess_text 数据集预处理相关配置
+# 注意， “:” 后需要加空格
+preprocess_text:
+  # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/Mygo.list"
+  # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
+  cleaned_path: ""
+  # 训练集路径
+  train_path: "filelists/train.list"
+  # 验证集路径
+  val_path: "filelists/val.list"
+  # 配置文件路径
+  config_path: "configs/config.json"
+  # 每个speaker的验证集条数
+  val_per_spk: 4
+  # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 8
+  # 是否进行数据清洗
+  clean: true
+# bert_gen 相关配置
+# 注意， “:” 后需要加空格
+bert_gen:
+  # 训练数据集配置文件路径
+  config_path: "configs/config.json"
+  # 并行数
+  num_processes: 2
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  # 该选项同时决定了get_bert_feature的默认设备
+  device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
+# emo_gen 相关配置
+# 注意， “:” 后需要加空格
+emo_gen:
+  # 训练数据集配置文件路径
+  config_path: "configs/config.json"
+  # 并行数
+  num_processes: 2
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  device: "cuda"
+# train 训练配置
+# 注意， “:” 后需要加空格
+train_ms:
+  env:
+    MASTER_ADDR: "localhost"
+    MASTER_PORT: 10086
+    WORLD_SIZE: 1
+    LOCAL_RANK: 0
+    RANK: 0
+    # 可以填写任意名的环境变量
+    # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
+  # 底模设置
+  base:
+    use_base_model: True
+    repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
+  # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
+  model: "models"
+  # 配置文件路径
+  config_path: "configs/config.json"
+  # 训练使用的worker，不建议超过CPU核心数
+  num_workers: 16
+  # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
+  spec_cache: True
+  # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
+  keep_ckpts: 8
+# webui webui配置
+# 注意， “:” 后需要加空格
+webui:
+  # 推理设备
+  device: "cpu"
+  # 模型路径
+  model: "models/G_30000.pth"
+  # 配置文件路径
+  config_path: "configs/config.json"
+  # 端口号
+  port: 7860
+  # 是否公开部署，对外网开放
+  share: false
+  # 是否开启debug模式
+  debug: false
+  # 语种识别库，可选langid, fastlid
+  language_identification_library: "langid"
+# server api配置
+# 注意， “:” 后需要加空格
+# 注意，本配置下的所有配置均为相对于根目录的路径
+server:
+  # 端口号
+  port: 5000
+  # 模型默认使用设备：但是当前并没有实现这个配置。
+  device: "cuda"
+  # 需要加载的所有模型的配置
+  # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  models:
+    - # 模型的路径
+      model: ""
+      # 模型config.json的路径
+      config: ""
+      # 模型使用设备，若填写则会覆盖默认配置
+      device: "cuda"
+      # 模型默认使用的语言
+      language: "ZH"
+      # 模型人物默认参数
+      # 不必填写所有人物，不填的使用默认值
+      # 暂时不用填写，当前尚未实现按人区分配置
+      speakers:
+        - speaker: "科比"
+          sdp_ratio: 0.2
+          noise_scale: 0.6
+          noise_scale_w: 0.8
+          length_scale: 1
+        - speaker: "五条悟"
+          sdp_ratio: 0.3
+          noise_scale: 0.7
+          noise_scale_w: 0.8
+          length_scale: 0.5
+        - speaker: "安倍晋三"
+          sdp_ratio: 0.2
+          noise_scale: 0.6
+          noise_scale_w: 0.8
+          length_scale: 1.2
+    - # 模型的路径
+      model: ""
+      # 模型config.json的路径
+      config: ""
+      # 模型使用设备，若填写则会覆盖默认配置
+      device: "cpu"
+      # 模型默认使用的语言
+      language: "JP"
+      # 模型人物默认参数
+      # 不必填写所有人物，不填的使用默认值
+      speakers: [ ] # 也可以不填
+# 百度翻译开放平台 api配置
+# api接入文档 https://api.fanyi.baidu.com/doc/21
+# 请不要在github等网站公开分享你的app id 与 key
+translate:
+  # 你的APPID
+  "app_key": ""
+  # 你的密钥
+  "secret_key": ""

configs/config.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "train": {
     "log_interval": 200,
     "eval_interval": 1000,
-    "seed": 52,
-    "epochs": 10000,
-    "learning_rate": 0.0003,
     "betas": [
       0.8,
       0.99
@@ -12,7 +12,7 @@
     "eps": 1e-09,
     "batch_size": 24,
     "fp16_run": false,
-    "lr_decay": 0.999875,
     "segment_size": 16384,
     "init_lr_ratio": 1,
     "warmup_epochs": 0,
@@ -32,82 +32,864 @@
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
-    "n_speakers": 256,
     "cleaned_text": true,
     "spk2id": {
-      "biaobei": 0,
-      "香澄": 1,
-      "有咲": 2,
-      "沙綾": 3,
-      "りみ": 4,
-      "たえ": 5,
-      "沙綾、りみ、たえ": 6,
-      "巴": 7,
-      "一同": 8,
-      "まりな": 9,
-      "ゆり": 10,
-      "ポピパ一同": 11,
-      "明日香": 12,
-      "？？？": 13,
-      "オーナー": 14,
-      "全員": 15,
-      "Poppin'Party": 16,
-      "ひまり": 17,
-      "モカ": 18,
-      "つぐみ": 19,
-      "蘭": 20,
-      "リサ": 21,
-      "千聖": 22,
-      "花音": 23,
-      "イヴ": 24,
-      "日菜": 25,
-      "友希那": 26,
-      "紗夜": 27,
-      "Afterglow": 28,
-      "こころ": 29,
-      "美咲": 30,
-      "薫": 31,
-      "はぐみ": 32,
-      "ミッシェル": 33,
-      "マリー": 34,
-      "怪盗ハロハッピー": 35,
-      "ハロー、ハッピーワールド！": 36,
-      "ニコリーナ": 37,
-      "彩": 38,
-      "麻弥": 39,
-      "パスパレ一同": 40,
-      "燐子": 41,
-      "あこ": 42,
-      "あこのチャット": 43,
-      "燐子のチャット": 44,
-      "燐子チャット": 45,
-      "Roselia": 46,
-      "ゆきな": 47,
-      "ましろ": 48,
-      "つくし": 49,
-      "透子": 50,
-      "七深": 51,
-      "瑠唯": 52,
-      "六花": 53,
-      "パレオ": 54,
-      "レイヤ": 55,
-      "マスキング": 56,
-      "チュチュ": 57,
-      "ますき": 58,
-      "ロック": 59,
-      "令王那": 60,
-      "CHIYU": 61,
-      "レイ": 62,
-      "詩船": 63,
-      "珠手ちゆ": 64,
-      "燈": 65,
-      "そよ": 66,
-      "祥子": 67,
-      "立希": 68,
-      "睦": 69,
-      "愛音": 70,
-      "楽奈": 71,
-      "海鈴": 72
     }
   },
   "model": {
@@ -163,5 +945,6 @@
     "n_layers_q": 3,
     "use_spectral_norm": false,
     "gin_channels": 256
-  }
-}

   "train": {
     "log_interval": 200,
     "eval_interval": 1000,
+    "seed": 42,
+    "epochs": 1000,
+    "learning_rate": 0.0002,
     "betas": [
       0.8,
       0.99
     "eps": 1e-09,
     "batch_size": 24,
     "fp16_run": false,
+    "lr_decay": 0.99995,
     "segment_size": 16384,
     "init_lr_ratio": 1,
     "warmup_epochs": 0,
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
+    "n_speakers": 896,
     "cleaned_text": true,
     "spk2id": {
+      "派蒙_ZH": 0,
+      "纳西妲_ZH": 1,
+      "凯亚_ZH": 2,
+      "阿贝多_ZH": 3,
+      "温迪_ZH": 4,
+      "枫原万叶_ZH": 5,
+      "钟离_ZH": 6,
+      "荒泷一斗_ZH": 7,
+      "八重神子_ZH": 8,
+      "艾尔海森_ZH": 9,
+      "提纳里_ZH": 10,
+      "迪希雅_ZH": 11,
+      "卡维_ZH": 12,
+      "宵宫_ZH": 13,
+      "那维莱特_ZH": 14,
+      "莱依拉_ZH": 15,
+      "赛诺_ZH": 16,
+      "莫娜_ZH": 17,
+      "诺艾尔_ZH": 18,
+      "托马_ZH": 19,
+      "凝光_ZH": 20,
+      "林尼_ZH": 21,
+      "北斗_ZH": 22,
+      "柯莱_ZH": 23,
+      "神里绫华_ZH": 24,
+      "可莉_ZH": 25,
+      "芭芭拉_ZH": 26,
+      "雷电将军_ZH": 27,
+      "娜维娅_ZH": 28,
+      "芙宁娜_ZH": 29,
+      "珊瑚宫心海_ZH": 30,
+      "鹿野院平藏_ZH": 31,
+      "迪奥娜_ZH": 32,
+      "琴_ZH": 33,
+      "五郎_ZH": 34,
+      "班尼特_ZH": 35,
+      "达达利亚_ZH": 36,
+      "安柏_ZH": 37,
+      "莱欧斯利_ZH": 38,
+      "夜兰_ZH": 39,
+      "妮露_ZH": 40,
+      "辛焱_ZH": 41,
+      "丽莎_ZH": 42,
+      "珐露珊_ZH": 43,
+      "魈_ZH": 44,
+      "香菱_ZH": 45,
+      "迪卢克_ZH": 46,
+      "砂糖_ZH": 47,
+      "烟绯_ZH": 48,
+      "早柚_ZH": 49,
+      "云堇_ZH": 50,
+      "刻晴_ZH": 51,
+      "重云_ZH": 52,
+      "优菈_ZH": 53,
+      "胡桃_ZH": 54,
+      "流浪者_ZH": 55,
+      "久岐忍_ZH": 56,
+      "神里绫人_ZH": 57,
+      "甘雨_ZH": 58,
+      "戴因斯雷布_ZH": 59,
+      "菲谢尔_ZH": 60,
+      "白术_ZH": 61,
+      "行秋_ZH": 62,
+      "九条裟罗_ZH": 63,
+      "夏洛蒂_ZH": 64,
+      "雷泽_ZH": 65,
+      "申鹤_ZH": 66,
+      "荧_ZH": 67,
+      "空_ZH": 68,
+      "迪娜泽黛_ZH": 69,
+      "凯瑟琳_ZH": 70,
+      "多莉_ZH": 71,
+      "坎蒂丝_ZH": 72,
+      "琳妮特_ZH": 73,
+      "萍姥姥_ZH": 74,
+      "罗莎莉亚_ZH": 75,
+      "埃德_ZH": 76,
+      "爱贝尔_ZH": 77,
+      "伊迪娅_ZH": 78,
+      "留云借风真君_ZH": 79,
+      "绮良良_ZH": 80,
+      "七七_ZH": 81,
+      "式大将_ZH": 82,
+      "瑶瑶_ZH": 83,
+      "奥兹_ZH": 84,
+      "菲米尼_ZH": 85,
+      "米卡_ZH": 86,
+      "哲平_ZH": 87,
+      "大肉丸_ZH": 88,
+      "托克_ZH": 89,
+      "蒂玛乌斯_ZH": 90,
+      "昆钧_ZH": 91,
+      "欧菲妮_ZH": 92,
+      "塞琉斯_ZH": 93,
+      "仆人_ZH": 94,
+      "迈勒斯_ZH": 95,
+      "希格雯_ZH": 96,
+      "阿守_ZH": 97,
+      "拉赫曼_ZH": 98,
+      "杜拉夫_ZH": 99,
+      "伊利亚斯_ZH": 100,
+      "阿晃_ZH": 101,
+      "旁白_ZH": 102,
+      "爱德琳_ZH": 103,
+      "埃洛伊_ZH": 104,
+      "德沃沙克_ZH": 105,
+      "玛乔丽_ZH": 106,
+      "塞塔蕾_ZH": 107,
+      "柊千里_ZH": 108,
+      "海芭夏_ZH": 109,
+      "九条镰治_ZH": 110,
+      "阿娜耶_ZH": 111,
+      "笼钓瓶一心_ZH": 112,
+      "回声海螺_ZH": 113,
+      "劳维克_ZH": 114,
+      "元太_ZH": 115,
+      "阿扎尔_ZH": 116,
+      "查尔斯_ZH": 117,
+      "阿洛瓦_ZH": 118,
+      "埃勒曼_ZH": 119,
+      "纳比尔_ZH": 120,
+      "莎拉_ZH": 121,
+      "康纳_ZH": 122,
+      "博来_ZH": 123,
+      "玛塞勒_ZH": 124,
+      "阿祇_ZH": 125,
+      "博士_ZH": 126,
+      "玛格丽特_ZH": 127,
+      "迪尔菲_ZH": 128,
+      "宛烟_ZH": 129,
+      "羽生田千鹤_ZH": 130,
+      "海妮耶_ZH": 131,
+      "旅行者_ZH": 132,
+      "霍夫曼_ZH": 133,
+      "佐西摩斯_ZH": 134,
+      "鹿野奈奈_ZH": 135,
+      "舒伯特_ZH": 136,
+      "天叔_ZH": 137,
+      "艾莉丝_ZH": 138,
+      "龙二_ZH": 139,
+      "莺儿_ZH": 140,
+      "嘉良_ZH": 141,
+      "一心传名刀_ZH": 142,
+      "费迪南德_ZH": 143,
+      "珊瑚_ZH": 144,
+      "言笑_ZH": 145,
+      "久利须_ZH": 146,
+      "嘉玛_ZH": 147,
+      "艾文_ZH": 148,
+      "克洛琳德_ZH": 149,
+      "丹吉尔_ZH": 150,
+      "女士_ZH": 151,
+      "白老先生_ZH": 152,
+      "天目十五_ZH": 153,
+      "老孟_ZH": 154,
+      "巴达维_ZH": 155,
+      "长生_ZH": 156,
+      "吴船长_ZH": 157,
+      "拉齐_ZH": 158,
+      "艾伯特_ZH": 159,
+      "松浦_ZH": 160,
+      "埃泽_ZH": 161,
+      "阿圆_ZH": 162,
+      "莫塞伊思_ZH": 163,
+      "阿拉夫_ZH": 164,
+      "杜吉耶_ZH": 165,
+      "石头_ZH": 166,
+      "百闻_ZH": 167,
+      "波洛_ZH": 168,
+      "斯坦利_ZH": 169,
+      "博易_ZH": 170,
+      "迈蒙_ZH": 171,
+      "掇星攫辰天君_ZH": 172,
+      "毗伽尔_ZH": 173,
+      "芙卡洛斯_ZH": 174,
+      "恶龙_ZH": 175,
+      "恕筠_ZH": 176,
+      "知易_ZH": 177,
+      "克列门特_ZH": 178,
+      "大慈树王_ZH": 179,
+      "西拉杰_ZH": 180,
+      "上杉_ZH": 181,
+      "阿尔卡米_ZH": 182,
+      "纯水精灵_ZH": 183,
+      "常九爷_ZH": 184,
+      "沙扎曼_ZH": 185,
+      "田铁嘴_ZH": 186,
+      "克罗索_ZH": 187,
+      "阿巴图伊_ZH": 188,
+      "悦_ZH": 189,
+      "阿佩普_ZH": 190,
+      "埃尔欣根_ZH": 191,
+      "萨赫哈蒂_ZH": 192,
+      "塔杰·拉德卡尼_ZH": 193,
+      "安西_ZH": 194,
+      "埃舍尔_ZH": 195,
+      "萨齐因_ZH": 196,
+      "派蒙_JP": 197,
+      "纳西妲_JP": 198,
+      "凯亚_JP": 199,
+      "阿贝多_JP": 200,
+      "温迪_JP": 201,
+      "枫原万叶_JP": 202,
+      "钟离_JP": 203,
+      "荒泷一斗_JP": 204,
+      "八重神子_JP": 205,
+      "艾尔海森_JP": 206,
+      "提纳里_JP": 207,
+      "迪希雅_JP": 208,
+      "卡维_JP": 209,
+      "宵宫_JP": 210,
+      "那维莱特_JP": 211,
+      "莱依拉_JP": 212,
+      "赛诺_JP": 213,
+      "莫娜_JP": 214,
+      "诺艾尔_JP": 215,
+      "托马_JP": 216,
+      "凝光_JP": 217,
+      "林尼_JP": 218,
+      "北斗_JP": 219,
+      "柯莱_JP": 220,
+      "神里绫华_JP": 221,
+      "可莉_JP": 222,
+      "芭芭拉_JP": 223,
+      "雷电将军_JP": 224,
+      "娜维娅_JP": 225,
+      "芙宁娜_JP": 226,
+      "珊瑚宫心海_JP": 227,
+      "鹿野院平藏_JP": 228,
+      "迪奥娜_JP": 229,
+      "琴_JP": 230,
+      "五郎_JP": 231,
+      "班尼特_JP": 232,
+      "达达利亚_JP": 233,
+      "安柏_JP": 234,
+      "莱欧斯利_JP": 235,
+      "夜兰_JP": 236,
+      "妮露_JP": 237,
+      "辛焱_JP": 238,
+      "丽莎_JP": 239,
+      "珐露珊_JP": 240,
+      "魈_JP": 241,
+      "香菱_JP": 242,
+      "迪卢克_JP": 243,
+      "砂糖_JP": 244,
+      "烟绯_JP": 245,
+      "早柚_JP": 246,
+      "云堇_JP": 247,
+      "刻晴_JP": 248,
+      "重云_JP": 249,
+      "优菈_JP": 250,
+      "胡桃_JP": 251,
+      "流浪者_JP": 252,
+      "久岐忍_JP": 253,
+      "神里绫人_JP": 254,
+      "甘雨_JP": 255,
+      "戴因斯雷布_JP": 256,
+      "菲谢尔_JP": 257,
+      "白术_JP": 258,
+      "行秋_JP": 259,
+      "九条裟罗_JP": 260,
+      "夏洛蒂_JP": 261,
+      "雷泽_JP": 262,
+      "申鹤_JP": 263,
+      "空_JP": 264,
+      "荧_JP": 265,
+      "迪娜泽黛_JP": 266,
+      "凯瑟琳_JP": 267,
+      "多莉_JP": 268,
+      "坎蒂丝_JP": 269,
+      "琳妮特_JP": 270,
+      "萍姥姥_JP": 271,
+      "罗莎莉亚_JP": 272,
+      "埃德_JP": 273,
+      "爱贝尔_JP": 274,
+      "伊迪娅_JP": 275,
+      "留云借风真君_JP": 276,
+      "绮良良_JP": 277,
+      "七七_JP": 278,
+      "式大将_JP": 279,
+      "瑶瑶_JP": 280,
+      "奥兹_JP": 281,
+      "菲米尼_JP": 282,
+      "米卡_JP": 283,
+      "哲平_JP": 284,
+      "大肉丸_JP": 285,
+      "托克_JP": 286,
+      "蒂玛乌斯_JP": 287,
+      "昆钧_JP": 288,
+      "欧菲妮_JP": 289,
+      "塞琉斯_JP": 290,
+      "仆人_JP": 291,
+      "迈勒斯_JP": 292,
+      "希格雯_JP": 293,
+      "阿守_JP": 294,
+      "拉赫曼_JP": 295,
+      "杜拉夫_JP": 296,
+      "伊利亚斯_JP": 297,
+      "阿晃_JP": 298,
+      "旁白_JP": 299,
+      "爱德琳_JP": 300,
+      "埃洛伊_JP": 301,
+      "德沃沙克_JP": 302,
+      "玛乔丽_JP": 303,
+      "塞塔蕾_JP": 304,
+      "柊千里_JP": 305,
+      "海芭夏_JP": 306,
+      "九条镰治_JP": 307,
+      "阿娜耶_JP": 308,
+      "笼钓瓶一心_JP": 309,
+      "回声海螺_JP": 310,
+      "劳维克_JP": 311,
+      "元太_JP": 312,
+      "阿扎尔_JP": 313,
+      "查尔斯_JP": 314,
+      "阿洛瓦_JP": 315,
+      "埃勒曼_JP": 316,
+      "纳比尔_JP": 317,
+      "莎拉_JP": 318,
+      "康纳_JP": 319,
+      "博来_JP": 320,
+      "玛塞勒_JP": 321,
+      "阿祇_JP": 322,
+      "博士_JP": 323,
+      "迪尔菲_JP": 324,
+      "玛格丽特_JP": 325,
+      "宛烟_JP": 326,
+      "羽生田千鹤_JP": 327,
+      "海妮耶_JP": 328,
+      "霍夫曼_JP": 329,
+      "旅行者_JP": 330,
+      "佐西摩斯_JP": 331,
+      "舒伯特_JP": 332,
+      "鹿野奈奈_JP": 333,
+      "天叔_JP": 334,
+      "龙二_JP": 335,
+      "艾莉丝_JP": 336,
+      "莺儿_JP": 337,
+      "嘉良_JP": 338,
+      "珊瑚_JP": 339,
+      "言笑_JP": 340,
+      "一心传名刀_JP": 341,
+      "费迪南德_JP": 342,
+      "久利须_JP": 343,
+      "嘉玛_JP": 344,
+      "艾文_JP": 345,
+      "克洛琳德_JP": 346,
+      "丹吉尔_JP": 347,
+      "天目十五_JP": 348,
+      "女士_JP": 349,
+      "老孟_JP": 350,
+      "白老先生_JP": 351,
+      "舍利夫_JP": 352,
+      "巴达维_JP": 353,
+      "拉齐_JP": 354,
+      "长生_JP": 355,
+      "吴船长_JP": 356,
+      "艾伯特_JP": 357,
+      "松浦_JP": 358,
+      "埃泽_JP": 359,
+      "阿圆_JP": 360,
+      "阿拉夫_JP": 361,
+      "莫塞伊思_JP": 362,
+      "石头_JP": 363,
+      "百闻_JP": 364,
+      "杜吉耶_JP": 365,
+      "波洛_JP": 366,
+      "掇星攫辰天君_JP": 367,
+      "迈蒙_JP": 368,
+      "博易_JP": 369,
+      "诗筠_JP": 370,
+      "斯坦利_JP": 371,
+      "毗伽尔_JP": 372,
+      "芙卡洛斯_JP": 373,
+      "恶龙_JP": 374,
+      "小仓澪_JP": 375,
+      "恕筠_JP": 376,
+      "知易_JP": 377,
+      "克列门特_JP": 378,
+      "大慈树王_JP": 379,
+      "望雅_JP": 380,
+      "黑田_JP": 381,
+      "卡莉娜_JP": 382,
+      "马姆杜_JP": 383,
+      "科林斯_JP": 384,
+      "上杉_JP": 385,
+      "西拉杰_JP": 386,
+      "菲尔戈黛特_JP": 387,
+      "一平_JP": 388,
+      "纯水精灵_JP": 389,
+      "阿尔卡米_JP": 390,
+      "老戴_JP": 391,
+      "谢赫祖拜尔_JP": 392,
+      "沙扎曼_JP": 393,
+      "田铁嘴_JP": 394,
+      "小野寺_JP": 395,
+      "百识_JP": 396,
+      "克罗索_JP": 397,
+      "莱斯格_JP": 398,
+      "芷巧_JP": 399,
+      "加藤洋平_JP": 400,
+      "阿巴图伊_JP": 401,
+      "埃尔欣根_JP": 402,
+      "斯嘉莉_JP": 403,
+      "阿佩普_JP": 404,
+      "巫女_JP": 405,
+      "卡布斯_JP": 406,
+      "洛伦佐_JP": 407,
+      "萨赫哈蒂_JP": 408,
+      "娜德瓦_JP": 409,
+      "塞德娜_JP": 410,
+      "塔杰·拉德卡尼_JP": 411,
+      "绘星_JP": 412,
+      "泽田_JP": 413,
+      "安西_JP": 414,
+      "拉伊德_JP": 415,
+      "亚卡巴_JP": 416,
+      "有乐斋_JP": 417,
+      "莱昂_JP": 418,
+      "尤苏波夫_JP": 419,
+      "夏妮_JP": 420,
+      "埃舍尔_JP": 421,
+      "萨齐因_JP": 422,
+      "古山_JP": 423,
+      "自称渊上之物_JP": 424,
+      "丹羽_JP": 425,
+      "塞萨尔的日记_JP": 426,
+      "派蒙_EN": 427,
+      "纳西妲_EN": 428,
+      "凯亚_EN": 429,
+      "阿贝多_EN": 430,
+      "温迪_EN": 431,
+      "枫原万叶_EN": 432,
+      "钟离_EN": 433,
+      "荒泷一斗_EN": 434,
+      "八重神子_EN": 435,
+      "艾尔海森_EN": 436,
+      "提纳里_EN": 437,
+      "迪希雅_EN": 438,
+      "卡维_EN": 439,
+      "宵宫_EN": 440,
+      "莱依拉_EN": 441,
+      "那维莱特_EN": 442,
+      "赛诺_EN": 443,
+      "莫娜_EN": 444,
+      "诺艾尔_EN": 445,
+      "托马_EN": 446,
+      "凝光_EN": 447,
+      "林尼_EN": 448,
+      "北斗_EN": 449,
+      "柯莱_EN": 450,
+      "神里绫华_EN": 451,
+      "可莉_EN": 452,
+      "芭芭拉_EN": 453,
+      "雷电将军_EN": 454,
+      "娜维娅_EN": 455,
+      "芙宁娜_EN": 456,
+      "珊瑚宫心海_EN": 457,
+      "鹿野院平藏_EN": 458,
+      "迪奥娜_EN": 459,
+      "五郎_EN": 460,
+      "琴_EN": 461,
+      "班尼特_EN": 462,
+      "达达利亚_EN": 463,
+      "安柏_EN": 464,
+      "莱欧斯利_EN": 465,
+      "夜兰_EN": 466,
+      "妮露_EN": 467,
+      "辛焱_EN": 468,
+      "珐露珊_EN": 469,
+      "丽莎_EN": 470,
+      "魈_EN": 471,
+      "香菱_EN": 472,
+      "迪卢克_EN": 473,
+      "砂糖_EN": 474,
+      "���绯_EN": 475,
+      "早柚_EN": 476,
+      "云堇_EN": 477,
+      "刻晴_EN": 478,
+      "重云_EN": 479,
+      "优菈_EN": 480,
+      "胡桃_EN": 481,
+      "流浪者_EN": 482,
+      "久岐忍_EN": 483,
+      "神里绫人_EN": 484,
+      "甘雨_EN": 485,
+      "戴因斯雷布_EN": 486,
+      "菲谢尔_EN": 487,
+      "白术_EN": 488,
+      "行秋_EN": 489,
+      "九条裟罗_EN": 490,
+      "夏洛蒂_EN": 491,
+      "雷泽_EN": 492,
+      "申鹤_EN": 493,
+      "荧_EN": 494,
+      "空_EN": 495,
+      "迪娜泽黛_EN": 496,
+      "凯瑟琳_EN": 497,
+      "多莉_EN": 498,
+      "坎蒂丝_EN": 499,
+      "琳妮特_EN": 500,
+      "萍姥姥_EN": 501,
+      "罗莎莉亚_EN": 502,
+      "埃德_EN": 503,
+      "爱贝尔_EN": 504,
+      "伊迪娅_EN": 505,
+      "留云借风真君_EN": 506,
+      "绮良良_EN": 507,
+      "七七_EN": 508,
+      "式大将_EN": 509,
+      "瑶瑶_EN": 510,
+      "奥兹_EN": 511,
+      "菲米尼_EN": 512,
+      "米卡_EN": 513,
+      "哲平_EN": 514,
+      "大肉丸_EN": 515,
+      "托克_EN": 516,
+      "蒂玛乌斯_EN": 517,
+      "昆钧_EN": 518,
+      "欧菲妮_EN": 519,
+      "塞琉斯_EN": 520,
+      "仆人_EN": 521,
+      "迈勒斯_EN": 522,
+      "希格雯_EN": 523,
+      "阿守_EN": 524,
+      "拉赫曼_EN": 525,
+      "杜拉夫_EN": 526,
+      "伊利亚斯_EN": 527,
+      "阿晃_EN": 528,
+      "旁白_EN": 529,
+      "爱德琳_EN": 530,
+      "埃洛伊_EN": 531,
+      "德沃沙克_EN": 532,
+      "玛乔丽_EN": 533,
+      "塞塔蕾_EN": 534,
+      "柊千里_EN": 535,
+      "海芭夏_EN": 536,
+      "九条镰治_EN": 537,
+      "阿娜耶_EN": 538,
+      "笼钓瓶一心_EN": 539,
+      "回声海螺_EN": 540,
+      "劳维克_EN": 541,
+      "元太_EN": 542,
+      "阿扎尔_EN": 543,
+      "查尔斯_EN": 544,
+      "阿洛瓦_EN": 545,
+      "埃勒曼_EN": 546,
+      "纳比尔_EN": 547,
+      "莎拉_EN": 548,
+      "康纳_EN": 549,
+      "博来_EN": 550,
+      "玛塞勒_EN": 551,
+      "阿祇_EN": 552,
+      "博士_EN": 553,
+      "迪尔菲_EN": 554,
+      "宛烟_EN": 555,
+      "玛格丽特_EN": 556,
+      "羽生田千鹤_EN": 557,
+      "海妮耶_EN": 558,
+      "霍夫曼_EN": 559,
+      "旅行者_EN": 560,
+      "佐西摩斯_EN": 561,
+      "鹿野奈奈_EN": 562,
+      "舒伯特_EN": 563,
+      "天叔_EN": 564,
+      "艾莉丝_EN": 565,
+      "龙二_EN": 566,
+      "莺儿_EN": 567,
+      "嘉良_EN": 568,
+      "珊瑚_EN": 569,
+      "费迪南德_EN": 570,
+      "言笑_EN": 571,
+      "一心传名刀_EN": 572,
+      "久利须_EN": 573,
+      "嘉玛_EN": 574,
+      "艾文_EN": 575,
+      "克洛琳德_EN": 576,
+      "丹吉尔_EN": 577,
+      "女士_EN": 578,
+      "天目十五_EN": 579,
+      "老孟_EN": 580,
+      "白老先生_EN": 581,
+      "舍利夫_EN": 582,
+      "巴达维_EN": 583,
+      "拉齐_EN": 584,
+      "长生_EN": 585,
+      "吴船长_EN": 586,
+      "艾伯特_EN": 587,
+      "松浦_EN": 588,
+      "埃泽_EN": 589,
+      "阿圆_EN": 590,
+      "阿拉夫_EN": 591,
+      "莫塞伊思_EN": 592,
+      "石头_EN": 593,
+      "百闻_EN": 594,
+      "杜吉耶_EN": 595,
+      "波洛_EN": 596,
+      "斯坦利_EN": 597,
+      "掇星攫辰天君_EN": 598,
+      "迈蒙_EN": 599,
+      "博易_EN": 600,
+      "诗筠_EN": 601,
+      "毗伽尔_EN": 602,
+      "慧心_EN": 603,
+      "芙卡洛斯_EN": 604,
+      "恶龙_EN": 605,
+      "小仓澪_EN": 606,
+      "恕筠_EN": 607,
+      "知易_EN": 608,
+      "克列门特_EN": 609,
+      "大慈树王_EN": 610,
+      "维多利亚_EN": 611,
+      "黑田_EN": 612,
+      "马姆杜_EN": 613,
+      "科林斯_EN": 614,
+      "上杉_EN": 615,
+      "西拉杰_EN": 616,
+      "宁禄_EN": 617,
+      "纯水精灵_EN": 618,
+      "常九爷_EN": 619,
+      "阿尔卡米_EN": 620,
+      "沙扎曼_EN": 621,
+      "田铁嘴_EN": 622,
+      "加萨尼_EN": 623,
+      "克罗索_EN": 624,
+      "星稀_EN": 625,
+      "莱斯格_EN": 626,
+      "阿巴图伊_EN": 627,
+      "悦_EN": 628,
+      "德田_EN": 629,
+      "埃尔欣根_EN": 630,
+      "阿佩普_EN": 631,
+      "萨赫哈蒂_EN": 632,
+      "洛伦佐_EN": 633,
+      "塔杰·拉德卡尼_EN": 634,
+      "泽田_EN": 635,
+      "安西_EN": 636,
+      "理水叠山真君_EN": 637,
+      "埃舍尔_EN": 638,
+      "萨齐因_EN": 639,
+      "古田_EN": 640,
+      "陆景和": 641,
+      "莫弈": 642,
+      "左然": 643,
+      "夏彦": 644,
+      "三月七_ZH": 645,
+      "丹恒_ZH": 646,
+      "希儿_ZH": 647,
+      "娜塔莎_ZH": 648,
+      "希露瓦_ZH": 649,
+      "瓦尔特_ZH": 650,
+      "佩拉_ZH": 651,
+      "布洛妮娅_ZH": 652,
+      "虎克_ZH": 653,
+      "素裳_ZH": 654,
+      "克拉拉_ZH": 655,
+      "符玄_ZH": 656,
+      "白露_ZH": 657,
+      "杰帕德_ZH": 658,
+      "景元_ZH": 659,
+      "藿藿_ZH": 660,
+      "姬子_ZH": 661,
+      "穹_ZH": 662,
+      "星_ZH": 663,
+      "卡芙卡_ZH": 664,
+      "桂乃芬_ZH": 665,
+      "艾丝妲_ZH": 666,
+      "玲可_ZH": 667,
+      "彦卿_ZH": 668,
+      "托帕_ZH": 669,
+      "驭空_ZH": 670,
+      "浮烟_ZH": 671,
+      "停云_ZH": 672,
+      "镜流_ZH": 673,
+      "罗刹_ZH": 674,
+      "卢卡_ZH": 675,
+      "史瓦罗_ZH": 676,
+      "黑塔_ZH": 677,
+      "桑博_ZH": 678,
+      "伦纳德_ZH": 679,
+      "明曦_ZH": 680,
+      "银狼_ZH": 681,
+      "帕姆_ZH": 682,
+      "青雀_ZH": 683,
+      "乔瓦尼_ZH": 684,
+      "公输师傅_ZH": 685,
+      "晴霓_ZH": 686,
+      "螺丝咕姆_ZH": 687,
+      "阿兰_ZH": 688,
+      "奥列格_ZH": 689,
+      "丹枢_ZH": 690,
+      "尾巴_ZH": 691,
+      "寒鸦_ZH": 692,
+      "雪衣_ZH": 693,
+      "可可利亚_ZH": 694,
+      "青镞_ZH": 695,
+      "半夏_ZH": 696,
+      "银枝_ZH": 697,
+      "大毫_ZH": 698,
+      "霄翰_ZH": 699,
+      "信使_ZH": 700,
+      "费斯曼_ZH": 701,
+      "绿芙蓉_ZH": 702,
+      "dev_成男_ZH": 703,
+      "金人会长_ZH": 704,
+      "维利特_ZH": 705,
+      "维尔德_ZH": 706,
+      "斯科特_ZH": 707,
+      "卡波特_ZH": 708,
+      "刃_ZH": 709,
+      "岩明_ZH": 710,
+      "浣溪_ZH": 711,
+      "三月七_JP": 712,
+      "丹恒_JP": 713,
+      "希儿_JP": 714,
+      "娜塔莎_JP": 715,
+      "希露瓦_JP": 716,
+      "瓦尔特_JP": 717,
+      "佩拉_JP": 718,
+      "布洛妮娅_JP": 719,
+      "虎克_JP": 720,
+      "素裳_JP": 721,
+      "克拉拉_JP": 722,
+      "符玄_JP": 723,
+      "白露_JP": 724,
+      "杰帕德_JP": 725,
+      "景元_JP": 726,
+      "藿藿_JP": 727,
+      "姬子_JP": 728,
+      "卡芙卡_JP": 729,
+      "穹_JP": 730,
+      "星_JP": 731,
+      "桂乃芬_JP": 732,
+      "艾丝妲_JP": 733,
+      "彦卿_JP": 734,
+      "玲可_JP": 735,
+      "托帕_JP": 736,
+      "驭空_JP": 737,
+      "浮烟_JP": 738,
+      "停云_JP": 739,
+      "镜流_JP": 740,
+      "罗刹_JP": 741,
+      "卢卡_JP": 742,
+      "史瓦罗_JP": 743,
+      "黑塔_JP": 744,
+      "桑博_JP": 745,
+      "伦纳德_JP": 746,
+      "明曦_JP": 747,
+      "银狼_JP": 748,
+      "帕姆_JP": 749,
+      "青雀_JP": 750,
+      "乔瓦尼_JP": 751,
+      "公输师傅_JP": 752,
+      "晴霓_JP": 753,
+      "螺丝咕姆_JP": 754,
+      "阿兰_JP": 755,
+      "奥列格_JP": 756,
+      "丹枢_JP": 757,
+      "尾巴_JP": 758,
+      "寒鸦_JP": 759,
+      "雪衣_JP": 760,
+      "可可利亚_JP": 761,
+      "青镞_JP": 762,
+      "半夏_JP": 763,
+      "银枝_JP": 764,
+      "大毫_JP": 765,
+      "霄翰_JP": 766,
+      "信使_JP": 767,
+      "费斯曼_JP": 768,
+      "绿芙蓉_JP": 769,
+      "dev_成男_JP": 770,
+      "金人会长_JP": 771,
+      "维利特_JP": 772,
+      "维尔德_JP": 773,
+      "斯科特_JP": 774,
+      "刃_JP": 775,
+      "卡波特_JP": 776,
+      "岩明_JP": 777,
+      "浣溪_JP": 778,
+      "净砚_JP": 779,
+      "紫月季_JP": 780,
+      "歌蒂_JP": 781,
+      "奇怪的云骑_JP": 782,
+      "幻胧_JP": 783,
+      "斯薇塔_JP": 784,
+      "隐书_JP": 785,
+      "三月七_EN": 786,
+      "丹恒_EN": 787,
+      "希儿_EN": 788,
+      "娜塔莎_EN": 789,
+      "希露瓦_EN": 790,
+      "瓦尔特_EN": 791,
+      "佩拉_EN": 792,
+      "布洛妮娅_EN": 793,
+      "虎克_EN": 794,
+      "素裳_EN": 795,
+      "克拉拉_EN": 796,
+      "符玄_EN": 797,
+      "白露_EN": 798,
+      "杰帕德_EN": 799,
+      "景元_EN": 800,
+      "藿藿_EN": 801,
+      "姬子_EN": 802,
+      "卡芙卡_EN": 803,
+      "穹_EN": 804,
+      "星_EN": 805,
+      "桂乃芬_EN": 806,
+      "艾丝妲_EN": 807,
+      "彦卿_EN": 808,
+      "玲可_EN": 809,
+      "托帕_EN": 810,
+      "驭空_EN": 811,
+      "浮烟_EN": 812,
+      "停云_EN": 813,
+      "镜流_EN": 814,
+      "罗刹_EN": 815,
+      "卢卡_EN": 816,
+      "史瓦罗_EN": 817,
+      "黑塔_EN": 818,
+      "桑博_EN": 819,
+      "伦纳德_EN": 820,
+      "明曦_EN": 821,
+      "银狼_EN": 822,
+      "帕姆_EN": 823,
+      "青雀_EN": 824,
+      "乔瓦尼_EN": 825,
+      "公输师傅_EN": 826,
+      "晴霓_EN": 827,
+      "螺丝咕姆_EN": 828,
+      "阿兰_EN": 829,
+      "奥列格_EN": 830,
+      "丹枢_EN": 831,
+      "尾巴_EN": 832,
+      "寒鸦_EN": 833,
+      "雪衣_EN": 834,
+      "可可利亚_EN": 835,
+      "青镞_EN": 836,
+      "半夏_EN": 837,
+      "银枝_EN": 838,
+      "大毫_EN": 839,
+      "霄翰_EN": 840,
+      "信使_EN": 841,
+      "费斯曼_EN": 842,
+      "绿芙蓉_EN": 843,
+      "dev_成男_EN": 844,
+      "金人会长_EN": 845,
+      "维利特_EN": 846,
+      "维尔德_EN": 847,
+      "刃_EN": 848,
+      "卡波特_EN": 849,
+      "岩明_EN": 850,
+      "浣溪_EN": 851,
+      "紫月季_EN": 852,
+      "幻胧_EN": 853,
+      "女声_EN": 854
     }
   },
   "model": {
     "n_layers_q": 3,
     "use_spectral_norm": false,
     "gin_channels": 256
+  },
+  "version": "2.1"
+}

data_utils.py CHANGED Viewed

@@ -3,11 +3,13 @@ import random
 import torch
 import torch.utils.data
 from tqdm import tqdm
-from loguru import logger
 import commons
 from mel_processing import spectrogram_torch, mel_spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
-from text import cleaned_text_to_sequence, get_bert
 """Multi speaker version"""
@@ -40,7 +42,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         self.add_blank = hparams.add_blank
         self.min_text_len = getattr(hparams, "min_text_len", 1)
-        self.max_text_len = getattr(hparams, "max_text_len", 300)
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
@@ -85,13 +87,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         # separate filename, speaker_id and text
         audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
-        bert, ja_bert, phones, tone, language = self.get_text(
             text, word2ph, phones, tone, language, audiopath
         )
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
-        return (phones, spec, wav, sid, tone, language, bert, ja_bert)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
@@ -131,7 +134,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
                     center=False,
                 )
             spec = torch.squeeze(spec, 0)
-            torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
@@ -145,40 +149,28 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
             word2ph[0] += 1
         bert_path = wav_path.replace(".wav", ".bert.pt")
         try:
-            bert = torch.load(bert_path)
-            assert bert.shape[-1] == len(phone)
-        except:
-            bert = get_bert(text, word2ph, language_str)
-            torch.save(bert, bert_path)
-            assert bert.shape[-1] == len(phone), phone
         if language_str == "ZH":
-            bert = bert
-            ja_bert = torch.zeros(768, len(phone))
-        elif language_str == "JA":
-            ja_bert = bert
             bert = torch.zeros(1024, len(phone))
-        else:
             bert = torch.zeros(1024, len(phone))
-            ja_bert = torch.zeros(768, len(phone))
-        assert bert.shape[-1] == len(phone), (
-            bert.shape,
-            len(phone),
-            sum(word2ph),
-            p1,
-            p2,
-            t1,
-            t2,
-            pold,
-            pold2,
-            word2ph,
-            text,
-            w2pho,
-        )
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
         language = torch.LongTensor(language)
-        return bert, ja_bert, phone, tone, language
     def get_sid(self, sid):
         sid = torch.LongTensor([int(sid)])
@@ -221,7 +213,9 @@ class TextAudioSpeakerCollate:
         tone_padded = torch.LongTensor(len(batch), max_text_len)
         language_padded = torch.LongTensor(len(batch), max_text_len)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
-        ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
@@ -232,6 +226,9 @@ class TextAudioSpeakerCollate:
         wav_padded.zero_()
         bert_padded.zero_()
         ja_bert_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
@@ -261,6 +258,11 @@ class TextAudioSpeakerCollate:
             ja_bert = row[7]
             ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
         return (
             text_padded,
             text_lengths,
@@ -273,6 +275,8 @@ class TextAudioSpeakerCollate:
             language_padded,
             bert_padded,
             ja_bert_padded,
         )

 import torch
 import torch.utils.data
 from tqdm import tqdm
+import numpy as np
+from tools.log import logger
 import commons
 from mel_processing import spectrogram_torch, mel_spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
+from text import cleaned_text_to_sequence
+from config import config
 """Multi speaker version"""
         self.add_blank = hparams.add_blank
         self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 384)
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         # separate filename, speaker_id and text
         audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
+        bert, ja_bert, en_bert, phones, tone, language = self.get_text(
             text, word2ph, phones, tone, language, audiopath
         )
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
+        emo = torch.FloatTensor(np.load(audiopath.replace(".wav", ".emo.npy")))
+        return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
                     center=False,
                 )
             spec = torch.squeeze(spec, 0)
+            if config.train_ms_config.spec_cache:
+                torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
             word2ph[0] += 1
         bert_path = wav_path.replace(".wav", ".bert.pt")
         try:
+            bert_ori = torch.load(bert_path)
+            assert bert_ori.shape[-1] == len(phone)
+        except Exception as e:
+            logger.warning("Bert load Failed")
+            logger.warning(e)
         if language_str == "ZH":
+            bert = bert_ori
+            ja_bert = torch.zeros(1024, len(phone))
+            en_bert = torch.zeros(1024, len(phone))
+        elif language_str == "JP":
             bert = torch.zeros(1024, len(phone))
+            ja_bert = bert_ori
+            en_bert = torch.zeros(1024, len(phone))
+        elif language_str == "EN":
             bert = torch.zeros(1024, len(phone))
+            ja_bert = torch.zeros(1024, len(phone))
+            en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
         language = torch.LongTensor(language)
+        return bert, ja_bert, en_bert, phone, tone, language
     def get_sid(self, sid):
         sid = torch.LongTensor([int(sid)])
         tone_padded = torch.LongTensor(len(batch), max_text_len)
         language_padded = torch.LongTensor(len(batch), max_text_len)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        emo = torch.FloatTensor(len(batch), 1024)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
         wav_padded.zero_()
         bert_padded.zero_()
         ja_bert_padded.zero_()
+        en_bert_padded.zero_()
+        emo.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
             ja_bert = row[7]
             ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
+            en_bert = row[8]
+            en_bert_padded[i, :, : en_bert.size(1)] = en_bert
+            emo[i, :] = row[9]
         return (
             text_padded,
             text_lengths,
             language_padded,
             bert_padded,
             ja_bert_padded,
+            en_bert_padded,
+            emo,
         )

default_config.yml ADDED Viewed

	@@ -0,0 +1,174 @@

+# 全局配置
+# 对于希望在同一时间使用多个配置文件的情况，例如两个GPU同时跑两个训练集：通过环境变量指定配置文件，不指定则默认为./config.yml
+# 拟提供通用路径配置，统一存放数据，避免数据放得很乱
+# 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
+# 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/"
+# 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
+mirror: ""
+openi_token: ""  # openi token
+# resample 音频重采样配置
+# 注意， “:” 后需要加空格
+resample:
+  # 目标重采样率
+  sampling_rate: 44100
+  # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
+  # 请填入相对于datasetPath的相对路径
+  in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
+  # 音频文件重采样后输出路径
+  out_dir: "audios/wavs"
+# preprocess_text 数据集预处理相关配置
+# 注意， “:” 后需要加空格
+preprocess_text:
+  # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/你的数据集文本.list"
+  # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
+  cleaned_path: ""
+  # 训练集路径
+  train_path: "filelists/train.list"
+  # 验证集路径
+  val_path: "filelists/val.list"
+  # 配置文件路径
+  config_path: "config.json"
+  # 每个speaker的验证集条数
+  val_per_spk: 4
+  # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 8
+  # 是否进行数据清洗
+  clean: true
+# bert_gen 相关配置
+# 注意， “:” 后需要加空格
+bert_gen:
+  # 训练数据集配置文件路径
+  config_path: "config.json"
+  # 并行数
+  num_processes: 2
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  # 该选项同时决定了get_bert_feature的默认设备
+  device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
+# emo_gen 相关配置
+# 注意， “:” 后需要加空格
+emo_gen:
+  # 训练数据集配置文件路径
+  config_path: "config.json"
+  # 并行数
+  num_processes: 2
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  device: "cuda"
+# train 训练配置
+# 注意， “:” 后需要加空格
+train_ms:
+  env:
+    MASTER_ADDR: "localhost"
+    MASTER_PORT: 10086
+    WORLD_SIZE: 1
+    LOCAL_RANK: 0
+    RANK: 0
+    # 可以填写任意名的环境变量
+    # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
+  # 底模设置
+  base:
+    use_base_model: false
+    repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
+  # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
+  model: "models"
+  # 配置文件路径
+  config_path: "configs/config.json"
+  # 训练使用的worker，不建议超过CPU核心数
+  num_workers: 16
+  # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
+  spec_cache: True
+  # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
+  keep_ckpts: 8
+# webui webui配置
+# 注意， “:” 后需要加空格
+webui:
+  # 推理设备
+  device: "cuda"
+  # 模型路径
+  model: "genshin/models/G_8000.pth"
+  # 配置文件路径
+  config_path: "configs/config.json"
+  # 端口号
+  port: 7860
+  # 是否公开部署，对外网开放
+  share: false
+  # 是否开启debug模式
+  debug: false
+  # 语种识别库，可选langid, fastlid
+  language_identification_library: "langid"
+# server api配置
+# 注意， “:” 后需要加空格
+# 注意，本配置下的所有配置均为相对于根目录的路径
+server:
+  # 端口号
+  port: 5000
+  # 模型默认使用设备：但是当前并没有实现这个配置。
+  device: "cuda"
+  # 需要加载的所有模型的配置
+  # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  models:
+    - # 模型的路径
+      model: ""
+      # 模型config.json的路径
+      config: ""
+      # 模型使用设备，若填写则会覆盖默认配置
+      device: "cuda"
+      # 模型默认使用的语言
+      language: "ZH"
+      # 模型人物默认参数
+      # 不必填写所有人物，不填的使用默认值
+      # 暂时不用填写，当前尚未实现按人区分配置
+      speakers:
+        - speaker: "科比"
+          sdp_ratio: 0.2
+          noise_scale: 0.6
+          noise_scale_w: 0.8
+          length_scale: 1
+        - speaker: "五条悟"
+          sdp_ratio: 0.3
+          noise_scale: 0.7
+          noise_scale_w: 0.8
+          length_scale: 0.5
+        - speaker: "安倍晋三"
+          sdp_ratio: 0.2
+          noise_scale: 0.6
+          noise_scale_w: 0.8
+          length_scale: 1.2
+    - # 模型的路径
+      model: ""
+      # 模型config.json的路径
+      config: ""
+      # 模型使用设备，若填写则会覆盖默认配置
+      device: "cpu"
+      # 模型默认使用的语言
+      language: "JP"
+      # 模型人物默认参数
+      # 不必填写所有人物，不填的使用默认值
+      speakers: [ ] # 也可以不填
+# 百度翻译开放平台 api配置
+# api接入文档 https://api.fanyi.baidu.com/doc/21
+# 请不要在github等网站公开分享你的app id 与 key
+translate:
+  # 你的APPID
+  "app_key": ""
+  # 你的密钥
+  "secret_key": ""

emo_gen.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import argparse
+import os
+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from transformers import Wav2Vec2Processor
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+)
+import sys
+import utils
+from config import config
+class RegressionHead(nn.Module):
+    r"""Classification head."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class EmotionModel(Wav2Vec2PreTrainedModel):
+    r"""Speech emotion classifier."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = RegressionHead(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_values,
+    ):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs[0]
+        hidden_states = torch.mean(hidden_states, dim=1)
+        logits = self.classifier(hidden_states)
+        return hidden_states, logits
+class AudioDataset(Dataset):
+    def __init__(self, list_of_wav_files, sr, processor):
+        self.list_of_wav_files = list_of_wav_files
+        self.processor = processor
+        self.sr = sr
+    def __len__(self):
+        return len(self.list_of_wav_files)
+    def __getitem__(self, idx):
+        wav_file = self.list_of_wav_files[idx]
+        audio_data, _ = librosa.load(wav_file, sr=self.sr)
+        processed_data = self.processor(audio_data, sampling_rate=self.sr)[
+            "input_values"
+        ][0]
+        return torch.from_numpy(processed_data)
+def process_func(
+    x: np.ndarray,
+    sampling_rate: int,
+    model: EmotionModel,
+    processor: Wav2Vec2Processor,
+    device: str,
+    embeddings: bool = False,
+) -> np.ndarray:
+    device = (
+        "cuda:0"
+        if torch.cuda.is_available()
+        else (
+            "mps"
+            if sys.platform == "darwin" and torch.backends.mps.is_available()
+            else "cpu"
+        )
+    )
+    r"""Predict emotions or extract embeddings from raw audio signal."""
+    model = model.to(device)
+    y = processor(x, sampling_rate=sampling_rate)
+    y = y["input_values"][0]
+    y = torch.from_numpy(y).unsqueeze(0).to(device)
+    # run through model
+    with torch.no_grad():
+        y = model(y)[0 if embeddings else 1]
+    # convert to numpy
+    y = y.detach().cpu().numpy()
+    return y
+def get_emo(path):
+    wav, sr = librosa.load(path, 16000)
+    device = config.bert_gen_config.device
+    print("successfully generate the emo vec")
+    return process_func(
+        np.expand_dims(wav, 0).astype(np.float),
+        sr,
+        model,
+        processor,
+        device,
+        embeddings=True,
+    ).squeeze(0)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.bert_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.bert_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
+    config_path = args.config
+    hps = utils.get_hparams_from_file(config_path)
+    device = config.bert_gen_config.device
+    model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+    REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+    if not Path(model_name).joinpath("pytorch_model.bin").exists():
+        utils.download_emo_models(config.mirror, REPO_ID, model_name)
+    processor = Wav2Vec2Processor.from_pretrained(model_name)
+    model = EmotionModel.from_pretrained(model_name).to(device)
+    lines = []
+    with open(hps.data.training_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    with open(hps.data.validation_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    wavnames = [line.split("|")[0] for line in lines]
+    dataset = AudioDataset(wavnames, 16000, processor)
+    data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=16)
+    with torch.no_grad():
+        for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+            wavname = wavnames[i]
+            emo_path = wavname.replace(".wav", ".emo.npy")
+            if os.path.exists(emo_path):
+                continue
+            emb = model(data.to(device))[0].detach().cpu().numpy()
+            np.save(emo_path, emb)
+    print("Emo vec 生成完毕!")

emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes ADDED Viewed

	@@ -0,0 +1,28 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md ADDED Viewed

	@@ -0,0 +1,127 @@

+---
+language: en
+datasets:
+- msp-podcast
+inference: true
+tags:
+- speech
+- audio
+- wav2vec2
+- audio-classification
+- emotion-recognition
+license: cc-by-nc-sa-4.0
+pipeline_tag: audio-classification
+---
+# Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0
+The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [
+Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).
+# Usage
+```python
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import Wav2Vec2Processor
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+)
+class RegressionHead(nn.Module):
+    r"""Classification head."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class EmotionModel(Wav2Vec2PreTrainedModel):
+    r"""Speech emotion classifier."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = RegressionHead(config)
+        self.init_weights()
+    def forward(
+            self,
+            input_values,
+    ):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs[0]
+        hidden_states = torch.mean(hidden_states, dim=1)
+        logits = self.classifier(hidden_states)
+        return hidden_states, logits
+# load model from hub
+device = 'cpu'
+model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = EmotionModel.from_pretrained(model_name)
+# dummy signal
+sampling_rate = 16000
+signal = np.zeros((1, sampling_rate), dtype=np.float32)
+def process_func(
+    x: np.ndarray,
+    sampling_rate: int,
+    embeddings: bool = False,
+) -> np.ndarray:
+    r"""Predict emotions or extract embeddings from raw audio signal."""
+    # run through processor to normalize signal
+    # always returns a batch, so we just get the first entry
+    # then we put it on the device
+    y = processor(x, sampling_rate=sampling_rate)
+    y = y['input_values'][0]
+    y = y.reshape(1, -1)
+    y = torch.from_numpy(y).to(device)
+    # run through model
+    with torch.no_grad():
+        y = model(y)[0 if embeddings else 1]
+    # convert to numpy
+    y = y.detach().cpu().numpy()
+    return y
+print(process_func(signal, sampling_rate))
+#  Arousal    dominance valence
+# [[0.5460754  0.6062266  0.40431657]]
+print(process_func(signal, sampling_rate, embeddings=True))
+# Pooled hidden states of last transformer layer
+# [[-0.00752167  0.0065819  -0.00746342 ...  0.00663632  0.00848748
+#    0.00599211]]
+```