Mahiruoshi commited on
Commit
9169788
1 Parent(s): 3d43932

Upload 120 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Data/BanGDream/configs/config.json +106 -0
  2. Data/BanGDream/filelists/Mygo.list +0 -0
  3. Data/BanGDream/filelists/val.list +8 -0
  4. Data/BanGDream/models/G_49000.pth +3 -0
  5. README.md +8 -6
  6. app.py +159 -382
  7. attentions_onnx.py +378 -0
  8. bert/bert-base-japanese-v3/README.md +1 -1
  9. bert/bert-base-japanese-v3/vocab.txt +1 -1
  10. bert/bert-large-japanese-v2/.gitattributes +34 -0
  11. bert/bert-large-japanese-v2/README.md +53 -0
  12. bert/bert-large-japanese-v2/config.json +19 -0
  13. bert/bert-large-japanese-v2/tokenizer_config.json +10 -0
  14. bert/bert-large-japanese-v2/vocab.txt +0 -0
  15. bert/bert_models.json +14 -0
  16. bert/chinese-roberta-wwm-ext-large/README.md +5 -5
  17. bert/chinese-roberta-wwm-ext-large/added_tokens.json +1 -1
  18. bert/chinese-roberta-wwm-ext-large/special_tokens_map.json +1 -1
  19. bert/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
  20. bert/chinese-roberta-wwm-ext-large/tokenizer_config.json +1 -1
  21. bert/deberta-v2-large-japanese-char-wwm/.gitattributes +34 -0
  22. bert/deberta-v2-large-japanese-char-wwm/README.md +89 -0
  23. bert/deberta-v2-large-japanese-char-wwm/config.json +37 -0
  24. bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin +3 -0
  25. bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json +7 -0
  26. bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json +19 -0
  27. bert/deberta-v2-large-japanese-char-wwm/vocab.txt +0 -0
  28. bert/deberta-v2-large-japanese/.gitattributes +34 -0
  29. bert/deberta-v2-large-japanese/README.md +111 -0
  30. bert/deberta-v2-large-japanese/config.json +38 -0
  31. bert/deberta-v2-large-japanese/special_tokens_map.json +9 -0
  32. bert/deberta-v2-large-japanese/tokenizer.json +0 -0
  33. bert/deberta-v2-large-japanese/tokenizer_config.json +15 -0
  34. bert/deberta-v3-large/.gitattributes +27 -0
  35. bert/deberta-v3-large/README.md +93 -0
  36. bert/deberta-v3-large/config.json +22 -0
  37. bert/deberta-v3-large/generator_config.json +22 -0
  38. bert/deberta-v3-large/pytorch_model.bin +3 -0
  39. bert/deberta-v3-large/spm.model +3 -0
  40. bert/deberta-v3-large/tokenizer_config.json +4 -0
  41. bert_gen.py +32 -20
  42. commons.py +7 -1
  43. config.yml +174 -0
  44. configs/config.json +863 -80
  45. data_utils.py +38 -34
  46. default_config.yml +174 -0
  47. emo_gen.py +174 -0
  48. emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes +28 -0
  49. emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE +437 -0
  50. emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md +127 -0
Data/BanGDream/configs/config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 1000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.99995,
16
+ "segment_size": 16384,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "skip_optimizer": true
22
+ },
23
+ "data": {
24
+ "training_files": "Data/BanGDream/filelists/train.list",
25
+ "validation_files": "Data/BanGDream/filelists/val.list",
26
+ "max_wav_value": 32768.0,
27
+ "sampling_rate": 44100,
28
+ "filter_length": 2048,
29
+ "hop_length": 512,
30
+ "win_length": 2048,
31
+ "n_mel_channels": 128,
32
+ "mel_fmin": 0.0,
33
+ "mel_fmax": null,
34
+ "add_blank": true,
35
+ "n_speakers": 896,
36
+ "cleaned_text": true,
37
+ "spk2id": {
38
+ "燈": 0,
39
+ "愛音": 1,
40
+ "楽奈": 2,
41
+ "そよ": 3,
42
+ "立希": 4,
43
+ "祥子": 5,
44
+ "睦": 6,
45
+ "海鈴": 7,
46
+ "にゃむ": 8,
47
+ "初華": 9,
48
+ "三月七": 10
49
+ }
50
+ },
51
+ "model": {
52
+ "use_spk_conditioned_encoder": true,
53
+ "use_noise_scaled_mas": true,
54
+ "use_mel_posterior_encoder": false,
55
+ "use_duration_discriminator": true,
56
+ "inter_channels": 192,
57
+ "hidden_channels": 192,
58
+ "filter_channels": 768,
59
+ "n_heads": 2,
60
+ "n_layers": 6,
61
+ "kernel_size": 3,
62
+ "p_dropout": 0.1,
63
+ "resblock": "1",
64
+ "resblock_kernel_sizes": [
65
+ 3,
66
+ 7,
67
+ 11
68
+ ],
69
+ "resblock_dilation_sizes": [
70
+ [
71
+ 1,
72
+ 3,
73
+ 5
74
+ ],
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ]
85
+ ],
86
+ "upsample_rates": [
87
+ 8,
88
+ 8,
89
+ 2,
90
+ 2,
91
+ 2
92
+ ],
93
+ "upsample_initial_channel": 512,
94
+ "upsample_kernel_sizes": [
95
+ 16,
96
+ 16,
97
+ 8,
98
+ 2,
99
+ 2
100
+ ],
101
+ "n_layers_q": 3,
102
+ "use_spectral_norm": false,
103
+ "gin_channels": 256
104
+ },
105
+ "version": "2.1"
106
+ }
Data/BanGDream/filelists/Mygo.list ADDED
The diff for this file is too large to render. See raw diff
 
Data/BanGDream/filelists/val.list ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ D:/Vits2/Dataset/area5524-005.wav|燈|JP|え......立希ちゃん,そこまでは......|_ e . . . . . . t a t e n o z o m i ch a n , s o k o m a d e w a . . . . . . _|0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 4 6 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1
2
+ D:/Vits2/Dataset/area5579-002.wav|燈|JP|え......!?おもしろい......?|_ e . . . . . . ! ? o m o sh i r o i . . . . . . ? _|0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1
3
+ D:/Vits2/Dataset/event235-03-068.wav|燈|JP|ご,ごめん......|_ g o , g o m e n . . . . . . _|0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0|1 2 1 2 2 1 1 1 1 1 1 1 1
4
+ D:/Vits2/Dataset/event235-19-061.wav|燈|JP|......愛音ちゃんは進んでる.行き止まりになっても,ちゃんと道を探して......進もうとしてる......|_ . . . . . . a i o n ch a n w a s u s u n d e r u . i k i d o m a r i n i n a q t e m o , ch a n t o m i ch i o s a g a sh i t e . . . . . . s u s u m o o t o sh i t e r u . . . . . . _|0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 2 2 1 1 1 2 3 2 2 2 1 2 2 2 2 1 2 2 1 2 2 1 2 1 1 1 4 1 3 3 2 1 1 1 1 1 1 3 3 1 2 2 2 2 1 1 1 1 1 1 1
5
+ D:/Vits2/Dataset/event235-37-046.wav|愛音|JP|どれどれ-......やばっ,何これ?ちゃんとキメてるの私だけじゃん!|_ d o r e d o r e - . . . . . . y a b a q , n a n i k o r e ? ch a n t o k i m e t e r u n o w a t a sh i d a k e j a n ! _|0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0|1 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 4 2 2 1 2 1 1 1 2 2 2 2 2 6 2 2 1 1 1 1 1
6
+ D:/Vits2/Dataset/event240-05-052.wav|愛音|JP|私たちも,りっきーと楽奈ちゃん探そっか!今度は五人で考えよ!|_ w a t a sh i t a ch i m o , r i q k i i t o r a k u n a ch a n s a g a s e s o q k a ! k o n d o w a g o n i n d e k a n g a e y o ! _|0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0|1 6 2 2 2 1 2 2 1 1 2 4 2 1 1 1 6 2 2 1 1 3 2 2 2 3 2 3 3 2 1 1
7
+ D:/Vits2/Dataset/event235-06-032.wav|愛音|JP|へー,......そういうのもあるんだ.そよさん,バンドのことに詳しいんですね!|_ e e , . . . . . . s o o y u u n o m o a r u n d a . s o y o s a n , b a n d o n o k o t o n i k u w a sh i i n d e s u n e ! _|0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0|1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 2 1 2 2 2 1 1 2 2 1 2 2 2 2 3 2 2 1 2 2 2 1 1
8
+ D:/Vits2/Dataset/event235-03-020.wav|愛音|JP|おーい!えっと,確か......高松さんきゃあ!|_ o o i ! e q t o , t a sh i k a . . . . . . t a k a m a ts u s a n ky a a ! _|0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0|1 1 1 1 1 2 1 1 1 3 3 1 1 1 1 1 1 4 4 2 1 1 1 1 1 1
Data/BanGDream/models/G_49000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7d9f2b0baff45ed4d88bbc6162bfaa4c960f5965b0b42085463311a672b350a
3
+ size 721511718
README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
- title: BangDream-Vits-bert
3
- emoji:
4
- colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.15.0
8
  app_file: app.py
9
  pinned: false
10
- license: other
11
- ---
 
 
 
1
  ---
2
+ title: Bushiroad BertVIts2 Emotional
3
+ emoji: 📚
4
+ colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.7.1
8
  app_file: app.py
9
  pinned: false
10
+ license: openrail
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,5 +1,10 @@
1
- # flake8: noqa: E402
 
 
 
2
  import logging
 
 
3
  logging.getLogger("numba").setLevel(logging.WARNING)
4
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
5
  logging.getLogger("urllib3").setLevel(logging.WARNING)
@@ -10,82 +15,71 @@ logging.basicConfig(
10
  )
11
 
12
  logger = logging.getLogger(__name__)
13
- import datetime
 
14
  import numpy as np
15
  import torch
16
- from ebooklib import epub
17
- import PyPDF2
18
- from PyPDF2 import PdfReader
19
- import zipfile
20
- import shutil
21
- import sys, os
22
- import json
23
- from bs4 import BeautifulSoup
24
- import argparse
 
 
 
 
 
 
 
25
  import commons
 
 
 
26
  import utils
 
27
  from models import SynthesizerTrn
28
  from text.symbols import symbols
29
- from text import cleaned_text_to_sequence, get_bert
30
- from text.cleaner import clean_text
31
- import gradio as gr
32
- import webbrowser
33
- import re
34
- from scipy.io.wavfile import write
35
- from datetime import datetime
36
  net_g = None
37
- BandList = {
38
-
39
- "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
40
- "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
41
- "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
42
- "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
43
- "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
44
- "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
45
- "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
46
- "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"],
47
- }
48
 
49
- if sys.platform == "darwin" and torch.backends.mps.is_available():
50
- device = "mps"
51
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
52
- else:
53
- device = "cuda"
54
 
55
- def is_japanese(string):
56
- for ch in string:
57
- if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
58
- return True
59
- return False
 
 
 
 
60
 
61
- def extrac(text):
62
- text = re.sub("<[^>]*>","",text)
63
- result_list = re.split(r'\n', text)
64
- final_list = []
65
- for i in result_list:
66
- i = i.replace('\n','').replace(' ','')
67
- #Current length of single sentence: 20
68
- if len(i)>1:
69
- if len(i) > 20:
70
- try:
71
- cur_list = re.split(r'。|!', i)
72
- for i in cur_list:
73
- if len(i)>1:
74
- final_list.append(i+'。')
75
- except:
76
- pass
77
- else:
78
- final_list.append(i)
79
- '''
80
- final_list.append(i)
81
- '''
82
- final_list = [x for x in final_list if x != '']
83
- return final_list
84
 
85
- def get_text(text, language_str, hps):
 
 
 
 
 
 
 
 
 
 
 
 
86
  norm_text, phone, tone, word2ph = clean_text(text, language_str)
87
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
88
-
89
  if hps.data.add_blank:
90
  phone = commons.intersperse(phone, 0)
91
  tone = commons.intersperse(tone, 0)
@@ -93,19 +87,24 @@ def get_text(text, language_str, hps):
93
  for i in range(len(word2ph)):
94
  word2ph[i] = word2ph[i] * 2
95
  word2ph[0] += 1
96
- bert = get_bert(norm_text, word2ph, language_str, device)
97
  del word2ph
98
- assert bert.shape[-1] == len(phone), phone
99
 
100
  if language_str == "ZH":
101
- bert = bert
102
- ja_bert = torch.zeros(768, len(phone))
103
- elif language_str == "JA":
104
- ja_bert = bert
105
  bert = torch.zeros(1024, len(phone))
106
- else:
 
 
107
  bert = torch.zeros(1024, len(phone))
108
- ja_bert = torch.zeros(768, len(phone))
 
 
 
109
 
110
  assert bert.shape[-1] == len(
111
  phone
@@ -114,19 +113,53 @@ def get_text(text, language_str, hps):
114
  phone = torch.LongTensor(phone)
115
  tone = torch.LongTensor(tone)
116
  language = torch.LongTensor(language)
117
- return bert, ja_bert, phone, tone, language
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
121
- global net_g
122
- bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
 
 
123
  with torch.no_grad():
124
  x_tst = phones.to(device).unsqueeze(0)
125
  tones = tones.to(device).unsqueeze(0)
126
  lang_ids = lang_ids.to(device).unsqueeze(0)
127
  bert = bert.to(device).unsqueeze(0)
128
  ja_bert = ja_bert.to(device).unsqueeze(0)
 
129
  x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
 
130
  del phones
131
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
132
  audio = (
@@ -138,6 +171,8 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
138
  lang_ids,
139
  bert,
140
  ja_bert,
 
 
141
  sdp_ratio=sdp_ratio,
142
  noise_scale=noise_scale,
143
  noise_scale_w=noise_scale_w,
@@ -147,256 +182,39 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
147
  .float()
148
  .numpy()
149
  )
150
- current_time = datetime.now()
151
- print(str(current_time)+':'+str(sid))
152
- del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
153
- return audio
154
-
155
-
156
- def tts_fn(
157
- text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence
158
- ):
159
- if not LongSentence:
160
- with torch.no_grad():
161
- audio = infer(
162
- text,
163
- sdp_ratio=sdp_ratio,
164
- noise_scale=noise_scale,
165
- noise_scale_w=noise_scale_w,
166
- length_scale=length_scale,
167
- sid=speaker,
168
- language= "JP" if is_japanese(text) else "ZH",
169
- )
170
  torch.cuda.empty_cache()
171
- return (hps.data.sampling_rate, audio)
172
- else:
173
- audiopath = 'voice.wav'
174
- a = ['【','[','(','(']
175
- b = ['】',']',')',')']
176
- for i in a:
177
- text = text.replace(i,'<')
178
- for i in b:
179
- text = text.replace(i,'>')
180
- final_list = extrac(text.replace('“','').replace('”',''))
181
- audio_fin = []
182
- for sentence in final_list:
183
- with torch.no_grad():
184
- audio = infer(
185
- sentence,
186
- sdp_ratio=sdp_ratio,
187
- noise_scale=noise_scale,
188
- noise_scale_w=noise_scale_w,
189
- length_scale=length_scale,
190
- sid=speaker,
191
- language= "JP" if is_japanese(text) else "ZH",
192
- )
193
- audio_fin.append(audio)
194
- return (hps.data.sampling_rate, np.concatenate(audio_fin))
195
-
196
- def split_into_sentences(text):
197
- """将文本分割为句子,基于中文的标点符号"""
198
- sentences = re.split(r'(?<=[。!?…\n])', text)
199
- return [sentence.strip() for sentence in sentences if sentence]
200
-
201
-
202
- def seconds_to_ass_time(seconds):
203
- """将秒数转换为ASS时间格式"""
204
- hours = int(seconds / 3600)
205
- minutes = int((seconds % 3600) / 60)
206
- seconds = int(seconds) % 60
207
- milliseconds = int((seconds - int(seconds)) * 1000)
208
- return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
209
-
210
- def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
211
- audio_fin = []
212
- ass_entries = []
213
- start_time = 0
214
-
215
- ass_header = """[Script Info]
216
- ; Script generated by OpenAI Assistant
217
- Title: Audiobook
218
- ScriptType: v4.00+
219
- WrapStyle: 0
220
- PlayResX: 640
221
- PlayResY: 360
222
- ScaledBorderAndShadow: yes
223
- [V4+ Styles]
224
- Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
225
- Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
226
- [Events]
227
- Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
228
- """
229
-
230
- for sentence in group:
231
- try:
232
- print(sentence)
233
- FakeSpeaker = sentence.split("|")[0]
234
- print(FakeSpeaker)
235
- SpeakersList = re.split('\n', spealerList)
236
- if FakeSpeaker in list(hps.data.spk2id.keys()):
237
- speaker = FakeSpeaker
238
- for i in SpeakersList:
239
- if FakeSpeaker == i.split("|")[1]:
240
- speaker = i.split("|")[0]
241
- speaker_ids = hps.data.spk2id
242
-
243
- _, audio = tts_fn(sentence.split("|")[-1], speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, LongSentence=True)
244
- silence_frames = int(silenceTime * 44010)
245
- silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
246
- audio_fin.append(audio)
247
- audio_fin.append(silence_data)
248
-
249
- duration = len(audio) / sampling_rate
250
- end_time = start_time + duration + silenceTime
251
- ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
252
- start_time = end_time
253
- except:
254
- pass
255
- wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
256
- ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
257
-
258
- write(wav_filename, sampling_rate, np.concatenate(audio_fin))
259
-
260
- with open(ass_filename, 'w', encoding='utf-8') as f:
261
- f.write(ass_header + '\n'.join(ass_entries))
262
- return (hps.data.sampling_rate, np.concatenate(audio_fin))
263
- def extract_text_from_epub(file_path):
264
- book = epub.read_epub(file_path)
265
- content = []
266
- for item in book.items:
267
- if isinstance(item, epub.EpubHtml):
268
- soup = BeautifulSoup(item.content, 'html.parser')
269
- content.append(soup.get_text())
270
- return '\n'.join(content)
271
-
272
- def extract_text_from_pdf(file_path):
273
- with open(file_path, 'rb') as file:
274
- reader = PdfReader(file)
275
- content = [page.extract_text() for page in reader.pages]
276
- return '\n'.join(content)
277
 
278
- def extract_text_from_game2(data):
279
- current_content = []
280
-
281
- def _extract(data, current_data=None):
282
- nonlocal current_content
283
-
284
- if current_data is None:
285
- current_data = {}
286
-
287
- if isinstance(data, dict):
288
- if 'name' in data and 'body' in data:
289
- current_name = data['name']
290
- current_body = data['body'].replace('\n', '')
291
- current_content.append(f"{current_name}|{current_body}")
292
-
293
- for key, value in data.items():
294
- _extract(value, dict(current_data))
295
-
296
- elif isinstance(data, list):
297
- for item in data:
298
- _extract(item, dict(current_data))
299
-
300
- _extract(data)
301
- return '\n'.join(current_content)
302
-
303
- def extract_text_from_file(inputFile):
304
- file_extension = os.path.splitext(inputFile)[1].lower()
305
-
306
- if file_extension == ".epub":
307
- return extract_text_from_epub(inputFile)
308
- elif file_extension == ".pdf":
309
- return extract_text_from_pdf(inputFile)
310
- elif file_extension == ".txt":
311
- with open(inputFile, 'r', encoding='utf-8') as f:
312
- return f.read()
313
- elif file_extension == ".asset":
314
- with open(inputFile, 'r', encoding='utf-8') as f:
315
- content = json.load(f)
316
- return extract_text_from_game2(content) if extract_text_from_game2(content) != '' else extract_text_from_game2(content)
317
- else:
318
- raise ValueError(f"Unsupported file format: {file_extension}")
319
-
320
- def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
321
- directory_path = "books"
322
- output_path = "books/audiobook_part_1.wav"
323
-
324
- if os.path.exists(directory_path):
325
- shutil.rmtree(directory_path)
326
-
327
- os.makedirs(directory_path)
328
- text = extract_text_from_file(inputFile.name)
329
- sentences = split_into_sentences(text)
330
- GROUP_SIZE = groupsize
331
- for i in range(0, len(sentences), GROUP_SIZE):
332
- group = sentences[i:i+GROUP_SIZE]
333
- if spealerList == "":
334
- spealerList = "无"
335
- result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
336
- if not torch.cuda.is_available():
337
- return result
338
- return result
339
 
340
  def loadmodel(model):
341
  _ = net_g.eval()
342
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
343
  return "success"
344
 
345
-
346
  if __name__ == "__main__":
347
- parser = argparse.ArgumentParser()
348
- parser.add_argument(
349
- "-m", "--model", default="./logs/BangDream/G_45000.pth", help="path of your model"
350
- )
351
- parser.add_argument(
352
- "-c",
353
- "--config",
354
- default="configs/config.json",
355
- help="path of your config file",
356
- )
357
- parser.add_argument(
358
- "--share", default=True, help="make link public", action="store_true"
359
  )
360
- parser.add_argument(
361
- "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
362
- )
363
-
364
- args = parser.parse_args()
365
- if args.debug:
366
- logger.info("Enable DEBUG-LEVEL log")
367
- logging.basicConfig(level=logging.DEBUG)
368
- device = (
369
- "cuda:0"
370
- if torch.cuda.is_available()
371
- else (
372
- "mps"
373
- if sys.platform == "darwin" and torch.backends.mps.is_available()
374
- else "cpu"
375
- )
376
- )
377
- hps = utils.get_hparams_from_file(args.config)
378
- net_g = SynthesizerTrn(
379
- len(symbols),
380
- hps.data.filter_length // 2 + 1,
381
- hps.train.segment_size // hps.data.hop_length,
382
- n_speakers=hps.data.n_speakers,
383
- **hps.model,
384
- ).to(device)
385
- loadmodel(args.model)
386
  speaker_ids = hps.data.spk2id
387
  speakers = list(speaker_ids.keys())
388
- languages = ["ZH", "JP"]
389
- examples = [
390
- ["filelist/Scenarioband6-018.asset", 500, "つくし", "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子", "扩展功能"],
391
- ]
392
  modelPaths = []
393
- for dirpath, dirnames, filenames in os.walk("./logs/BangDream/"):
394
  for filename in filenames:
395
  modelPaths.append(os.path.join(dirpath, filename))
396
  with gr.Blocks() as app:
397
- gr.Markdown(
398
- f"少歌邦邦全员TTS,使用本模型请严格遵守法律法规!\n 发布二创作品请注明项目和本模型作者<a href='https://space.bilibili.com/19874615/'>B站@Mahiroshi</a>及项目链接\n从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看使用说明</a>"
399
- )
400
  for band in BandList:
401
  with gr.TabItem(band):
402
  for name in BandList[band]:
@@ -412,20 +230,10 @@ if __name__ == "__main__":
412
  length_scale = gr.Slider(
413
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
414
  )
415
- with gr.Accordion(label="切换模型(合成中文建议切换为早期模型)", open=False):
416
- modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
417
- btnMod = gr.Button("载入模型")
418
- statusa = gr.TextArea()
419
- btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
420
- with gr.Column():
421
- text = gr.TextArea(
422
- label="输入纯日语或者中文",
423
- placeholder="输入纯日语或者中文",
424
- value="有个人躺在地上,哀嚎......\n有个人睡着了,睡在盒子里。\n我要把它打开,看看他的梦是什么。",
425
- )
426
- btn = gr.Button("点击生成", variant="primary")
427
- audio_output = gr.Audio(label="Output Audio")
428
- with gr.Accordion(label="其它参数设定", open=False):
429
  sdp_ratio = gr.Slider(
430
  minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
431
  )
@@ -435,73 +243,42 @@ if __name__ == "__main__":
435
  noise_scale_w = gr.Slider(
436
  minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
437
  )
438
- LongSentence = gr.Checkbox(value=True, label="Generate LongSentence")
439
  speaker = gr.Dropdown(
440
  choices=speakers, value=name, label="说话人"
441
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  btn.click(
443
- tts_fn,
444
  inputs=[
445
  text,
446
- speaker,
447
  sdp_ratio,
448
  noise_scale,
449
  noise_scale_w,
450
  length_scale,
451
- LongSentence,
452
- ],
453
- outputs=[audio_output],
454
- )
455
- for i in examples:
456
- with gr.Tab(i[-1]):
457
- with gr.Row():
458
- with gr.Column():
459
- gr.Markdown(
460
- f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
461
- )
462
- inputFile = gr.inputs.File(label="上传txt(可设置角色对应表)、epub或mobi文件")
463
- groupSize = gr.Slider(
464
- minimum=10, maximum=1000,value = i[1], step=1, label="当个音频文件包含的最大字数"
465
- )
466
- silenceTime = gr.Slider(
467
- minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔"
468
- )
469
- spealerList = gr.TextArea(
470
- label="角色对应表",
471
- placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}|{SeakerInUploadText3}\n",
472
- value = i[3],
473
- )
474
- speaker = gr.Dropdown(
475
- choices=speakers, value = i[2], label="选择默认说话人"
476
- )
477
- with gr.Column():
478
- sdp_ratio = gr.Slider(
479
- minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
480
- )
481
- noise_scale = gr.Slider(
482
- minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
483
- )
484
- noise_scale_w = gr.Slider(
485
- minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
486
- )
487
- length_scale = gr.Slider(
488
- minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
489
- )
490
- LastAudioOutput = gr.Audio(label="当用cuda在本地运行时才能在book文件夹下浏览全部合成内容")
491
- btn2 = gr.Button("点击生成", variant="primary")
492
- btn2.click(
493
- audiobook,
494
- inputs=[
495
- inputFile,
496
- groupSize,
497
  speaker,
498
- sdp_ratio,
499
- noise_scale,
500
- noise_scale_w,
501
- length_scale,
502
- spealerList,
503
- silenceTime
504
  ],
505
- outputs=[LastAudioOutput],
506
  )
507
- app.launch()
 
 
 
1
+ import argparse
2
+ import os
3
+ from pathlib import Path
4
+
5
  import logging
6
+ import re_matching
7
+
8
  logging.getLogger("numba").setLevel(logging.WARNING)
9
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
10
  logging.getLogger("urllib3").setLevel(logging.WARNING)
 
15
  )
16
 
17
  logger = logging.getLogger(__name__)
18
+
19
+ import librosa
20
  import numpy as np
21
  import torch
22
+ import torch.nn as nn
23
+ from torch.utils.data import Dataset
24
+ from torch.utils.data import DataLoader, Dataset
25
+ from tqdm import tqdm
26
+ from transformers import Wav2Vec2Processor
27
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
28
+ Wav2Vec2Model,
29
+ Wav2Vec2PreTrainedModel,
30
+ )
31
+
32
+ import gradio as gr
33
+
34
+ import utils
35
+ from config import config
36
+
37
+ import torch
38
  import commons
39
+ from text import cleaned_text_to_sequence, get_bert
40
+ from emo_gen import process_func, EmotionModel, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2PreTrainedModel, RegressionHead
41
+ from text.cleaner import clean_text
42
  import utils
43
+
44
  from models import SynthesizerTrn
45
  from text.symbols import symbols
46
+ import sys
47
+
 
 
 
 
 
48
  net_g = None
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ device = 'cpu'
 
 
 
 
51
 
52
+ device = (
53
+ "cuda:0"
54
+ if torch.cuda.is_available()
55
+ else (
56
+ "mps"
57
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
58
+ else "cpu"
59
+ )
60
+ )
61
 
62
+ BandList = {
63
+ "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈"],
64
+ "AveMujica":["祥子","睦","海鈴","にゃむ","初華"]
65
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ def get_net_g(model_path: str, version: str, device: str, hps):
68
+ net_g = SynthesizerTrn(
69
+ len(symbols),
70
+ hps.data.filter_length // 2 + 1,
71
+ hps.train.segment_size // hps.data.hop_length,
72
+ n_speakers=hps.data.n_speakers,
73
+ **hps.model,
74
+ ).to(device)
75
+ _ = net_g.eval()
76
+ _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
77
+ return net_g
78
+
79
+ def get_text(text, language_str, hps, device):
80
  norm_text, phone, tone, word2ph = clean_text(text, language_str)
81
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
82
+ print(text)
83
  if hps.data.add_blank:
84
  phone = commons.intersperse(phone, 0)
85
  tone = commons.intersperse(tone, 0)
 
87
  for i in range(len(word2ph)):
88
  word2ph[i] = word2ph[i] * 2
89
  word2ph[0] += 1
90
+ bert_ori = get_bert(norm_text, word2ph, language_str, device)
91
  del word2ph
92
+ assert bert_ori.shape[-1] == len(phone), phone
93
 
94
  if language_str == "ZH":
95
+ bert = bert_ori
96
+ ja_bert = torch.zeros(1024, len(phone))
97
+ en_bert = torch.zeros(1024, len(phone))
98
+ elif language_str == "JP":
99
  bert = torch.zeros(1024, len(phone))
100
+ ja_bert = bert_ori
101
+ en_bert = torch.zeros(1024, len(phone))
102
+ elif language_str == "EN":
103
  bert = torch.zeros(1024, len(phone))
104
+ ja_bert = torch.zeros(1024, len(phone))
105
+ en_bert = bert_ori
106
+ else:
107
+ raise ValueError("language_str should be ZH, JP or EN")
108
 
109
  assert bert.shape[-1] == len(
110
  phone
 
113
  phone = torch.LongTensor(phone)
114
  tone = torch.LongTensor(tone)
115
  language = torch.LongTensor(language)
116
+ return bert, ja_bert, en_bert, phone, tone, language
117
 
118
+ def get_emo_(reference_audio, emotion):
119
+ emo = (
120
+ torch.from_numpy(get_emo(reference_audio))
121
+ if reference_audio
122
+ else torch.Tensor([emotion])
123
+ )
124
+ return emo
125
+
126
+ def get_emo(path):
127
+ wav, sr = librosa.load(path, 16000)
128
+ device = config.bert_gen_config.device
129
+ return process_func(
130
+ np.expand_dims(wav, 0).astype(np.float),
131
+ sr,
132
+ emotional_model,
133
+ emotional_processor,
134
+ device,
135
+ embeddings=True,
136
+ ).squeeze(0)
137
+
138
+ def infer(
139
+ text,
140
+ sdp_ratio,
141
+ noise_scale,
142
+ noise_scale_w,
143
+ length_scale,
144
+ sid,
145
+ reference_audio=None,
146
+ emotion=None,
147
+ ):
148
 
149
+ language= 'JP' if is_japanese(text) else 'ZH'
150
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
151
+ text, language, hps, device
152
+ )
153
+ emo = get_emo_(reference_audio, emotion)
154
  with torch.no_grad():
155
  x_tst = phones.to(device).unsqueeze(0)
156
  tones = tones.to(device).unsqueeze(0)
157
  lang_ids = lang_ids.to(device).unsqueeze(0)
158
  bert = bert.to(device).unsqueeze(0)
159
  ja_bert = ja_bert.to(device).unsqueeze(0)
160
+ en_bert = en_bert.to(device).unsqueeze(0)
161
  x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
162
+ emo = emo.to(device).unsqueeze(0)
163
  del phones
164
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
165
  audio = (
 
171
  lang_ids,
172
  bert,
173
  ja_bert,
174
+ en_bert,
175
+ emo,
176
  sdp_ratio=sdp_ratio,
177
  noise_scale=noise_scale,
178
  noise_scale_w=noise_scale_w,
 
182
  .float()
183
  .numpy()
184
  )
185
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
186
+ if torch.cuda.is_available():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  torch.cuda.empty_cache()
188
+ return (hps.data.sampling_rate,audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ def is_japanese(string):
191
+ for ch in string:
192
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
193
+ return True
194
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def loadmodel(model):
197
  _ = net_g.eval()
198
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
199
  return "success"
200
 
 
201
  if __name__ == "__main__":
202
+ emotional_model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
203
+ REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
204
+ emotional_processor = Wav2Vec2Processor.from_pretrained(emotional_model_name)
205
+ emotional_model = EmotionModel.from_pretrained(emotional_model_name).to(device)
206
+ hps = utils.get_hparams_from_file('Data/BanGDream/configs/config.json')
207
+ net_g = get_net_g(
208
+ model_path='Data/BangDream/models/G_49000.pth', version="2.1", device=device, hps=hps
 
 
 
 
 
209
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  speaker_ids = hps.data.spk2id
211
  speakers = list(speaker_ids.keys())
212
+ languages = [ "Auto", "ZH", "JP"]
 
 
 
213
  modelPaths = []
214
+ for dirpath, dirnames, filenames in os.walk("Data/BanGDream/models/"):
215
  for filename in filenames:
216
  modelPaths.append(os.path.join(dirpath, filename))
217
  with gr.Blocks() as app:
 
 
 
218
  for band in BandList:
219
  with gr.TabItem(band):
220
  for name in BandList[band]:
 
230
  length_scale = gr.Slider(
231
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
232
  )
233
+ emotion = gr.Slider(
234
+ minimum=0, maximum=9, value=0, step=1, label="Emotion"
235
+ )
236
+ with gr.Accordion(label="参数设定", open=False):
 
 
 
 
 
 
 
 
 
 
237
  sdp_ratio = gr.Slider(
238
  minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
239
  )
 
243
  noise_scale_w = gr.Slider(
244
  minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
245
  )
 
246
  speaker = gr.Dropdown(
247
  choices=speakers, value=name, label="说话人"
248
+ )
249
+ with gr.Accordion(label="切换模型", open=False):
250
+ modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
251
+ btnMod = gr.Button("载入模型")
252
+ statusa = gr.TextArea()
253
+ btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
254
+ with gr.Column():
255
+ text = gr.TextArea(
256
+ label="输入纯日语或者中文",
257
+ placeholder="输入纯日语或者中文",
258
+ value="为什么要演奏春日影!",
259
+ )
260
+ reference_audio = gr.Audio(label="情感参考音频(WAV 格式):用于生成语音的情感参考。(WAV 格式)", type="filepath")
261
+ btn = gr.Button("点击生成", variant="primary")
262
+ audio_output = gr.Audio(label="Output Audio")
263
+ '''
264
+ btntran = gr.Button("快速中翻日")
265
+ translateResult = gr.TextArea("从这复制翻译后的文本")
266
+ btntran.click(translate, inputs=[text], outputs = [translateResult])
267
+ '''
268
  btn.click(
269
+ infer,
270
  inputs=[
271
  text,
 
272
  sdp_ratio,
273
  noise_scale,
274
  noise_scale_w,
275
  length_scale,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  speaker,
277
+ reference_audio,
278
+ emotion,
 
 
 
 
279
  ],
280
+ outputs=[audio_output],
281
  )
282
+
283
+ print("推理页面已开启!")
284
+ app.launch()
attentions_onnx.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+ for i in range(self.n_layers):
82
+ self.attn_layers.append(
83
+ MultiHeadAttention(
84
+ hidden_channels,
85
+ hidden_channels,
86
+ n_heads,
87
+ p_dropout=p_dropout,
88
+ window_size=window_size,
89
+ )
90
+ )
91
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
92
+ self.ffn_layers.append(
93
+ FFN(
94
+ hidden_channels,
95
+ hidden_channels,
96
+ filter_channels,
97
+ kernel_size,
98
+ p_dropout=p_dropout,
99
+ )
100
+ )
101
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
102
+
103
+ def forward(self, x, x_mask, g=None):
104
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
105
+ x = x * x_mask
106
+ for i in range(self.n_layers):
107
+ if i == self.cond_layer_idx and g is not None:
108
+ g = self.spk_emb_linear(g.transpose(1, 2))
109
+ g = g.transpose(1, 2)
110
+ x = x + g
111
+ x = x * x_mask
112
+ y = self.attn_layers[i](x, x, attn_mask)
113
+ y = self.drop(y)
114
+ x = self.norm_layers_1[i](x + y)
115
+
116
+ y = self.ffn_layers[i](x, x_mask)
117
+ y = self.drop(y)
118
+ x = self.norm_layers_2[i](x + y)
119
+ x = x * x_mask
120
+ return x
121
+
122
+
123
+ class MultiHeadAttention(nn.Module):
124
+ def __init__(
125
+ self,
126
+ channels,
127
+ out_channels,
128
+ n_heads,
129
+ p_dropout=0.0,
130
+ window_size=None,
131
+ heads_share=True,
132
+ block_length=None,
133
+ proximal_bias=False,
134
+ proximal_init=False,
135
+ ):
136
+ super().__init__()
137
+ assert channels % n_heads == 0
138
+
139
+ self.channels = channels
140
+ self.out_channels = out_channels
141
+ self.n_heads = n_heads
142
+ self.p_dropout = p_dropout
143
+ self.window_size = window_size
144
+ self.heads_share = heads_share
145
+ self.block_length = block_length
146
+ self.proximal_bias = proximal_bias
147
+ self.proximal_init = proximal_init
148
+ self.attn = None
149
+
150
+ self.k_channels = channels // n_heads
151
+ self.conv_q = nn.Conv1d(channels, channels, 1)
152
+ self.conv_k = nn.Conv1d(channels, channels, 1)
153
+ self.conv_v = nn.Conv1d(channels, channels, 1)
154
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
155
+ self.drop = nn.Dropout(p_dropout)
156
+
157
+ if window_size is not None:
158
+ n_heads_rel = 1 if heads_share else n_heads
159
+ rel_stddev = self.k_channels**-0.5
160
+ self.emb_rel_k = nn.Parameter(
161
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
162
+ * rel_stddev
163
+ )
164
+ self.emb_rel_v = nn.Parameter(
165
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
166
+ * rel_stddev
167
+ )
168
+
169
+ nn.init.xavier_uniform_(self.conv_q.weight)
170
+ nn.init.xavier_uniform_(self.conv_k.weight)
171
+ nn.init.xavier_uniform_(self.conv_v.weight)
172
+ if proximal_init:
173
+ with torch.no_grad():
174
+ self.conv_k.weight.copy_(self.conv_q.weight)
175
+ self.conv_k.bias.copy_(self.conv_q.bias)
176
+
177
+ def forward(self, x, c, attn_mask=None):
178
+ q = self.conv_q(x)
179
+ k = self.conv_k(c)
180
+ v = self.conv_v(c)
181
+
182
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
183
+
184
+ x = self.conv_o(x)
185
+ return x
186
+
187
+ def attention(self, query, key, value, mask=None):
188
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
189
+ b, d, t_s, t_t = (*key.size(), query.size(2))
190
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
191
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
192
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
193
+
194
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
195
+ if self.window_size is not None:
196
+ assert (
197
+ t_s == t_t
198
+ ), "Relative attention is only available for self-attention."
199
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
200
+ rel_logits = self._matmul_with_relative_keys(
201
+ query / math.sqrt(self.k_channels), key_relative_embeddings
202
+ )
203
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
204
+ scores = scores + scores_local
205
+ if self.proximal_bias:
206
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
207
+ scores = scores + self._attention_bias_proximal(t_s).to(
208
+ device=scores.device, dtype=scores.dtype
209
+ )
210
+ if mask is not None:
211
+ scores = scores.masked_fill(mask == 0, -1e4)
212
+ if self.block_length is not None:
213
+ assert (
214
+ t_s == t_t
215
+ ), "Local attention is only available for self-attention."
216
+ block_mask = (
217
+ torch.ones_like(scores)
218
+ .triu(-self.block_length)
219
+ .tril(self.block_length)
220
+ )
221
+ scores = scores.masked_fill(block_mask == 0, -1e4)
222
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
223
+ p_attn = self.drop(p_attn)
224
+ output = torch.matmul(p_attn, value)
225
+ if self.window_size is not None:
226
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
227
+ value_relative_embeddings = self._get_relative_embeddings(
228
+ self.emb_rel_v, t_s
229
+ )
230
+ output = output + self._matmul_with_relative_values(
231
+ relative_weights, value_relative_embeddings
232
+ )
233
+ output = (
234
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
235
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
236
+ return output, p_attn
237
+
238
+ def _matmul_with_relative_values(self, x, y):
239
+ """
240
+ x: [b, h, l, m]
241
+ y: [h or 1, m, d]
242
+ ret: [b, h, l, d]
243
+ """
244
+ ret = torch.matmul(x, y.unsqueeze(0))
245
+ return ret
246
+
247
+ def _matmul_with_relative_keys(self, x, y):
248
+ """
249
+ x: [b, h, l, d]
250
+ y: [h or 1, m, d]
251
+ ret: [b, h, l, m]
252
+ """
253
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
254
+ return ret
255
+
256
+ def _get_relative_embeddings(self, relative_embeddings, length):
257
+ max_relative_position = 2 * self.window_size + 1
258
+ # Pad first before slice to avoid using cond ops.
259
+ pad_length = max(length - (self.window_size + 1), 0)
260
+ slice_start_position = max((self.window_size + 1) - length, 0)
261
+ slice_end_position = slice_start_position + 2 * length - 1
262
+ if pad_length > 0:
263
+ padded_relative_embeddings = F.pad(
264
+ relative_embeddings,
265
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
266
+ )
267
+ else:
268
+ padded_relative_embeddings = relative_embeddings
269
+ used_relative_embeddings = padded_relative_embeddings[
270
+ :, slice_start_position:slice_end_position
271
+ ]
272
+ return used_relative_embeddings
273
+
274
+ def _relative_position_to_absolute_position(self, x):
275
+ """
276
+ x: [b, h, l, 2*l-1]
277
+ ret: [b, h, l, l]
278
+ """
279
+ batch, heads, length, _ = x.size()
280
+ # Concat columns of pad to shift from relative to absolute indexing.
281
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
282
+
283
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
284
+ x_flat = x.view([batch, heads, length * 2 * length])
285
+ x_flat = F.pad(
286
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
287
+ )
288
+
289
+ # Reshape and slice out the padded elements.
290
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
291
+ :, :, :length, length - 1 :
292
+ ]
293
+ return x_final
294
+
295
+ def _absolute_position_to_relative_position(self, x):
296
+ """
297
+ x: [b, h, l, l]
298
+ ret: [b, h, l, 2*l-1]
299
+ """
300
+ batch, heads, length, _ = x.size()
301
+ # padd along column
302
+ x = F.pad(
303
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
304
+ )
305
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
306
+ # add 0's in the beginning that will skew the elements after reshape
307
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
308
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
309
+ return x_final
310
+
311
+ def _attention_bias_proximal(self, length):
312
+ """Bias for self-attention to encourage attention to close positions.
313
+ Args:
314
+ length: an integer scalar.
315
+ Returns:
316
+ a Tensor with shape [1, 1, length, length]
317
+ """
318
+ r = torch.arange(length, dtype=torch.float32)
319
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
320
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
321
+
322
+
323
+ class FFN(nn.Module):
324
+ def __init__(
325
+ self,
326
+ in_channels,
327
+ out_channels,
328
+ filter_channels,
329
+ kernel_size,
330
+ p_dropout=0.0,
331
+ activation=None,
332
+ causal=False,
333
+ ):
334
+ super().__init__()
335
+ self.in_channels = in_channels
336
+ self.out_channels = out_channels
337
+ self.filter_channels = filter_channels
338
+ self.kernel_size = kernel_size
339
+ self.p_dropout = p_dropout
340
+ self.activation = activation
341
+ self.causal = causal
342
+
343
+ if causal:
344
+ self.padding = self._causal_padding
345
+ else:
346
+ self.padding = self._same_padding
347
+
348
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
349
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
350
+ self.drop = nn.Dropout(p_dropout)
351
+
352
+ def forward(self, x, x_mask):
353
+ x = self.conv_1(self.padding(x * x_mask))
354
+ if self.activation == "gelu":
355
+ x = x * torch.sigmoid(1.702 * x)
356
+ else:
357
+ x = torch.relu(x)
358
+ x = self.drop(x)
359
+ x = self.conv_2(self.padding(x * x_mask))
360
+ return x * x_mask
361
+
362
+ def _causal_padding(self, x):
363
+ if self.kernel_size == 1:
364
+ return x
365
+ pad_l = self.kernel_size - 1
366
+ pad_r = 0
367
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
368
+ x = F.pad(x, commons.convert_pad_shape(padding))
369
+ return x
370
+
371
+ def _same_padding(self, x):
372
+ if self.kernel_size == 1:
373
+ return x
374
+ pad_l = (self.kernel_size - 1) // 2
375
+ pad_r = self.kernel_size // 2
376
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
377
+ x = F.pad(x, commons.convert_pad_shape(padding))
378
+ return x
bert/bert-base-japanese-v3/README.md CHANGED
@@ -50,4 +50,4 @@ The pretrained models are distributed under the Apache License 2.0.
50
 
51
  ## Acknowledgments
52
 
53
- This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
 
50
 
51
  ## Acknowledgments
52
 
53
+ This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
bert/bert-base-japanese-v3/vocab.txt CHANGED
@@ -13,7 +13,7 @@
13
  [unused7]
14
  [unused8]
15
  [unused9]
16
-
17
  !
18
  "
19
  #
 
13
  [unused7]
14
  [unused8]
15
  [unused9]
16
+
17
  !
18
  "
19
  #
bert/bert-large-japanese-v2/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/bert-large-japanese-v2/README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - cc100
5
+ - wikipedia
6
+ language:
7
+ - ja
8
+ widget:
9
+ - text: 東北大学で[MASK]の研究をしています。
10
+ ---
11
+
12
+ # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13
+
14
+ This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15
+
16
+ This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17
+ Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18
+
19
+ The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20
+
21
+ ## Model architecture
22
+
23
+ The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24
+
25
+ ## Training Data
26
+
27
+ The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28
+ For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29
+ The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30
+
31
+ For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32
+
33
+ ## Tokenization
34
+
35
+ The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36
+ The vocabulary size is 32768.
37
+
38
+ We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39
+
40
+ ## Training
41
+
42
+ We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43
+ For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44
+
45
+ For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46
+
47
+ ## Licenses
48
+
49
+ The pretrained models are distributed under the Apache License 2.0.
50
+
51
+ ## Acknowledgments
52
+
53
+ This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
bert/bert-large-japanese-v2/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForPreTraining"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 4096,
11
+ "layer_norm_eps": 1e-12,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "bert",
14
+ "num_attention_heads": 16,
15
+ "num_hidden_layers": 24,
16
+ "pad_token_id": 0,
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 32768
19
+ }
bert/bert-large-japanese-v2/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "BertJapaneseTokenizer",
3
+ "model_max_length": 512,
4
+ "do_lower_case": false,
5
+ "word_tokenizer_type": "mecab",
6
+ "subword_tokenizer_type": "wordpiece",
7
+ "mecab_kwargs": {
8
+ "mecab_dic": "unidic_lite"
9
+ }
10
+ }
bert/bert-large-japanese-v2/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
bert/bert_models.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "deberta-v2-large-japanese-char-wwm": {
3
+ "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
4
+ "files": ["pytorch_model.bin"]
5
+ },
6
+ "chinese-roberta-wwm-ext-large": {
7
+ "repo_id": "hfl/chinese-roberta-wwm-ext-large",
8
+ "files": ["pytorch_model.bin"]
9
+ },
10
+ "deberta-v3-large": {
11
+ "repo_id": "microsoft/deberta-v3-large",
12
+ "files": ["spm.model", "pytorch_model.bin"]
13
+ }
14
+ }
bert/chinese-roberta-wwm-ext-large/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- language:
3
  - zh
4
  tags:
5
  - bert
@@ -9,9 +9,9 @@ license: "apache-2.0"
9
  # Please use 'Bert' related functions to load this model!
10
 
11
  ## Chinese BERT with Whole Word Masking
12
- For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13
 
14
- **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15
  Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16
 
17
  This repository is developed based on:https://github.com/google-research/bert
@@ -46,7 +46,7 @@ If you find the technical report or resource is useful, please cite the followin
46
  pages = "657--668",
47
  }
48
  ```
49
- - Secondary: https://arxiv.org/abs/1906.08101
50
  ```
51
  @article{chinese-bert-wwm,
52
  title={Pre-Training with Whole Word Masking for Chinese BERT},
@@ -54,4 +54,4 @@ If you find the technical report or resource is useful, please cite the followin
54
  journal={arXiv preprint arXiv:1906.08101},
55
  year={2019}
56
  }
57
- ```
 
1
  ---
2
+ language:
3
  - zh
4
  tags:
5
  - bert
 
9
  # Please use 'Bert' related functions to load this model!
10
 
11
  ## Chinese BERT with Whole Word Masking
12
+ For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13
 
14
+ **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15
  Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16
 
17
  This repository is developed based on:https://github.com/google-research/bert
 
46
  pages = "657--668",
47
  }
48
  ```
49
+ - Secondary: https://arxiv.org/abs/1906.08101
50
  ```
51
  @article{chinese-bert-wwm,
52
  title={Pre-Training with Whole Word Masking for Chinese BERT},
 
54
  journal={arXiv preprint arXiv:1906.08101},
55
  year={2019}
56
  }
57
+ ```
bert/chinese-roberta-wwm-ext-large/added_tokens.json CHANGED
@@ -1 +1 @@
1
- {}
 
1
+ {}
bert/chinese-roberta-wwm-ext-large/special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
bert/chinese-roberta-wwm-ext-large/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
bert/chinese-roberta-wwm-ext-large/tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"init_inputs": []}
 
1
+ {"init_inputs": []}
bert/deberta-v2-large-japanese-char-wwm/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/deberta-v2-large-japanese-char-wwm/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: cc-by-sa-4.0
4
+ library_name: transformers
5
+ tags:
6
+ - deberta
7
+ - deberta-v2
8
+ - fill-mask
9
+ - character
10
+ - wwm
11
+ datasets:
12
+ - wikipedia
13
+ - cc100
14
+ - oscar
15
+ metrics:
16
+ - accuracy
17
+ mask_token: "[MASK]"
18
+ widget:
19
+ - text: "京都大学で自然言語処理を[MASK][MASK]する。"
20
+ ---
21
+
22
+ # Model Card for Japanese character-level DeBERTa V2 large
23
+
24
+ ## Model description
25
+
26
+ This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the Japanese portion of OSCAR.
27
+ This model is trained with character-level tokenization and whole word masking.
28
+
29
+ ## How to use
30
+
31
+ You can use this model for masked language modeling as follows:
32
+
33
+ ```python
34
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
35
+ tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
36
+ model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
37
+
38
+ sentence = '京都大学で自然言語処理を[MASK][MASK]する。'
39
+ encoding = tokenizer(sentence, return_tensors='pt')
40
+ ...
41
+ ```
42
+
43
+ You can also fine-tune this model on downstream tasks.
44
+
45
+ ## Tokenization
46
+
47
+ There is no need to tokenize texts in advance, and you can give raw texts to the tokenizer.
48
+ The texts are tokenized into character-level tokens by [sentencepiece](https://github.com/google/sentencepiece).
49
+
50
+ ## Training data
51
+
52
+ We used the following corpora for pre-training:
53
+
54
+ - Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
55
+ - Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
56
+ - Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
57
+
58
+ Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
59
+ Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
60
+
61
+ ## Training procedure
62
+
63
+ We first segmented texts in the corpora into words using [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) for whole word masking.
64
+ Then, we built a sentencepiece model with 22,012 tokens including all characters that appear in the training corpus.
65
+
66
+ We tokenized raw corpora into character-level subwords using the sentencepiece model and trained the Japanese DeBERTa model using [transformers](https://github.com/huggingface/transformers) library.
67
+ The training took 26 days using 16 NVIDIA A100-SXM4-40GB GPUs.
68
+
69
+ The following hyperparameters were used during pre-training:
70
+
71
+ - learning_rate: 1e-4
72
+ - per_device_train_batch_size: 26
73
+ - distributed_type: multi-GPU
74
+ - num_devices: 16
75
+ - gradient_accumulation_steps: 8
76
+ - total_train_batch_size: 3,328
77
+ - max_seq_length: 512
78
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
79
+ - lr_scheduler_type: linear schedule with warmup (lr = 0 at 300k steps)
80
+ - training_steps: 260,000
81
+ - warmup_steps: 10,000
82
+
83
+ The accuracy of the trained model on the masked language modeling task was 0.795.
84
+ The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
85
+
86
+ ## Acknowledgments
87
+
88
+ This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of Large-Scale Japanese Language Models".
89
+ For training models, we used the mdx: a platform for the data-driven future.
bert/deberta-v2-large-japanese-char-wwm/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForMaskedLM"
4
+ ],
5
+ "attention_head_size": 64,
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "conv_act": "gelu",
8
+ "conv_kernel_size": 3,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-07,
15
+ "max_position_embeddings": 512,
16
+ "max_relative_positions": -1,
17
+ "model_type": "deberta-v2",
18
+ "norm_rel_ebd": "layer_norm",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "pad_token_id": 0,
22
+ "pooler_dropout": 0,
23
+ "pooler_hidden_act": "gelu",
24
+ "pooler_hidden_size": 1024,
25
+ "pos_att_type": [
26
+ "p2c",
27
+ "c2p"
28
+ ],
29
+ "position_biased_input": false,
30
+ "position_buckets": 256,
31
+ "relative_attention": true,
32
+ "share_att_key": true,
33
+ "torch_dtype": "float16",
34
+ "transformers_version": "4.25.1",
35
+ "type_vocab_size": 0,
36
+ "vocab_size": 22012
37
+ }
bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf0dab8ad87bd7c22e85ec71e04f2240804fda6d33196157d6b5923af6ea1201
3
+ size 1318456639
bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "do_subword_tokenize": true,
5
+ "do_word_tokenize": true,
6
+ "jumanpp_kwargs": null,
7
+ "mask_token": "[MASK]",
8
+ "mecab_kwargs": null,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "special_tokens_map_file": null,
14
+ "subword_tokenizer_type": "character",
15
+ "sudachi_kwargs": null,
16
+ "tokenizer_class": "BertJapaneseTokenizer",
17
+ "unk_token": "[UNK]",
18
+ "word_tokenizer_type": "basic"
19
+ }
bert/deberta-v2-large-japanese-char-wwm/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
bert/deberta-v2-large-japanese/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/deberta-v2-large-japanese/README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: cc-by-sa-4.0
4
+ library_name: transformers
5
+ tags:
6
+ - deberta
7
+ - deberta-v2
8
+ - fill-mask
9
+ datasets:
10
+ - wikipedia
11
+ - cc100
12
+ - oscar
13
+ metrics:
14
+ - accuracy
15
+ mask_token: "[MASK]"
16
+ widget:
17
+ - text: "京都 大学 で 自然 言語 処理 を [MASK] する 。"
18
+ ---
19
+
20
+ # Model Card for Japanese DeBERTa V2 large
21
+
22
+ ## Model description
23
+
24
+ This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the
25
+ Japanese portion of OSCAR.
26
+
27
+ ## How to use
28
+
29
+ You can use this model for masked language modeling as follows:
30
+
31
+ ```python
32
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese')
35
+ model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese')
36
+
37
+ sentence = '京都 大学 で 自然 言語 処理 を [MASK] する 。' # input should be segmented into words by Juman++ in advance
38
+ encoding = tokenizer(sentence, return_tensors='pt')
39
+ ...
40
+ ```
41
+
42
+ You can also fine-tune this model on downstream tasks.
43
+
44
+ ## Tokenization
45
+
46
+ The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in
47
+ advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each
48
+ word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).
49
+
50
+ ## Training data
51
+
52
+ We used the following corpora for pre-training:
53
+
54
+ - Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
55
+ - Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
56
+ - Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
57
+
58
+ Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
59
+ Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of
60
+ CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
61
+
62
+ ## Training procedure
63
+
64
+ We first segmented texts in the corpora into words using [Juman++](https://github.com/ku-nlp/jumanpp).
65
+ Then, we built a sentencepiece model with 32000 tokens including words ([JumanDIC](https://github.com/ku-nlp/JumanDIC))
66
+ and subwords induced by the unigram language model of [sentencepiece](https://github.com/google/sentencepiece).
67
+
68
+ We tokenized the segmented corpora into subwords using the sentencepiece model and trained the Japanese DeBERTa model
69
+ using [transformers](https://github.com/huggingface/transformers) library.
70
+ The training took 36 days using 8 NVIDIA A100-SXM4-40GB GPUs.
71
+
72
+ The following hyperparameters were used during pre-training:
73
+
74
+ - learning_rate: 1e-4
75
+ - per_device_train_batch_size: 18
76
+ - distributed_type: multi-GPU
77
+ - num_devices: 8
78
+ - gradient_accumulation_steps: 16
79
+ - total_train_batch_size: 2,304
80
+ - max_seq_length: 512
81
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
82
+ - lr_scheduler_type: linear schedule with warmup
83
+ - training_steps: 300,000
84
+ - warmup_steps: 10,000
85
+
86
+ The accuracy of the trained model on the masked language modeling task was 0.799.
87
+ The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
88
+
89
+ ## Fine-tuning on NLU tasks
90
+
91
+ We fine-tuned the following models and evaluated them on the dev set of JGLUE.
92
+ We tuned learning rate and training epochs for each model and task
93
+ following [the JGLUE paper](https://www.jstage.jst.go.jp/article/jnlp/30/1/30_63/_pdf/-char/ja).
94
+
95
+ | Model | MARC-ja/acc | JSTS/pearson | JSTS/spearman | JNLI/acc | JSQuAD/EM | JSQuAD/F1 | JComQA/acc |
96
+ |-------------------------------|-------------|--------------|---------------|----------|-----------|-----------|------------|
97
+ | Waseda RoBERTa base | 0.965 | 0.913 | 0.876 | 0.905 | 0.853 | 0.916 | 0.853 |
98
+ | Waseda RoBERTa large (seq512) | 0.969 | 0.925 | 0.890 | 0.928 | 0.910 | 0.955 | 0.900 |
99
+ | LUKE Japanese base* | 0.965 | 0.916 | 0.877 | 0.912 | - | - | 0.842 |
100
+ | LUKE Japanese large* | 0.965 | 0.932 | 0.902 | 0.927 | - | - | 0.893 |
101
+ | DeBERTaV2 base | 0.970 | 0.922 | 0.886 | 0.922 | 0.899 | 0.951 | 0.873 |
102
+ | DeBERTaV2 large | 0.968 | 0.925 | 0.892 | 0.924 | 0.912 | 0.959 | 0.890 |
103
+
104
+ *The scores of LUKE are from [the official repository](https://github.com/studio-ousia/luke).
105
+
106
+ ## Acknowledgments
107
+
108
+ This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (
109
+ JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of
110
+ Large-Scale Japanese Language Models".
111
+ For training models, we used the mdx: a platform for the data-driven future.
bert/deberta-v2-large-japanese/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "configs/deberta_v2_large.json",
3
+ "architectures": [
4
+ "DebertaV2ForMaskedLM"
5
+ ],
6
+ "attention_head_size": 64,
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "conv_act": "gelu",
9
+ "conv_kernel_size": 3,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-07,
16
+ "max_position_embeddings": 512,
17
+ "max_relative_positions": -1,
18
+ "model_type": "deberta-v2",
19
+ "norm_rel_ebd": "layer_norm",
20
+ "num_attention_heads": 16,
21
+ "num_hidden_layers": 24,
22
+ "pad_token_id": 0,
23
+ "pooler_dropout": 0,
24
+ "pooler_hidden_act": "gelu",
25
+ "pooler_hidden_size": 1024,
26
+ "pos_att_type": [
27
+ "p2c",
28
+ "c2p"
29
+ ],
30
+ "position_biased_input": false,
31
+ "position_buckets": 256,
32
+ "relative_attention": true,
33
+ "share_att_key": true,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.23.1",
36
+ "type_vocab_size": 0,
37
+ "vocab_size": 32000
38
+ }
bert/deberta-v2-large-japanese/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
bert/deberta-v2-large-japanese/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bert/deberta-v2-large-japanese/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "sp_model_kwargs": {},
11
+ "special_tokens_map_file": null,
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
bert/deberta-v3-large/.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/deberta-v3-large/README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - deberta
5
+ - deberta-v3
6
+ - fill-mask
7
+ thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
8
+ license: mit
9
+ ---
10
+
11
+ ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
12
+
13
+ [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
14
+
15
+ In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
16
+
17
+ Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
18
+
19
+ The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2.
20
+
21
+
22
+ #### Fine-tuning on NLU tasks
23
+
24
+ We present the dev results on SQuAD 2.0 and MNLI tasks.
25
+
26
+ | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
27
+ |-------------------|----------|-------------------|-----------|----------|
28
+ | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 |
29
+ | XLNet-large |32 |- | 90.6/87.9 | 90.8 |
30
+ | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 |
31
+ | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**|
32
+
33
+
34
+ #### Fine-tuning with HF transformers
35
+
36
+ ```bash
37
+ #!/bin/bash
38
+
39
+ cd transformers/examples/pytorch/text-classification/
40
+
41
+ pip install datasets
42
+ export TASK_NAME=mnli
43
+
44
+ output_dir="ds_results"
45
+
46
+ num_gpus=8
47
+
48
+ batch_size=8
49
+
50
+ python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
51
+ run_glue.py \
52
+ --model_name_or_path microsoft/deberta-v3-large \
53
+ --task_name $TASK_NAME \
54
+ --do_train \
55
+ --do_eval \
56
+ --evaluation_strategy steps \
57
+ --max_seq_length 256 \
58
+ --warmup_steps 50 \
59
+ --per_device_train_batch_size ${batch_size} \
60
+ --learning_rate 6e-6 \
61
+ --num_train_epochs 2 \
62
+ --output_dir $output_dir \
63
+ --overwrite_output_dir \
64
+ --logging_steps 1000 \
65
+ --logging_dir $output_dir
66
+
67
+ ```
68
+
69
+ ### Citation
70
+
71
+ If you find DeBERTa useful for your work, please cite the following papers:
72
+
73
+ ``` latex
74
+ @misc{he2021debertav3,
75
+ title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
76
+ author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
77
+ year={2021},
78
+ eprint={2111.09543},
79
+ archivePrefix={arXiv},
80
+ primaryClass={cs.CL}
81
+ }
82
+ ```
83
+
84
+ ``` latex
85
+ @inproceedings{
86
+ he2021deberta,
87
+ title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
88
+ author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
89
+ booktitle={International Conference on Learning Representations},
90
+ year={2021},
91
+ url={https://openreview.net/forum?id=XPZIaotutsD}
92
+ }
93
+ ```
bert/deberta-v3-large/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "deberta-v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 1024,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 4096,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "p2c|c2p",
15
+ "layer_norm_eps": 1e-7,
16
+ "max_relative_positions": -1,
17
+ "position_biased_input": false,
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "type_vocab_size": 0,
21
+ "vocab_size": 128100
22
+ }
bert/deberta-v3-large/generator_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "deberta-v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 1024,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 4096,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "p2c|c2p",
15
+ "layer_norm_eps": 1e-7,
16
+ "max_relative_positions": -1,
17
+ "position_biased_input": false,
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 12,
20
+ "type_vocab_size": 0,
21
+ "vocab_size": 128100
22
+ }
bert/deberta-v3-large/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5b5d93e2db101aaf281df0ea1216c07ad73620ff59c5b42dccac4bf2eef5b5
3
+ size 873673253
bert/deberta-v3-large/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
bert/deberta-v3-large/tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "do_lower_case": false,
3
+ "vocab_type": "spm"
4
+ }
bert_gen.py CHANGED
@@ -6,14 +6,19 @@ from tqdm import tqdm
6
  from text import cleaned_text_to_sequence, get_bert
7
  import argparse
8
  import torch.multiprocessing as mp
 
9
 
10
 
11
  def process_line(line):
12
- rank = mp.current_process()._identity
13
- rank = rank[0] if len(rank) > 0 else 0
14
- if torch.cuda.is_available():
15
- gpu_id = rank % torch.cuda.device_count()
16
- device = torch.device(f"cuda:{gpu_id}")
 
 
 
 
17
  wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
18
  phone = phones.split(" ")
19
  tone = [int(i) for i in tone.split(" ")]
@@ -21,15 +26,14 @@ def process_line(line):
21
  word2ph = [i for i in word2ph]
22
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
23
 
24
- if hps.data.add_blank:
25
- phone = commons.intersperse(phone, 0)
26
- tone = commons.intersperse(tone, 0)
27
- language = commons.intersperse(language, 0)
28
- for i in range(len(word2ph)):
29
- word2ph[i] = word2ph[i] * 2
30
- word2ph[0] += 1
31
 
32
- bert_path = wav_path.replace(".wav", ".bert.pt")
33
 
34
  try:
35
  bert = torch.load(bert_path)
@@ -40,11 +44,17 @@ def process_line(line):
40
  torch.save(bert, bert_path)
41
 
42
 
 
 
43
  if __name__ == "__main__":
44
  parser = argparse.ArgumentParser()
45
- parser.add_argument("-c", "--config", type=str, default="configs/config.json")
46
- parser.add_argument("--num_processes", type=int, default=2)
47
- args = parser.parse_args()
 
 
 
 
48
  config_path = args.config
49
  hps = utils.get_hparams_from_file(config_path)
50
  lines = []
@@ -53,8 +63,10 @@ if __name__ == "__main__":
53
 
54
  with open(hps.data.validation_files, encoding="utf-8") as f:
55
  lines.extend(f.readlines())
 
 
 
 
 
56
 
57
- num_processes = args.num_processes
58
- with Pool(processes=num_processes) as pool:
59
- for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
60
- pass
 
6
  from text import cleaned_text_to_sequence, get_bert
7
  import argparse
8
  import torch.multiprocessing as mp
9
+ from config import config
10
 
11
 
12
  def process_line(line):
13
+ device = config.bert_gen_config.device
14
+ if config.bert_gen_config.use_multi_device:
15
+ rank = mp.current_process()._identity
16
+ rank = rank[0] if len(rank) > 0 else 0
17
+ if torch.cuda.is_available():
18
+ gpu_id = rank % torch.cuda.device_count()
19
+ device = torch.device(f"cuda:{gpu_id}")
20
+ else:
21
+ device = torch.device("cpu")
22
  wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
23
  phone = phones.split(" ")
24
  tone = [int(i) for i in tone.split(" ")]
 
26
  word2ph = [i for i in word2ph]
27
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
28
 
29
+ phone = commons.intersperse(phone, 0)
30
+ tone = commons.intersperse(tone, 0)
31
+ language = commons.intersperse(language, 0)
32
+ for i in range(len(word2ph)):
33
+ word2ph[i] = word2ph[i] * 2
34
+ word2ph[0] += 1
 
35
 
36
+ bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
37
 
38
  try:
39
  bert = torch.load(bert_path)
 
44
  torch.save(bert, bert_path)
45
 
46
 
47
+ preprocess_text_config = config.preprocess_text_config
48
+
49
  if __name__ == "__main__":
50
  parser = argparse.ArgumentParser()
51
+ parser.add_argument(
52
+ "-c", "--config", type=str, default=config.bert_gen_config.config_path
53
+ )
54
+ parser.add_argument(
55
+ "--num_processes", type=int, default=config.bert_gen_config.num_processes
56
+ )
57
+ args, _ = parser.parse_known_args()
58
  config_path = args.config
59
  hps = utils.get_hparams_from_file(config_path)
60
  lines = []
 
63
 
64
  with open(hps.data.validation_files, encoding="utf-8") as f:
65
  lines.extend(f.readlines())
66
+ if len(lines) != 0:
67
+ num_processes = args.num_processes
68
+ with Pool(processes=num_processes) as pool:
69
+ for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
70
+ pass
71
 
72
+ print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
 
 
 
commons.py CHANGED
@@ -50,7 +50,13 @@ def slice_segments(x, ids_str, segment_size=4):
50
  for i in range(x.size(0)):
51
  idx_str = ids_str[i]
52
  idx_end = idx_str + segment_size
53
- ret[i] = x[i, :, idx_str:idx_end]
 
 
 
 
 
 
54
  return ret
55
 
56
 
 
50
  for i in range(x.size(0)):
51
  idx_str = ids_str[i]
52
  idx_end = idx_str + segment_size
53
+ if idx_str < 0:
54
+ i1 = x.size(2) + idx_str
55
+ r1 = x[i, :, i1:]
56
+ r2 = x[i, :, :idx_end]
57
+ ret[i] = torch.cat([r1, r2], dim=1)
58
+ else:
59
+ ret[i] = x[i, :, idx_str:idx_end]
60
  return ret
61
 
62
 
config.yml ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 全局配置
2
+ # 对于希望在同一时间使用多个配置文件的情况,例如两个GPU同时跑两个训练集:通过环境变量指定配置文件,不指定则默认为./config.yml
3
+
4
+ # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
+ # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
+ # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/BanGDream"
8
+
9
+ # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
+ mirror: "openai"
11
+ openi_token: "" # openi token
12
+
13
+ # resample 音频重采样配置
14
+ # 注意, “:” 后需要加空格
15
+ resample:
16
+ # 目标重采样率
17
+ sampling_rate: 44100
18
+ # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
+ # 请填入相对于datasetPath的相对路径
20
+ in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
21
+ # 音频文件重采样后输出路径
22
+ out_dir: ""
23
+
24
+
25
+ # preprocess_text 数据集预处理相关配置
26
+ # 注意, “:” 后需要加空格
27
+ preprocess_text:
28
+ # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/Mygo.list"
30
+ # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
+ cleaned_path: ""
32
+ # 训练集路径
33
+ train_path: "filelists/train.list"
34
+ # 验证集路径
35
+ val_path: "filelists/val.list"
36
+ # 配置文件路径
37
+ config_path: "configs/config.json"
38
+ # 每个speaker的验证集条数
39
+ val_per_spk: 4
40
+ # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 8
42
+ # 是否进行数据清洗
43
+ clean: true
44
+
45
+
46
+ # bert_gen 相关配置
47
+ # 注意, “:” 后需要加空格
48
+ bert_gen:
49
+ # 训练数据集配置文件路径
50
+ config_path: "configs/config.json"
51
+ # 并行数
52
+ num_processes: 2
53
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
+ # 该选项同时决定了get_bert_feature的默认设备
55
+ device: "cuda"
56
+ # 使用多卡推理
57
+ use_multi_device: false
58
+
59
+ # emo_gen 相关配置
60
+ # 注意, “:” 后需要加空格
61
+ emo_gen:
62
+ # 训练数据集配置文件路径
63
+ config_path: "configs/config.json"
64
+ # 并行数
65
+ num_processes: 2
66
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
+ device: "cuda"
68
+
69
+ # train 训练配置
70
+ # 注意, “:” 后需要加空格
71
+ train_ms:
72
+ env:
73
+ MASTER_ADDR: "localhost"
74
+ MASTER_PORT: 10086
75
+ WORLD_SIZE: 1
76
+ LOCAL_RANK: 0
77
+ RANK: 0
78
+ # 可以填写任意名的环境变量
79
+ # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
80
+ # 底模设置
81
+ base:
82
+ use_base_model: True
83
+ repo_id: "Stardust_minus/Bert-VITS2"
84
+ model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
85
+ # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
86
+ model: "models"
87
+ # 配置文件路径
88
+ config_path: "configs/config.json"
89
+ # 训练使用的worker,不建议超过CPU核心数
90
+ num_workers: 16
91
+ # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
92
+ spec_cache: True
93
+ # 保存的检查点数量,多于此数目的权重会被删除来节省空间。
94
+ keep_ckpts: 8
95
+
96
+
97
+ # webui webui配置
98
+ # 注意, “:” 后需要加空格
99
+ webui:
100
+ # 推理设备
101
+ device: "cpu"
102
+ # 模型路径
103
+ model: "models/G_30000.pth"
104
+ # 配置文件路径
105
+ config_path: "configs/config.json"
106
+ # 端口号
107
+ port: 7860
108
+ # 是否公开部署,对外网开放
109
+ share: false
110
+ # 是否开启debug模式
111
+ debug: false
112
+ # 语种识别库,可选langid, fastlid
113
+ language_identification_library: "langid"
114
+
115
+
116
+ # server api配置
117
+ # 注意, “:” 后需要加空格
118
+ # 注意,本配置下的所有配置均为相对于根目录的路径
119
+ server:
120
+ # 端口号
121
+ port: 5000
122
+ # 模型默认使用设备:但是当前并没有实现这个配置。
123
+ device: "cuda"
124
+ # 需要加载的所有模型的配置
125
+ # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
126
+ models:
127
+ - # 模型的路径
128
+ model: ""
129
+ # 模型config.json的路径
130
+ config: ""
131
+ # 模型使用设备,若填写则会覆盖默认配置
132
+ device: "cuda"
133
+ # 模型默认使用的语言
134
+ language: "ZH"
135
+ # 模型人物默认参数
136
+ # 不必填写所有人物,不填的使用默认值
137
+ # 暂时不用填写,当前尚未实现按人区分配置
138
+ speakers:
139
+ - speaker: "科比"
140
+ sdp_ratio: 0.2
141
+ noise_scale: 0.6
142
+ noise_scale_w: 0.8
143
+ length_scale: 1
144
+ - speaker: "五条悟"
145
+ sdp_ratio: 0.3
146
+ noise_scale: 0.7
147
+ noise_scale_w: 0.8
148
+ length_scale: 0.5
149
+ - speaker: "安倍晋三"
150
+ sdp_ratio: 0.2
151
+ noise_scale: 0.6
152
+ noise_scale_w: 0.8
153
+ length_scale: 1.2
154
+ - # 模型的路径
155
+ model: ""
156
+ # 模型config.json的路径
157
+ config: ""
158
+ # 模型使用设备,若填写则会覆盖默认配置
159
+ device: "cpu"
160
+ # 模型默认使用的语言
161
+ language: "JP"
162
+ # 模型人物默认参数
163
+ # 不必填写所有人物,不填的使用默认值
164
+ speakers: [ ] # 也可以不填
165
+
166
+
167
+ # 百度翻译开放平台 api配置
168
+ # api接入文档 https://api.fanyi.baidu.com/doc/21
169
+ # 请不要在github等网站公开分享你的app id 与 key
170
+ translate:
171
+ # 你的APPID
172
+ "app_key": ""
173
+ # 你的密钥
174
+ "secret_key": ""
configs/config.json CHANGED
@@ -2,9 +2,9 @@
2
  "train": {
3
  "log_interval": 200,
4
  "eval_interval": 1000,
5
- "seed": 52,
6
- "epochs": 10000,
7
- "learning_rate": 0.0003,
8
  "betas": [
9
  0.8,
10
  0.99
@@ -12,7 +12,7 @@
12
  "eps": 1e-09,
13
  "batch_size": 24,
14
  "fp16_run": false,
15
- "lr_decay": 0.999875,
16
  "segment_size": 16384,
17
  "init_lr_ratio": 1,
18
  "warmup_epochs": 0,
@@ -32,82 +32,864 @@
32
  "mel_fmin": 0.0,
33
  "mel_fmax": null,
34
  "add_blank": true,
35
- "n_speakers": 256,
36
  "cleaned_text": true,
37
  "spk2id": {
38
- "biaobei": 0,
39
- "香澄": 1,
40
- "有咲": 2,
41
- "沙綾": 3,
42
- "りみ": 4,
43
- "たえ": 5,
44
- "沙綾、りみ、たえ": 6,
45
- "": 7,
46
- "一同": 8,
47
- "まりな": 9,
48
- "ゆり": 10,
49
- "ポピパ一同": 11,
50
- "明日香": 12,
51
- "???": 13,
52
- "オーナー": 14,
53
- "全員": 15,
54
- "Poppin'Party": 16,
55
- "ひまり": 17,
56
- "モカ": 18,
57
- "つぐみ": 19,
58
- "": 20,
59
- "リサ": 21,
60
- "千聖": 22,
61
- "花音": 23,
62
- "イヴ": 24,
63
- "日菜": 25,
64
- "友希那": 26,
65
- "紗夜": 27,
66
- "Afterglow": 28,
67
- "こころ": 29,
68
- "美咲": 30,
69
- "": 31,
70
- "はぐみ": 32,
71
- "ミッシェル": 33,
72
- "マリー": 34,
73
- "怪盗ハロハッピー": 35,
74
- "ハロー、ハッピーワールド!": 36,
75
- "ニコリーナ": 37,
76
- "": 38,
77
- "麻弥": 39,
78
- "パスパレ一同": 40,
79
- "燐子": 41,
80
- "あこ": 42,
81
- "あこのチャット": 43,
82
- "燐子のチャット": 44,
83
- "燐子チャット": 45,
84
- "Roselia": 46,
85
- "ゆきな": 47,
86
- "ましろ": 48,
87
- "つくし": 49,
88
- "透子": 50,
89
- "七深": 51,
90
- "瑠唯": 52,
91
- "六花": 53,
92
- "パレオ": 54,
93
- "レイヤ": 55,
94
- "マスキング": 56,
95
- "チュチュ": 57,
96
- "ますき": 58,
97
- "ロック": 59,
98
- "令王那": 60,
99
- "CHIYU": 61,
100
- "レイ": 62,
101
- "詩船": 63,
102
- "珠手ちゆ": 64,
103
- "": 65,
104
- "そよ": 66,
105
- "祥子": 67,
106
- "立希": 68,
107
- "": 69,
108
- "愛音": 70,
109
- "楽奈": 71,
110
- "海鈴": 72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
  },
113
  "model": {
@@ -163,5 +945,6 @@
163
  "n_layers_q": 3,
164
  "use_spectral_norm": false,
165
  "gin_channels": 256
166
- }
167
- }
 
 
2
  "train": {
3
  "log_interval": 200,
4
  "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 1000,
7
+ "learning_rate": 0.0002,
8
  "betas": [
9
  0.8,
10
  0.99
 
12
  "eps": 1e-09,
13
  "batch_size": 24,
14
  "fp16_run": false,
15
+ "lr_decay": 0.99995,
16
  "segment_size": 16384,
17
  "init_lr_ratio": 1,
18
  "warmup_epochs": 0,
 
32
  "mel_fmin": 0.0,
33
  "mel_fmax": null,
34
  "add_blank": true,
35
+ "n_speakers": 896,
36
  "cleaned_text": true,
37
  "spk2id": {
38
+ "派蒙_ZH": 0,
39
+ "纳西妲_ZH": 1,
40
+ "凯亚_ZH": 2,
41
+ "阿贝多_ZH": 3,
42
+ "温迪_ZH": 4,
43
+ "枫原万叶_ZH": 5,
44
+ "钟离_ZH": 6,
45
+ "荒泷一斗_ZH": 7,
46
+ "八重神子_ZH": 8,
47
+ "艾尔海森_ZH": 9,
48
+ "提纳里_ZH": 10,
49
+ "迪希雅_ZH": 11,
50
+ "卡维_ZH": 12,
51
+ "宵宫_ZH": 13,
52
+ "那维莱特_ZH": 14,
53
+ "莱依拉_ZH": 15,
54
+ "赛诺_ZH": 16,
55
+ "莫娜_ZH": 17,
56
+ "诺艾尔_ZH": 18,
57
+ "托马_ZH": 19,
58
+ "凝光_ZH": 20,
59
+ "林尼_ZH": 21,
60
+ "北斗_ZH": 22,
61
+ "柯莱_ZH": 23,
62
+ "神里绫华_ZH": 24,
63
+ "可莉_ZH": 25,
64
+ "芭芭拉_ZH": 26,
65
+ "雷电将军_ZH": 27,
66
+ "娜维娅_ZH": 28,
67
+ "芙宁娜_ZH": 29,
68
+ "珊瑚宫心海_ZH": 30,
69
+ "鹿野院平藏_ZH": 31,
70
+ "迪奥娜_ZH": 32,
71
+ "琴_ZH": 33,
72
+ "五郎_ZH": 34,
73
+ "班尼特_ZH": 35,
74
+ "达达利亚_ZH": 36,
75
+ "安柏_ZH": 37,
76
+ "莱欧斯利_ZH": 38,
77
+ "夜兰_ZH": 39,
78
+ "妮露_ZH": 40,
79
+ "辛焱_ZH": 41,
80
+ "丽莎_ZH": 42,
81
+ "珐露珊_ZH": 43,
82
+ "魈_ZH": 44,
83
+ "香菱_ZH": 45,
84
+ "迪卢克_ZH": 46,
85
+ "砂糖_ZH": 47,
86
+ "烟绯_ZH": 48,
87
+ "早柚_ZH": 49,
88
+ "云堇_ZH": 50,
89
+ "刻晴_ZH": 51,
90
+ "重云_ZH": 52,
91
+ "优菈_ZH": 53,
92
+ "胡桃_ZH": 54,
93
+ "流浪者_ZH": 55,
94
+ "久岐忍_ZH": 56,
95
+ "神里绫人_ZH": 57,
96
+ "甘雨_ZH": 58,
97
+ "戴因斯雷布_ZH": 59,
98
+ "菲谢尔_ZH": 60,
99
+ "白术_ZH": 61,
100
+ "行秋_ZH": 62,
101
+ "九条裟罗_ZH": 63,
102
+ "夏洛蒂_ZH": 64,
103
+ "雷泽_ZH": 65,
104
+ "申鹤_ZH": 66,
105
+ "荧_ZH": 67,
106
+ "空_ZH": 68,
107
+ "迪娜泽黛_ZH": 69,
108
+ "凯瑟琳_ZH": 70,
109
+ "多莉_ZH": 71,
110
+ "坎蒂丝_ZH": 72,
111
+ "琳妮特_ZH": 73,
112
+ "萍姥姥_ZH": 74,
113
+ "罗莎莉亚_ZH": 75,
114
+ "埃德_ZH": 76,
115
+ "爱贝尔_ZH": 77,
116
+ "伊迪娅_ZH": 78,
117
+ "留云借风真君_ZH": 79,
118
+ "绮良良_ZH": 80,
119
+ "七七_ZH": 81,
120
+ "式大将_ZH": 82,
121
+ "瑶瑶_ZH": 83,
122
+ "奥兹_ZH": 84,
123
+ "菲米尼_ZH": 85,
124
+ "米卡_ZH": 86,
125
+ "哲平_ZH": 87,
126
+ "大肉丸_ZH": 88,
127
+ "托克_ZH": 89,
128
+ "蒂玛乌斯_ZH": 90,
129
+ "昆钧_ZH": 91,
130
+ "欧菲妮_ZH": 92,
131
+ "塞琉斯_ZH": 93,
132
+ "仆人_ZH": 94,
133
+ "迈勒斯_ZH": 95,
134
+ "希格雯_ZH": 96,
135
+ "阿守_ZH": 97,
136
+ "拉赫曼_ZH": 98,
137
+ "杜拉夫_ZH": 99,
138
+ "伊利亚斯_ZH": 100,
139
+ "阿晃_ZH": 101,
140
+ "旁白_ZH": 102,
141
+ "爱德琳_ZH": 103,
142
+ "埃洛伊_ZH": 104,
143
+ "德沃沙克_ZH": 105,
144
+ "玛乔丽_ZH": 106,
145
+ "塞塔蕾_ZH": 107,
146
+ "柊千里_ZH": 108,
147
+ "海芭夏_ZH": 109,
148
+ "九条镰治_ZH": 110,
149
+ "阿娜耶_ZH": 111,
150
+ "笼钓瓶一心_ZH": 112,
151
+ "回声海螺_ZH": 113,
152
+ "劳维克_ZH": 114,
153
+ "元太_ZH": 115,
154
+ "阿扎尔_ZH": 116,
155
+ "查尔斯_ZH": 117,
156
+ "阿洛瓦_ZH": 118,
157
+ "埃勒曼_ZH": 119,
158
+ "纳比尔_ZH": 120,
159
+ "莎拉_ZH": 121,
160
+ "康纳_ZH": 122,
161
+ "博来_ZH": 123,
162
+ "玛塞勒_ZH": 124,
163
+ "阿祇_ZH": 125,
164
+ "博士_ZH": 126,
165
+ "玛格丽特_ZH": 127,
166
+ "迪尔菲_ZH": 128,
167
+ "宛烟_ZH": 129,
168
+ "羽生田千鹤_ZH": 130,
169
+ "海妮耶_ZH": 131,
170
+ "旅行者_ZH": 132,
171
+ "霍夫曼_ZH": 133,
172
+ "佐西摩斯_ZH": 134,
173
+ "鹿野奈奈_ZH": 135,
174
+ "舒伯特_ZH": 136,
175
+ "天叔_ZH": 137,
176
+ "艾莉丝_ZH": 138,
177
+ "龙二_ZH": 139,
178
+ "莺儿_ZH": 140,
179
+ "嘉良_ZH": 141,
180
+ "一心传名刀_ZH": 142,
181
+ "费迪南德_ZH": 143,
182
+ "珊瑚_ZH": 144,
183
+ "言笑_ZH": 145,
184
+ "久利须_ZH": 146,
185
+ "嘉玛_ZH": 147,
186
+ "艾文_ZH": 148,
187
+ "克洛琳德_ZH": 149,
188
+ "丹吉尔_ZH": 150,
189
+ "女士_ZH": 151,
190
+ "白老先生_ZH": 152,
191
+ "天目十五_ZH": 153,
192
+ "老孟_ZH": 154,
193
+ "巴达维_ZH": 155,
194
+ "长生_ZH": 156,
195
+ "吴船长_ZH": 157,
196
+ "拉齐_ZH": 158,
197
+ "艾伯特_ZH": 159,
198
+ "松浦_ZH": 160,
199
+ "埃泽_ZH": 161,
200
+ "阿圆_ZH": 162,
201
+ "莫塞伊思_ZH": 163,
202
+ "阿拉夫_ZH": 164,
203
+ "杜吉耶_ZH": 165,
204
+ "石头_ZH": 166,
205
+ "百闻_ZH": 167,
206
+ "波洛_ZH": 168,
207
+ "斯坦利_ZH": 169,
208
+ "博易_ZH": 170,
209
+ "迈蒙_ZH": 171,
210
+ "掇星攫辰天君_ZH": 172,
211
+ "毗伽尔_ZH": 173,
212
+ "芙卡洛斯_ZH": 174,
213
+ "恶龙_ZH": 175,
214
+ "恕筠_ZH": 176,
215
+ "知易_ZH": 177,
216
+ "克列门特_ZH": 178,
217
+ "大慈树王_ZH": 179,
218
+ "西拉杰_ZH": 180,
219
+ "上杉_ZH": 181,
220
+ "阿尔卡米_ZH": 182,
221
+ "纯水精灵_ZH": 183,
222
+ "常九爷_ZH": 184,
223
+ "沙扎曼_ZH": 185,
224
+ "田铁嘴_ZH": 186,
225
+ "克罗索_ZH": 187,
226
+ "阿巴图伊_ZH": 188,
227
+ "悦_ZH": 189,
228
+ "阿佩普_ZH": 190,
229
+ "埃尔欣根_ZH": 191,
230
+ "萨赫哈蒂_ZH": 192,
231
+ "塔杰·拉德卡尼_ZH": 193,
232
+ "安西_ZH": 194,
233
+ "埃舍尔_ZH": 195,
234
+ "萨齐因_ZH": 196,
235
+ "派蒙_JP": 197,
236
+ "纳西妲_JP": 198,
237
+ "凯亚_JP": 199,
238
+ "阿贝多_JP": 200,
239
+ "温迪_JP": 201,
240
+ "枫原万叶_JP": 202,
241
+ "钟离_JP": 203,
242
+ "荒泷一斗_JP": 204,
243
+ "八重神子_JP": 205,
244
+ "艾尔海森_JP": 206,
245
+ "提纳里_JP": 207,
246
+ "迪希雅_JP": 208,
247
+ "卡维_JP": 209,
248
+ "宵宫_JP": 210,
249
+ "那维莱特_JP": 211,
250
+ "莱依拉_JP": 212,
251
+ "赛诺_JP": 213,
252
+ "莫娜_JP": 214,
253
+ "诺艾尔_JP": 215,
254
+ "托马_JP": 216,
255
+ "凝光_JP": 217,
256
+ "林尼_JP": 218,
257
+ "北斗_JP": 219,
258
+ "柯莱_JP": 220,
259
+ "神里绫华_JP": 221,
260
+ "可莉_JP": 222,
261
+ "芭芭拉_JP": 223,
262
+ "雷电将军_JP": 224,
263
+ "娜维娅_JP": 225,
264
+ "芙宁娜_JP": 226,
265
+ "珊瑚宫心海_JP": 227,
266
+ "鹿野院平藏_JP": 228,
267
+ "迪奥娜_JP": 229,
268
+ "琴_JP": 230,
269
+ "五郎_JP": 231,
270
+ "班尼特_JP": 232,
271
+ "达达利亚_JP": 233,
272
+ "安柏_JP": 234,
273
+ "莱欧斯利_JP": 235,
274
+ "夜兰_JP": 236,
275
+ "妮露_JP": 237,
276
+ "辛焱_JP": 238,
277
+ "丽莎_JP": 239,
278
+ "珐露珊_JP": 240,
279
+ "魈_JP": 241,
280
+ "香菱_JP": 242,
281
+ "迪卢克_JP": 243,
282
+ "砂糖_JP": 244,
283
+ "烟绯_JP": 245,
284
+ "早柚_JP": 246,
285
+ "云堇_JP": 247,
286
+ "刻晴_JP": 248,
287
+ "重云_JP": 249,
288
+ "优菈_JP": 250,
289
+ "胡桃_JP": 251,
290
+ "流浪者_JP": 252,
291
+ "久岐忍_JP": 253,
292
+ "神里绫人_JP": 254,
293
+ "甘雨_JP": 255,
294
+ "戴因斯雷布_JP": 256,
295
+ "菲谢尔_JP": 257,
296
+ "白术_JP": 258,
297
+ "行秋_JP": 259,
298
+ "九条裟罗_JP": 260,
299
+ "夏洛蒂_JP": 261,
300
+ "雷泽_JP": 262,
301
+ "申鹤_JP": 263,
302
+ "空_JP": 264,
303
+ "荧_JP": 265,
304
+ "迪娜泽黛_JP": 266,
305
+ "凯瑟琳_JP": 267,
306
+ "多莉_JP": 268,
307
+ "坎蒂丝_JP": 269,
308
+ "琳妮特_JP": 270,
309
+ "萍姥姥_JP": 271,
310
+ "罗莎莉亚_JP": 272,
311
+ "埃德_JP": 273,
312
+ "爱贝尔_JP": 274,
313
+ "伊迪娅_JP": 275,
314
+ "留云借风真君_JP": 276,
315
+ "绮良良_JP": 277,
316
+ "七七_JP": 278,
317
+ "式大将_JP": 279,
318
+ "瑶瑶_JP": 280,
319
+ "奥兹_JP": 281,
320
+ "菲米尼_JP": 282,
321
+ "米卡_JP": 283,
322
+ "哲平_JP": 284,
323
+ "大肉丸_JP": 285,
324
+ "托克_JP": 286,
325
+ "蒂玛乌斯_JP": 287,
326
+ "昆钧_JP": 288,
327
+ "欧菲妮_JP": 289,
328
+ "塞琉斯_JP": 290,
329
+ "仆人_JP": 291,
330
+ "迈勒斯_JP": 292,
331
+ "希格雯_JP": 293,
332
+ "阿守_JP": 294,
333
+ "拉赫曼_JP": 295,
334
+ "杜拉夫_JP": 296,
335
+ "伊利亚斯_JP": 297,
336
+ "阿晃_JP": 298,
337
+ "旁白_JP": 299,
338
+ "爱德琳_JP": 300,
339
+ "埃洛伊_JP": 301,
340
+ "德沃沙克_JP": 302,
341
+ "玛乔丽_JP": 303,
342
+ "塞塔蕾_JP": 304,
343
+ "柊千里_JP": 305,
344
+ "海芭夏_JP": 306,
345
+ "九条镰治_JP": 307,
346
+ "阿娜耶_JP": 308,
347
+ "笼钓瓶一心_JP": 309,
348
+ "回声海螺_JP": 310,
349
+ "劳维克_JP": 311,
350
+ "元太_JP": 312,
351
+ "阿扎尔_JP": 313,
352
+ "查尔斯_JP": 314,
353
+ "阿洛瓦_JP": 315,
354
+ "埃勒曼_JP": 316,
355
+ "纳比尔_JP": 317,
356
+ "莎拉_JP": 318,
357
+ "康纳_JP": 319,
358
+ "博来_JP": 320,
359
+ "玛塞勒_JP": 321,
360
+ "阿祇_JP": 322,
361
+ "博士_JP": 323,
362
+ "迪尔菲_JP": 324,
363
+ "玛格丽特_JP": 325,
364
+ "宛烟_JP": 326,
365
+ "羽生田千鹤_JP": 327,
366
+ "海妮耶_JP": 328,
367
+ "霍夫曼_JP": 329,
368
+ "旅行者_JP": 330,
369
+ "佐西摩斯_JP": 331,
370
+ "舒伯特_JP": 332,
371
+ "鹿野奈奈_JP": 333,
372
+ "天叔_JP": 334,
373
+ "龙二_JP": 335,
374
+ "艾莉丝_JP": 336,
375
+ "莺儿_JP": 337,
376
+ "嘉良_JP": 338,
377
+ "珊瑚_JP": 339,
378
+ "言笑_JP": 340,
379
+ "一心传名刀_JP": 341,
380
+ "费迪南德_JP": 342,
381
+ "久利须_JP": 343,
382
+ "嘉玛_JP": 344,
383
+ "艾文_JP": 345,
384
+ "克洛琳德_JP": 346,
385
+ "丹吉尔_JP": 347,
386
+ "天目十五_JP": 348,
387
+ "女士_JP": 349,
388
+ "老孟_JP": 350,
389
+ "白老先生_JP": 351,
390
+ "舍利夫_JP": 352,
391
+ "巴达维_JP": 353,
392
+ "拉齐_JP": 354,
393
+ "长生_JP": 355,
394
+ "吴船长_JP": 356,
395
+ "艾伯特_JP": 357,
396
+ "松浦_JP": 358,
397
+ "埃泽_JP": 359,
398
+ "阿圆_JP": 360,
399
+ "阿拉夫_JP": 361,
400
+ "莫塞伊思_JP": 362,
401
+ "石头_JP": 363,
402
+ "百闻_JP": 364,
403
+ "杜吉耶_JP": 365,
404
+ "波洛_JP": 366,
405
+ "掇星攫辰天君_JP": 367,
406
+ "迈蒙_JP": 368,
407
+ "博易_JP": 369,
408
+ "诗筠_JP": 370,
409
+ "斯坦利_JP": 371,
410
+ "毗伽尔_JP": 372,
411
+ "芙卡洛斯_JP": 373,
412
+ "恶龙_JP": 374,
413
+ "小仓澪_JP": 375,
414
+ "恕筠_JP": 376,
415
+ "知易_JP": 377,
416
+ "克列门特_JP": 378,
417
+ "大慈树王_JP": 379,
418
+ "望雅_JP": 380,
419
+ "黑田_JP": 381,
420
+ "卡莉娜_JP": 382,
421
+ "马姆杜_JP": 383,
422
+ "科林斯_JP": 384,
423
+ "上杉_JP": 385,
424
+ "西拉杰_JP": 386,
425
+ "菲尔戈黛特_JP": 387,
426
+ "一平_JP": 388,
427
+ "纯水精灵_JP": 389,
428
+ "阿尔卡米_JP": 390,
429
+ "老戴_JP": 391,
430
+ "谢赫祖拜尔_JP": 392,
431
+ "沙扎曼_JP": 393,
432
+ "田铁嘴_JP": 394,
433
+ "小野寺_JP": 395,
434
+ "百识_JP": 396,
435
+ "克罗索_JP": 397,
436
+ "莱斯格_JP": 398,
437
+ "芷巧_JP": 399,
438
+ "加藤洋平_JP": 400,
439
+ "阿巴图伊_JP": 401,
440
+ "埃尔欣根_JP": 402,
441
+ "斯嘉莉_JP": 403,
442
+ "阿佩普_JP": 404,
443
+ "巫女_JP": 405,
444
+ "卡布斯_JP": 406,
445
+ "洛伦佐_JP": 407,
446
+ "萨赫哈蒂_JP": 408,
447
+ "娜德瓦_JP": 409,
448
+ "塞德娜_JP": 410,
449
+ "塔杰·拉德卡尼_JP": 411,
450
+ "绘星_JP": 412,
451
+ "泽田_JP": 413,
452
+ "安西_JP": 414,
453
+ "拉伊德_JP": 415,
454
+ "亚卡巴_JP": 416,
455
+ "有乐斋_JP": 417,
456
+ "莱昂_JP": 418,
457
+ "尤苏波夫_JP": 419,
458
+ "夏妮_JP": 420,
459
+ "埃舍尔_JP": 421,
460
+ "萨齐因_JP": 422,
461
+ "古山_JP": 423,
462
+ "自称渊上之物_JP": 424,
463
+ "丹羽_JP": 425,
464
+ "塞萨尔的日记_JP": 426,
465
+ "派蒙_EN": 427,
466
+ "纳西妲_EN": 428,
467
+ "凯亚_EN": 429,
468
+ "阿贝多_EN": 430,
469
+ "温迪_EN": 431,
470
+ "枫原万叶_EN": 432,
471
+ "钟离_EN": 433,
472
+ "荒泷一斗_EN": 434,
473
+ "八重神子_EN": 435,
474
+ "艾尔海森_EN": 436,
475
+ "提纳里_EN": 437,
476
+ "迪希雅_EN": 438,
477
+ "卡维_EN": 439,
478
+ "宵宫_EN": 440,
479
+ "莱依拉_EN": 441,
480
+ "那维莱特_EN": 442,
481
+ "赛诺_EN": 443,
482
+ "莫娜_EN": 444,
483
+ "诺艾尔_EN": 445,
484
+ "托马_EN": 446,
485
+ "凝光_EN": 447,
486
+ "林尼_EN": 448,
487
+ "北斗_EN": 449,
488
+ "柯莱_EN": 450,
489
+ "神里绫华_EN": 451,
490
+ "可莉_EN": 452,
491
+ "芭芭拉_EN": 453,
492
+ "雷电将军_EN": 454,
493
+ "娜维娅_EN": 455,
494
+ "芙宁娜_EN": 456,
495
+ "珊瑚宫心海_EN": 457,
496
+ "鹿野院平藏_EN": 458,
497
+ "迪奥娜_EN": 459,
498
+ "五郎_EN": 460,
499
+ "琴_EN": 461,
500
+ "班尼特_EN": 462,
501
+ "达达利亚_EN": 463,
502
+ "安柏_EN": 464,
503
+ "莱欧斯利_EN": 465,
504
+ "夜兰_EN": 466,
505
+ "妮露_EN": 467,
506
+ "辛焱_EN": 468,
507
+ "珐露珊_EN": 469,
508
+ "丽莎_EN": 470,
509
+ "魈_EN": 471,
510
+ "香菱_EN": 472,
511
+ "迪卢克_EN": 473,
512
+ "砂糖_EN": 474,
513
+ "���绯_EN": 475,
514
+ "早柚_EN": 476,
515
+ "云堇_EN": 477,
516
+ "刻晴_EN": 478,
517
+ "重云_EN": 479,
518
+ "优菈_EN": 480,
519
+ "胡桃_EN": 481,
520
+ "流浪者_EN": 482,
521
+ "久岐忍_EN": 483,
522
+ "神里绫人_EN": 484,
523
+ "甘雨_EN": 485,
524
+ "戴因斯雷布_EN": 486,
525
+ "菲谢尔_EN": 487,
526
+ "白术_EN": 488,
527
+ "行秋_EN": 489,
528
+ "九条裟罗_EN": 490,
529
+ "夏洛蒂_EN": 491,
530
+ "雷泽_EN": 492,
531
+ "申鹤_EN": 493,
532
+ "荧_EN": 494,
533
+ "空_EN": 495,
534
+ "迪娜泽黛_EN": 496,
535
+ "凯瑟琳_EN": 497,
536
+ "多莉_EN": 498,
537
+ "坎蒂丝_EN": 499,
538
+ "琳妮特_EN": 500,
539
+ "萍姥姥_EN": 501,
540
+ "罗莎莉亚_EN": 502,
541
+ "埃德_EN": 503,
542
+ "爱贝尔_EN": 504,
543
+ "伊迪娅_EN": 505,
544
+ "留云借风真君_EN": 506,
545
+ "绮良良_EN": 507,
546
+ "七七_EN": 508,
547
+ "式大将_EN": 509,
548
+ "瑶瑶_EN": 510,
549
+ "奥兹_EN": 511,
550
+ "菲米尼_EN": 512,
551
+ "米卡_EN": 513,
552
+ "哲平_EN": 514,
553
+ "大肉丸_EN": 515,
554
+ "托克_EN": 516,
555
+ "蒂玛乌斯_EN": 517,
556
+ "昆钧_EN": 518,
557
+ "欧菲妮_EN": 519,
558
+ "塞琉斯_EN": 520,
559
+ "仆人_EN": 521,
560
+ "迈勒斯_EN": 522,
561
+ "希格雯_EN": 523,
562
+ "阿守_EN": 524,
563
+ "拉赫曼_EN": 525,
564
+ "杜拉夫_EN": 526,
565
+ "伊利亚斯_EN": 527,
566
+ "阿晃_EN": 528,
567
+ "旁白_EN": 529,
568
+ "爱德琳_EN": 530,
569
+ "埃洛伊_EN": 531,
570
+ "德沃沙克_EN": 532,
571
+ "玛乔丽_EN": 533,
572
+ "塞塔蕾_EN": 534,
573
+ "柊千里_EN": 535,
574
+ "海芭夏_EN": 536,
575
+ "九条镰治_EN": 537,
576
+ "阿娜耶_EN": 538,
577
+ "笼钓瓶一心_EN": 539,
578
+ "回声海螺_EN": 540,
579
+ "劳维克_EN": 541,
580
+ "元太_EN": 542,
581
+ "阿扎尔_EN": 543,
582
+ "查尔斯_EN": 544,
583
+ "阿洛瓦_EN": 545,
584
+ "埃勒曼_EN": 546,
585
+ "纳比尔_EN": 547,
586
+ "莎拉_EN": 548,
587
+ "康纳_EN": 549,
588
+ "博来_EN": 550,
589
+ "玛塞勒_EN": 551,
590
+ "阿祇_EN": 552,
591
+ "博士_EN": 553,
592
+ "迪尔菲_EN": 554,
593
+ "宛烟_EN": 555,
594
+ "玛格丽特_EN": 556,
595
+ "羽生田千鹤_EN": 557,
596
+ "海妮耶_EN": 558,
597
+ "霍夫曼_EN": 559,
598
+ "旅行者_EN": 560,
599
+ "佐西摩斯_EN": 561,
600
+ "鹿野奈奈_EN": 562,
601
+ "舒伯特_EN": 563,
602
+ "天叔_EN": 564,
603
+ "艾莉丝_EN": 565,
604
+ "龙二_EN": 566,
605
+ "莺儿_EN": 567,
606
+ "嘉良_EN": 568,
607
+ "珊瑚_EN": 569,
608
+ "费迪南德_EN": 570,
609
+ "言笑_EN": 571,
610
+ "一心传名刀_EN": 572,
611
+ "久利须_EN": 573,
612
+ "嘉玛_EN": 574,
613
+ "艾文_EN": 575,
614
+ "克洛琳德_EN": 576,
615
+ "丹吉尔_EN": 577,
616
+ "女士_EN": 578,
617
+ "天目十五_EN": 579,
618
+ "老孟_EN": 580,
619
+ "白老先生_EN": 581,
620
+ "舍利夫_EN": 582,
621
+ "巴达维_EN": 583,
622
+ "拉齐_EN": 584,
623
+ "长生_EN": 585,
624
+ "吴船长_EN": 586,
625
+ "艾伯特_EN": 587,
626
+ "松浦_EN": 588,
627
+ "埃泽_EN": 589,
628
+ "阿圆_EN": 590,
629
+ "阿拉夫_EN": 591,
630
+ "莫塞伊思_EN": 592,
631
+ "石头_EN": 593,
632
+ "百闻_EN": 594,
633
+ "杜吉耶_EN": 595,
634
+ "波洛_EN": 596,
635
+ "斯坦利_EN": 597,
636
+ "掇星攫辰天君_EN": 598,
637
+ "迈蒙_EN": 599,
638
+ "博易_EN": 600,
639
+ "诗筠_EN": 601,
640
+ "毗伽尔_EN": 602,
641
+ "慧心_EN": 603,
642
+ "芙卡洛斯_EN": 604,
643
+ "恶龙_EN": 605,
644
+ "小仓澪_EN": 606,
645
+ "恕筠_EN": 607,
646
+ "知易_EN": 608,
647
+ "克列门特_EN": 609,
648
+ "大慈树王_EN": 610,
649
+ "维多利亚_EN": 611,
650
+ "黑田_EN": 612,
651
+ "马姆杜_EN": 613,
652
+ "科林斯_EN": 614,
653
+ "上杉_EN": 615,
654
+ "西拉杰_EN": 616,
655
+ "宁禄_EN": 617,
656
+ "纯水精灵_EN": 618,
657
+ "常九爷_EN": 619,
658
+ "阿尔卡米_EN": 620,
659
+ "沙扎曼_EN": 621,
660
+ "田铁嘴_EN": 622,
661
+ "加萨尼_EN": 623,
662
+ "克罗索_EN": 624,
663
+ "星稀_EN": 625,
664
+ "莱斯格_EN": 626,
665
+ "阿巴图伊_EN": 627,
666
+ "悦_EN": 628,
667
+ "德田_EN": 629,
668
+ "埃尔欣根_EN": 630,
669
+ "阿佩普_EN": 631,
670
+ "萨赫哈蒂_EN": 632,
671
+ "洛伦佐_EN": 633,
672
+ "塔杰·拉德卡尼_EN": 634,
673
+ "泽田_EN": 635,
674
+ "安西_EN": 636,
675
+ "理水叠山真君_EN": 637,
676
+ "埃舍尔_EN": 638,
677
+ "萨齐因_EN": 639,
678
+ "古田_EN": 640,
679
+ "陆景和": 641,
680
+ "莫弈": 642,
681
+ "左然": 643,
682
+ "夏彦": 644,
683
+ "三月七_ZH": 645,
684
+ "丹恒_ZH": 646,
685
+ "希儿_ZH": 647,
686
+ "娜塔莎_ZH": 648,
687
+ "希露瓦_ZH": 649,
688
+ "瓦尔特_ZH": 650,
689
+ "佩拉_ZH": 651,
690
+ "布洛妮娅_ZH": 652,
691
+ "虎克_ZH": 653,
692
+ "素裳_ZH": 654,
693
+ "克拉拉_ZH": 655,
694
+ "符玄_ZH": 656,
695
+ "白露_ZH": 657,
696
+ "杰帕德_ZH": 658,
697
+ "景元_ZH": 659,
698
+ "藿藿_ZH": 660,
699
+ "姬子_ZH": 661,
700
+ "穹_ZH": 662,
701
+ "星_ZH": 663,
702
+ "卡芙卡_ZH": 664,
703
+ "桂乃芬_ZH": 665,
704
+ "艾丝妲_ZH": 666,
705
+ "玲可_ZH": 667,
706
+ "彦卿_ZH": 668,
707
+ "托帕_ZH": 669,
708
+ "驭空_ZH": 670,
709
+ "浮烟_ZH": 671,
710
+ "停云_ZH": 672,
711
+ "镜流_ZH": 673,
712
+ "罗刹_ZH": 674,
713
+ "卢卡_ZH": 675,
714
+ "史瓦罗_ZH": 676,
715
+ "黑塔_ZH": 677,
716
+ "桑博_ZH": 678,
717
+ "伦纳德_ZH": 679,
718
+ "明曦_ZH": 680,
719
+ "银狼_ZH": 681,
720
+ "帕姆_ZH": 682,
721
+ "青雀_ZH": 683,
722
+ "乔瓦尼_ZH": 684,
723
+ "公输师傅_ZH": 685,
724
+ "晴霓_ZH": 686,
725
+ "螺丝咕姆_ZH": 687,
726
+ "阿兰_ZH": 688,
727
+ "奥列格_ZH": 689,
728
+ "丹枢_ZH": 690,
729
+ "尾巴_ZH": 691,
730
+ "寒鸦_ZH": 692,
731
+ "雪衣_ZH": 693,
732
+ "可可利亚_ZH": 694,
733
+ "青镞_ZH": 695,
734
+ "半夏_ZH": 696,
735
+ "银枝_ZH": 697,
736
+ "大毫_ZH": 698,
737
+ "霄翰_ZH": 699,
738
+ "信使_ZH": 700,
739
+ "费斯曼_ZH": 701,
740
+ "绿芙蓉_ZH": 702,
741
+ "dev_成男_ZH": 703,
742
+ "金人会长_ZH": 704,
743
+ "维利特_ZH": 705,
744
+ "维尔德_ZH": 706,
745
+ "斯科特_ZH": 707,
746
+ "卡波特_ZH": 708,
747
+ "刃_ZH": 709,
748
+ "岩明_ZH": 710,
749
+ "浣溪_ZH": 711,
750
+ "三月七_JP": 712,
751
+ "丹恒_JP": 713,
752
+ "希儿_JP": 714,
753
+ "娜塔莎_JP": 715,
754
+ "希露瓦_JP": 716,
755
+ "瓦尔特_JP": 717,
756
+ "佩拉_JP": 718,
757
+ "布洛妮娅_JP": 719,
758
+ "虎克_JP": 720,
759
+ "素裳_JP": 721,
760
+ "克拉拉_JP": 722,
761
+ "符玄_JP": 723,
762
+ "白露_JP": 724,
763
+ "杰帕德_JP": 725,
764
+ "景元_JP": 726,
765
+ "藿藿_JP": 727,
766
+ "姬子_JP": 728,
767
+ "卡芙卡_JP": 729,
768
+ "穹_JP": 730,
769
+ "星_JP": 731,
770
+ "桂乃芬_JP": 732,
771
+ "艾丝妲_JP": 733,
772
+ "彦卿_JP": 734,
773
+ "玲可_JP": 735,
774
+ "托帕_JP": 736,
775
+ "驭空_JP": 737,
776
+ "浮烟_JP": 738,
777
+ "停云_JP": 739,
778
+ "镜流_JP": 740,
779
+ "罗刹_JP": 741,
780
+ "卢卡_JP": 742,
781
+ "史瓦罗_JP": 743,
782
+ "黑塔_JP": 744,
783
+ "桑博_JP": 745,
784
+ "伦纳德_JP": 746,
785
+ "明曦_JP": 747,
786
+ "银狼_JP": 748,
787
+ "帕姆_JP": 749,
788
+ "青雀_JP": 750,
789
+ "乔瓦尼_JP": 751,
790
+ "公输师傅_JP": 752,
791
+ "晴霓_JP": 753,
792
+ "螺丝咕姆_JP": 754,
793
+ "阿兰_JP": 755,
794
+ "奥列格_JP": 756,
795
+ "丹枢_JP": 757,
796
+ "尾巴_JP": 758,
797
+ "寒鸦_JP": 759,
798
+ "雪衣_JP": 760,
799
+ "可可利亚_JP": 761,
800
+ "青镞_JP": 762,
801
+ "半夏_JP": 763,
802
+ "银枝_JP": 764,
803
+ "大毫_JP": 765,
804
+ "霄翰_JP": 766,
805
+ "信使_JP": 767,
806
+ "费斯曼_JP": 768,
807
+ "绿芙蓉_JP": 769,
808
+ "dev_成男_JP": 770,
809
+ "金人会长_JP": 771,
810
+ "维利特_JP": 772,
811
+ "维尔德_JP": 773,
812
+ "斯科特_JP": 774,
813
+ "刃_JP": 775,
814
+ "卡波特_JP": 776,
815
+ "岩明_JP": 777,
816
+ "浣溪_JP": 778,
817
+ "净砚_JP": 779,
818
+ "紫月季_JP": 780,
819
+ "歌蒂_JP": 781,
820
+ "奇怪的云骑_JP": 782,
821
+ "幻胧_JP": 783,
822
+ "斯薇塔_JP": 784,
823
+ "隐书_JP": 785,
824
+ "三月七_EN": 786,
825
+ "丹恒_EN": 787,
826
+ "希儿_EN": 788,
827
+ "娜塔莎_EN": 789,
828
+ "希露瓦_EN": 790,
829
+ "瓦尔特_EN": 791,
830
+ "佩拉_EN": 792,
831
+ "布洛妮娅_EN": 793,
832
+ "虎克_EN": 794,
833
+ "素裳_EN": 795,
834
+ "克拉拉_EN": 796,
835
+ "符玄_EN": 797,
836
+ "白露_EN": 798,
837
+ "杰帕德_EN": 799,
838
+ "景元_EN": 800,
839
+ "藿藿_EN": 801,
840
+ "姬子_EN": 802,
841
+ "卡芙卡_EN": 803,
842
+ "穹_EN": 804,
843
+ "星_EN": 805,
844
+ "桂乃芬_EN": 806,
845
+ "艾丝妲_EN": 807,
846
+ "彦卿_EN": 808,
847
+ "玲可_EN": 809,
848
+ "托帕_EN": 810,
849
+ "驭空_EN": 811,
850
+ "浮烟_EN": 812,
851
+ "停云_EN": 813,
852
+ "镜流_EN": 814,
853
+ "罗刹_EN": 815,
854
+ "卢卡_EN": 816,
855
+ "史瓦罗_EN": 817,
856
+ "黑塔_EN": 818,
857
+ "桑博_EN": 819,
858
+ "伦纳德_EN": 820,
859
+ "明曦_EN": 821,
860
+ "银狼_EN": 822,
861
+ "帕姆_EN": 823,
862
+ "青雀_EN": 824,
863
+ "乔瓦尼_EN": 825,
864
+ "公输师傅_EN": 826,
865
+ "晴霓_EN": 827,
866
+ "螺丝咕姆_EN": 828,
867
+ "阿兰_EN": 829,
868
+ "奥列格_EN": 830,
869
+ "丹枢_EN": 831,
870
+ "尾巴_EN": 832,
871
+ "寒鸦_EN": 833,
872
+ "雪衣_EN": 834,
873
+ "可可利亚_EN": 835,
874
+ "青镞_EN": 836,
875
+ "半夏_EN": 837,
876
+ "银枝_EN": 838,
877
+ "大毫_EN": 839,
878
+ "霄翰_EN": 840,
879
+ "信使_EN": 841,
880
+ "费斯曼_EN": 842,
881
+ "绿芙蓉_EN": 843,
882
+ "dev_成男_EN": 844,
883
+ "金人会长_EN": 845,
884
+ "维利特_EN": 846,
885
+ "维尔德_EN": 847,
886
+ "刃_EN": 848,
887
+ "卡波特_EN": 849,
888
+ "岩明_EN": 850,
889
+ "浣溪_EN": 851,
890
+ "紫月季_EN": 852,
891
+ "幻胧_EN": 853,
892
+ "女声_EN": 854
893
  }
894
  },
895
  "model": {
 
945
  "n_layers_q": 3,
946
  "use_spectral_norm": false,
947
  "gin_channels": 256
948
+ },
949
+ "version": "2.1"
950
+ }
data_utils.py CHANGED
@@ -3,11 +3,13 @@ import random
3
  import torch
4
  import torch.utils.data
5
  from tqdm import tqdm
6
- from loguru import logger
 
7
  import commons
8
  from mel_processing import spectrogram_torch, mel_spectrogram_torch
9
  from utils import load_wav_to_torch, load_filepaths_and_text
10
- from text import cleaned_text_to_sequence, get_bert
 
11
 
12
  """Multi speaker version"""
13
 
@@ -40,7 +42,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
40
 
41
  self.add_blank = hparams.add_blank
42
  self.min_text_len = getattr(hparams, "min_text_len", 1)
43
- self.max_text_len = getattr(hparams, "max_text_len", 300)
44
 
45
  random.seed(1234)
46
  random.shuffle(self.audiopaths_sid_text)
@@ -85,13 +87,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
85
  # separate filename, speaker_id and text
86
  audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
87
 
88
- bert, ja_bert, phones, tone, language = self.get_text(
89
  text, word2ph, phones, tone, language, audiopath
90
  )
91
 
92
  spec, wav = self.get_audio(audiopath)
93
  sid = torch.LongTensor([int(self.spk_map[sid])])
94
- return (phones, spec, wav, sid, tone, language, bert, ja_bert)
 
95
 
96
  def get_audio(self, filename):
97
  audio, sampling_rate = load_wav_to_torch(filename)
@@ -131,7 +134,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
131
  center=False,
132
  )
133
  spec = torch.squeeze(spec, 0)
134
- torch.save(spec, spec_filename)
 
135
  return spec, audio_norm
136
 
137
  def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
@@ -145,40 +149,28 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
145
  word2ph[0] += 1
146
  bert_path = wav_path.replace(".wav", ".bert.pt")
147
  try:
148
- bert = torch.load(bert_path)
149
- assert bert.shape[-1] == len(phone)
150
- except:
151
- bert = get_bert(text, word2ph, language_str)
152
- torch.save(bert, bert_path)
153
- assert bert.shape[-1] == len(phone), phone
154
 
155
  if language_str == "ZH":
156
- bert = bert
157
- ja_bert = torch.zeros(768, len(phone))
158
- elif language_str == "JA":
159
- ja_bert = bert
160
  bert = torch.zeros(1024, len(phone))
161
- else:
 
 
162
  bert = torch.zeros(1024, len(phone))
163
- ja_bert = torch.zeros(768, len(phone))
164
- assert bert.shape[-1] == len(phone), (
165
- bert.shape,
166
- len(phone),
167
- sum(word2ph),
168
- p1,
169
- p2,
170
- t1,
171
- t2,
172
- pold,
173
- pold2,
174
- word2ph,
175
- text,
176
- w2pho,
177
- )
178
  phone = torch.LongTensor(phone)
179
  tone = torch.LongTensor(tone)
180
  language = torch.LongTensor(language)
181
- return bert, ja_bert, phone, tone, language
182
 
183
  def get_sid(self, sid):
184
  sid = torch.LongTensor([int(sid)])
@@ -221,7 +213,9 @@ class TextAudioSpeakerCollate:
221
  tone_padded = torch.LongTensor(len(batch), max_text_len)
222
  language_padded = torch.LongTensor(len(batch), max_text_len)
223
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
224
- ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
 
 
225
 
226
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
227
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
@@ -232,6 +226,9 @@ class TextAudioSpeakerCollate:
232
  wav_padded.zero_()
233
  bert_padded.zero_()
234
  ja_bert_padded.zero_()
 
 
 
235
  for i in range(len(ids_sorted_decreasing)):
236
  row = batch[ids_sorted_decreasing[i]]
237
 
@@ -261,6 +258,11 @@ class TextAudioSpeakerCollate:
261
  ja_bert = row[7]
262
  ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
263
 
 
 
 
 
 
264
  return (
265
  text_padded,
266
  text_lengths,
@@ -273,6 +275,8 @@ class TextAudioSpeakerCollate:
273
  language_padded,
274
  bert_padded,
275
  ja_bert_padded,
 
 
276
  )
277
 
278
 
 
3
  import torch
4
  import torch.utils.data
5
  from tqdm import tqdm
6
+ import numpy as np
7
+ from tools.log import logger
8
  import commons
9
  from mel_processing import spectrogram_torch, mel_spectrogram_torch
10
  from utils import load_wav_to_torch, load_filepaths_and_text
11
+ from text import cleaned_text_to_sequence
12
+ from config import config
13
 
14
  """Multi speaker version"""
15
 
 
42
 
43
  self.add_blank = hparams.add_blank
44
  self.min_text_len = getattr(hparams, "min_text_len", 1)
45
+ self.max_text_len = getattr(hparams, "max_text_len", 384)
46
 
47
  random.seed(1234)
48
  random.shuffle(self.audiopaths_sid_text)
 
87
  # separate filename, speaker_id and text
88
  audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
89
 
90
+ bert, ja_bert, en_bert, phones, tone, language = self.get_text(
91
  text, word2ph, phones, tone, language, audiopath
92
  )
93
 
94
  spec, wav = self.get_audio(audiopath)
95
  sid = torch.LongTensor([int(self.spk_map[sid])])
96
+ emo = torch.FloatTensor(np.load(audiopath.replace(".wav", ".emo.npy")))
97
+ return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
98
 
99
  def get_audio(self, filename):
100
  audio, sampling_rate = load_wav_to_torch(filename)
 
134
  center=False,
135
  )
136
  spec = torch.squeeze(spec, 0)
137
+ if config.train_ms_config.spec_cache:
138
+ torch.save(spec, spec_filename)
139
  return spec, audio_norm
140
 
141
  def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
 
149
  word2ph[0] += 1
150
  bert_path = wav_path.replace(".wav", ".bert.pt")
151
  try:
152
+ bert_ori = torch.load(bert_path)
153
+ assert bert_ori.shape[-1] == len(phone)
154
+ except Exception as e:
155
+ logger.warning("Bert load Failed")
156
+ logger.warning(e)
 
157
 
158
  if language_str == "ZH":
159
+ bert = bert_ori
160
+ ja_bert = torch.zeros(1024, len(phone))
161
+ en_bert = torch.zeros(1024, len(phone))
162
+ elif language_str == "JP":
163
  bert = torch.zeros(1024, len(phone))
164
+ ja_bert = bert_ori
165
+ en_bert = torch.zeros(1024, len(phone))
166
+ elif language_str == "EN":
167
  bert = torch.zeros(1024, len(phone))
168
+ ja_bert = torch.zeros(1024, len(phone))
169
+ en_bert = bert_ori
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  phone = torch.LongTensor(phone)
171
  tone = torch.LongTensor(tone)
172
  language = torch.LongTensor(language)
173
+ return bert, ja_bert, en_bert, phone, tone, language
174
 
175
  def get_sid(self, sid):
176
  sid = torch.LongTensor([int(sid)])
 
213
  tone_padded = torch.LongTensor(len(batch), max_text_len)
214
  language_padded = torch.LongTensor(len(batch), max_text_len)
215
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
216
+ ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
217
+ en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
218
+ emo = torch.FloatTensor(len(batch), 1024)
219
 
220
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
221
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
 
226
  wav_padded.zero_()
227
  bert_padded.zero_()
228
  ja_bert_padded.zero_()
229
+ en_bert_padded.zero_()
230
+ emo.zero_()
231
+
232
  for i in range(len(ids_sorted_decreasing)):
233
  row = batch[ids_sorted_decreasing[i]]
234
 
 
258
  ja_bert = row[7]
259
  ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
260
 
261
+ en_bert = row[8]
262
+ en_bert_padded[i, :, : en_bert.size(1)] = en_bert
263
+
264
+ emo[i, :] = row[9]
265
+
266
  return (
267
  text_padded,
268
  text_lengths,
 
275
  language_padded,
276
  bert_padded,
277
  ja_bert_padded,
278
+ en_bert_padded,
279
+ emo,
280
  )
281
 
282
 
default_config.yml ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 全局配置
2
+ # 对于希望在同一时间使用多个配置文件的情况,例如两个GPU同时跑两个训练集:通过环境变量指定配置文件,不指定则默认为./config.yml
3
+
4
+ # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
+ # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
+ # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/"
8
+
9
+ # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
+ mirror: ""
11
+ openi_token: "" # openi token
12
+
13
+ # resample 音频重采样配置
14
+ # 注意, “:” 后需要加空格
15
+ resample:
16
+ # 目标重采样率
17
+ sampling_rate: 44100
18
+ # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
+ # 请填入相对于datasetPath的相对路径
20
+ in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
21
+ # 音频文件重采样后输出路径
22
+ out_dir: "audios/wavs"
23
+
24
+
25
+ # preprocess_text 数据集预处理相关配置
26
+ # 注意, “:” 后需要加空格
27
+ preprocess_text:
28
+ # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/你的数据集文本.list"
30
+ # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
+ cleaned_path: ""
32
+ # 训练集路径
33
+ train_path: "filelists/train.list"
34
+ # 验证集路径
35
+ val_path: "filelists/val.list"
36
+ # 配置文件路径
37
+ config_path: "config.json"
38
+ # 每个speaker的验证集条数
39
+ val_per_spk: 4
40
+ # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 8
42
+ # 是否进行数据清洗
43
+ clean: true
44
+
45
+
46
+ # bert_gen 相关配置
47
+ # 注意, “:” 后需要加空格
48
+ bert_gen:
49
+ # 训练数据集配置文件路径
50
+ config_path: "config.json"
51
+ # 并行数
52
+ num_processes: 2
53
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
+ # 该选项同时决定了get_bert_feature的默认设备
55
+ device: "cuda"
56
+ # 使用多卡推理
57
+ use_multi_device: false
58
+
59
+ # emo_gen 相关配置
60
+ # 注意, “:” 后需要加空格
61
+ emo_gen:
62
+ # 训练数据集配置文件路径
63
+ config_path: "config.json"
64
+ # 并行数
65
+ num_processes: 2
66
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
+ device: "cuda"
68
+
69
+ # train 训练配置
70
+ # 注意, “:” 后需要加空格
71
+ train_ms:
72
+ env:
73
+ MASTER_ADDR: "localhost"
74
+ MASTER_PORT: 10086
75
+ WORLD_SIZE: 1
76
+ LOCAL_RANK: 0
77
+ RANK: 0
78
+ # 可以填写任意名的环境变量
79
+ # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
80
+ # 底模设置
81
+ base:
82
+ use_base_model: false
83
+ repo_id: "Stardust_minus/Bert-VITS2"
84
+ model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
85
+ # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
86
+ model: "models"
87
+ # 配置文件路径
88
+ config_path: "configs/config.json"
89
+ # 训练使用的worker,不建议超过CPU核心数
90
+ num_workers: 16
91
+ # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
92
+ spec_cache: True
93
+ # 保存的检查点数量,多于此数目的权重会被删除来节省空间。
94
+ keep_ckpts: 8
95
+
96
+
97
+ # webui webui配置
98
+ # 注意, “:” 后需要加空格
99
+ webui:
100
+ # 推理设备
101
+ device: "cuda"
102
+ # 模型路径
103
+ model: "genshin/models/G_8000.pth"
104
+ # 配置文件路径
105
+ config_path: "configs/config.json"
106
+ # 端口号
107
+ port: 7860
108
+ # 是否公开部署,对外网开放
109
+ share: false
110
+ # 是否开启debug模式
111
+ debug: false
112
+ # 语种识别库,可选langid, fastlid
113
+ language_identification_library: "langid"
114
+
115
+
116
+ # server api配置
117
+ # 注意, “:” 后需要加空格
118
+ # 注意,本配置下的所有配置均为相对于根目录的路径
119
+ server:
120
+ # 端口号
121
+ port: 5000
122
+ # 模型默认使用设备:但是当前并没有实现这个配置。
123
+ device: "cuda"
124
+ # 需要加载的所有模型的配置
125
+ # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
126
+ models:
127
+ - # 模型的路径
128
+ model: ""
129
+ # 模型config.json的路径
130
+ config: ""
131
+ # 模型使用设备,若填写则会覆盖默认配置
132
+ device: "cuda"
133
+ # 模型默认使用的语言
134
+ language: "ZH"
135
+ # 模型人物默认参数
136
+ # 不必填写所有人物,不填的使用默认值
137
+ # 暂时不用填写,当前尚未实现按人区分配置
138
+ speakers:
139
+ - speaker: "科比"
140
+ sdp_ratio: 0.2
141
+ noise_scale: 0.6
142
+ noise_scale_w: 0.8
143
+ length_scale: 1
144
+ - speaker: "五条悟"
145
+ sdp_ratio: 0.3
146
+ noise_scale: 0.7
147
+ noise_scale_w: 0.8
148
+ length_scale: 0.5
149
+ - speaker: "安倍晋三"
150
+ sdp_ratio: 0.2
151
+ noise_scale: 0.6
152
+ noise_scale_w: 0.8
153
+ length_scale: 1.2
154
+ - # 模型的路径
155
+ model: ""
156
+ # 模型config.json的路径
157
+ config: ""
158
+ # 模型使用设备,若填写则会覆盖默认配置
159
+ device: "cpu"
160
+ # 模型默认使用的语言
161
+ language: "JP"
162
+ # 模型人物默认参数
163
+ # 不必填写所有人物,不填的使用默认值
164
+ speakers: [ ] # 也可以不填
165
+
166
+
167
+ # 百度翻译开放平台 api配置
168
+ # api接入文档 https://api.fanyi.baidu.com/doc/21
169
+ # 请不要在github等网站公开分享你的app id 与 key
170
+ translate:
171
+ # 你的APPID
172
+ "app_key": ""
173
+ # 你的密钥
174
+ "secret_key": ""
emo_gen.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.utils.data import Dataset
10
+ from torch.utils.data import DataLoader, Dataset
11
+ from tqdm import tqdm
12
+ from transformers import Wav2Vec2Processor
13
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
14
+ Wav2Vec2Model,
15
+ Wav2Vec2PreTrainedModel,
16
+ )
17
+ import sys
18
+ import utils
19
+ from config import config
20
+
21
+
22
+ class RegressionHead(nn.Module):
23
+ r"""Classification head."""
24
+
25
+ def __init__(self, config):
26
+ super().__init__()
27
+
28
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
29
+ self.dropout = nn.Dropout(config.final_dropout)
30
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
31
+
32
+ def forward(self, features, **kwargs):
33
+ x = features
34
+ x = self.dropout(x)
35
+ x = self.dense(x)
36
+ x = torch.tanh(x)
37
+ x = self.dropout(x)
38
+ x = self.out_proj(x)
39
+
40
+ return x
41
+
42
+
43
+ class EmotionModel(Wav2Vec2PreTrainedModel):
44
+ r"""Speech emotion classifier."""
45
+
46
+ def __init__(self, config):
47
+ super().__init__(config)
48
+
49
+ self.config = config
50
+ self.wav2vec2 = Wav2Vec2Model(config)
51
+ self.classifier = RegressionHead(config)
52
+ self.init_weights()
53
+
54
+ def forward(
55
+ self,
56
+ input_values,
57
+ ):
58
+ outputs = self.wav2vec2(input_values)
59
+ hidden_states = outputs[0]
60
+ hidden_states = torch.mean(hidden_states, dim=1)
61
+ logits = self.classifier(hidden_states)
62
+
63
+ return hidden_states, logits
64
+
65
+
66
+ class AudioDataset(Dataset):
67
+ def __init__(self, list_of_wav_files, sr, processor):
68
+ self.list_of_wav_files = list_of_wav_files
69
+ self.processor = processor
70
+ self.sr = sr
71
+
72
+ def __len__(self):
73
+ return len(self.list_of_wav_files)
74
+
75
+ def __getitem__(self, idx):
76
+ wav_file = self.list_of_wav_files[idx]
77
+ audio_data, _ = librosa.load(wav_file, sr=self.sr)
78
+ processed_data = self.processor(audio_data, sampling_rate=self.sr)[
79
+ "input_values"
80
+ ][0]
81
+ return torch.from_numpy(processed_data)
82
+
83
+
84
+ def process_func(
85
+ x: np.ndarray,
86
+ sampling_rate: int,
87
+ model: EmotionModel,
88
+ processor: Wav2Vec2Processor,
89
+ device: str,
90
+ embeddings: bool = False,
91
+ ) -> np.ndarray:
92
+ device = (
93
+ "cuda:0"
94
+ if torch.cuda.is_available()
95
+ else (
96
+ "mps"
97
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
98
+ else "cpu"
99
+ )
100
+ )
101
+ r"""Predict emotions or extract embeddings from raw audio signal."""
102
+ model = model.to(device)
103
+ y = processor(x, sampling_rate=sampling_rate)
104
+ y = y["input_values"][0]
105
+ y = torch.from_numpy(y).unsqueeze(0).to(device)
106
+
107
+ # run through model
108
+ with torch.no_grad():
109
+ y = model(y)[0 if embeddings else 1]
110
+
111
+ # convert to numpy
112
+ y = y.detach().cpu().numpy()
113
+
114
+ return y
115
+
116
+
117
+
118
+ def get_emo(path):
119
+ wav, sr = librosa.load(path, 16000)
120
+ device = config.bert_gen_config.device
121
+ print("successfully generate the emo vec")
122
+ return process_func(
123
+ np.expand_dims(wav, 0).astype(np.float),
124
+ sr,
125
+ model,
126
+ processor,
127
+ device,
128
+ embeddings=True,
129
+ ).squeeze(0)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument(
135
+ "-c", "--config", type=str, default=config.bert_gen_config.config_path
136
+ )
137
+ parser.add_argument(
138
+ "--num_processes", type=int, default=config.bert_gen_config.num_processes
139
+ )
140
+ args, _ = parser.parse_known_args()
141
+ config_path = args.config
142
+ hps = utils.get_hparams_from_file(config_path)
143
+
144
+ device = config.bert_gen_config.device
145
+
146
+ model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
147
+ REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
148
+ if not Path(model_name).joinpath("pytorch_model.bin").exists():
149
+ utils.download_emo_models(config.mirror, REPO_ID, model_name)
150
+
151
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
152
+ model = EmotionModel.from_pretrained(model_name).to(device)
153
+
154
+ lines = []
155
+ with open(hps.data.training_files, encoding="utf-8") as f:
156
+ lines.extend(f.readlines())
157
+
158
+ with open(hps.data.validation_files, encoding="utf-8") as f:
159
+ lines.extend(f.readlines())
160
+
161
+ wavnames = [line.split("|")[0] for line in lines]
162
+ dataset = AudioDataset(wavnames, 16000, processor)
163
+ data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=16)
164
+
165
+ with torch.no_grad():
166
+ for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
167
+ wavname = wavnames[i]
168
+ emo_path = wavname.replace(".wav", ".emo.npy")
169
+ if os.path.exists(emo_path):
170
+ continue
171
+ emb = model(data.to(device))[0].detach().cpu().numpy()
172
+ np.save(emo_path, emb)
173
+
174
+ print("Emo vec 生成完毕!")
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.wasm filter=lfs diff=lfs merge=lfs -text
25
+ *.xz filter=lfs diff=lfs merge=lfs -text
26
+ *.zip filter=lfs diff=lfs merge=lfs -text
27
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
28
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/LICENSE ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial-ShareAlike 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
58
+ Public License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial-ShareAlike 4.0 International Public License
63
+ ("Public License"). To the extent this Public License may be
64
+ interpreted as a contract, You are granted the Licensed Rights in
65
+ consideration of Your acceptance of these terms and conditions, and the
66
+ Licensor grants You such rights in consideration of benefits the
67
+ Licensor receives from making the Licensed Material available under
68
+ these terms and conditions.
69
+
70
+
71
+ Section 1 -- Definitions.
72
+
73
+ a. Adapted Material means material subject to Copyright and Similar
74
+ Rights that is derived from or based upon the Licensed Material
75
+ and in which the Licensed Material is translated, altered,
76
+ arranged, transformed, or otherwise modified in a manner requiring
77
+ permission under the Copyright and Similar Rights held by the
78
+ Licensor. For purposes of this Public License, where the Licensed
79
+ Material is a musical work, performance, or sound recording,
80
+ Adapted Material is always produced where the Licensed Material is
81
+ synched in timed relation with a moving image.
82
+
83
+ b. Adapter's License means the license You apply to Your Copyright
84
+ and Similar Rights in Your contributions to Adapted Material in
85
+ accordance with the terms and conditions of this Public License.
86
+
87
+ c. BY-NC-SA Compatible License means a license listed at
88
+ creativecommons.org/compatiblelicenses, approved by Creative
89
+ Commons as essentially the equivalent of this Public License.
90
+
91
+ d. Copyright and Similar Rights means copyright and/or similar rights
92
+ closely related to copyright including, without limitation,
93
+ performance, broadcast, sound recording, and Sui Generis Database
94
+ Rights, without regard to how the rights are labeled or
95
+ categorized. For purposes of this Public License, the rights
96
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
97
+ Rights.
98
+
99
+ e. Effective Technological Measures means those measures that, in the
100
+ absence of proper authority, may not be circumvented under laws
101
+ fulfilling obligations under Article 11 of the WIPO Copyright
102
+ Treaty adopted on December 20, 1996, and/or similar international
103
+ agreements.
104
+
105
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
106
+ any other exception or limitation to Copyright and Similar Rights
107
+ that applies to Your use of the Licensed Material.
108
+
109
+ g. License Elements means the license attributes listed in the name
110
+ of a Creative Commons Public License. The License Elements of this
111
+ Public License are Attribution, NonCommercial, and ShareAlike.
112
+
113
+ h. Licensed Material means the artistic or literary work, database,
114
+ or other material to which the Licensor applied this Public
115
+ License.
116
+
117
+ i. Licensed Rights means the rights granted to You subject to the
118
+ terms and conditions of this Public License, which are limited to
119
+ all Copyright and Similar Rights that apply to Your use of the
120
+ Licensed Material and that the Licensor has authority to license.
121
+
122
+ j. Licensor means the individual(s) or entity(ies) granting rights
123
+ under this Public License.
124
+
125
+ k. NonCommercial means not primarily intended for or directed towards
126
+ commercial advantage or monetary compensation. For purposes of
127
+ this Public License, the exchange of the Licensed Material for
128
+ other material subject to Copyright and Similar Rights by digital
129
+ file-sharing or similar means is NonCommercial provided there is
130
+ no payment of monetary compensation in connection with the
131
+ exchange.
132
+
133
+ l. Share means to provide material to the public by any means or
134
+ process that requires permission under the Licensed Rights, such
135
+ as reproduction, public display, public performance, distribution,
136
+ dissemination, communication, or importation, and to make material
137
+ available to the public including in ways that members of the
138
+ public may access the material from a place and at a time
139
+ individually chosen by them.
140
+
141
+ m. Sui Generis Database Rights means rights other than copyright
142
+ resulting from Directive 96/9/EC of the European Parliament and of
143
+ the Council of 11 March 1996 on the legal protection of databases,
144
+ as amended and/or succeeded, as well as other essentially
145
+ equivalent rights anywhere in the world.
146
+
147
+ n. You means the individual or entity exercising the Licensed Rights
148
+ under this Public License. Your has a corresponding meaning.
149
+
150
+
151
+ Section 2 -- Scope.
152
+
153
+ a. License grant.
154
+
155
+ 1. Subject to the terms and conditions of this Public License,
156
+ the Licensor hereby grants You a worldwide, royalty-free,
157
+ non-sublicensable, non-exclusive, irrevocable license to
158
+ exercise the Licensed Rights in the Licensed Material to:
159
+
160
+ a. reproduce and Share the Licensed Material, in whole or
161
+ in part, for NonCommercial purposes only; and
162
+
163
+ b. produce, reproduce, and Share Adapted Material for
164
+ NonCommercial purposes only.
165
+
166
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
167
+ Exceptions and Limitations apply to Your use, this Public
168
+ License does not apply, and You do not need to comply with
169
+ its terms and conditions.
170
+
171
+ 3. Term. The term of this Public License is specified in Section
172
+ 6(a).
173
+
174
+ 4. Media and formats; technical modifications allowed. The
175
+ Licensor authorizes You to exercise the Licensed Rights in
176
+ all media and formats whether now known or hereafter created,
177
+ and to make technical modifications necessary to do so. The
178
+ Licensor waives and/or agrees not to assert any right or
179
+ authority to forbid You from making technical modifications
180
+ necessary to exercise the Licensed Rights, including
181
+ technical modifications necessary to circumvent Effective
182
+ Technological Measures. For purposes of this Public License,
183
+ simply making modifications authorized by this Section 2(a)
184
+ (4) never produces Adapted Material.
185
+
186
+ 5. Downstream recipients.
187
+
188
+ a. Offer from the Licensor -- Licensed Material. Every
189
+ recipient of the Licensed Material automatically
190
+ receives an offer from the Licensor to exercise the
191
+ Licensed Rights under the terms and conditions of this
192
+ Public License.
193
+
194
+ b. Additional offer from the Licensor -- Adapted Material.
195
+ Every recipient of Adapted Material from You
196
+ automatically receives an offer from the Licensor to
197
+ exercise the Licensed Rights in the Adapted Material
198
+ under the conditions of the Adapter's License You apply.
199
+
200
+ c. No downstream restrictions. You may not offer or impose
201
+ any additional or different terms or conditions on, or
202
+ apply any Effective Technological Measures to, the
203
+ Licensed Material if doing so restricts exercise of the
204
+ Licensed Rights by any recipient of the Licensed
205
+ Material.
206
+
207
+ 6. No endorsement. Nothing in this Public License constitutes or
208
+ may be construed as permission to assert or imply that You
209
+ are, or that Your use of the Licensed Material is, connected
210
+ with, or sponsored, endorsed, or granted official status by,
211
+ the Licensor or others designated to receive attribution as
212
+ provided in Section 3(a)(1)(A)(i).
213
+
214
+ b. Other rights.
215
+
216
+ 1. Moral rights, such as the right of integrity, are not
217
+ licensed under this Public License, nor are publicity,
218
+ privacy, and/or other similar personality rights; however, to
219
+ the extent possible, the Licensor waives and/or agrees not to
220
+ assert any such rights held by the Licensor to the limited
221
+ extent necessary to allow You to exercise the Licensed
222
+ Rights, but not otherwise.
223
+
224
+ 2. Patent and trademark rights are not licensed under this
225
+ Public License.
226
+
227
+ 3. To the extent possible, the Licensor waives any right to
228
+ collect royalties from You for the exercise of the Licensed
229
+ Rights, whether directly or through a collecting society
230
+ under any voluntary or waivable statutory or compulsory
231
+ licensing scheme. In all other cases the Licensor expressly
232
+ reserves any right to collect such royalties, including when
233
+ the Licensed Material is used other than for NonCommercial
234
+ purposes.
235
+
236
+
237
+ Section 3 -- License Conditions.
238
+
239
+ Your exercise of the Licensed Rights is expressly made subject to the
240
+ following conditions.
241
+
242
+ a. Attribution.
243
+
244
+ 1. If You Share the Licensed Material (including in modified
245
+ form), You must:
246
+
247
+ a. retain the following if it is supplied by the Licensor
248
+ with the Licensed Material:
249
+
250
+ i. identification of the creator(s) of the Licensed
251
+ Material and any others designated to receive
252
+ attribution, in any reasonable manner requested by
253
+ the Licensor (including by pseudonym if
254
+ designated);
255
+
256
+ ii. a copyright notice;
257
+
258
+ iii. a notice that refers to this Public License;
259
+
260
+ iv. a notice that refers to the disclaimer of
261
+ warranties;
262
+
263
+ v. a URI or hyperlink to the Licensed Material to the
264
+ extent reasonably practicable;
265
+
266
+ b. indicate if You modified the Licensed Material and
267
+ retain an indication of any previous modifications; and
268
+
269
+ c. indicate the Licensed Material is licensed under this
270
+ Public License, and include the text of, or the URI or
271
+ hyperlink to, this Public License.
272
+
273
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
274
+ reasonable manner based on the medium, means, and context in
275
+ which You Share the Licensed Material. For example, it may be
276
+ reasonable to satisfy the conditions by providing a URI or
277
+ hyperlink to a resource that includes the required
278
+ information.
279
+ 3. If requested by the Licensor, You must remove any of the
280
+ information required by Section 3(a)(1)(A) to the extent
281
+ reasonably practicable.
282
+
283
+ b. ShareAlike.
284
+
285
+ In addition to the conditions in Section 3(a), if You Share
286
+ Adapted Material You produce, the following conditions also apply.
287
+
288
+ 1. The Adapter's License You apply must be a Creative Commons
289
+ license with the same License Elements, this version or
290
+ later, or a BY-NC-SA Compatible License.
291
+
292
+ 2. You must include the text of, or the URI or hyperlink to, the
293
+ Adapter's License You apply. You may satisfy this condition
294
+ in any reasonable manner based on the medium, means, and
295
+ context in which You Share Adapted Material.
296
+
297
+ 3. You may not offer or impose any additional or different terms
298
+ or conditions on, or apply any Effective Technological
299
+ Measures to, Adapted Material that restrict exercise of the
300
+ rights granted under the Adapter's License You apply.
301
+
302
+
303
+ Section 4 -- Sui Generis Database Rights.
304
+
305
+ Where the Licensed Rights include Sui Generis Database Rights that
306
+ apply to Your use of the Licensed Material:
307
+
308
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309
+ to extract, reuse, reproduce, and Share all or a substantial
310
+ portion of the contents of the database for NonCommercial purposes
311
+ only;
312
+
313
+ b. if You include all or a substantial portion of the database
314
+ contents in a database in which You have Sui Generis Database
315
+ Rights, then the database in which You have Sui Generis Database
316
+ Rights (but not its individual contents) is Adapted Material,
317
+ including for purposes of Section 3(b); and
318
+
319
+ c. You must comply with the conditions in Section 3(a) if You Share
320
+ all or a substantial portion of the contents of the database.
321
+
322
+ For the avoidance of doubt, this Section 4 supplements and does not
323
+ replace Your obligations under this Public License where the Licensed
324
+ Rights include other Copyright and Similar Rights.
325
+
326
+
327
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328
+
329
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339
+
340
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349
+
350
+ c. The disclaimer of warranties and limitation of liability provided
351
+ above shall be interpreted in a manner that, to the extent
352
+ possible, most closely approximates an absolute disclaimer and
353
+ waiver of all liability.
354
+
355
+
356
+ Section 6 -- Term and Termination.
357
+
358
+ a. This Public License applies for the term of the Copyright and
359
+ Similar Rights licensed here. However, if You fail to comply with
360
+ this Public License, then Your rights under this Public License
361
+ terminate automatically.
362
+
363
+ b. Where Your right to use the Licensed Material has terminated under
364
+ Section 6(a), it reinstates:
365
+
366
+ 1. automatically as of the date the violation is cured, provided
367
+ it is cured within 30 days of Your discovery of the
368
+ violation; or
369
+
370
+ 2. upon express reinstatement by the Licensor.
371
+
372
+ For the avoidance of doubt, this Section 6(b) does not affect any
373
+ right the Licensor may have to seek remedies for Your violations
374
+ of this Public License.
375
+
376
+ c. For the avoidance of doubt, the Licensor may also offer the
377
+ Licensed Material under separate terms or conditions or stop
378
+ distributing the Licensed Material at any time; however, doing so
379
+ will not terminate this Public License.
380
+
381
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382
+ License.
383
+
384
+
385
+ Section 7 -- Other Terms and Conditions.
386
+
387
+ a. The Licensor shall not be bound by any additional or different
388
+ terms or conditions communicated by You unless expressly agreed.
389
+
390
+ b. Any arrangements, understandings, or agreements regarding the
391
+ Licensed Material not stated herein are separate from and
392
+ independent of the terms and conditions of this Public License.
393
+
394
+
395
+ Section 8 -- Interpretation.
396
+
397
+ a. For the avoidance of doubt, this Public License does not, and
398
+ shall not be interpreted to, reduce, limit, restrict, or impose
399
+ conditions on any use of the Licensed Material that could lawfully
400
+ be made without permission under this Public License.
401
+
402
+ b. To the extent possible, if any provision of this Public License is
403
+ deemed unenforceable, it shall be automatically reformed to the
404
+ minimum extent necessary to make it enforceable. If the provision
405
+ cannot be reformed, it shall be severed from this Public License
406
+ without affecting the enforceability of the remaining terms and
407
+ conditions.
408
+
409
+ c. No term or condition of this Public License will be waived and no
410
+ failure to comply consented to unless expressly agreed to by the
411
+ Licensor.
412
+
413
+ d. Nothing in this Public License constitutes or may be interpreted
414
+ as a limitation upon, or waiver of, any privileges and immunities
415
+ that apply to the Licensor or You, including from the legal
416
+ processes of any jurisdiction or authority.
417
+
418
+ =======================================================================
419
+
420
+ Creative Commons is not a party to its public
421
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
422
+ its public licenses to material it publishes and in those instances
423
+ will be considered the “Licensor.” The text of the Creative Commons
424
+ public licenses is dedicated to the public domain under the CC0 Public
425
+ Domain Dedication. Except for the limited purpose of indicating that
426
+ material is shared under a Creative Commons public license or as
427
+ otherwise permitted by the Creative Commons policies published at
428
+ creativecommons.org/policies, Creative Commons does not authorize the
429
+ use of the trademark "Creative Commons" or any other trademark or logo
430
+ of Creative Commons without its prior written consent including,
431
+ without limitation, in connection with any unauthorized modifications
432
+ to any of its public licenses or any other arrangements,
433
+ understandings, or agreements concerning use of licensed material. For
434
+ the avoidance of doubt, this paragraph does not form part of the
435
+ public licenses.
436
+
437
+ Creative Commons may be contacted at creativecommons.org.
emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - msp-podcast
5
+ inference: true
6
+ tags:
7
+ - speech
8
+ - audio
9
+ - wav2vec2
10
+ - audio-classification
11
+ - emotion-recognition
12
+ license: cc-by-nc-sa-4.0
13
+ pipeline_tag: audio-classification
14
+ ---
15
+
16
+ # Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0
17
+
18
+ The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [
19
+ Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).
20
+
21
+ # Usage
22
+
23
+ ```python
24
+ import numpy as np
25
+ import torch
26
+ import torch.nn as nn
27
+ from transformers import Wav2Vec2Processor
28
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
29
+ Wav2Vec2Model,
30
+ Wav2Vec2PreTrainedModel,
31
+ )
32
+
33
+
34
+ class RegressionHead(nn.Module):
35
+ r"""Classification head."""
36
+
37
+ def __init__(self, config):
38
+
39
+ super().__init__()
40
+
41
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
42
+ self.dropout = nn.Dropout(config.final_dropout)
43
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
44
+
45
+ def forward(self, features, **kwargs):
46
+
47
+ x = features
48
+ x = self.dropout(x)
49
+ x = self.dense(x)
50
+ x = torch.tanh(x)
51
+ x = self.dropout(x)
52
+ x = self.out_proj(x)
53
+
54
+ return x
55
+
56
+
57
+ class EmotionModel(Wav2Vec2PreTrainedModel):
58
+ r"""Speech emotion classifier."""
59
+
60
+ def __init__(self, config):
61
+
62
+ super().__init__(config)
63
+
64
+ self.config = config
65
+ self.wav2vec2 = Wav2Vec2Model(config)
66
+ self.classifier = RegressionHead(config)
67
+ self.init_weights()
68
+
69
+ def forward(
70
+ self,
71
+ input_values,
72
+ ):
73
+
74
+ outputs = self.wav2vec2(input_values)
75
+ hidden_states = outputs[0]
76
+ hidden_states = torch.mean(hidden_states, dim=1)
77
+ logits = self.classifier(hidden_states)
78
+
79
+ return hidden_states, logits
80
+
81
+
82
+
83
+ # load model from hub
84
+ device = 'cpu'
85
+ model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
86
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
87
+ model = EmotionModel.from_pretrained(model_name)
88
+
89
+ # dummy signal
90
+ sampling_rate = 16000
91
+ signal = np.zeros((1, sampling_rate), dtype=np.float32)
92
+
93
+
94
+ def process_func(
95
+ x: np.ndarray,
96
+ sampling_rate: int,
97
+ embeddings: bool = False,
98
+ ) -> np.ndarray:
99
+ r"""Predict emotions or extract embeddings from raw audio signal."""
100
+
101
+ # run through processor to normalize signal
102
+ # always returns a batch, so we just get the first entry
103
+ # then we put it on the device
104
+ y = processor(x, sampling_rate=sampling_rate)
105
+ y = y['input_values'][0]
106
+ y = y.reshape(1, -1)
107
+ y = torch.from_numpy(y).to(device)
108
+
109
+ # run through model
110
+ with torch.no_grad():
111
+ y = model(y)[0 if embeddings else 1]
112
+
113
+ # convert to numpy
114
+ y = y.detach().cpu().numpy()
115
+
116
+ return y
117
+
118
+
119
+ print(process_func(signal, sampling_rate))
120
+ # Arousal dominance valence
121
+ # [[0.5460754 0.6062266 0.40431657]]
122
+
123
+ print(process_func(signal, sampling_rate, embeddings=True))
124
+ # Pooled hidden states of last transformer layer
125
+ # [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748
126
+ # 0.00599211]]
127
+ ```