Spaces:
No application file
No application file
# Warning: This config is developing, and subject to change. | |
_base_ = [ | |
"./_base_/archs/diff_svc_v2.py", | |
"./_base_/trainers/base.py", | |
"./_base_/schedulers/warmup_cosine.py", | |
"./_base_/datasets/audio_folder.py", | |
] | |
phonemes = [ | |
"AP", | |
"SP", | |
"E", | |
"En", | |
"a", | |
"ai", | |
"an", | |
"ang", | |
"ao", | |
"b", | |
"c", | |
"ch", | |
"d", | |
"e", | |
"ei", | |
"en", | |
"eng", | |
"er", | |
"f", | |
"g", | |
"h", | |
"i", | |
"i0", | |
"ia", | |
"ian", | |
"iang", | |
"iao", | |
"ie", | |
"in", | |
"ing", | |
"iong", | |
"ir", | |
"iu", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"ong", | |
"ou", | |
"p", | |
"q", | |
"r", | |
"s", | |
"sh", | |
"t", | |
"u", | |
"ua", | |
"uai", | |
"uan", | |
"uang", | |
"ui", | |
"un", | |
"uo", | |
"v", | |
"van", | |
"ve", | |
"vn", | |
"w", | |
"x", | |
"y", | |
"z", | |
"zh", | |
] | |
preprocessing = dict( | |
text_features_extractor=dict( | |
type="OpenCpopTranscriptionToPhonemesDuration", | |
phonemes=phonemes, | |
transcription_path="dataset/transcriptions.txt", | |
), | |
pitch_extractor=dict( | |
type="ParselMouthPitchExtractor", | |
), | |
) | |
model = dict( | |
type="DiffSinger", | |
text_encoder=dict( | |
_delete_=True, | |
type="NaiveProjectionEncoder", | |
input_size=len(phonemes) * 2 + 2, | |
output_size=256, | |
), | |
diffusion=dict( | |
max_beta=0.02, | |
), | |
) | |
dataset = dict( | |
_delete_=True, | |
train=dict( | |
type="AudioFolderDataset", | |
path="dataset/diff-singer/train", | |
speaker_id=0, | |
), | |
valid=dict( | |
type="AudioFolderDataset", | |
path="dataset/diff-singer/valid", | |
speaker_id=0, | |
), | |
) | |