Spaces:
No application file
No application file
File size: 1,686 Bytes
a6df73d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# Warning: This config is developing, and subject to change.
_base_ = [
"./_base_/archs/diff_svc_v2.py",
"./_base_/trainers/base.py",
"./_base_/schedulers/warmup_cosine.py",
"./_base_/datasets/audio_folder.py",
]
phonemes = [
"AP",
"SP",
"E",
"En",
"a",
"ai",
"an",
"ang",
"ao",
"b",
"c",
"ch",
"d",
"e",
"ei",
"en",
"eng",
"er",
"f",
"g",
"h",
"i",
"i0",
"ia",
"ian",
"iang",
"iao",
"ie",
"in",
"ing",
"iong",
"ir",
"iu",
"j",
"k",
"l",
"m",
"n",
"o",
"ong",
"ou",
"p",
"q",
"r",
"s",
"sh",
"t",
"u",
"ua",
"uai",
"uan",
"uang",
"ui",
"un",
"uo",
"v",
"van",
"ve",
"vn",
"w",
"x",
"y",
"z",
"zh",
]
preprocessing = dict(
text_features_extractor=dict(
type="OpenCpopTranscriptionToPhonemesDuration",
phonemes=phonemes,
transcription_path="dataset/transcriptions.txt",
),
pitch_extractor=dict(
type="ParselMouthPitchExtractor",
),
)
model = dict(
type="DiffSinger",
text_encoder=dict(
_delete_=True,
type="NaiveProjectionEncoder",
input_size=len(phonemes) * 2 + 2,
output_size=256,
),
diffusion=dict(
max_beta=0.02,
),
)
dataset = dict(
_delete_=True,
train=dict(
type="AudioFolderDataset",
path="dataset/diff-singer/train",
speaker_id=0,
),
valid=dict(
type="AudioFolderDataset",
path="dataset/diff-singer/valid",
speaker_id=0,
),
)
|