fish-diffusion_demo / configs /svs_baseline.py
rinflan's picture
Upload 15 files
a6df73d
# Warning: This config is developing, and subject to change.
_base_ = [
"./_base_/archs/diff_svc_v2.py",
"./_base_/trainers/base.py",
"./_base_/schedulers/warmup_cosine.py",
"./_base_/datasets/audio_folder.py",
]
phonemes = [
"AP",
"SP",
"E",
"En",
"a",
"ai",
"an",
"ang",
"ao",
"b",
"c",
"ch",
"d",
"e",
"ei",
"en",
"eng",
"er",
"f",
"g",
"h",
"i",
"i0",
"ia",
"ian",
"iang",
"iao",
"ie",
"in",
"ing",
"iong",
"ir",
"iu",
"j",
"k",
"l",
"m",
"n",
"o",
"ong",
"ou",
"p",
"q",
"r",
"s",
"sh",
"t",
"u",
"ua",
"uai",
"uan",
"uang",
"ui",
"un",
"uo",
"v",
"van",
"ve",
"vn",
"w",
"x",
"y",
"z",
"zh",
]
preprocessing = dict(
text_features_extractor=dict(
type="OpenCpopTranscriptionToPhonemesDuration",
phonemes=phonemes,
transcription_path="dataset/transcriptions.txt",
),
pitch_extractor=dict(
type="ParselMouthPitchExtractor",
),
)
model = dict(
type="DiffSinger",
text_encoder=dict(
_delete_=True,
type="NaiveProjectionEncoder",
input_size=len(phonemes) * 2 + 2,
output_size=256,
),
diffusion=dict(
max_beta=0.02,
),
)
dataset = dict(
_delete_=True,
train=dict(
type="AudioFolderDataset",
path="dataset/diff-singer/train",
speaker_id=0,
),
valid=dict(
type="AudioFolderDataset",
path="dataset/diff-singer/valid",
speaker_id=0,
),
)