jbetker
/

tortoise-filtering-models

Model card Files Files and versions Community

tortoise-filtering-models / train_voice_voice_clip.yml

jbetker

Initial commit

55cc85f almost 2 years ago

raw history blame contribute delete

No virus

2.55 kB

	#### general settings
	name: train_voice_voice_clip
	use_tb_logger: true
	gpu_ids: [0]
	start_step: 0
	fp16: false
	checkpointing_enabled: true
	wandb: false

	datasets:
	train:
	name: clips
	n_workers: 4
	batch_size: 512
	mode: unsupervised_audio
	path: [/y/clips,
	/y/bigasr_dataset/libritts/train-clean-100, /y/bigasr_dataset/libritts/train-clean-360,
	/y/bigasr_dataset/libritts/train-other-500, /y/bigasr_dataset/ljspeech/wavs]
	exclusions: [/y/clips/books1-hifreq.txt, /y/clips/podcasts-0-hifreq.txt,
	/y/clips/books2-hifreq.txt, /y/bigasr_dataset/libritts-hifreq.txt]
	cache_path: /y/clips-cache-hifreq.pth
	sampling_rate: 22050
	do_augmentation: false
	pad_to_samples: 80000
	resample_clip: false
	min_length: 40000
	debug_loading_failures: false
	val:
	name: clips_val
	n_workers: 1
	batch_size: 512
	mode: unsupervised_audio
	path: [/h/libritts/test-clean]
	cache_path: /h/libritts/test-clean/cache.pth
	sampling_rate: 22050
	do_augmentation: false
	pad_to_samples: 80000
	resample_clip: false
	min_length: 40000
	debug_loading_failures: false

	networks:
	clip:
	type: generator
	which_model_G: voice_to_voice_clip
	kwargs:
	encoder_output: 512

	#### path
	path:
	strict_load: true
	#resume_state: ../experiments/train_voice_voice_clip/training_state/56000.state
	pretrain_model_clip: voice_voice_clip.pth

	steps:
	clip_train:
	training: clip
	loss_log_buffer: 250

	# Generally follows the recipe from the DALLE paper.
	optimizer: adamw
	optimizer_params:
	lr: !!float 1e-4
	weight_decay: 0
	beta1: 0.9
	beta2: 0.99
	clip_grad_eps: 4 # TODO: remove clipping after warmup steps.

	injectors:
	# Speech only
	speech_to_mel:
	type: torch_mel_spectrogram
	mel_norm_file: ../experiments/clips_mel_norms.pth
	in: clip
	out: speech_mel
	forward:
	type: generator
	generator: clip
	in: [speech_mel, clip_lengths]
	out: clip_loss
	losses:
	clip_loss_ce:
	type: direct
	weight: 1
	key: clip_loss


	train:
	niter: 500000
	warmup_iter: -1
	mega_batch_factor: 1
	ema_rate: .999
	val_freq: 500

	default_lr_scheme: MultiStepLR
	gen_lr_steps: [ 20000, 40000, 60000 ]
	lr_gamma: 0.2
	warmup_steps: 1000
	#force_lr: !!float 4e-5

	eval:
	pure: true

	logger:
	print_freq: 10
	save_checkpoint_freq: 500
	visuals: []
	is_mel_spectrogram: true
	visual_debug_rate: 100