Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / funasr_detach /models /uniasr /template.yaml

mrfakename

Super-squash branch 'main' using huggingface_hub

0102e16 verified 8 months ago

raw

history blame

4.1 kB

	# This is an example that demonstrates how to configure a model file.
	# You can modify the configuration according to your own requirements.

	# to print the register_table:
	# from funasr.register import tables
	# tables.print()

	# network architecture
	model: UniASR
	model_conf:
	ctc_weight: 0.0
	lsm_weight: 0.1
	length_normalized_loss: true
	predictor_weight: 1.0
	decoder_attention_chunk_type: chunk
	ctc_weight2: 0.0
	predictor_weight2: 1.0
	decoder_attention_chunk_type2: chunk
	loss_weight_model1: 0.5


	# encoder
	encoder: SANMEncoderChunkOpt
	encoder_conf:
	output_size: 320
	attention_heads: 4
	linear_units: 1280
	num_blocks: 35
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.1
	input_layer: pe
	pos_enc_class: SinusoidalPositionEncoder
	normalize_before: true
	kernel_size: 11
	sanm_shfit: 0
	selfattention_layer_type: sanm
	chunk_size:
	- 20
	- 60
	stride:
	- 10
	- 40
	pad_left:
	- 5
	- 10
	encoder_att_look_back_factor:
	- 0
	- 0
	decoder_att_look_back_factor:
	- 0
	- 0

	# decoder
	decoder: FsmnDecoderSCAMAOpt
	decoder_conf:
	attention_dim: 256
	attention_heads: 4
	linear_units: 1024
	num_blocks: 12
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	self_attention_dropout_rate: 0.1
	src_attention_dropout_rate: 0.1
	att_layer_num: 6
	kernel_size: 11
	concat_embeds: true

	# predictor
	predictor: CifPredictorV2
	predictor_conf:
	idim: 320
	threshold: 1.0
	l_order: 1
	r_order: 1


	# encoder2
	encoder2: SANMEncoderChunkOpt
	encoder2_conf:
	output_size: 320
	attention_heads: 4
	linear_units: 1280
	num_blocks: 20
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.1
	input_layer: pe
	pos_enc_class: SinusoidalPositionEncoder
	normalize_before: true
	kernel_size: 21
	sanm_shfit: 0
	selfattention_layer_type: sanm
	chunk_size:
	- 45
	- 70
	stride:
	- 35
	- 50
	pad_left:
	- 5
	- 10
	encoder_att_look_back_factor:
	- 0
	- 0
	decoder_att_look_back_factor:
	- 0
	- 0

	# decoder
	decoder2: FsmnDecoderSCAMAOpt
	decoder2_conf:
	attention_dim: 320
	attention_heads: 4
	linear_units: 1280
	num_blocks: 12
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	self_attention_dropout_rate: 0.1
	src_attention_dropout_rate: 0.1
	att_layer_num: 6
	kernel_size: 11
	concat_embeds: true

	predictor2: CifPredictorV2
	predictor2_conf:
	idim: 320
	threshold: 1.0
	l_order: 1
	r_order: 1

	stride_conv: stride_conv1d
	stride_conv_conf:
	kernel_size: 2
	stride: 2
	pad:
	- 0
	- 1

	# frontend related
	frontend: WavFrontend
	frontend_conf:
	fs: 16000
	window: hamming
	n_mels: 80
	frame_length: 25
	frame_shift: 10
	lfr_m: 7
	lfr_n: 6
	dither: 0.0

	specaug: SpecAugLFR
	specaug_conf:
	apply_time_warp: false
	time_warp_window: 5
	time_warp_mode: bicubic
	apply_freq_mask: true
	freq_mask_width_range:
	- 0
	- 30
	lfr_rate: 6
	num_freq_mask: 1
	apply_time_mask: true
	time_mask_width_range:
	- 0
	- 12
	num_time_mask: 1

	train_conf:
	accum_grad: 1
	grad_clip: 5
	max_epoch: 150
	keep_nbest_models: 10
	avg_nbest_model: 5
	log_interval: 50

	optim: adam
	optim_conf:
	lr: 0.0001
	scheduler: warmuplr
	scheduler_conf:
	warmup_steps: 30000

	dataset: AudioDataset
	dataset_conf:
	index_ds: IndexDSJsonl
	batch_sampler: DynamicBatchLocalShuffleSampler
	batch_type: example # example or length
	batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
	max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
	buffer_size: 500
	shuffle: True
	num_workers: 0

	tokenizer: CharTokenizer
	tokenizer_conf:
	unk_symbol: <unk>
	split_with_space: true


	ctc_conf:
	dropout_rate: 0.0
	ctc_type: builtin
	reduce: true
	ignore_nan_grad: true
	normalize: null