Spaces:

mpc001
/

auto_avsr

Runtime error

App Files Files Community

mpc001 commited on Jun 15, 2023

Commit

d319e26

•

1 Parent(s): 5cebf32

Upload 7 files

Browse files

Files changed (7) hide show

benchmarks/LRS3/models/LRS3_AV_WER0.9/model.json +162 -0
benchmarks/LRS3/models/LRS3_AV_WER0.9/model.pth +3 -0
benchmarks/LRS3/models/LRS3_A_WER1.0/model.json +177 -0
benchmarks/LRS3/models/LRS3_A_WER1.0/model.pth +3 -0
benchmarks/LRS3/models/LRS3_V_WER19.1/model.json +177 -0
benchmarks/LRS3/models/LRS3_V_WER19.1/model.pth +3 -0
benchmarks/LRS3/models/README.md +1 -0

benchmarks/LRS3/models/LRS3_AV_WER0.9/model.json ADDED Viewed

	@@ -0,0 +1,162 @@

+[
+    9216,
+    41,
+    {
+        "a_upsample_ratio": 1,
+        "accum_grad": 2,
+        "adim": 768,
+        "aheads": 12,
+        "apply_uttmvn": true,
+        "aux_a_upsample_ratio": 1,
+        "aux_adim": 768,
+        "aux_aheads": 12,
+        "aux_cnn_module_kernel": 31,
+        "aux_dropout_rate": 0.1,
+        "aux_elayers": 12,
+        "aux_eunits": 3072,
+        "aux_lsm_weight": 0.0,
+        "aux_macaron_style": 1,
+        "aux_transformer_attn_dropout_rate": 0.1,
+        "aux_transformer_encoder_attn_layer_type": "rel_mha",
+        "aux_transformer_input_layer": "conv1d",
+        "aux_use_cnn_module": 1,
+        "backend": "pytorch",
+        "badim": 320,
+        "batch_bins": 0,
+        "batch_count": "auto",
+        "batch_frames_in": 0,
+        "batch_frames_inout": 0,
+        "batch_frames_out": 0,
+        "bdropout_rate": 0.0,
+        "beam_size": 4,
+        "blayers": 2,
+        "bnmask": 2,
+        "bprojs": 300,
+        "btype": "blstmp",
+        "bunits": 300,
+        "cnn_module_kernel": 31,
+        "config2": null,
+        "config3": null,
+        "context_residual": false,
+        "criterion": "acc",
+        "ctc_type": "warpctc",
+        "ctc_weight": 0.3,
+        "debugmode": 1,
+        "dec_init": null,
+        "dec_init_mods": [
+            "att.",
+            " dec."
+        ],
+        "dict": "data/lang_1char/units.txt",
+        "dlayers": 6,
+        "dropout_rate": 0.1,
+        "dunits": 3072,
+        "early_stop_criterion": "validation/main/acc",
+        "elayers": 12,
+        "enc_init": null,
+        "enc_init_mods": [
+            "enc.enc."
+        ],
+        "eps": 1e-08,
+        "eps_decay": 0.01,
+        "eunits": 3072,
+        "fbank_fmax": null,
+        "fbank_fmin": 0.0,
+        "fbank_fs": 16000,
+        "fusion_hdim": 8192,
+        "fusion_norm": "batchnorm",
+        "grad_clip": 5.0,
+        "grad_noise": false,
+        "labels_type": "unigram5000",
+        "lm_weight": 0.1,
+        "lsm_weight": 0.1,
+        "macaron_style": 1,
+        "maxlen_in": 220,
+        "maxlen_out": 220,
+        "maxlenratio": 0.0,
+        "minibatches": 0,
+        "minlenratio": 0.0,
+        "model_module": "espnet.nets.pytorch_backend.e2e_asr_transformer_multitask_dual:E2E",
+        "mtlalpha": 0.1,
+        "n_iter_processes": 12,
+        "n_mels": 80,
+        "nbest": 1,
+        "ngpu": 1,
+        "num_encs": 1,
+        "num_input": 2,
+        "num_save_attention": 3,
+        "num_spkrs": 1,
+        "opt": "noam",
+        "patience": 0,
+        "penalty": 0.0,
+        "preprocess_conf": null,
+        "raw_max_freq_width": 150,
+        "raw_max_speed_rate": 1.1,
+        "raw_max_time_width": 0.4,
+        "raw_min_speed_rate": 0.9,
+        "raw_n_freq_mask": 2,
+        "raw_n_time_mask": 2,
+        "raw_speech_do_normalize": false,
+        "ref_channel": -1,
+        "rel_pos_type": "latest",
+        "relu_type": "swish",
+        "report_cer": false,
+        "report_interval_iters": 100,
+        "report_wer": false,
+        "rnnlm": null,
+        "rnnlm_conf": null,
+        "save_interval_iters": 0,
+        "seed": 1,
+        "sortagrad": 0,
+        "specaug_max_freq_width": 30,
+        "specaug_max_time_warp": 5,
+        "specaug_max_time_width": 40,
+        "specaug_n_freq_mask": 2,
+        "specaug_n_time_mask": 2,
+        "sr_interp_mode": "nearest",
+        "sr_interp_scale_factor": 1.0,
+        "stats_file": null,
+        "sym_blank": "<blank>",
+        "sym_space": "<space>",
+        "threshold": 0.0001,
+        "train_dtype": "float32",
+        "transformer_attn_dropout_rate": 0.1,
+        "transformer_encoder_attn_layer_type": "rel_mha",
+        "transformer_init": "pytorch",
+        "transformer_input_layer": "conv3d",
+        "transformer_length_normalized_loss": 0,
+        "transformer_warmup_steps": 25000,
+        "use_beamformer": true,
+        "use_cnn_module": 1,
+        "use_dnn_mask_for_wpe": false,
+        "use_freqmask": false,
+        "use_frontend": false,
+        "use_noiseaug": false,
+        "use_specaug": false,
+        "use_speedaug": false,
+        "use_timemask": false,
+        "use_v_adaptive_timemask": true,
+        "use_v_cutout": false,
+        "use_v_timemask": false,
+        "use_wpe": false,
+        "uttmvn_norm_means": true,
+        "uttmvn_norm_vars": false,
+        "v_cutout_max_hole_length": 22,
+        "v_cutout_n_holes": 1,
+        "v_raw_max_time_width": 0.4,
+        "v_raw_n_time_mask": 1,
+        "v_timemask_replace_with_zero": false,
+        "v_timemask_stride": 1.0,
+        "verbose": 0,
+        "wavaugments": null,
+        "wdropout_rate": 0.0,
+        "weight_decay": 0.0,
+        "wlayers": 2,
+        "wpe_delay": 3,
+        "wpe_taps": 5,
+        "wprojs": 300,
+        "wtype": "blstmp",
+        "wunits": 300,
+        "zero_triu": false
+    }
+]

benchmarks/LRS3/models/LRS3_AV_WER0.9/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16304660a2181629215f8c7390565750784379f6e8a2b5c8d155fd9574afcd6e
+size 1776719497

benchmarks/LRS3/models/LRS3_A_WER1.0/model.json ADDED Viewed

	@@ -0,0 +1,177 @@

+[
+    9216,
+    41,
+    {
+        "a_upsample_ratio": 1,
+        "accum_grad": 2,
+        "adim": 768,
+        "aheads": 12,
+        "apply_uttmvn": true,
+        "aux_lsm_weight": 0.0,
+        "backend": "pytorch",
+        "badim": 320,
+        "batch_bins": 0,
+        "batch_count": "auto",
+        "batch_frames_in": 0,
+        "batch_frames_inout": 0,
+        "batch_frames_out": 0,
+        "bdropout_rate": 0.0,
+        "beam_size": 4,
+        "blayers": 2,
+        "bnmask": 2,
+        "bprojs": 300,
+        "btype": "blstmp",
+        "bunits": 300,
+        "cnn_module_kernel": 31,
+        "config2": null,
+        "config3": null,
+        "context_residual": false,
+        "criterion": "acc",
+        "ctc_type": "warpctc",
+        "ctc_weight": 0.3,
+        "debugmode": 1,
+        "dec_init": null,
+        "dec_init_mods": [
+            "att.",
+            " dec."
+        ],
+        "dict": "data/lang_1char/units.txt",
+        "dlayers": 6,
+        "dropout_rate": 0.1,
+        "dunits": 3072,
+        "early_stop_criterion": "validation/main/acc",
+        "elayers": 12,
+        "enc_init": null,
+        "enc_init_mods": [
+            "enc.enc."
+        ],
+        "eps": 1e-08,
+        "eps_decay": 0.01,
+        "eunits": 3072,
+        "fbank_fmax": null,
+        "fbank_fmin": 0.0,
+        "fbank_fs": 16000,
+        "grad_clip": 5.0,
+        "grad_noise": false,
+        "labels_type": "unigram5000",
+        "lm_weight": 0.1,
+        "lsm_weight": 0.1,
+        "macaron_style": 1,
+        "maxlen_in": 220,
+        "maxlen_out": 220,
+        "maxlenratio": 0.0,
+        "minibatches": 0,
+        "minlenratio": 0.0,
+        "model_module": "espnet.nets.pytorch_backend.e2e_asr_transformer_multitask_dual:E2E",
+        "mtl_custom_worker_l1_weight": 0.0,
+        "mtl_custom_worker_length_normalized_loss": 0,
+        "mtl_custom_worker_mlp_hdim": 256,
+        "mtl_custom_worker_mlp_nlayers": 2,
+        "mtl_custom_worker_mlp_nonlin_end": 0,
+        "mtl_custom_worker_mlp_nonlin_type": "relu",
+        "mtl_custom_worker_name": "patrickvonplaten/wav2vec2-base",
+        "mtl_custom_worker_task_type": "",
+        "mtl_custom_worker_tgt_type": "projected_quantized_states",
+        "mtl_kl_weight": 0.0,
+        "mtl_kl_weight_2": 0.0,
+        "mtl_l1_weight": 0.4,
+        "mtl_l1_weight_2": 0.4,
+        "mtl_length_normalized_loss": 1,
+        "mtl_length_normalized_loss_2": 1,
+        "mtl_mlp_hdim": 256,
+        "mtl_mlp_hdim_2": 256,
+        "mtl_mlp_nlayers": 1,
+        "mtl_mlp_nlayers_2": 1,
+        "mtl_mlp_nonlin_end": 0,
+        "mtl_mlp_nonlin_end_2": 0,
+        "mtl_mlp_nonlin_type": "relu",
+        "mtl_mlp_nonlin_type_2": "relu",
+        "mtl_task_layer": "conformer6",
+        "mtl_task_type": "l1",
+        "mtl_task_type_2": "l1",
+        "mtl_worker_source": "conv1d_lrs3_v04_lrs2",
+        "mtl_worker_source_2": "conv3d_lrs3_v04_lrs2_dual",
+        "mtlalpha": 0.1,
+        "n_iter_processes": 12,
+        "n_mels": 80,
+        "nbest": 1,
+        "ngpu": 1,
+        "num_encs": 1,
+        "num_input": 2,
+        "num_save_attention": 3,
+        "num_spkrs": 1,
+        "opt": "noam",
+        "patience": 0,
+        "penalty": 0.0,
+        "preprocess_conf": null,
+        "pretrain_dataset": "lrs2_full_dual_ignore",
+        "raw_max_freq_width": 150,
+        "raw_max_speed_rate": 1.1,
+        "raw_max_time_width": 0.4,
+        "raw_min_speed_rate": 0.9,
+        "raw_n_freq_mask": 2,
+        "raw_n_time_mask": 2,
+        "raw_speech_do_normalize": false,
+        "ref_channel": -1,
+        "rel_pos_type": "latest",
+        "relu_type": "swish",
+        "report_cer": false,
+        "report_interval_iters": 100,
+        "report_wer": false,
+        "rnnlm": null,
+        "rnnlm_conf": null,
+        "save_interval_iters": 0,
+        "seed": 1,
+        "sortagrad": 0,
+        "specaug_max_freq_width": 30,
+        "specaug_max_time_warp": 5,
+        "specaug_max_time_width": 40,
+        "specaug_n_freq_mask": 2,
+        "specaug_n_time_mask": 2,
+        "sr_interp_mode": "nearest",
+        "sr_interp_scale_factor": 1.0,
+        "stats_file": null,
+        "sym_blank": "<blank>",
+        "sym_space": "<space>",
+        "threshold": 0.0001,
+        "train_dtype": "float32",
+        "transformer_attn_dropout_rate": 0.1,
+        "transformer_encoder_attn_layer_type": "rel_mha",
+        "transformer_init": "pytorch",
+        "transformer_input_layer": "conv1d",
+        "transformer_length_normalized_loss": 0,
+        "transformer_warmup_steps": 25000,
+        "use_beamformer": true,
+        "use_cnn_module": 1,
+        "use_dnn_mask_for_wpe": false,
+        "use_freqmask": false,
+        "use_frontend": false,
+        "use_noiseaug": false,
+        "use_specaug": false,
+        "use_speedaug": false,
+        "use_timemask": false,
+        "use_v_adaptive_timemask": true,
+        "use_v_cutout": false,
+        "use_v_timemask": false,
+        "use_wpe": false,
+        "uttmvn_norm_means": true,
+        "uttmvn_norm_vars": false,
+        "v_cutout_max_hole_length": 22,
+        "v_cutout_n_holes": 1,
+        "v_raw_max_time_width": 0.4,
+        "v_raw_n_time_mask": 1,
+        "v_timemask_replace_with_zero": false,
+        "v_timemask_stride": 1.0,
+        "verbose": 0,
+        "wavaugments": null,
+        "wdropout_rate": 0.0,
+        "weight_decay": 0.0,
+        "wlayers": 2,
+        "wpe_delay": 3,
+        "wpe_taps": 5,
+        "wprojs": 300,
+        "wtype": "blstmp",
+        "wunits": 300,
+        "zero_triu": false
+    }
+]

benchmarks/LRS3/models/LRS3_A_WER1.0/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6443e830edbdd1886854ba64219b29430eca2ee055497ffc4f04419258ff3b2
+size 972562947

benchmarks/LRS3/models/LRS3_V_WER19.1/model.json ADDED Viewed

	@@ -0,0 +1,177 @@

+[
+    9216,
+    41,
+    {
+        "a_upsample_ratio": 1,
+        "accum_grad": 2,
+        "adim": 768,
+        "aheads": 12,
+        "apply_uttmvn": true,
+        "aux_lsm_weight": 0.0,
+        "backend": "pytorch",
+        "badim": 320,
+        "batch_bins": 0,
+        "batch_count": "auto",
+        "batch_frames_in": 0,
+        "batch_frames_inout": 0,
+        "batch_frames_out": 0,
+        "bdropout_rate": 0.0,
+        "beam_size": 4,
+        "blayers": 2,
+        "bnmask": 2,
+        "bprojs": 300,
+        "btype": "blstmp",
+        "bunits": 300,
+        "cnn_module_kernel": 31,
+        "config2": null,
+        "config3": null,
+        "context_residual": false,
+        "criterion": "acc",
+        "ctc_type": "warpctc",
+        "ctc_weight": 0.3,
+        "debugmode": 1,
+        "dec_init": null,
+        "dec_init_mods": [
+            "att.",
+            " dec."
+        ],
+        "dict": "data/lang_1char/units.txt",
+        "dlayers": 6,
+        "dropout_rate": 0.1,
+        "dunits": 3072,
+        "early_stop_criterion": "validation/main/acc",
+        "elayers": 12,
+        "enc_init": null,
+        "enc_init_mods": [
+            "enc.enc."
+        ],
+        "eps": 1e-08,
+        "eps_decay": 0.01,
+        "eunits": 3072,
+        "fbank_fmax": null,
+        "fbank_fmin": 0.0,
+        "fbank_fs": 16000,
+        "grad_clip": 5.0,
+        "grad_noise": false,
+        "labels_type": "unigram5000",
+        "lm_weight": 0.1,
+        "lsm_weight": 0.1,
+        "macaron_style": 1,
+        "maxlen_in": 220,
+        "maxlen_out": 220,
+        "maxlenratio": 0.0,
+        "minibatches": 0,
+        "minlenratio": 0.0,
+        "model_module": "espnet.nets.pytorch_backend.e2e_asr_transformer_multitask_dual:E2E",
+        "mtl_custom_worker_l1_weight": 0.0,
+        "mtl_custom_worker_length_normalized_loss": 0,
+        "mtl_custom_worker_mlp_hdim": 256,
+        "mtl_custom_worker_mlp_nlayers": 2,
+        "mtl_custom_worker_mlp_nonlin_end": 0,
+        "mtl_custom_worker_mlp_nonlin_type": "relu",
+        "mtl_custom_worker_name": "patrickvonplaten/wav2vec2-base",
+        "mtl_custom_worker_task_type": "",
+        "mtl_custom_worker_tgt_type": "projected_quantized_states",
+        "mtl_kl_weight": 0.0,
+        "mtl_kl_weight_2": 0.0,
+        "mtl_l1_weight": 0.4,
+        "mtl_l1_weight_2": 0.4,
+        "mtl_length_normalized_loss": 1,
+        "mtl_length_normalized_loss_2": 1,
+        "mtl_mlp_hdim": 256,
+        "mtl_mlp_hdim_2": 256,
+        "mtl_mlp_nlayers": 1,
+        "mtl_mlp_nlayers_2": 1,
+        "mtl_mlp_nonlin_end": 0,
+        "mtl_mlp_nonlin_end_2": 0,
+        "mtl_mlp_nonlin_type": "relu",
+        "mtl_mlp_nonlin_type_2": "relu",
+        "mtl_task_layer": "conformer6",
+        "mtl_task_type": "l1",
+        "mtl_task_type_2": "l1",
+        "mtl_worker_source": "conv1d_lrs3_v04_lrs2",
+        "mtl_worker_source_2": "conv3d_lrs3_v04_lrs2_dual",
+        "mtlalpha": 0.1,
+        "n_iter_processes": 12,
+        "n_mels": 80,
+        "nbest": 1,
+        "ngpu": 1,
+        "num_encs": 1,
+        "num_input": 2,
+        "num_save_attention": 3,
+        "num_spkrs": 1,
+        "opt": "noam",
+        "patience": 0,
+        "penalty": 0.0,
+        "preprocess_conf": null,
+        "pretrain_dataset": "lrs2_full_dual_ignore",
+        "raw_max_freq_width": 150,
+        "raw_max_speed_rate": 1.1,
+        "raw_max_time_width": 0.4,
+        "raw_min_speed_rate": 0.9,
+        "raw_n_freq_mask": 2,
+        "raw_n_time_mask": 2,
+        "raw_speech_do_normalize": false,
+        "ref_channel": -1,
+        "rel_pos_type": "latest",
+        "relu_type": "swish",
+        "report_cer": false,
+        "report_interval_iters": 100,
+        "report_wer": false,
+        "rnnlm": null,
+        "rnnlm_conf": null,
+        "save_interval_iters": 0,
+        "seed": 1,
+        "sortagrad": 0,
+        "specaug_max_freq_width": 30,
+        "specaug_max_time_warp": 5,
+        "specaug_max_time_width": 40,
+        "specaug_n_freq_mask": 2,
+        "specaug_n_time_mask": 2,
+        "sr_interp_mode": "nearest",
+        "sr_interp_scale_factor": 1.0,
+        "stats_file": null,
+        "sym_blank": "<blank>",
+        "sym_space": "<space>",
+        "threshold": 0.0001,
+        "train_dtype": "float32",
+        "transformer_attn_dropout_rate": 0.1,
+        "transformer_encoder_attn_layer_type": "rel_mha",
+        "transformer_init": "pytorch",
+        "transformer_input_layer": "conv3d",
+        "transformer_length_normalized_loss": 0,
+        "transformer_warmup_steps": 25000,
+        "use_beamformer": true,
+        "use_cnn_module": 1,
+        "use_dnn_mask_for_wpe": false,
+        "use_freqmask": false,
+        "use_frontend": false,
+        "use_noiseaug": false,
+        "use_specaug": false,
+        "use_speedaug": false,
+        "use_timemask": false,
+        "use_v_adaptive_timemask": true,
+        "use_v_cutout": false,
+        "use_v_timemask": false,
+        "use_wpe": false,
+        "uttmvn_norm_means": true,
+        "uttmvn_norm_vars": false,
+        "v_cutout_max_hole_length": 22,
+        "v_cutout_n_holes": 1,
+        "v_raw_max_time_width": 0.4,
+        "v_raw_n_time_mask": 1,
+        "v_timemask_replace_with_zero": false,
+        "v_timemask_stride": 1.0,
+        "verbose": 0,
+        "wavaugments": null,
+        "wdropout_rate": 0.0,
+        "weight_decay": 0.0,
+        "wlayers": 2,
+        "wpe_delay": 3,
+        "wpe_taps": 5,
+        "wprojs": 300,
+        "wtype": "blstmp",
+        "wunits": 300,
+        "zero_triu": false
+    }
+]

benchmarks/LRS3/models/LRS3_V_WER19.1/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e740cef369abeabd0ba2c18e37a0661342e1d94d432d6caa77755a11821d8fe3
+size 1001908942

benchmarks/LRS3/models/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Put model folders here.