Titouan commited on
Commit
916c9ff
1 Parent(s): 7c41d58

update with respect to PR 1329, better and smaller model

Browse files
Files changed (3) hide show
  1. asr.ckpt +2 -2
  2. hyperparams.yaml +14 -11
  3. normalizer.ckpt +2 -2
asr.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01bfdfb2bbfcb99605a6ad68f6f539785bd598385c4020f579a0a16f9bd803f7
3
- size 654179252
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2db0253dbd1cc0116907d789252d0e6154522cdf686138c1c0a38137668d7fa7
3
+ size 291335121
hyperparams.yaml CHANGED
@@ -15,11 +15,11 @@ n_mels: 80
15
 
16
  ####################### Model parameters ###########################
17
  # Transformer
18
- d_model: 768
19
- nhead: 8
20
  num_encoder_layers: 12
21
  num_decoder_layers: 6
22
- d_ffn: 3072
23
  transformer_dropout: 0.0
24
  activation: !name:torch.nn.GELU
25
  output_neurons: 5000
@@ -38,9 +38,9 @@ min_decode_ratio: 0.0
38
  max_decode_ratio: 1.0
39
  valid_search_interval: 10
40
  valid_beam_size: 10
41
- test_beam_size: 10
42
  lm_weight: 0.60
43
- ctc_weight_decode: 0.52
44
 
45
  ############################## models ################################
46
 
@@ -48,14 +48,14 @@ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
48
  input_shape: (8, 10, 80)
49
  num_blocks: 3
50
  num_layers_per_block: 1
51
- out_channels: (128, 256, 512)
52
- kernel_sizes: (3, 3, 1)
53
  strides: (2, 2, 1)
54
- residuals: (False, False, False)
55
- norm: !name:speechbrain.nnet.normalization.BatchNorm2d
56
 
57
  Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
58
- input_size: 10240
59
  tgt_vocab: !ref <output_neurons>
60
  d_model: !ref <d_model>
61
  nhead: !ref <nhead>
@@ -64,7 +64,10 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
64
  d_ffn: !ref <d_ffn>
65
  dropout: !ref <transformer_dropout>
66
  activation: !ref <activation>
67
- normalize_before: False
 
 
 
68
 
69
  ctc_lin: !new:speechbrain.nnet.linear.Linear
70
  input_size: !ref <d_model>
15
 
16
  ####################### Model parameters ###########################
17
  # Transformer
18
+ d_model: 512
19
+ nhead: 4
20
  num_encoder_layers: 12
21
  num_decoder_layers: 6
22
+ d_ffn: 2048
23
  transformer_dropout: 0.0
24
  activation: !name:torch.nn.GELU
25
  output_neurons: 5000
38
  max_decode_ratio: 1.0
39
  valid_search_interval: 10
40
  valid_beam_size: 10
41
+ test_beam_size: 66
42
  lm_weight: 0.60
43
+ ctc_weight_decode: 0.40
44
 
45
  ############################## models ################################
46
 
48
  input_shape: (8, 10, 80)
49
  num_blocks: 3
50
  num_layers_per_block: 1
51
+ out_channels: (64, 64, 64)
52
+ kernel_sizes: (5, 5, 1)
53
  strides: (2, 2, 1)
54
+ residuals: (False, False, True)
55
+ norm: !name:speechbrain.nnet.normalization.LayerNorm
56
 
57
  Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
58
+ input_size: 1280
59
  tgt_vocab: !ref <output_neurons>
60
  d_model: !ref <d_model>
61
  nhead: !ref <nhead>
64
  d_ffn: !ref <d_ffn>
65
  dropout: !ref <transformer_dropout>
66
  activation: !ref <activation>
67
+ encoder_module: transformer
68
+ attention_type: regularMHA
69
+ normalize_before: True
70
+ causal: False
71
 
72
  ctc_lin: !new:speechbrain.nnet.linear.Linear
73
  input_size: !ref <d_model>
normalizer.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48683d4aeeabada648645b74d4d006f2947406c27b660739f3d96a660f10f8d5
3
- size 1793
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5650a789fc12867073ec4d44e9b5343f3db2b043ebfbf5843a2c02db5ca0c69
3
+ size 1703