dragonSwing commited on
Commit
6b9cca3
1 Parent(s): 38cffaa

Upload model

Browse files

Signed-off-by: Binh Le <binhnd234@gmail.com>

Files changed (4) hide show
  1. README.md +13 -0
  2. config.json +107 -0
  3. preprocessor_config.json +9 -0
  4. pytorch_model.bin +3 -0
README.md CHANGED
@@ -1,3 +1,16 @@
1
  ---
2
  license: cc-by-sa-4.0
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-sa-4.0
3
+ language: vi
4
+ tags:
5
+ - speech
6
+ - automatic-speech-recognition
7
  ---
8
+ # Wav2Vec2 base model trained of 1.5K hours of Vietnamese speech
9
+ The base model is pre-trained on 16kHz sampled speech audio from Vietnamese speech corpus containing 1.5K hours of reading and broadcasting speech. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Vietnamese Automatic Speech Recognition.
10
+ **Note**: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more in-detail explanation of how to fine-tune the model.
11
+ [Facebook's Wav2Vec2 blog](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
12
+ [Paper](https://arxiv.org/abs/2006.11477)
13
+
14
+
15
+ # Usage
16
+ See [this notebook](https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing) for more information on how to fine-tune the English pre-trained model.
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/pretrain_models/m2",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.1,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.1,
54
+ "gradient_checkpointing": false,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 3072,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.1,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 12,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 12,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 768,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 256,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 32,
106
+ "xvector_output_dim": 512
107
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f88c3f4619831af349484ddf23dc5b740bf6786422805eb865583c7afdccdcc6
3
+ size 380263437