Liusuthu commited on
Commit
c0fd774
β€’
1 Parent(s): 0a6d3ed

Delete pretrained_models/facebook

Browse files
pretrained_models/facebook/wav2vecChinese/README.md DELETED
@@ -1,61 +0,0 @@
1
- ---
2
- license: mit
3
- ---
4
- Pretrained on 10k hours WenetSpeech L subset. More details in [TencentGameMate/chinese_speech_pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
5
-
6
- This model does not have a tokenizer as it was pretrained on audio alone.
7
- In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data.
8
-
9
- python package:
10
- transformers==4.16.2
11
-
12
- ```python
13
-
14
-
15
- import torch
16
- import torch.nn.functional as F
17
- import soundfile as sf
18
- from fairseq import checkpoint_utils
19
-
20
- from transformers import (
21
- Wav2Vec2FeatureExtractor,
22
- Wav2Vec2ForPreTraining,
23
- Wav2Vec2Model,
24
- )
25
- from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
26
-
27
- model_path=""
28
- wav_path=""
29
- mask_prob=0.0
30
- mask_length=10
31
-
32
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
33
- model = Wav2Vec2Model.from_pretrained(model_path)
34
-
35
- # for pretrain: Wav2Vec2ForPreTraining
36
- # model = Wav2Vec2ForPreTraining.from_pretrained(model_path)
37
-
38
- model = model.to(device)
39
- model = model.half()
40
- model.eval()
41
-
42
- wav, sr = sf.read(wav_path)
43
- input_values = feature_extractor(wav, return_tensors="pt").input_values
44
- input_values = input_values.half()
45
- input_values = input_values.to(device)
46
-
47
- # for Wav2Vec2ForPreTraining
48
- # batch_size, raw_sequence_length = input_values.shape
49
- # sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
50
- # mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.0, mask_length=2)
51
- # mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
52
-
53
- with torch.no_grad():
54
- outputs = model(input_values)
55
- last_hidden_state = outputs.last_hidden_state
56
-
57
- # for Wav2Vec2ForPreTraining
58
- # outputs = model(input_values, mask_time_indices=mask_time_indices, output_hidden_states=True)
59
- # last_hidden_state = outputs.hidden_states[-1]
60
-
61
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_models/facebook/wav2vecChinese/config.json DELETED
@@ -1,115 +0,0 @@
1
- {
2
- "activation_dropout": 0.0,
3
- "adapter_kernel_size": 3,
4
- "adapter_stride": 2,
5
- "add_adapter": false,
6
- "apply_spec_augment": true,
7
- "architectures": [
8
- "Wav2Vec2ForPreTraining"
9
- ],
10
- "attention_dropout": 0.1,
11
- "bos_token_id": 1,
12
- "classifier_proj_size": 256,
13
- "codevector_dim": 768,
14
- "contrastive_logits_temperature": 0.1,
15
- "conv_bias": true,
16
- "conv_dim": [
17
- 512,
18
- 512,
19
- 512,
20
- 512,
21
- 512,
22
- 512,
23
- 512
24
- ],
25
- "conv_kernel": [
26
- 10,
27
- 3,
28
- 3,
29
- 3,
30
- 3,
31
- 2,
32
- 2
33
- ],
34
- "conv_stride": [
35
- 5,
36
- 2,
37
- 2,
38
- 2,
39
- 2,
40
- 2,
41
- 2
42
- ],
43
- "ctc_loss_reduction": "sum",
44
- "ctc_zero_infinity": false,
45
- "diversity_loss_weight": 0.1,
46
- "do_stable_layer_norm": true,
47
- "eos_token_id": 2,
48
- "feat_extract_activation": "gelu",
49
- "feat_extract_dropout": 0.0,
50
- "feat_extract_norm": "layer",
51
- "feat_proj_dropout": 0.1,
52
- "feat_quantizer_dropout": 0.0,
53
- "final_dropout": 0.0,
54
- "gradient_checkpointing": false,
55
- "hidden_act": "gelu",
56
- "hidden_dropout": 0.1,
57
- "hidden_size": 1024,
58
- "initializer_range": 0.02,
59
- "intermediate_size": 4096,
60
- "layer_norm_eps": 1e-05,
61
- "layerdrop": 0.1,
62
- "mask_channel_length": 10,
63
- "mask_channel_min_space": 1,
64
- "mask_channel_other": 0.0,
65
- "mask_channel_prob": 0.0,
66
- "mask_channel_selection": "static",
67
- "mask_feature_length": 10,
68
- "mask_feature_min_masks": 0,
69
- "mask_feature_prob": 0.0,
70
- "mask_time_length": 10,
71
- "mask_time_min_masks": 2,
72
- "mask_time_min_space": 1,
73
- "mask_time_other": 0.0,
74
- "mask_time_prob": 0.075,
75
- "mask_time_selection": "static",
76
- "model_type": "wav2vec2",
77
- "num_adapter_layers": 3,
78
- "num_attention_heads": 16,
79
- "num_codevector_groups": 2,
80
- "num_codevectors_per_group": 320,
81
- "num_conv_pos_embedding_groups": 16,
82
- "num_conv_pos_embeddings": 128,
83
- "num_feat_extract_layers": 7,
84
- "num_hidden_layers": 24,
85
- "num_negatives": 100,
86
- "output_hidden_size": 1024,
87
- "pad_token_id": 0,
88
- "proj_codevector_dim": 768,
89
- "tdnn_dilation": [
90
- 1,
91
- 2,
92
- 3,
93
- 1,
94
- 1
95
- ],
96
- "tdnn_dim": [
97
- 512,
98
- 512,
99
- 512,
100
- 512,
101
- 1500
102
- ],
103
- "tdnn_kernel": [
104
- 5,
105
- 3,
106
- 3,
107
- 1,
108
- 1
109
- ],
110
- "torch_dtype": "float32",
111
- "transformers_version": "4.16.2",
112
- "use_weighted_layer_sum": false,
113
- "vocab_size": 32,
114
- "xvector_output_dim": 512
115
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_models/facebook/wav2vecChinese/gitattributes.txt DELETED
@@ -1,27 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ftz filter=lfs diff=lfs merge=lfs -text
6
- *.gz filter=lfs diff=lfs merge=lfs -text
7
- *.h5 filter=lfs diff=lfs merge=lfs -text
8
- *.joblib filter=lfs diff=lfs merge=lfs -text
9
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
- *.model filter=lfs diff=lfs merge=lfs -text
11
- *.msgpack filter=lfs diff=lfs merge=lfs -text
12
- *.onnx filter=lfs diff=lfs merge=lfs -text
13
- *.ot filter=lfs diff=lfs merge=lfs -text
14
- *.parquet filter=lfs diff=lfs merge=lfs -text
15
- *.pb filter=lfs diff=lfs merge=lfs -text
16
- *.pt filter=lfs diff=lfs merge=lfs -text
17
- *.pth filter=lfs diff=lfs merge=lfs -text
18
- *.rar filter=lfs diff=lfs merge=lfs -text
19
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
- *.tar.* filter=lfs diff=lfs merge=lfs -text
21
- *.tflite filter=lfs diff=lfs merge=lfs -text
22
- *.tgz filter=lfs diff=lfs merge=lfs -text
23
- *.wasm filter=lfs diff=lfs merge=lfs -text
24
- *.xz filter=lfs diff=lfs merge=lfs -text
25
- *.zip filter=lfs diff=lfs merge=lfs -text
26
- *.zstandard filter=lfs diff=lfs merge=lfs -text
27
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_models/facebook/wav2vecChinese/hyperparams.yaml DELETED
@@ -1,59 +0,0 @@
1
- # ############################################################################
2
- # Model: WAV2VEC base for Emotion Recognition
3
- # ############################################################################
4
-
5
-
6
- # Hparams NEEDED
7
- HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
8
- # Modules Needed
9
- MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp"]
10
-
11
- # Feature parameters
12
- wav2vec2_hub: wav2vecChinese
13
-
14
- # Pretrain folder (HuggingFace)
15
- pretrained_path: emotion-recognition-wav2vec2-IEMOCAP
16
-
17
- # parameters
18
- encoder_dim: 768
19
- out_n_neurons: 4
20
-
21
- wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
22
- source: D:/pycharm2020/code/yuyin_ChineseWav2vec/pretrained_models/facebook/wav2vec2-base
23
- output_norm: True
24
- freeze: True
25
- save_path: wav2vec2_checkpoints
26
-
27
- avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
28
- return_std: False
29
-
30
- output_mlp: !new:speechbrain.nnet.linear.Linear
31
- input_size: !ref <encoder_dim>
32
- n_neurons: !ref <out_n_neurons>
33
- bias: False
34
-
35
- model: !new:torch.nn.ModuleList
36
- - [!ref <output_mlp>]
37
-
38
- modules:
39
- wav2vec2: !ref <wav2vec2>
40
- output_mlp: !ref <output_mlp>
41
- avg_pool: !ref <avg_pool>
42
-
43
- softmax: !new:speechbrain.nnet.activations.Softmax
44
-
45
-
46
- label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
47
-
48
-
49
- pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
50
- loadables:
51
- wav2vec2: !ref <wav2vec2>
52
- model: !ref <model>
53
- label_encoder: !ref <label_encoder>
54
- paths:
55
- wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
56
- model: !ref <pretrained_path>/model.ckpt
57
- label_encoder: !ref <pretrained_path>/label_encoder.txt
58
-
59
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_models/facebook/wav2vecChinese/preprocessor_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "do_normalize": true,
3
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
- "feature_size": 1,
5
- "padding_side": "right",
6
- "padding_value": 0,
7
- "return_attention_mask": true,
8
- "sampling_rate": 16000
9
- }
 
 
 
 
 
 
 
 
 
 
pretrained_models/facebook/wav2vecChinese/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8a5554a79c3bbbe76f2e43d3d4b4369c8c2abd5515e623192e0381d7e5e7b3f
3
- size 1269726951