Liusuthu commited on
Commit
495b7f0
β€’
1 Parent(s): 2ba8ae1

Upload folder using huggingface_hub

Browse files
a.wav CHANGED
Binary files a/a.wav and b/a.wav differ
 
emotion-recognition-wav2vec2-IEMOCAP/hyperparams.yaml CHANGED
@@ -9,10 +9,10 @@ HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
9
  MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
10
 
11
  # Feature parameters
12
- wav2vec2_hub: pretrained_models\facebook\wav2vecChinese
13
 
14
  # Pretrain folder (HuggingFace)
15
- pretrained_path: pretrained_models\speechbrain\emotion-recognition-wav2vec2-IEMOCAP
16
 
17
  # parameters
18
  encoder_dim: 1024
 
9
  MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
10
 
11
  # Feature parameters
12
+ wav2vec2_hub: facebook\wav2vecChinese
13
 
14
  # Pretrain folder (HuggingFace)
15
+ pretrained_path: pretrained_path
16
 
17
  # parameters
18
  encoder_dim: 1024
facebook/wav2vecChinese/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ Pretrained on 10k hours WenetSpeech L subset. More details in [TencentGameMate/chinese_speech_pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
5
+
6
+ This model does not have a tokenizer as it was pretrained on audio alone.
7
+ In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data.
8
+
9
+ python package:
10
+ transformers==4.16.2
11
+
12
+ ```python
13
+
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import soundfile as sf
18
+ from fairseq import checkpoint_utils
19
+
20
+ from transformers import (
21
+ Wav2Vec2FeatureExtractor,
22
+ Wav2Vec2ForPreTraining,
23
+ Wav2Vec2Model,
24
+ )
25
+ from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
26
+
27
+ model_path=""
28
+ wav_path=""
29
+ mask_prob=0.0
30
+ mask_length=10
31
+
32
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
33
+ model = Wav2Vec2Model.from_pretrained(model_path)
34
+
35
+ # for pretrain: Wav2Vec2ForPreTraining
36
+ # model = Wav2Vec2ForPreTraining.from_pretrained(model_path)
37
+
38
+ model = model.to(device)
39
+ model = model.half()
40
+ model.eval()
41
+
42
+ wav, sr = sf.read(wav_path)
43
+ input_values = feature_extractor(wav, return_tensors="pt").input_values
44
+ input_values = input_values.half()
45
+ input_values = input_values.to(device)
46
+
47
+ # for Wav2Vec2ForPreTraining
48
+ # batch_size, raw_sequence_length = input_values.shape
49
+ # sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
50
+ # mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.0, mask_length=2)
51
+ # mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
52
+
53
+ with torch.no_grad():
54
+ outputs = model(input_values)
55
+ last_hidden_state = outputs.last_hidden_state
56
+
57
+ # for Wav2Vec2ForPreTraining
58
+ # outputs = model(input_values, mask_time_indices=mask_time_indices, output_hidden_states=True)
59
+ # last_hidden_state = outputs.hidden_states[-1]
60
+
61
+ ```
facebook/wav2vecChinese/config.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "Wav2Vec2ForPreTraining"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 768,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": true,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "sum",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": true,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_dropout": 0.0,
50
+ "feat_extract_norm": "layer",
51
+ "feat_proj_dropout": 0.1,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.0,
54
+ "gradient_checkpointing": false,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 1024,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 4096,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.1,
62
+ "mask_channel_length": 10,
63
+ "mask_channel_min_space": 1,
64
+ "mask_channel_other": 0.0,
65
+ "mask_channel_prob": 0.0,
66
+ "mask_channel_selection": "static",
67
+ "mask_feature_length": 10,
68
+ "mask_feature_min_masks": 0,
69
+ "mask_feature_prob": 0.0,
70
+ "mask_time_length": 10,
71
+ "mask_time_min_masks": 2,
72
+ "mask_time_min_space": 1,
73
+ "mask_time_other": 0.0,
74
+ "mask_time_prob": 0.075,
75
+ "mask_time_selection": "static",
76
+ "model_type": "wav2vec2",
77
+ "num_adapter_layers": 3,
78
+ "num_attention_heads": 16,
79
+ "num_codevector_groups": 2,
80
+ "num_codevectors_per_group": 320,
81
+ "num_conv_pos_embedding_groups": 16,
82
+ "num_conv_pos_embeddings": 128,
83
+ "num_feat_extract_layers": 7,
84
+ "num_hidden_layers": 24,
85
+ "num_negatives": 100,
86
+ "output_hidden_size": 1024,
87
+ "pad_token_id": 0,
88
+ "proj_codevector_dim": 768,
89
+ "tdnn_dilation": [
90
+ 1,
91
+ 2,
92
+ 3,
93
+ 1,
94
+ 1
95
+ ],
96
+ "tdnn_dim": [
97
+ 512,
98
+ 512,
99
+ 512,
100
+ 512,
101
+ 1500
102
+ ],
103
+ "tdnn_kernel": [
104
+ 5,
105
+ 3,
106
+ 3,
107
+ 1,
108
+ 1
109
+ ],
110
+ "torch_dtype": "float32",
111
+ "transformers_version": "4.16.2",
112
+ "use_weighted_layer_sum": false,
113
+ "vocab_size": 32,
114
+ "xvector_output_dim": 512
115
+ }
facebook/wav2vecChinese/gitattributes.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
facebook/wav2vecChinese/hyperparams.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: WAV2VEC base for Emotion Recognition
3
+ # ############################################################################
4
+
5
+
6
+ # Hparams NEEDED
7
+ HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
8
+ # Modules Needed
9
+ MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp"]
10
+
11
+ # Feature parameters
12
+ wav2vec2_hub: wav2vecChinese
13
+
14
+ # Pretrain folder (HuggingFace)
15
+ pretrained_path: emotion-recognition-wav2vec2-IEMOCAP
16
+
17
+ # parameters
18
+ encoder_dim: 768
19
+ out_n_neurons: 4
20
+
21
+ wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
22
+ source: D:/pycharm2020/code/yuyin_ChineseWav2vec/pretrained_models/facebook/wav2vec2-base
23
+ output_norm: True
24
+ freeze: True
25
+ save_path: wav2vec2_checkpoints
26
+
27
+ avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
28
+ return_std: False
29
+
30
+ output_mlp: !new:speechbrain.nnet.linear.Linear
31
+ input_size: !ref <encoder_dim>
32
+ n_neurons: !ref <out_n_neurons>
33
+ bias: False
34
+
35
+ model: !new:torch.nn.ModuleList
36
+ - [!ref <output_mlp>]
37
+
38
+ modules:
39
+ wav2vec2: !ref <wav2vec2>
40
+ output_mlp: !ref <output_mlp>
41
+ avg_pool: !ref <avg_pool>
42
+
43
+ softmax: !new:speechbrain.nnet.activations.Softmax
44
+
45
+
46
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
47
+
48
+
49
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
50
+ loadables:
51
+ wav2vec2: !ref <wav2vec2>
52
+ model: !ref <model>
53
+ label_encoder: !ref <label_encoder>
54
+ paths:
55
+ wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
56
+ model: !ref <pretrained_path>/model.ckpt
57
+ label_encoder: !ref <pretrained_path>/label_encoder.txt
58
+
59
+
facebook/wav2vecChinese/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
facebook/wav2vecChinese/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8a5554a79c3bbbe76f2e43d3d4b4369c8c2abd5515e623192e0381d7e5e7b3f
3
+ size 1269726951
hyperparams.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: WAV2VEC base for Emotion Recognition
3
+ # ############################################################################
4
+
5
+
6
+ # Hparams NEEDED
7
+ HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
8
+ # Modules Needed
9
+ MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
10
+
11
+ # Feature parameters
12
+ wav2vec2_hub: facebook\wav2vecChinese
13
+
14
+ # Pretrain folder (HuggingFace)
15
+ pretrained_path: emotion-recognition-wav2vec2-IEMOCAP
16
+
17
+ # parameters
18
+ encoder_dim: 1024
19
+ out_n_neurons: 2
20
+
21
+ wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
22
+ source: !ref <wav2vec2_hub>
23
+ output_norm: True
24
+ freeze: True
25
+ save_path: wav2vec2_checkpoints
26
+
27
+ LSTM: !new:speechbrain.nnet.RNN.LSTM
28
+ input_size: !ref <encoder_dim>
29
+ hidden_size: !ref <encoder_dim>
30
+
31
+ avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
32
+ return_std: False
33
+
34
+ output_mlp: !new:speechbrain.nnet.linear.Linear
35
+ input_size: !ref <encoder_dim>
36
+ n_neurons: !ref <out_n_neurons>
37
+ bias: False
38
+
39
+ model: !new:torch.nn.ModuleList
40
+ - [!ref <output_mlp>]
41
+
42
+ modules:
43
+ wav2vec2: !ref <wav2vec2>
44
+ output_mlp: !ref <output_mlp>
45
+ avg_pool: !ref <avg_pool>
46
+ LSTM: !ref <LSTM>
47
+
48
+ softmax: !new:speechbrain.nnet.activations.Softmax
49
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
50
+ apply_log: True
51
+
52
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
53
+
54
+
55
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
56
+ loadables:
57
+ wav2vec2: !ref <wav2vec2>
58
+ model: !ref <model>
59
+ label_encoder: !ref <label_encoder>
60
+ paths:
61
+ wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
62
+ model: !ref <pretrained_path>/model.ckpt
63
+ label_encoder: !ref <pretrained_path>/label_encoder.txt
64
+
65
+
out.wav CHANGED
Binary files a/out.wav and b/out.wav differ
 
paraformer_logs/log/info.log CHANGED
@@ -391,3 +391,54 @@
391
  [INFO] 2024-02-19 11:37:47,247 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
392
  [INFO] 2024-02-19 11:37:47,248 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
393
  [INFO] 2024-02-19 11:37:47,249 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  [INFO] 2024-02-19 11:37:47,247 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
392
  [INFO] 2024-02-19 11:37:47,248 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
393
  [INFO] 2024-02-19 11:37:47,249 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
394
+ [INFO] 2024-02-19 11:41:21,395 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
395
+ [INFO] 2024-02-19 11:41:22,000 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
396
+ [INFO] 2024-02-19 11:41:25,167 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
397
+ [INFO] 2024-02-19 11:41:25,168 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
398
+ [INFO] 2024-02-19 11:41:25,175 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
399
+ [INFO] 2024-02-19 11:41:28,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
400
+ [INFO] 2024-02-19 11:41:28,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
401
+ [INFO] 2024-02-19 11:41:28,333 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
402
+ [INFO] 2024-02-19 11:41:28,366 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
403
+ [INFO] 2024-02-19 11:41:28,367 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
404
+ [INFO] 2024-02-19 11:41:28,368 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
405
+ [INFO] 2024-02-19 11:41:28,369 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
406
+ [INFO] 2024-02-19 11:41:28,438 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.06800723075866699 s
407
+ [INFO] 2024-02-19 11:41:28,513 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
408
+ [INFO] 2024-02-19 11:41:32,325 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
409
+ [INFO] 2024-02-19 11:41:32,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
410
+ [INFO] 2024-02-19 11:41:32,327 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
411
+ [INFO] 2024-02-19 12:20:59,536 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
412
+ [INFO] 2024-02-19 12:21:00,086 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
413
+ [INFO] 2024-02-19 12:21:03,528 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
414
+ [INFO] 2024-02-19 12:21:03,529 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
415
+ [INFO] 2024-02-19 12:21:03,534 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
416
+ [INFO] 2024-02-19 12:21:06,482 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
417
+ [INFO] 2024-02-19 12:21:06,482 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
418
+ [INFO] 2024-02-19 12:21:06,490 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
419
+ [INFO] 2024-02-19 12:21:06,540 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
420
+ [INFO] 2024-02-19 12:21:06,542 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
421
+ [INFO] 2024-02-19 12:21:06,542 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
422
+ [INFO] 2024-02-19 12:21:06,544 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
423
+ [INFO] 2024-02-19 12:21:06,626 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.08150649070739746 s
424
+ [INFO] 2024-02-19 12:21:06,711 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
425
+ [INFO] 2024-02-19 12:21:10,434 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
426
+ [INFO] 2024-02-19 12:21:10,435 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
427
+ [INFO] 2024-02-19 12:21:10,436 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
428
+ [INFO] 2024-02-19 12:24:23,118 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
429
+ [INFO] 2024-02-19 12:24:23,663 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
430
+ [INFO] 2024-02-19 12:24:27,366 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
431
+ [INFO] 2024-02-19 12:24:27,367 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
432
+ [INFO] 2024-02-19 12:24:27,371 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
433
+ [INFO] 2024-02-19 12:24:30,101 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
434
+ [INFO] 2024-02-19 12:24:30,102 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
435
+ [INFO] 2024-02-19 12:24:30,109 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
436
+ [INFO] 2024-02-19 12:24:30,158 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
437
+ [INFO] 2024-02-19 12:24:30,159 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
438
+ [INFO] 2024-02-19 12:24:30,160 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
439
+ [INFO] 2024-02-19 12:24:30,162 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
440
+ [INFO] 2024-02-19 12:24:30,244 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.08200550079345703 s
441
+ [INFO] 2024-02-19 12:24:30,327 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
442
+ [INFO] 2024-02-19 12:24:33,888 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
443
+ [INFO] 2024-02-19 12:24:33,889 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
444
+ [INFO] 2024-02-19 12:24:33,889 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
pretrained_path/custom_interface.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from speechbrain.pretrained import Pretrained
3
+
4
+
5
+ class CustomEncoderWav2vec2Classifier(Pretrained):
6
+ """A ready-to-use class for utterance-level classification (e.g, speaker-id,
7
+ language-id, emotion recognition, keyword spotting, etc).
8
+
9
+ The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
10
+ are defined in the yaml file. If you want to
11
+ convert the predicted index into a corresponding text label, please
12
+ provide the path of the label_encoder in a variable called 'lab_encoder_file'
13
+ within the yaml.
14
+
15
+ The class can be used either to run only the encoder (encode_batch()) to
16
+ extract embeddings or to run a classification step (classify_batch()).
17
+ ```
18
+
19
+ Example
20
+ -------
21
+ >>> import torchaudio
22
+ >>> from speechbrain.pretrained import EncoderClassifier
23
+ >>> # Model is downloaded from the speechbrain HuggingFace repo
24
+ >>> tmpdir = getfixture("tmpdir")
25
+ >>> classifier = EncoderClassifier.from_hparams(
26
+ ... source="speechbrain/spkrec-ecapa-voxceleb",
27
+ ... savedir=tmpdir,
28
+ ... )
29
+
30
+ >>> # Compute embeddings
31
+ >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
32
+ >>> embeddings = classifier.encode_batch(signal)
33
+
34
+ >>> # Classification
35
+ >>> prediction = classifier .classify_batch(signal)
36
+ """
37
+
38
+ def __init__(self, *args, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+
41
+ def encode_batch(self, wavs, wav_lens=None, normalize=False):
42
+ """Encodes the input audio into a single vector embeddin g.
43
+
44
+ The waveforms should already be in the model's desired format.
45
+ You can call:
46
+ ``normalized = <this>.normalizer(signal, sample_rate)``
47
+ to get a correctly converted signal in most cases.
48
+
49
+ Arguments
50
+ ---------
51
+ wavs : torch.tensor
52
+ Batch of waveforms [batch, time, channels] or [batch, time]
53
+ depending on the model. Make sure the sample rate is fs=16000 Hz.
54
+ wav_lens : torch.tensor
55
+ Lengths of the waveforms relative to the longest one in the
56
+ batch, tensor of shape [batch]. The longest one should have
57
+ relative length 1.0 and others len(waveform) / max_length.
58
+ Used for ignoring padding.
59
+ normalize : bool
60
+ If True, it normalizes the embeddings with the statistics
61
+ contained in mean_var_norm_emb.
62
+
63
+ Returns
64
+ -------
65
+ torch.tensor
66
+ The encoded batch
67
+ """
68
+ # Manage single waveforms in input
69
+ if len(wavs.shape) == 1:
70
+ wavs = wavs.unsqueeze(0)
71
+
72
+ # Assign full length if wav_lens is not assigned
73
+ if wav_lens is None:
74
+ wav_lens = torch.ones(wavs.shape[0], device=self.device)
75
+
76
+ # Storing waveform in the specified device
77
+ wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
78
+ wavs = wavs.float()
79
+
80
+ # Computing features and embeddings
81
+ outputs = self.mods.wav2vec2(wavs)
82
+
83
+ # last dim will be used for AdaptativeAVG pool
84
+ outputs = self.mods.avg_pool(outputs, wav_lens)
85
+ # # print(outputs.shape)
86
+ outputs = outputs.view(outputs.shape[0], -1)
87
+ # print(outputs.shape)
88
+
89
+
90
+
91
+ return outputs
92
+
93
+ def classify_batch(self, wavs, wav_lens=None):
94
+ """Performs classification on the top of the encoded features.
95
+
96
+ It returns the posterior probabilities, the index and, if the label
97
+ encoder is specified it also the text label.
98
+
99
+ Arguments
100
+ ---------
101
+ wavs : torch.tensor
102
+ Batch of waveforms [batch, time, channels] or [batch, time]
103
+ depending on the model. Make sure the sample rate is fs=16000 Hz.
104
+ wav_lens : torch.tensor
105
+ Lengths of the waveforms relative to the longest one in the
106
+ batch, tensor of shape [batch]. The longest one should have
107
+ relative length 1.0 and others len(waveform) / max_length.
108
+ Used for ignoring padding.
109
+
110
+ Returns
111
+ -------
112
+ out_prob
113
+ The log posterior probabilities of each class ([batch, N_class])
114
+ score:
115
+ It is the value of the log-posterior for the best class ([batch,])
116
+ index
117
+ The indexes of the best class ([batch,])
118
+ text_lab:
119
+ List with the text labels corresponding to the indexes.
120
+ (label encoder should be provided).
121
+ """
122
+ outputs = self.encode_batch(wavs, wav_lens)
123
+ #outputs = self.CH(wavs, wav_lens)
124
+ outputs = self.mods.output_mlp(outputs)
125
+ out_prob = self.hparams.softmax(outputs)
126
+ score, index = torch.max(out_prob, dim=-1)
127
+ text_lab = self.hparams.label_encoder.decode_torch(index)
128
+ return out_prob, score, index, text_lab
129
+
130
+ def CH(self, wavs, wav_lens=None):
131
+ import torch
132
+ import torch.nn.functional as F
133
+ import soundfile as sf
134
+ from fairseq import checkpoint_utils
135
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
136
+
137
+ model_path = "D:\pycharm2020\code\yuyin_ChineseWav2vec\pretrained_models\Chinses_hubert\\chinese-hubert-large-fairseq-ckpt.pt"
138
+ wav_path = wavs
139
+
140
+ def postprocess(feats, normalize=False):
141
+ if feats.dim() == 2:
142
+ feats = feats.mean(-1)
143
+
144
+ assert feats.dim() == 1, feats.dim()
145
+
146
+ if normalize:
147
+ with torch.no_grad():
148
+ feats = F.layer_norm(feats, feats.shape)
149
+ return feats
150
+
151
+ print("loading model(s) from {}".format(model_path))
152
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
153
+ [model_path],
154
+ suffix="",
155
+ )
156
+ print("loaded model(s) from {}".format(model_path))
157
+ print(f"normalize: {saved_cfg.task.normalize}")
158
+
159
+ model = models[0]
160
+ model = model.to(device)
161
+ model = model.half()
162
+ model.eval()
163
+
164
+ # wav, sr = sf.read(wav_path)
165
+ # feat = torch.from_numpy(wav_path).float()
166
+ feat = postprocess(wav_path, normalize=saved_cfg.task.normalize)
167
+ feats = feat.view(1, -1)
168
+ padding_mask = (
169
+ torch.BoolTensor(feats.shape).fill_(False)
170
+ )
171
+ inputs = {
172
+ "source": feats.half().to(device),
173
+ "padding_mask": padding_mask.to(device),
174
+ }
175
+
176
+ with torch.no_grad():
177
+ logits = model.extract_features(**inputs)
178
+ outputs = self.mods.avg_pool(logits[0], wav_lens)
179
+ # # print(outputs.shape)
180
+ outputs = outputs.view(outputs.shape[0], -1)
181
+ # print(outputs.shape)
182
+
183
+ return outputs
184
+
185
+
186
+
187
+ def classify_file(self, path):
188
+ """Classifies the given audiofile into the given set of labels.
189
+
190
+ Arguments
191
+ ---------
192
+ path : str
193
+ Path to audio file to classify.
194
+
195
+ Returns
196
+ -------
197
+ out_prob
198
+ The log posterior probabilities of each class ([batch, N_class])
199
+ score:
200
+ It is the value of the log-posterior for the best class ([batch,])
201
+ index
202
+ The indexes of the best class ([batch,])
203
+ text_lab:
204
+ List with the text labels corresponding to the indexes.
205
+ (label encoder should be provided).
206
+ """
207
+ waveform = self.load_audio(path)
208
+ # Fake a batch:
209
+ batch = waveform.unsqueeze(0)
210
+ rel_length = torch.tensor([1.0])
211
+ outputs = self.encode_batch(batch, rel_length)
212
+ outputs = self.mods.output_mlp(outputs).squeeze(1)
213
+ out_prob = self.hparams.softmax(outputs)
214
+ score, index = torch.max(out_prob, dim=-1)
215
+ text_lab = self.hparams.label_encoder.decode_torch(index)
216
+ return out_prob, score, index, text_lab
217
+
218
+ def forward(self, wavs, wav_lens=None, normalize=False):
219
+ return self.encode_batch(
220
+ wavs=wavs, wav_lens=wav_lens, normalize=normalize
221
+ )
pretrained_path/label_encoder.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7bb126752aed05089d4f211eaf82ceab98c4a17c2813774b58091441064c6ad
3
+ size 73
pretrained_path/label_encoder.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 'depress' => 0
2
+ 'non_depress' => 1
3
+ ================
4
+ 'starting_index' => 0
pretrained_path/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293e54327bc8c42df3028fa045a7c043a5c25d05085ae32d5af1c022820be1ae
3
+ size 8997
pretrained_path/wav2vec2.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31fd83ae093ab6a97f04cd8612d419cca8fc78a0cf9f4b11b35dfc29dd5cd5e3
3
+ size 1261924189
speech.py CHANGED
@@ -75,4 +75,4 @@ demo = gr.Interface(
75
  ],
76
  )
77
 
78
- demo.launch()
 
75
  ],
76
  )
77
 
78
+ demo.launch(share=True)