Spaces:

Liusuthu
/

SpeechDepression

Runtime error

App Files Files Community

Liusuthu commited on Feb 19

Commit

495b7f0

•

1 Parent(s): 2ba8ae1

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

a.wav +0 -0
emotion-recognition-wav2vec2-IEMOCAP/hyperparams.yaml +2 -2
facebook/wav2vecChinese/README.md +61 -0
facebook/wav2vecChinese/config.json +115 -0
facebook/wav2vecChinese/gitattributes.txt +27 -0
facebook/wav2vecChinese/hyperparams.yaml +59 -0
facebook/wav2vecChinese/preprocessor_config.json +9 -0
facebook/wav2vecChinese/pytorch_model.bin +3 -0
hyperparams.yaml +65 -0
out.wav +0 -0
paraformer_logs/log/info.log +51 -0
pretrained_path/custom_interface.py +221 -0
pretrained_path/label_encoder.ckpt +3 -0
pretrained_path/label_encoder.txt +4 -0
pretrained_path/model.ckpt +3 -0
pretrained_path/wav2vec2.ckpt +3 -0
speech.py +1 -1

a.wav CHANGED Viewed

Binary files a/a.wav and b/a.wav differ

emotion-recognition-wav2vec2-IEMOCAP/hyperparams.yaml CHANGED Viewed

@@ -9,10 +9,10 @@ HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
 MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
 # Feature parameters
-wav2vec2_hub: pretrained_models\facebook\wav2vecChinese
 # Pretrain folder (HuggingFace)
-pretrained_path: pretrained_models\speechbrain\emotion-recognition-wav2vec2-IEMOCAP
 # parameters
 encoder_dim: 1024

 MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
 # Feature parameters
+wav2vec2_hub: facebook\wav2vecChinese
 # Pretrain folder (HuggingFace)
+pretrained_path: pretrained_path
 # parameters
 encoder_dim: 1024

facebook/wav2vecChinese/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+license: mit
+---
+Pretrained on 10k hours WenetSpeech L subset. More details in  [TencentGameMate/chinese_speech_pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
+This model does not have a tokenizer as it was pretrained on audio alone.
+In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data.
+python package:
+transformers==4.16.2
+```python
+import torch
+import torch.nn.functional as F
+import soundfile as sf
+from fairseq import checkpoint_utils
+from transformers import (
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2ForPreTraining,
+    Wav2Vec2Model,
+)
+from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
+model_path=""
+wav_path=""
+mask_prob=0.0
+mask_length=10
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
+model = Wav2Vec2Model.from_pretrained(model_path)
+# for pretrain: Wav2Vec2ForPreTraining
+# model = Wav2Vec2ForPreTraining.from_pretrained(model_path)
+model = model.to(device)
+model = model.half()
+model.eval()
+wav, sr = sf.read(wav_path)
+input_values = feature_extractor(wav, return_tensors="pt").input_values
+input_values = input_values.half()
+input_values = input_values.to(device)
+# for Wav2Vec2ForPreTraining
+# batch_size, raw_sequence_length = input_values.shape
+# sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+# mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.0, mask_length=2)
+# mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
+with torch.no_grad():
+    outputs = model(input_values)
+    last_hidden_state = outputs.last_hidden_state
+    # for Wav2Vec2ForPreTraining
+    # outputs = model(input_values, mask_time_indices=mask_time_indices, output_hidden_states=True)
+    # last_hidden_state = outputs.hidden_states[-1]
+```

facebook/wav2vecChinese/config.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "activation_dropout": 0.0,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.075,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.16.2",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

facebook/wav2vecChinese/gitattributes.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

facebook/wav2vecChinese/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+# ############################################################################
+# Model: WAV2VEC base for Emotion Recognition
+# ############################################################################
+# Hparams NEEDED
+HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
+# Modules Needed
+MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp"]
+# Feature parameters
+wav2vec2_hub: wav2vecChinese
+# Pretrain folder (HuggingFace)
+pretrained_path: emotion-recognition-wav2vec2-IEMOCAP
+# parameters
+encoder_dim: 768
+out_n_neurons: 4
+wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
+    source: D:/pycharm2020/code/yuyin_ChineseWav2vec/pretrained_models/facebook/wav2vec2-base
+    output_norm: True
+    freeze: True
+    save_path: wav2vec2_checkpoints
+avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
+    return_std: False
+output_mlp: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <encoder_dim>
+    n_neurons: !ref <out_n_neurons>
+    bias: False
+model: !new:torch.nn.ModuleList
+    - [!ref <output_mlp>]
+modules:
+    wav2vec2: !ref <wav2vec2>
+    output_mlp: !ref <output_mlp>
+    avg_pool: !ref <avg_pool>
+softmax: !new:speechbrain.nnet.activations.Softmax
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        wav2vec2: !ref <wav2vec2>
+        model: !ref <model>
+        label_encoder: !ref <label_encoder>
+    paths:
+        wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
+        model: !ref <pretrained_path>/model.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt

facebook/wav2vecChinese/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

facebook/wav2vecChinese/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a5554a79c3bbbe76f2e43d3d4b4369c8c2abd5515e623192e0381d7e5e7b3f
+size 1269726951

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+# ############################################################################
+# Model: WAV2VEC base for Emotion Recognition
+# ############################################################################
+# Hparams NEEDED
+HPARAMS_NEEDED: ["encoder_dim", "out_n_neurons", "label_encoder", "softmax"]
+# Modules Needed
+MODULES_NEEDED: ["wav2vec2", "avg_pool", "output_mlp", "LSTM"]
+# Feature parameters
+wav2vec2_hub: facebook\wav2vecChinese
+# Pretrain folder (HuggingFace)
+pretrained_path: emotion-recognition-wav2vec2-IEMOCAP
+# parameters
+encoder_dim: 1024
+out_n_neurons: 2
+wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
+    source: !ref <wav2vec2_hub>
+    output_norm: True
+    freeze: True
+    save_path: wav2vec2_checkpoints
+LSTM: !new:speechbrain.nnet.RNN.LSTM
+    input_size: !ref <encoder_dim>
+    hidden_size: !ref <encoder_dim>
+avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
+    return_std: False
+output_mlp: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <encoder_dim>
+    n_neurons: !ref <out_n_neurons>
+    bias: False
+model: !new:torch.nn.ModuleList
+    - [!ref <output_mlp>]
+modules:
+    wav2vec2: !ref <wav2vec2>
+    output_mlp: !ref <output_mlp>
+    avg_pool: !ref <avg_pool>
+    LSTM: !ref <LSTM>
+softmax: !new:speechbrain.nnet.activations.Softmax
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        wav2vec2: !ref <wav2vec2>
+        model: !ref <model>
+        label_encoder: !ref <label_encoder>
+    paths:
+        wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
+        model: !ref <pretrained_path>/model.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt

out.wav CHANGED Viewed

Binary files a/out.wav and b/out.wav differ

paraformer_logs/log/info.log CHANGED Viewed

@@ -391,3 +391,54 @@
 [INFO] 2024-02-19 11:37:47,247 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
 [INFO] 2024-02-19 11:37:47,248 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
 [INFO] 2024-02-19 11:37:47,249 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.

 [INFO] 2024-02-19 11:37:47,247 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
 [INFO] 2024-02-19 11:37:47,248 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
 [INFO] 2024-02-19 11:37:47,249 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
+[INFO] 2024-02-19 11:41:21,395 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
+[INFO] 2024-02-19 11:41:22,000 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
+[INFO] 2024-02-19 11:41:25,167 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
+[INFO] 2024-02-19 11:41:25,168 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
+[INFO] 2024-02-19 11:41:25,175 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
+[INFO] 2024-02-19 11:41:28,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 11:41:28,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
+[INFO] 2024-02-19 11:41:28,333 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
+[INFO] 2024-02-19 11:41:28,366 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 11:41:28,367 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
+[INFO] 2024-02-19 11:41:28,368 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
+[INFO] 2024-02-19 11:41:28,369 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
+[INFO] 2024-02-19 11:41:28,438 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.06800723075866699 s
+[INFO] 2024-02-19 11:41:28,513 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
+[INFO] 2024-02-19 11:41:32,325 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 11:41:32,326 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
+[INFO] 2024-02-19 11:41:32,327 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
+[INFO] 2024-02-19 12:20:59,536 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
+[INFO] 2024-02-19 12:21:00,086 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
+[INFO] 2024-02-19 12:21:03,528 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
+[INFO] 2024-02-19 12:21:03,529 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
+[INFO] 2024-02-19 12:21:03,534 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:21:06,482 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:21:06,482 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
+[INFO] 2024-02-19 12:21:06,490 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:21:06,540 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:21:06,542 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
+[INFO] 2024-02-19 12:21:06,542 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
+[INFO] 2024-02-19 12:21:06,544 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
+[INFO] 2024-02-19 12:21:06,626 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.08150649070739746 s
+[INFO] 2024-02-19 12:21:06,711 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:21:10,434 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:21:10,435 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
+[INFO] 2024-02-19 12:21:10,436 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.
+[INFO] 2024-02-19 12:24:23,118 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating Campplus instance
+[INFO] 2024-02-19 12:24:23,663 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create Campplus instance finished
+[INFO] 2024-02-19 12:24:27,366 [D:\DD\paraformer\runtime\python\paraformerInfer.py:51] __init__: Load onnx model dir at D:\DD\paraformer\onnx\asr_offline
+[INFO] 2024-02-19 12:24:27,367 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating ParaformerOfflineModel instance
+[INFO] 2024-02-19 12:24:27,371 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating AsrOfflineOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:24:30,101 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create AsrOfflineOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:24:30,102 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create ParaformerOfflineModel instance finished
+[INFO] 2024-02-19 12:24:30,109 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating VadOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:24:30,158 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create VadOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:24:30,159 [D:\DD\paraformer\runtime\python\cttPunctuator.py:24] __init__: Initializing punctuator instance with offline mode.
+[INFO] 2024-02-19 12:24:30,160 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating CT_Transformer instance
+[INFO] 2024-02-19 12:24:30,162 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:52] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl
+[INFO] 2024-02-19 12:24:30,244 [D:\DD\paraformer\runtime\python\model\punc\punctuator.py:56] __init__: Loading config file D:\DD\paraformer\onnx\punc\config.pkl finished, takes 0.08200550079345703 s
+[INFO] 2024-02-19 12:24:30,327 [D:\DD\paraformer\runtime\python\utils\singleton.py:26] get_instance: creating PuncOrtInferRuntimeSession instance
+[INFO] 2024-02-19 12:24:33,888 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create PuncOrtInferRuntimeSession instance finished
+[INFO] 2024-02-19 12:24:33,889 [D:\DD\paraformer\runtime\python\utils\singleton.py:29] get_instance: create CT_Transformer instance finished
+[INFO] 2024-02-19 12:24:33,889 [D:\DD\paraformer\runtime\python\cttPunctuator.py:26] __init__: Offline punctuator instance initialized.

pretrained_path/custom_interface.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+from speechbrain.pretrained import Pretrained
+class CustomEncoderWav2vec2Classifier(Pretrained):
+    """A ready-to-use class for utterance-level classification (e.g, speaker-id,
+    language-id, emotion recognition, keyword spotting, etc).
+    The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
+    are defined in the yaml file. If you want to
+    convert the predicted index into a corresponding text label, please
+    provide the path of the label_encoder in a variable called 'lab_encoder_file'
+    within the yaml.
+    The class can be used either to run only the encoder (encode_batch()) to
+    extract embeddings or to run a classification step (classify_batch()).
+    ```
+    Example
+    -------
+    >>> import torchaudio
+    >>> from speechbrain.pretrained import EncoderClassifier
+    >>> # Model is downloaded from the speechbrain HuggingFace repo
+    >>> tmpdir = getfixture("tmpdir")
+    >>> classifier = EncoderClassifier.from_hparams(
+    ...     source="speechbrain/spkrec-ecapa-voxceleb",
+    ...     savedir=tmpdir,
+    ... )
+    >>> # Compute embeddings
+    >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
+    >>> embeddings =  classifier.encode_batch(signal)
+    >>> # Classification
+    >>> prediction =  classifier .classify_batch(signal)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def encode_batch(self, wavs, wav_lens=None, normalize=False):
+        """Encodes the input audio into a single vector embeddin    g.
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = <this>.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        normalize : bool
+            If True, it normalizes the embeddings with the statistics
+            contained in mean_var_norm_emb.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        # Manage single waveforms in input
+        if len(wavs.shape) == 1:
+            wavs = wavs.unsqueeze(0)
+        # Assign full length if wav_lens is not assigned
+        if wav_lens is None:
+            wav_lens = torch.ones(wavs.shape[0], device=self.device)
+        # Storing waveform in the specified device
+        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+        wavs = wavs.float()
+        # Computing features and embeddings
+        outputs = self.mods.wav2vec2(wavs)
+        # last dim will be used for AdaptativeAVG pool
+        outputs = self.mods.avg_pool(outputs, wav_lens)
+        # # print(outputs.shape)
+        outputs = outputs.view(outputs.shape[0], -1)
+        # print(outputs.shape)
+        return outputs
+    def classify_batch(self, wavs, wav_lens=None):
+        """Performs classification on the top of the encoded features.
+        It returns the posterior probabilities, the index and, if the label
+        encoder is specified it also the text label.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        outputs = self.encode_batch(wavs, wav_lens)
+        #outputs = self.CH(wavs, wav_lens)
+        outputs = self.mods.output_mlp(outputs)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def CH(self, wavs, wav_lens=None):
+        import torch
+        import torch.nn.functional as F
+        import soundfile as sf
+        from fairseq import checkpoint_utils
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model_path = "D:\pycharm2020\code\yuyin_ChineseWav2vec\pretrained_models\Chinses_hubert\\chinese-hubert-large-fairseq-ckpt.pt"
+        wav_path = wavs
+        def postprocess(feats, normalize=False):
+            if feats.dim() == 2:
+                feats = feats.mean(-1)
+            assert feats.dim() == 1, feats.dim()
+            if normalize:
+                with torch.no_grad():
+                    feats = F.layer_norm(feats, feats.shape)
+            return feats
+        print("loading model(s) from {}".format(model_path))
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            [model_path],
+            suffix="",
+        )
+        print("loaded model(s) from {}".format(model_path))
+        print(f"normalize: {saved_cfg.task.normalize}")
+        model = models[0]
+        model = model.to(device)
+        model = model.half()
+        model.eval()
+        # wav, sr = sf.read(wav_path)
+        # feat = torch.from_numpy(wav_path).float()
+        feat = postprocess(wav_path, normalize=saved_cfg.task.normalize)
+        feats = feat.view(1, -1)
+        padding_mask = (
+            torch.BoolTensor(feats.shape).fill_(False)
+        )
+        inputs = {
+            "source": feats.half().to(device),
+            "padding_mask": padding_mask.to(device),
+        }
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            outputs = self.mods.avg_pool(logits[0], wav_lens)
+            # # print(outputs.shape)
+            outputs = outputs.view(outputs.shape[0], -1)
+            # print(outputs.shape)
+            return outputs
+    def classify_file(self, path):
+        """Classifies the given audiofile into the given set of labels.
+        Arguments
+        ---------
+        path : str
+            Path to audio file to classify.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        waveform = self.load_audio(path)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0])
+        outputs = self.encode_batch(batch, rel_length)
+        outputs = self.mods.output_mlp(outputs).squeeze(1)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def forward(self, wavs, wav_lens=None, normalize=False):
+        return self.encode_batch(
+            wavs=wavs, wav_lens=wav_lens, normalize=normalize
+        )

pretrained_path/label_encoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7bb126752aed05089d4f211eaf82ceab98c4a17c2813774b58091441064c6ad
+size 73

pretrained_path/label_encoder.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+'depress' => 0
+'non_depress' => 1
+================
+'starting_index' => 0

pretrained_path/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:293e54327bc8c42df3028fa045a7c043a5c25d05085ae32d5af1c022820be1ae
+size 8997

pretrained_path/wav2vec2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31fd83ae093ab6a97f04cd8612d419cca8fc78a0cf9f4b11b35dfc29dd5cd5e3
+size 1261924189

speech.py CHANGED Viewed

@@ -75,4 +75,4 @@ demo = gr.Interface(
     ],
 )
-demo.launch()

     ],
 )
+demo.launch(share=True)