Spaces:

DevashishBhake
/

SERModel

Runtime error

devashish-bhake commited on Apr 9, 2023

Commit

fbfe17a

1 Parent(s): 08f5fbb

modified: SER_model/config.json

modified: SER_model/preprocessor_config.json
modified: SER_model/pytorch_model.bin
modified: SER_model/training_args.bin
modified: app.py

Files changed (5) hide show

SER_model/config.json +4 -32
SER_model/preprocessor_config.json +1 -1
SER_model/pytorch_model.bin +2 -2
SER_model/training_args.bin +2 -2
app.py +6 -7

SER_model/config.json CHANGED Viewed

@@ -1,9 +1,6 @@
 {
-  "_name_or_path": "harshit345/xlsr-wav2vec-speech-emotion-recognition",
   "activation_dropout": 0.0,
-  "adapter_kernel_size": 3,
-  "adapter_stride": 2,
-  "add_adapter": false,
   "apply_spec_augment": true,
   "architectures": [
     "Wav2Vec2ForSequenceClassification"
@@ -53,6 +50,7 @@
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "finetuning_task": "wav2vec2_clf",
   "hidden_act": "gelu",
   "hidden_dropout": 0.1,
   "hidden_size": 1024,
@@ -80,16 +78,13 @@
   "mask_channel_prob": 0.0,
   "mask_channel_selection": "static",
   "mask_feature_length": 10,
-  "mask_feature_min_masks": 0,
   "mask_feature_prob": 0.0,
   "mask_time_length": 10,
-  "mask_time_min_masks": 2,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
   "mask_time_prob": 0.05,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
-  "num_adapter_layers": 3,
   "num_attention_heads": 16,
   "num_codevector_groups": 2,
   "num_codevectors_per_group": 320,
@@ -98,35 +93,12 @@
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
   "num_negatives": 100,
-  "output_hidden_size": 1024,
   "pad_token_id": 54,
   "pooling_mode": "mean",
   "problem_type": "single_label_classification",
   "proj_codevector_dim": 256,
-  "tdnn_dilation": [
-    1,
-    2,
-    3,
-    1,
-    1
-  ],
-  "tdnn_dim": [
-    512,
-    512,
-    512,
-    512,
-    1500
-  ],
-  "tdnn_kernel": [
-    5,
-    3,
-    3,
-    1,
-    1
-  ],
   "torch_dtype": "float32",
-  "transformers_version": "4.18.0",
   "use_weighted_layer_sum": false,
-  "vocab_size": 55,
-  "xvector_output_dim": 512
 }

 {
+  "_name_or_path": "lighteternal/wav2vec2-large-xlsr-53-greek",
   "activation_dropout": 0.0,
   "apply_spec_augment": true,
   "architectures": [
     "Wav2Vec2ForSequenceClassification"
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "finetuning_task": "wav2vec2_clf",
+  "gradient_checkpointing": true,
   "hidden_act": "gelu",
   "hidden_dropout": 0.1,
   "hidden_size": 1024,
   "mask_channel_prob": 0.0,
   "mask_channel_selection": "static",
   "mask_feature_length": 10,
   "mask_feature_prob": 0.0,
   "mask_time_length": 10,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
   "mask_time_prob": 0.05,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,
   "num_codevector_groups": 2,
   "num_codevectors_per_group": 320,
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
   "num_negatives": 100,
   "pad_token_id": 54,
   "pooling_mode": "mean",
   "problem_type": "single_label_classification",
   "proj_codevector_dim": 256,
   "torch_dtype": "float32",
+  "transformers_version": "4.11.0.dev0",
   "use_weighted_layer_sum": false,
+  "vocab_size": 55
 }

SER_model/preprocessor_config.json CHANGED Viewed

@@ -4,6 +4,6 @@
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
-  "return_attention_mask": false,
   "sampling_rate": 16000
 }

   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
+  "return_attention_mask": true,
   "sampling_rate": 16000
 }

SER_model/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86f15f40f02086b67ac11d9a0d86ea0543e566b4b5ea68ebe40727039ba10283
-size 1262954093

 version https://git-lfs.github.com/spec/v1
+oid sha256:6fca4831614ee2cf814899e61045641219ed7f3f7dc12e95c1ed1f99ccecd501
+size 1266137389

SER_model/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:831905a10c8c6e96d743a653765f1b83c8f209b13341540da2ef34e8b0263a13
-size 3119

 version https://git-lfs.github.com/spec/v1
+oid sha256:edfa74cde1a819557f67b8d79ebbc972342f238ccab6b70489c4baf413332bb7
+size 2799

app.py CHANGED Viewed

@@ -14,14 +14,13 @@ def speech_file_to_array_fn(path, sampling_rate):
     try:
         speech_array, _sampling_rate = torchaudio.load(path)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
-        speech = resampler(speech_array).squeeze().numpy()
         return speech
     except:
         speech_array, _sampling_rate = torchaudio.load(path)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
-        speech = resampler(speech_array[1]).squeeze().numpy()
         return speech
 def predict(path, sampling_rate, feature_extractor, device, model, config):
@@ -59,8 +58,8 @@ def get_sos_status(transcription, key_phrase):
 def main(audio):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    SPT_MODEL = "./SPT_model"
-    model_name_or_path = "./SER_model"
     config = AutoConfig.from_pretrained(model_name_or_path)
     feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
     sampling_rate = feature_extractor.sampling_rate
@@ -80,8 +79,8 @@ def main(audio):
             emotion = i['Emotion']
     if emotion in ['disgust', 'fear', 'sadness']:
         emotion = 'negative'
-    elif emotion == 'anger':
-        emotion = 'anger'
     else:
         emotion = 'positive'

     try:
         speech_array, _sampling_rate = torchaudio.load(path)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
+        speech = resampler(speech_array[1]).squeeze().numpy()
         return speech
     except:
         speech_array, _sampling_rate = torchaudio.load(path)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
+        speech = resampler(speech_array).squeeze().numpy()
         return speech
 def predict(path, sampling_rate, feature_extractor, device, model, config):
 def main(audio):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    SPT_MODEL = "D:\kaggle_practice\KJSCE_hack\SERModel\SPT_model"
+    model_name_or_path = "D:\kaggle_practice\KJSCE_hack\SERModel\SER_model"
     config = AutoConfig.from_pretrained(model_name_or_path)
     feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
     sampling_rate = feature_extractor.sampling_rate
             emotion = i['Emotion']
     if emotion in ['disgust', 'fear', 'sadness']:
         emotion = 'negative'
+    elif emotion == 'neutral':
+        emotion = 'neutral'
     else:
         emotion = 'positive'