devashish-bhake commited on
Commit
fbfe17a
1 Parent(s): 08f5fbb

modified: SER_model/config.json

Browse files

modified: SER_model/preprocessor_config.json
modified: SER_model/pytorch_model.bin
modified: SER_model/training_args.bin
modified: app.py

SER_model/config.json CHANGED
@@ -1,9 +1,6 @@
1
  {
2
- "_name_or_path": "harshit345/xlsr-wav2vec-speech-emotion-recognition",
3
  "activation_dropout": 0.0,
4
- "adapter_kernel_size": 3,
5
- "adapter_stride": 2,
6
- "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
  "Wav2Vec2ForSequenceClassification"
@@ -53,6 +50,7 @@
53
  "feat_quantizer_dropout": 0.0,
54
  "final_dropout": 0.0,
55
  "finetuning_task": "wav2vec2_clf",
 
56
  "hidden_act": "gelu",
57
  "hidden_dropout": 0.1,
58
  "hidden_size": 1024,
@@ -80,16 +78,13 @@
80
  "mask_channel_prob": 0.0,
81
  "mask_channel_selection": "static",
82
  "mask_feature_length": 10,
83
- "mask_feature_min_masks": 0,
84
  "mask_feature_prob": 0.0,
85
  "mask_time_length": 10,
86
- "mask_time_min_masks": 2,
87
  "mask_time_min_space": 1,
88
  "mask_time_other": 0.0,
89
  "mask_time_prob": 0.05,
90
  "mask_time_selection": "static",
91
  "model_type": "wav2vec2",
92
- "num_adapter_layers": 3,
93
  "num_attention_heads": 16,
94
  "num_codevector_groups": 2,
95
  "num_codevectors_per_group": 320,
@@ -98,35 +93,12 @@
98
  "num_feat_extract_layers": 7,
99
  "num_hidden_layers": 24,
100
  "num_negatives": 100,
101
- "output_hidden_size": 1024,
102
  "pad_token_id": 54,
103
  "pooling_mode": "mean",
104
  "problem_type": "single_label_classification",
105
  "proj_codevector_dim": 256,
106
- "tdnn_dilation": [
107
- 1,
108
- 2,
109
- 3,
110
- 1,
111
- 1
112
- ],
113
- "tdnn_dim": [
114
- 512,
115
- 512,
116
- 512,
117
- 512,
118
- 1500
119
- ],
120
- "tdnn_kernel": [
121
- 5,
122
- 3,
123
- 3,
124
- 1,
125
- 1
126
- ],
127
  "torch_dtype": "float32",
128
- "transformers_version": "4.18.0",
129
  "use_weighted_layer_sum": false,
130
- "vocab_size": 55,
131
- "xvector_output_dim": 512
132
  }
 
1
  {
2
+ "_name_or_path": "lighteternal/wav2vec2-large-xlsr-53-greek",
3
  "activation_dropout": 0.0,
 
 
 
4
  "apply_spec_augment": true,
5
  "architectures": [
6
  "Wav2Vec2ForSequenceClassification"
 
50
  "feat_quantizer_dropout": 0.0,
51
  "final_dropout": 0.0,
52
  "finetuning_task": "wav2vec2_clf",
53
+ "gradient_checkpointing": true,
54
  "hidden_act": "gelu",
55
  "hidden_dropout": 0.1,
56
  "hidden_size": 1024,
 
78
  "mask_channel_prob": 0.0,
79
  "mask_channel_selection": "static",
80
  "mask_feature_length": 10,
 
81
  "mask_feature_prob": 0.0,
82
  "mask_time_length": 10,
 
83
  "mask_time_min_space": 1,
84
  "mask_time_other": 0.0,
85
  "mask_time_prob": 0.05,
86
  "mask_time_selection": "static",
87
  "model_type": "wav2vec2",
 
88
  "num_attention_heads": 16,
89
  "num_codevector_groups": 2,
90
  "num_codevectors_per_group": 320,
 
93
  "num_feat_extract_layers": 7,
94
  "num_hidden_layers": 24,
95
  "num_negatives": 100,
 
96
  "pad_token_id": 54,
97
  "pooling_mode": "mean",
98
  "problem_type": "single_label_classification",
99
  "proj_codevector_dim": 256,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  "torch_dtype": "float32",
101
+ "transformers_version": "4.11.0.dev0",
102
  "use_weighted_layer_sum": false,
103
+ "vocab_size": 55
 
104
  }
SER_model/preprocessor_config.json CHANGED
@@ -4,6 +4,6 @@
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
7
- "return_attention_mask": false,
8
  "sampling_rate": 16000
9
  }
 
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
SER_model/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86f15f40f02086b67ac11d9a0d86ea0543e566b4b5ea68ebe40727039ba10283
3
- size 1262954093
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fca4831614ee2cf814899e61045641219ed7f3f7dc12e95c1ed1f99ccecd501
3
+ size 1266137389
SER_model/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:831905a10c8c6e96d743a653765f1b83c8f209b13341540da2ef34e8b0263a13
3
- size 3119
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edfa74cde1a819557f67b8d79ebbc972342f238ccab6b70489c4baf413332bb7
3
+ size 2799
app.py CHANGED
@@ -14,14 +14,13 @@ def speech_file_to_array_fn(path, sampling_rate):
14
  try:
15
  speech_array, _sampling_rate = torchaudio.load(path)
16
  resampler = torchaudio.transforms.Resample(_sampling_rate)
17
- speech = resampler(speech_array).squeeze().numpy()
18
  return speech
19
  except:
20
  speech_array, _sampling_rate = torchaudio.load(path)
21
  resampler = torchaudio.transforms.Resample(_sampling_rate)
22
- speech = resampler(speech_array[1]).squeeze().numpy()
23
  return speech
24
-
25
 
26
 
27
  def predict(path, sampling_rate, feature_extractor, device, model, config):
@@ -59,8 +58,8 @@ def get_sos_status(transcription, key_phrase):
59
 
60
  def main(audio):
61
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62
- SPT_MODEL = "./SPT_model"
63
- model_name_or_path = "./SER_model"
64
  config = AutoConfig.from_pretrained(model_name_or_path)
65
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
66
  sampling_rate = feature_extractor.sampling_rate
@@ -80,8 +79,8 @@ def main(audio):
80
  emotion = i['Emotion']
81
  if emotion in ['disgust', 'fear', 'sadness']:
82
  emotion = 'negative'
83
- elif emotion == 'anger':
84
- emotion = 'anger'
85
  else:
86
  emotion = 'positive'
87
 
 
14
  try:
15
  speech_array, _sampling_rate = torchaudio.load(path)
16
  resampler = torchaudio.transforms.Resample(_sampling_rate)
17
+ speech = resampler(speech_array[1]).squeeze().numpy()
18
  return speech
19
  except:
20
  speech_array, _sampling_rate = torchaudio.load(path)
21
  resampler = torchaudio.transforms.Resample(_sampling_rate)
22
+ speech = resampler(speech_array).squeeze().numpy()
23
  return speech
 
24
 
25
 
26
  def predict(path, sampling_rate, feature_extractor, device, model, config):
 
58
 
59
  def main(audio):
60
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
+ SPT_MODEL = "D:\kaggle_practice\KJSCE_hack\SERModel\SPT_model"
62
+ model_name_or_path = "D:\kaggle_practice\KJSCE_hack\SERModel\SER_model"
63
  config = AutoConfig.from_pretrained(model_name_or_path)
64
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
65
  sampling_rate = feature_extractor.sampling_rate
 
79
  emotion = i['Emotion']
80
  if emotion in ['disgust', 'fear', 'sadness']:
81
  emotion = 'negative'
82
+ elif emotion == 'neutral':
83
+ emotion = 'neutral'
84
  else:
85
  emotion = 'positive'
86