harveysamson commited on
Commit
6886c22
1 Parent(s): 5851ff3

added models and inference

Browse files
__pycache__/models.cpython-39.pyc ADDED
Binary file (5.59 kB). View file
 
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor
4
+ from src.models import Wav2Vec2ForSpeechClassification
5
+ import gradio as gr
6
+ import librosa
7
+
8
+ device = torch.device("cpu")
9
+ model_name_or_path = "harshit345/xlsr-wav2vec-speech-emotion-recognition"
10
+ config = AutoConfig.from_pretrained(model_name_or_path)
11
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
12
+ sampling_rate = feature_extractor.sampling_rate
13
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
14
+
15
+ def load_data(path):
16
+ speech, sampling_rate = librosa.load(path)
17
+ if len(speech.shape) > 1:
18
+ speech = speech[:,0] + speech[:,1]
19
+ if sampling_rate != 16000:
20
+ speech = librosa.resample(speech, sampling_rate,16000)
21
+ return speech
22
+
23
+ def inference(path):
24
+ speech = load_data(path)
25
+ inputs = feature_extractor(speech, return_tensors="pt").input_values
26
+ with torch.no_grad():
27
+ logits = model(inputs).logits
28
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
29
+ outputs = {config.id2label[i]: float(round(score,2)) for i, score in enumerate(scores)}
30
+ return outputs
31
+
32
+ examples = ['data/test_audio.wav', 'data/test_audio_2.wav']
33
+ inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="microphone")
34
+ outputs = gr.outputs.Label(type="confidences", label = "Output Scores")
35
+ iface = gr.Interface(inference, inputs, outputs=["label"], examples=examples)
36
+ iface.launch(debug=True)
data/test_audio.wav ADDED
Binary file (505 kB). View file
 
data/test_audio_2.wav ADDED
Binary file (538 kB). View file
 
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (149 Bytes). View file
 
src/__pycache__/collator.cpython-39.pyc ADDED
Binary file (3.29 kB). View file
 
src/__pycache__/modeling_outputs.cpython-39.pyc ADDED
Binary file (700 Bytes). View file
 
src/__pycache__/models.cpython-39.pyc ADDED
Binary file (3.42 kB). View file
 
src/__pycache__/trainer.cpython-39.pyc ADDED
Binary file (2.04 kB). View file
 
src/collator.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Union
3
+ import torch
4
+
5
+ import transformers
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
7
+
8
+
9
+ @dataclass
10
+ class DataCollatorCTCWithPadding:
11
+ """
12
+ Data collator that will dynamically pad the inputs received.
13
+ Args:
14
+ feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`)
15
+ The feature_extractor used for proccessing the data.
16
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
17
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
18
+ among:
19
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
20
+ sequence if provided).
21
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
22
+ maximum acceptable input length for the model if that argument is not provided.
23
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
24
+ different lengths).
25
+ max_length (:obj:`int`, `optional`):
26
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
27
+ max_length_labels (:obj:`int`, `optional`):
28
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
29
+ pad_to_multiple_of (:obj:`int`, `optional`):
30
+ If set will pad the sequence to a multiple of the provided value.
31
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
32
+ 7.5 (Volta).
33
+ """
34
+
35
+ feature_extractor: Wav2Vec2FeatureExtractor
36
+ padding: Union[bool, str] = True
37
+ max_length: Optional[int] = None
38
+ max_length_labels: Optional[int] = None
39
+ pad_to_multiple_of: Optional[int] = None
40
+ pad_to_multiple_of_labels: Optional[int] = None
41
+
42
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
43
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
44
+ label_features = [feature["labels"] for feature in features]
45
+
46
+ d_type = torch.long if isinstance(label_features[0], int) else torch.float
47
+
48
+ batch = self.feature_extractor.pad(
49
+ input_features,
50
+ padding=self.padding,
51
+ max_length=self.max_length,
52
+ pad_to_multiple_of=self.pad_to_multiple_of,
53
+ return_tensors="pt",
54
+ )
55
+
56
+ batch["labels"] = torch.tensor(label_features, dtype=d_type)
57
+
58
+ return batch
src/modeling_outputs.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+ import torch
4
+ from transformers.file_utils import ModelOutput
5
+
6
+
7
+ @dataclass
8
+ class SpeechClassifierOutput(ModelOutput):
9
+ loss: Optional[torch.FloatTensor] = None
10
+ logits: torch.FloatTensor = None
11
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
12
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
src/models.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
4
+
5
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
6
+ Wav2Vec2PreTrainedModel,
7
+ Wav2Vec2Model
8
+ )
9
+
10
+ from src.modeling_outputs import SpeechClassifierOutput
11
+
12
+
13
+ class Wav2Vec2ClassificationHead(nn.Module):
14
+ """Head for wav2vec classification task."""
15
+
16
+ def __init__(self, config):
17
+ super().__init__()
18
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
19
+ self.dropout = nn.Dropout(config.final_dropout)
20
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
21
+
22
+ def forward(self, features, **kwargs):
23
+ x = features
24
+ x = self.dropout(x)
25
+ x = self.dense(x)
26
+ x = torch.tanh(x)
27
+ x = self.dropout(x)
28
+ x = self.out_proj(x)
29
+ return x
30
+
31
+
32
+ class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
33
+ def __init__(self, config):
34
+ super().__init__(config)
35
+ self.num_labels = config.num_labels
36
+ self.pooling_mode = config.pooling_mode
37
+ self.config = config
38
+
39
+ self.wav2vec2 = Wav2Vec2Model(config)
40
+ self.classifier = Wav2Vec2ClassificationHead(config)
41
+
42
+ self.init_weights()
43
+
44
+ def freeze_feature_extractor(self):
45
+ self.wav2vec2.feature_extractor._freeze_parameters()
46
+
47
+ def merged_strategy(
48
+ self,
49
+ hidden_states,
50
+ mode="mean"
51
+ ):
52
+ if mode == "mean":
53
+ outputs = torch.mean(hidden_states, dim=1)
54
+ elif mode == "sum":
55
+ outputs = torch.sum(hidden_states, dim=1)
56
+ elif mode == "max":
57
+ outputs = torch.max(hidden_states, dim=1)[0]
58
+ else:
59
+ raise Exception(
60
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
61
+
62
+ return outputs
63
+
64
+ def forward(
65
+ self,
66
+ input_values,
67
+ attention_mask=None,
68
+ output_attentions=None,
69
+ output_hidden_states=None,
70
+ return_dict=None,
71
+ labels=None,
72
+ ):
73
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
74
+ outputs = self.wav2vec2(
75
+ input_values,
76
+ attention_mask=attention_mask,
77
+ output_attentions=output_attentions,
78
+ output_hidden_states=output_hidden_states,
79
+ return_dict=return_dict,
80
+ )
81
+ hidden_states = outputs[0]
82
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
83
+ logits = self.classifier(hidden_states)
84
+
85
+ loss = None
86
+ if labels is not None:
87
+ if self.config.problem_type is None:
88
+ if self.num_labels == 1:
89
+ self.config.problem_type = "regression"
90
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
91
+ self.config.problem_type = "single_label_classification"
92
+ else:
93
+ self.config.problem_type = "multi_label_classification"
94
+
95
+ if self.config.problem_type == "regression":
96
+ loss_fct = MSELoss()
97
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
98
+ elif self.config.problem_type == "single_label_classification":
99
+ loss_fct = CrossEntropyLoss()
100
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
101
+ elif self.config.problem_type == "multi_label_classification":
102
+ loss_fct = BCEWithLogitsLoss()
103
+ loss = loss_fct(logits, labels)
104
+
105
+ if not return_dict:
106
+ output = (logits,) + outputs[2:]
107
+ return ((loss,) + output) if loss is not None else output
108
+
109
+ return SpeechClassifierOutput(
110
+ loss=loss,
111
+ logits=logits,
112
+ hidden_states=outputs.hidden_states,
113
+ attentions=outputs.attentions,
114
+ )
src/trainer.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Union
2
+
3
+ import torch
4
+ from packaging import version
5
+ from torch import nn
6
+
7
+ from transformers import (
8
+ Trainer,
9
+ is_apex_available,
10
+ )
11
+
12
+ if is_apex_available():
13
+ from apex import amp
14
+
15
+ if version.parse(torch.__version__) >= version.parse("1.6"):
16
+ _is_native_amp_available = True
17
+ from torch.cuda.amp import autocast
18
+
19
+
20
+ class CTCTrainer(Trainer):
21
+ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
22
+ """
23
+ Perform a training step on a batch of inputs.
24
+
25
+ Subclass and override to inject custom behavior.
26
+
27
+ Args:
28
+ model (:obj:`nn.Module`):
29
+ The model to train.
30
+ inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
31
+ The inputs and targets of the model.
32
+
33
+ The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
34
+ argument :obj:`labels`. Check your model's documentation for all accepted arguments.
35
+
36
+ Return:
37
+ :obj:`torch.Tensor`: The tensor with training loss on this batch.
38
+ """
39
+
40
+ model.train()
41
+ inputs = self._prepare_inputs(inputs)
42
+
43
+ if self.use_amp:
44
+ with autocast():
45
+ loss = self.compute_loss(model, inputs)
46
+ else:
47
+ loss = self.compute_loss(model, inputs)
48
+
49
+ if self.args.gradient_accumulation_steps > 1:
50
+ loss = loss / self.args.gradient_accumulation_steps
51
+
52
+ if self.use_amp:
53
+ self.scaler.scale(loss).backward()
54
+ elif self.use_apex:
55
+ with amp.scale_loss(loss, self.optimizer) as scaled_loss:
56
+ scaled_loss.backward()
57
+ elif self.deepspeed:
58
+ self.deepspeed.backward(loss)
59
+ else:
60
+ loss.backward()
61
+
62
+ return loss.detach()