khizon commited on
Commit
c3e5c63
1 Parent(s): fdcd61b

initial commit

Browse files
Files changed (11) hide show
  1. .gitignore +132 -0
  2. Procfile +1 -0
  3. README.md +5 -33
  4. app.py +78 -0
  5. download_dataset.py +74 -0
  6. download_model.py +19 -0
  7. main.py +382 -0
  8. requirements.txt +13 -0
  9. setup.sh +8 -0
  10. test.py +61 -0
  11. utils.py +30 -0
.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ data/aesdd/*
132
+ artifacts/*
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: sh setup.sh && python download_dataset.py && streamlit run demo.py
README.md CHANGED
@@ -1,37 +1,9 @@
1
- ---
2
- title: Emotion Classifier Demo
3
- emoji: 😻
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: streamlit
7
- app_file: app.py
8
- pinned: false
9
- ---
10
 
11
- # Configuration
12
 
13
- `title`: _string_
14
- Display title for the Space
15
 
16
- `emoji`: _string_
17
- Space emoji (emoji-only character allowed)
18
 
19
- `colorFrom`: _string_
20
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
- `colorTo`: _string_
23
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
- `sdk`: _string_
26
- Can be either `gradio`, `streamlit`, or `static`
27
-
28
- `sdk_version` : _string_
29
- Only applicable for `streamlit` SDK.
30
- See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
-
32
- `app_file`: _string_
33
- Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
34
- Path is relative to the root of the repository.
35
-
36
- `pinned`: _boolean_
37
- Whether the Space stays on top of your list.
 
1
+ # EE286_final_project
 
 
 
 
 
 
 
 
2
 
3
+ Emotion Classifier of Greek Speech Audio Using a Fine-tuned Wav2Vec2 Model
4
 
5
+ Original code from: [Mehrdad Farahani](https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition)
 
6
 
7
+ Google Colab Demo can be accessed [here](https://colab.research.google.com/drive/1xgbm7f0j8jSPWF4YrnaQxwe_6ktW_TND?usp=sharing)
 
8
 
9
+ Video recording of the demo can be accessed [here](https://youtu.be/ae79DOj5yZI)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from main import SpeechClassifierOutput, Wav2Vec2ForSpeechClassification
4
+ from datasets import load_dataset
5
+ from transformers import AutoConfig, Wav2Vec2Processor
6
+ import torchaudio
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import seaborn as sns
10
+ import matplotlib.pyplot as plt
11
+ import streamlit as st
12
+ import os
13
+
14
+ sns.set_theme(style="darkgrid", palette="pastel")
15
+
16
+ def demo_speech_file_to_array_fn(path):
17
+ speech_array, _sampling_rate = torchaudio.load(path, normalize=True)
18
+ resampler = torchaudio.transforms.Resample(_sampling_rate, 16_000)
19
+ speech = resampler(speech_array).squeeze().numpy()
20
+ return speech
21
+
22
+ def demo_predict(df_row):
23
+ path, emotion = df_row["path"], df_row["emotion"]
24
+ speech = demo_speech_file_to_array_fn(path)
25
+ features = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
26
+
27
+ input_values = features.input_values.to(device)
28
+ attention_mask = features.attention_mask.to(device)
29
+
30
+ with torch.no_grad():
31
+ logits = model(input_values, attention_mask=attention_mask).logits
32
+
33
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
34
+ outputs = [{"Emotion": config.id2label[i], "Score": round(score * 100, 3)} for i, score in enumerate(scores)]
35
+ return outputs
36
+
37
+ def cache_model():
38
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
+ model_name_or_path = 'm3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition'
40
+ config = AutoConfig.from_pretrained(model_name_or_path)
41
+ processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
42
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
43
+ return config, processor, model, device
44
+
45
+ @st.cache
46
+ def load_data():
47
+ return pd.read_csv('data/test.csv', delimiter = '\t')
48
+
49
+ def bar_plot(df):
50
+ fig = plt.figure(figsize=(8, 6))
51
+ plt.title("Prediction Scores")
52
+ plt.xticks(fontsize=12)
53
+ sns.barplot(x="Score", y="Emotion", data=df)
54
+ st.pyplot(fig)
55
+
56
+ if __name__ == '__main__':
57
+ os.system('python download_dataset.py')
58
+ test = load_data()
59
+
60
+ config, processor, model, device = cache_model()
61
+ print('Model loaded')
62
+
63
+ st.title("Emotion Classifier for Greek Speech Audio Demo")
64
+ if st.button("Classify Random Audio"):
65
+ # Load demo file
66
+ idx = np.random.randint(0, len(test))
67
+ sample = test.iloc[idx]
68
+
69
+ audio_file = open(sample['path'], 'rb')
70
+ audio_bytes = audio_file.read()
71
+
72
+ st.success(f'Label: {sample["emotion"]}')
73
+ st.audio(audio_bytes, format='audio/ogg')
74
+
75
+ outputs = demo_predict(sample)
76
+ r = pd.DataFrame(outputs)
77
+ # st.dataframe(r)
78
+ bar_plot(r)
download_dataset.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ import os
5
+ import gdown
6
+
7
+ from pathlib import Path
8
+ from tqdm import tqdm
9
+ from sklearn.model_selection import train_test_split
10
+
11
+ import torchaudio
12
+
13
+ if __name__ == '__main__':
14
+
15
+ if not os.path.exists(os.path.join('data')):
16
+ os.makedirs(os.path.join('data'))
17
+
18
+ os.system('gdown https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1')
19
+ os.system('unzip -q aesdd.zip -d data/')
20
+ os.rename(os.path.join('data', 'Acted Emotional Speech Dynamic Database'),
21
+ os.path.join('data', 'aesdd'))
22
+
23
+ data = []
24
+ # Load the annotations file
25
+ for path in tqdm(Path("data/aesdd").glob("**/*.wav")):
26
+ name = str(path).split("/")[-1]
27
+ label = str(path).split('/')[-2]
28
+ path = os.path.join("data", "aesdd", label, name)
29
+ print(path)
30
+
31
+ try:
32
+ # There are some broken files
33
+ s = torchaudio.load(path)
34
+ print(s)
35
+ data.append({
36
+ "name": name,
37
+ "path": path,
38
+ "emotion": label
39
+ })
40
+ except Exception as e:
41
+ # print(str(path), e)
42
+ pass
43
+
44
+
45
+
46
+ df = pd.DataFrame(data)
47
+ print(df.head())
48
+
49
+ # Filter broken and non-existed paths
50
+
51
+ print(f"Step 0: {len(df)}")
52
+
53
+ df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
54
+ df = df.dropna(subset=["path"])
55
+ df = df.drop("status", 1)
56
+ print(f"Step 1: {len(df)}")
57
+
58
+ df = df.sample(frac=1)
59
+ df = df.reset_index(drop=True)
60
+
61
+ # Train test split
62
+ save_path = "data"
63
+
64
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])
65
+
66
+ train_df = train_df.reset_index(drop=True)
67
+ test_df = test_df.reset_index(drop=True)
68
+
69
+ train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
70
+ test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
71
+
72
+
73
+ print(train_df.shape)
74
+ print(test_df.shape)
download_model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ from main import *
3
+
4
+ def cache_model():
5
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6
+ generic_greek_model = 'lighteternal/wav2vec2-large-xlsr-53-greek'
7
+ local_model = 'artifacts/aesdd_classifier-v0'
8
+ config = AutoConfig.from_pretrained(local_model)
9
+ processor = Wav2Vec2Processor.from_pretrained(generic_greek_model)
10
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(local_model).to(device)
11
+ return config, processor, model, device
12
+
13
+ if __name__ == '__main__':
14
+ # with wandb.init() as run:
15
+ # artifact = run.use_artifact('khizon/EE286_final_project/aesdd_classifier:v0', type='model')
16
+ # artifact_dir = artifact.download()
17
+ config, processor, model, device = cache_model()
18
+
19
+ model.push_to_hub("greek-emotion-classifier-demo")
main.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torchaudio
4
+ from packaging import version
5
+
6
+ from datasets import load_dataset, load_metric
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
14
+
15
+
16
+ import transformers
17
+ from transformers import AutoConfig, Wav2Vec2Processor
18
+ from transformers.file_utils import ModelOutput
19
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
20
+ Wav2Vec2PreTrainedModel,
21
+ Wav2Vec2Model
22
+ )
23
+ from transformers.file_utils import ModelOutput
24
+ from transformers import EvalPrediction
25
+ from transformers import TrainingArguments
26
+ from transformers import (
27
+ Trainer,
28
+ is_apex_available,
29
+ )
30
+
31
+
32
+ if is_apex_available():
33
+ from apex import amp
34
+
35
+ if version.parse(torch.__version__) >= version.parse("1.6"):
36
+ _is_native_amp_available = True
37
+ from torch.cuda.amp import autocast
38
+
39
+ def speech_file_to_array_fn(path):
40
+ speech_array, sampling_rate = torchaudio.load(path)
41
+ resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
42
+ speech = resampler(speech_array).squeeze().numpy()
43
+ return speech
44
+
45
+ def label_to_id(label, label_list):
46
+
47
+ if len(label_list) > 0:
48
+ return label_list.index(label) if label in label_list else -1
49
+
50
+ return label
51
+
52
+ def preprocess_function(examples):
53
+ speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
54
+ target_list = [label_to_id(label, label_list) for label in examples[output_column]]
55
+
56
+ result = processor(speech_list, sampling_rate=target_sampling_rate)
57
+ result["labels"] = list(target_list)
58
+
59
+ return result
60
+
61
+ @dataclass
62
+ class SpeechClassifierOutput(ModelOutput):
63
+ loss: Optional[torch.FloatTensor] = None
64
+ logits: torch.FloatTensor = None
65
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
66
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
67
+
68
+ class Wav2Vec2ClassificationHead(nn.Module):
69
+ """Head for wav2vec classification task."""
70
+
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
74
+ self.dropout = nn.Dropout(config.final_dropout)
75
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
76
+
77
+ def forward(self, features, **kwargs):
78
+ x = features
79
+ x = self.dropout(x)
80
+ x = self.dense(x)
81
+ x = torch.tanh(x)
82
+ x = self.dropout(x)
83
+ x = self.out_proj(x)
84
+ return x
85
+
86
+
87
+ class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
88
+ def __init__(self, config):
89
+ super().__init__(config)
90
+ self.num_labels = config.num_labels
91
+ self.pooling_mode = config.pooling_mode
92
+ self.config = config
93
+
94
+ self.wav2vec2 = Wav2Vec2Model(config)
95
+ self.classifier = Wav2Vec2ClassificationHead(config)
96
+
97
+ self.init_weights()
98
+
99
+ def freeze_feature_extractor(self):
100
+ self.wav2vec2.feature_extractor._freeze_parameters()
101
+
102
+ def merged_strategy(
103
+ self,
104
+ hidden_states,
105
+ mode="mean"
106
+ ):
107
+ if mode == "mean":
108
+ outputs = torch.mean(hidden_states, dim=1)
109
+ elif mode == "sum":
110
+ outputs = torch.sum(hidden_states, dim=1)
111
+ elif mode == "max":
112
+ outputs = torch.max(hidden_states, dim=1)[0]
113
+ else:
114
+ raise Exception(
115
+ "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
116
+
117
+ return outputs
118
+
119
+ def forward(
120
+ self,
121
+ input_values,
122
+ attention_mask=None,
123
+ output_attentions=None,
124
+ output_hidden_states=None,
125
+ return_dict=None,
126
+ labels=None,
127
+ ):
128
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
129
+ outputs = self.wav2vec2(
130
+ input_values,
131
+ attention_mask=attention_mask,
132
+ output_attentions=output_attentions,
133
+ output_hidden_states=output_hidden_states,
134
+ return_dict=return_dict,
135
+ )
136
+ hidden_states = outputs[0]
137
+ hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
138
+ logits = self.classifier(hidden_states)
139
+
140
+ loss = None
141
+ if labels is not None:
142
+ if self.config.problem_type is None:
143
+ if self.num_labels == 1:
144
+ self.config.problem_type = "regression"
145
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
146
+ self.config.problem_type = "single_label_classification"
147
+ else:
148
+ self.config.problem_type = "multi_label_classification"
149
+
150
+ if self.config.problem_type == "regression":
151
+ loss_fct = MSELoss()
152
+ loss = loss_fct(logits.view(-1, self.num_labels), labels)
153
+ elif self.config.problem_type == "single_label_classification":
154
+ loss_fct = CrossEntropyLoss()
155
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
156
+ elif self.config.problem_type == "multi_label_classification":
157
+ loss_fct = BCEWithLogitsLoss()
158
+ loss = loss_fct(logits, labels)
159
+
160
+ if not return_dict:
161
+ output = (logits,) + outputs[2:]
162
+ return ((loss,) + output) if loss is not None else output
163
+
164
+ return SpeechClassifierOutput(
165
+ loss=loss,
166
+ logits=logits,
167
+ hidden_states=outputs.hidden_states,
168
+ attentions=outputs.attentions,
169
+ )
170
+
171
+ def compute_metrics(p: EvalPrediction):
172
+ preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
173
+ preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
174
+
175
+ if is_regression:
176
+ return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
177
+ else:
178
+ return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
179
+
180
+ @dataclass
181
+ class DataCollatorCTCWithPadding:
182
+ """
183
+ Data collator that will dynamically pad the inputs received.
184
+ Args:
185
+ processor (:class:`~transformers.Wav2Vec2Processor`)
186
+ The processor used for proccessing the data.
187
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
188
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
189
+ among:
190
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
191
+ sequence if provided).
192
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
193
+ maximum acceptable input length for the model if that argument is not provided.
194
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
195
+ different lengths).
196
+ max_length (:obj:`int`, `optional`):
197
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
198
+ max_length_labels (:obj:`int`, `optional`):
199
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
200
+ pad_to_multiple_of (:obj:`int`, `optional`):
201
+ If set will pad the sequence to a multiple of the provided value.
202
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
203
+ 7.5 (Volta).
204
+ """
205
+
206
+ processor: Wav2Vec2Processor
207
+ padding: Union[bool, str] = True
208
+ max_length: Optional[int] = None
209
+ max_length_labels: Optional[int] = None
210
+ pad_to_multiple_of: Optional[int] = None
211
+ pad_to_multiple_of_labels: Optional[int] = None
212
+
213
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
214
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
215
+ label_features = [feature["labels"] for feature in features]
216
+
217
+ d_type = torch.long if isinstance(label_features[0], int) else torch.float
218
+
219
+ batch = self.processor.pad(
220
+ input_features,
221
+ padding=self.padding,
222
+ max_length=self.max_length,
223
+ pad_to_multiple_of=self.pad_to_multiple_of,
224
+ return_tensors="pt",
225
+ )
226
+
227
+ batch["labels"] = torch.tensor(label_features, dtype=d_type)
228
+
229
+ return batch
230
+
231
+ class CTCTrainer(Trainer):
232
+ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
233
+ """
234
+ Perform a training step on a batch of inputs.
235
+
236
+ Subclass and override to inject custom behavior.
237
+
238
+ Args:
239
+ model (:obj:`nn.Module`):
240
+ The model to train.
241
+ inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
242
+ The inputs and targets of the model.
243
+
244
+ The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
245
+ argument :obj:`labels`. Check your model's documentation for all accepted arguments.
246
+
247
+ Return:
248
+ :obj:`torch.Tensor`: The tensor with training loss on this batch.
249
+ """
250
+
251
+ model.train()
252
+ inputs = self._prepare_inputs(inputs)
253
+
254
+ if self.use_amp:
255
+ with autocast():
256
+ loss = self.compute_loss(model, inputs)
257
+ else:
258
+ loss = self.compute_loss(model, inputs)
259
+
260
+ if self.args.gradient_accumulation_steps > 1:
261
+ loss = loss / self.args.gradient_accumulation_steps
262
+
263
+ if self.use_amp:
264
+ self.scaler.scale(loss).backward()
265
+ elif self.use_apex:
266
+ with amp.scale_loss(loss, self.optimizer) as scaled_loss:
267
+ scaled_loss.backward()
268
+ elif self.deepspeed:
269
+ self.deepspeed.backward(loss)
270
+ else:
271
+ loss.backward()
272
+
273
+ return loss.detach()
274
+
275
+ if __name__ == '__main__':
276
+
277
+ WANDB_SILENT=True
278
+ WANDB_LOG_MODEL=True
279
+
280
+ # Load dataset
281
+ data_files = {
282
+ "train": "data/train.csv",
283
+ "validation": "data/test.csv",
284
+ }
285
+
286
+ dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
287
+
288
+ train_dataset = dataset["train"]
289
+ eval_dataset = dataset["validation"]
290
+
291
+ print(train_dataset)
292
+ print(eval_dataset)
293
+
294
+ # We need to specify the input and output column
295
+ input_column = "path"
296
+ output_column = "emotion"
297
+
298
+ # we need to distinguish the unique labels in our SER dataset
299
+ label_list = train_dataset.unique(output_column)
300
+ label_list.sort() # Let's sort it for determinism
301
+ num_labels = len(label_list)
302
+ print(f"A classification problem with {num_labels} classes: {label_list}")
303
+
304
+ # Specify the pre-trained model that we will fine tune
305
+ model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
306
+ pooling_mode = "mean"
307
+
308
+ # Model Configuration
309
+ config = AutoConfig.from_pretrained(
310
+ model_name_or_path,
311
+ num_labels=num_labels,
312
+ label2id={label: i for i, label in enumerate(label_list)},
313
+ id2label={i: label for i, label in enumerate(label_list)},
314
+ finetuning_task="wav2vec2_clf",
315
+ )
316
+ setattr(config, 'pooling_mode', pooling_mode)
317
+
318
+ # Processor is the combination of feature extractor and tokenizer
319
+ processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
320
+ target_sampling_rate = processor.feature_extractor.sampling_rate
321
+ print(f"The target sampling rate: {target_sampling_rate}")
322
+
323
+ # So far, our dataset only contains the path to the audio
324
+ # Using the mapper, we will load the audio files and also compute
325
+ # the features
326
+
327
+ train_dataset = train_dataset.map(
328
+ preprocess_function,
329
+ batch_size=100,
330
+ batched=True,
331
+ num_proc=4
332
+ )
333
+
334
+ eval_dataset = eval_dataset.map(
335
+ preprocess_function,
336
+ batch_size=100,
337
+ batched=True,
338
+ num_proc=4
339
+ )
340
+
341
+ data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
342
+
343
+ is_regression = False
344
+
345
+ # Instantiate the Classifier model
346
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(
347
+ model_name_or_path,
348
+ config=config,
349
+ )
350
+
351
+ # The model's initial layers are CNNs and are already pre-trained so we will freeze their weights for this demo
352
+ model.freeze_feature_extractor()
353
+
354
+ training_args = TrainingArguments(
355
+ report_to = 'wandb',
356
+ output_dir="data/wav2vec2-xlsr-greek-speech-emotion-recognition",
357
+ per_device_train_batch_size=4,
358
+ per_device_eval_batch_size=4,
359
+ gradient_accumulation_steps=2,
360
+ evaluation_strategy="steps",
361
+ num_train_epochs=3.0,
362
+ fp16=True,
363
+ save_steps=20,
364
+ eval_steps=30,
365
+ logging_steps=10,
366
+ learning_rate=1e-4,
367
+ save_total_limit=2,
368
+ run_name = 'custom_training' # name of the W&B run
369
+ )
370
+
371
+ trainer = CTCTrainer(
372
+ model=model,
373
+ data_collator=data_collator,
374
+ args=training_args,
375
+ compute_metrics=compute_metrics,
376
+ train_dataset=train_dataset,
377
+ eval_dataset=eval_dataset,
378
+ tokenizer=processor.feature_extractor,
379
+ )
380
+
381
+ trainer.train()
382
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ numpy==1.21.5
3
+ pandas==1.3.5
4
+ datasets==1.17.0
5
+ transformers==4.15.0
6
+ torch==1.10.1+cpu
7
+ torchaudio==0.10.1+cpu
8
+ matplotlib==3.5.1
9
+ matplotlib-inline==0.1.3
10
+ streamlit==1.3.1
11
+ seaborn==0.11.2
12
+ gdown==4.2.0
13
+ scikit-learn==1.0.2
setup.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ mkdir -p ~/.streamlit/
2
+ echo "\
3
+ [server]\n\
4
+ headless = true\n\
5
+ port = $PORT\n\
6
+ enableCORS = false\n\
7
+ \n\
8
+ " > ~/.streamlit/config.toml
test.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from main import *
2
+
3
+ from sklearn.metrics import classification_report
4
+
5
+ def speech_file_to_array_fn(batch):
6
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
7
+ speech_array = speech_array
8
+ resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
9
+ speech_array = resampler(speech_array).squeeze().numpy()
10
+
11
+ batch["speech"] = speech_array
12
+ return batch
13
+
14
+
15
+ def predict(batch):
16
+ features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)
17
+
18
+ input_values = features.input_values.to(device)
19
+ attention_mask = features.attention_mask.to(device)
20
+
21
+ with torch.no_grad():
22
+ logits = model(input_values, attention_mask=attention_mask).logits
23
+
24
+ pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
25
+ batch["predicted"] = pred_ids
26
+ return batch
27
+
28
+ if __name__ == '__main__':
29
+
30
+ data_files = {
31
+ "test" : 'data/test.csv'
32
+ }
33
+ test_dataset = load_dataset('csv', data_files = data_files, delimiter = "\t")["test"]
34
+ print(test_dataset)
35
+
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ print(f"Device: {device}")
38
+
39
+ # model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
40
+ model_name_or_path2 = "lighteternal/wav2vec2-large-xlsr-53-greek"
41
+ # model_name_or_path = "data/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-180"
42
+ model_name_or_path = 'artifacts/aesdd_classifier:v0'
43
+ config = AutoConfig.from_pretrained(model_name_or_path)
44
+ processor = Wav2Vec2Processor.from_pretrained(model_name_or_path2)
45
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
46
+
47
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
48
+
49
+ result = test_dataset.map(predict, batched=True, batch_size=8)
50
+
51
+ label_names = [config.id2label[i] for i in range(config.num_labels)]
52
+
53
+ print(f'Labels: {label_names}')
54
+
55
+ y_true = [config.label2id[name] for name in result["emotion"]]
56
+ y_pred = result["predicted"]
57
+
58
+ print(y_true[:5])
59
+ print(y_pred[:5])
60
+
61
+ print(classification_report(y_true, y_pred, target_names=label_names))
utils.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+
3
+ def speech_file_to_array_fn(path):
4
+ speech_array, sampling_rate = torchaudio.load(path)
5
+ resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
6
+ speech = resampler(speech_array).squeeze().numpy()
7
+ return speech
8
+
9
+ def label_to_id(label, label_list):
10
+
11
+ if len(label_list) > 0:
12
+ return label_list.index(label) if label in label_list else -1
13
+
14
+ return label
15
+
16
+ def preprocess_function(examples):
17
+ speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
18
+ target_list = [label_to_id(label, label_list) for label in examples[output_column]]
19
+
20
+ result = processor(speech_list, sampling_rate=target_sampling_rate)
21
+ result["labels"] = list(target_list)
22
+
23
+ return result
24
+
25
+ @dataclass
26
+ class SpeechClassifierOutput(ModelOutput):
27
+ loss: Optional[torch.FloatTensor] = None
28
+ logits: torch.FloatTensor = None
29
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
30
+ attentions: Optional[Tuple[torch.FloatTensor]] = None