Spaces:

khizon
/

emotion-classifier-demo

Runtime error

App Files Files Community

khizon commited on Jan 9, 2022

Commit

c3e5c63

1 Parent(s): fdcd61b

initial commit

Browse files

Files changed (11) hide show

.gitignore +132 -0
Procfile +1 -0
README.md +5 -33
app.py +78 -0
download_dataset.py +74 -0
download_model.py +19 -0
main.py +382 -0
requirements.txt +13 -0
setup.sh +8 -0
test.py +61 -0
utils.py +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data/aesdd/*
+artifacts/*

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: sh setup.sh && python download_dataset.py && streamlit run demo.py

README.md CHANGED Viewed

@@ -1,37 +1,9 @@
----
-title: Emotion Classifier Demo
-emoji: 😻
-colorFrom: green
-colorTo: gray
-sdk: streamlit
-app_file: app.py
-pinned: false
----
-# Configuration
-`title`: _string_
-Display title for the Space
-`emoji`: _string_
-Space emoji (emoji-only character allowed)
-`colorFrom`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`colorTo`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`sdk`: _string_
-Can be either `gradio`, `streamlit`, or `static`
-`sdk_version` : _string_
-Only applicable for `streamlit` SDK.
-See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
-`app_file`: _string_
-Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
-Path is relative to the root of the repository.
-`pinned`: _boolean_
-Whether the Space stays on top of your list.

+# EE286_final_project
+Emotion Classifier of Greek Speech Audio Using a Fine-tuned Wav2Vec2 Model
+Original code from: [Mehrdad Farahani](https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition)
+Google Colab Demo can be accessed [here](https://colab.research.google.com/drive/1xgbm7f0j8jSPWF4YrnaQxwe_6ktW_TND?usp=sharing)
+Video recording of the demo can be accessed [here](https://youtu.be/ae79DOj5yZI)

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+import pandas as pd
+from main import SpeechClassifierOutput, Wav2Vec2ForSpeechClassification
+from datasets import load_dataset
+from transformers import AutoConfig, Wav2Vec2Processor
+import torchaudio
+import torch
+import torch.nn.functional as F
+import seaborn as sns
+import matplotlib.pyplot as plt
+import streamlit as st
+import os
+sns.set_theme(style="darkgrid", palette="pastel")
+def demo_speech_file_to_array_fn(path):
+    speech_array, _sampling_rate = torchaudio.load(path, normalize=True)
+    resampler = torchaudio.transforms.Resample(_sampling_rate, 16_000)
+    speech = resampler(speech_array).squeeze().numpy()
+    return speech
+def demo_predict(df_row):
+    path, emotion = df_row["path"], df_row["emotion"]
+    speech = demo_speech_file_to_array_fn(path)
+    features = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
+    input_values = features.input_values.to(device)
+    attention_mask = features.attention_mask.to(device)
+    with torch.no_grad():
+        logits = model(input_values, attention_mask=attention_mask).logits
+    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+    outputs = [{"Emotion": config.id2label[i], "Score": round(score * 100, 3)} for i, score in enumerate(scores)]
+    return outputs
+def cache_model():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_name_or_path = 'm3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition'
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
+    model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
+    return config, processor, model, device
+@st.cache
+def load_data():
+    return pd.read_csv('data/test.csv', delimiter = '\t')
+def bar_plot(df):
+    fig = plt.figure(figsize=(8, 6))
+    plt.title("Prediction Scores")
+    plt.xticks(fontsize=12)
+    sns.barplot(x="Score", y="Emotion", data=df)
+    st.pyplot(fig)
+if __name__ == '__main__':
+    os.system('python download_dataset.py')
+    test = load_data()
+    config, processor, model, device = cache_model()
+    print('Model loaded')
+    st.title("Emotion Classifier for Greek Speech Audio Demo")
+    if st.button("Classify Random Audio"):
+        # Load demo file
+        idx = np.random.randint(0, len(test))
+        sample = test.iloc[idx]
+        audio_file = open(sample['path'], 'rb')
+        audio_bytes = audio_file.read()
+        st.success(f'Label: {sample["emotion"]}')
+        st.audio(audio_bytes, format='audio/ogg')
+        outputs = demo_predict(sample)
+        r = pd.DataFrame(outputs)
+        # st.dataframe(r)
+        bar_plot(r)

download_dataset.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pandas as pd
+import numpy as np
+import os
+import gdown
+from pathlib import Path
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+import torchaudio
+if __name__ == '__main__':
+    if not os.path.exists(os.path.join('data')):
+        os.makedirs(os.path.join('data'))
+    os.system('gdown https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1')
+    os.system('unzip -q aesdd.zip -d data/')
+    os.rename(os.path.join('data', 'Acted Emotional Speech Dynamic Database'),
+              os.path.join('data', 'aesdd'))
+    data = []
+    # Load the annotations file
+    for path in tqdm(Path("data/aesdd").glob("**/*.wav")):
+        name = str(path).split("/")[-1]
+        label = str(path).split('/')[-2]
+        path = os.path.join("data", "aesdd", label, name)
+        print(path)
+        try:
+            # There are some broken files
+            s = torchaudio.load(path)
+            print(s)
+            data.append({
+                "name": name,
+                "path": path,
+                "emotion": label
+            })
+        except Exception as e:
+            # print(str(path), e)
+            pass
+    df = pd.DataFrame(data)
+    print(df.head())
+    # Filter broken and non-existed paths
+    print(f"Step 0: {len(df)}")
+    df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
+    df = df.dropna(subset=["path"])
+    df = df.drop("status", 1)
+    print(f"Step 1: {len(df)}")
+    df = df.sample(frac=1)
+    df = df.reset_index(drop=True)
+    # Train test split
+    save_path = "data"
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])
+    train_df = train_df.reset_index(drop=True)
+    test_df = test_df.reset_index(drop=True)
+    train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
+    test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
+    print(train_df.shape)
+    print(test_df.shape)

download_model.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import wandb
+from main import *
+def cache_model():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    generic_greek_model = 'lighteternal/wav2vec2-large-xlsr-53-greek'
+    local_model = 'artifacts/aesdd_classifier-v0'
+    config = AutoConfig.from_pretrained(local_model)
+    processor = Wav2Vec2Processor.from_pretrained(generic_greek_model)
+    model = Wav2Vec2ForSpeechClassification.from_pretrained(local_model).to(device)
+    return config, processor, model, device
+if __name__ == '__main__':
+    # with wandb.init() as run:
+    #     artifact = run.use_artifact('khizon/EE286_final_project/aesdd_classifier:v0', type='model')
+    #     artifact_dir = artifact.download()
+    config, processor, model, device = cache_model()
+    model.push_to_hub("greek-emotion-classifier-demo")

main.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import pandas as pd
+import numpy as np
+import torchaudio
+from packaging import version
+from datasets import load_dataset, load_metric
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import transformers
+from transformers import AutoConfig, Wav2Vec2Processor
+from transformers.file_utils import ModelOutput
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2PreTrainedModel,
+    Wav2Vec2Model
+)
+from transformers.file_utils import ModelOutput
+from transformers import EvalPrediction
+from transformers import TrainingArguments
+from transformers import (
+    Trainer,
+    is_apex_available,
+)
+if is_apex_available():
+    from apex import amp
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_native_amp_available = True
+    from torch.cuda.amp import autocast
+def speech_file_to_array_fn(path):
+    speech_array, sampling_rate = torchaudio.load(path)
+    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
+    speech = resampler(speech_array).squeeze().numpy()
+    return speech
+def label_to_id(label, label_list):
+    if len(label_list) > 0:
+        return label_list.index(label) if label in label_list else -1
+    return label
+def preprocess_function(examples):
+    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
+    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
+    result = processor(speech_list, sampling_rate=target_sampling_rate)
+    result["labels"] = list(target_list)
+    return result
+@dataclass
+class SpeechClassifierOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class Wav2Vec2ClassificationHead(nn.Module):
+    """Head for wav2vec classification task."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.pooling_mode = config.pooling_mode
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = Wav2Vec2ClassificationHead(config)
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        self.wav2vec2.feature_extractor._freeze_parameters()
+    def merged_strategy(
+            self,
+            hidden_states,
+            mode="mean"
+    ):
+        if mode == "mean":
+            outputs = torch.mean(hidden_states, dim=1)
+        elif mode == "sum":
+            outputs = torch.sum(hidden_states, dim=1)
+        elif mode == "max":
+            outputs = torch.max(hidden_states, dim=1)[0]
+        else:
+            raise Exception(
+                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
+        return outputs
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SpeechClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+def compute_metrics(p: EvalPrediction):
+    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
+    if is_regression:
+        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
+    else:
+        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.Wav2Vec2Processor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: Wav2Vec2Processor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [feature["labels"] for feature in features]
+        d_type = torch.long if isinstance(label_features[0], int) else torch.float
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        batch["labels"] = torch.tensor(label_features, dtype=d_type)
+        return batch
+class CTCTrainer(Trainer):
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (:obj:`nn.Module`):
+                The model to train.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+        Return:
+            :obj:`torch.Tensor`: The tensor with training loss on this batch.
+        """
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        if self.use_amp:
+            with autocast():
+                loss = self.compute_loss(model, inputs)
+        else:
+            loss = self.compute_loss(model, inputs)
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+        if self.use_amp:
+            self.scaler.scale(loss).backward()
+        elif self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        elif self.deepspeed:
+            self.deepspeed.backward(loss)
+        else:
+            loss.backward()
+        return loss.detach()
+if __name__ == '__main__':
+    WANDB_SILENT=True
+    WANDB_LOG_MODEL=True
+    # Load dataset
+    data_files = {
+        "train": "data/train.csv",
+        "validation": "data/test.csv",
+    }
+    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["validation"]
+    print(train_dataset)
+    print(eval_dataset)
+    # We need to specify the input and output column
+    input_column = "path"
+    output_column = "emotion"
+    # we need to distinguish the unique labels in our SER dataset
+    label_list = train_dataset.unique(output_column)
+    label_list.sort()  # Let's sort it for determinism
+    num_labels = len(label_list)
+    print(f"A classification problem with {num_labels} classes: {label_list}")
+    # Specify the pre-trained model that we will fine tune
+    model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
+    pooling_mode = "mean"
+    # Model Configuration
+    config = AutoConfig.from_pretrained(
+        model_name_or_path,
+        num_labels=num_labels,
+        label2id={label: i for i, label in enumerate(label_list)},
+        id2label={i: label for i, label in enumerate(label_list)},
+        finetuning_task="wav2vec2_clf",
+    )
+    setattr(config, 'pooling_mode', pooling_mode)
+    # Processor is the combination of feature extractor and tokenizer
+    processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
+    target_sampling_rate = processor.feature_extractor.sampling_rate
+    print(f"The target sampling rate: {target_sampling_rate}")
+    # So far, our dataset only contains the path to the audio
+    # Using the mapper, we will load the audio files and also compute
+    # the features
+    train_dataset = train_dataset.map(
+        preprocess_function,
+        batch_size=100,
+        batched=True,
+        num_proc=4
+    )
+    eval_dataset = eval_dataset.map(
+        preprocess_function,
+        batch_size=100,
+        batched=True,
+        num_proc=4
+    )
+    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
+    is_regression = False
+    # Instantiate the Classifier model
+    model = Wav2Vec2ForSpeechClassification.from_pretrained(
+        model_name_or_path,
+        config=config,
+    )
+    # The model's initial  layers are CNNs and are already pre-trained so we will freeze their weights for this demo
+    model.freeze_feature_extractor()
+    training_args = TrainingArguments(
+        report_to = 'wandb',
+        output_dir="data/wav2vec2-xlsr-greek-speech-emotion-recognition",
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        gradient_accumulation_steps=2,
+        evaluation_strategy="steps",
+        num_train_epochs=3.0,
+        fp16=True,
+        save_steps=20,
+        eval_steps=30,
+        logging_steps=10,
+        learning_rate=1e-4,
+        save_total_limit=2,
+        run_name = 'custom_training'            # name of the W&B run
+    )
+    trainer = CTCTrainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=processor.feature_extractor,
+    )
+    trainer.train()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+-f https://download.pytorch.org/whl/cpu/torch_stable.html
+numpy==1.21.5
+pandas==1.3.5
+datasets==1.17.0
+transformers==4.15.0
+torch==1.10.1+cpu
+torchaudio==0.10.1+cpu
+matplotlib==3.5.1
+matplotlib-inline==0.1.3
+streamlit==1.3.1
+seaborn==0.11.2
+gdown==4.2.0
+scikit-learn==1.0.2

setup.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+mkdir -p ~/.streamlit/
+echo "\
+[server]\n\
+headless = true\n\
+port = $PORT\n\
+enableCORS = false\n\
+\n\
+" > ~/.streamlit/config.toml

test.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from main import *
+from sklearn.metrics import classification_report
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    speech_array = speech_array
+    resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
+    speech_array = resampler(speech_array).squeeze().numpy()
+    batch["speech"] = speech_array
+    return batch
+def predict(batch):
+    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)
+    input_values = features.input_values.to(device)
+    attention_mask = features.attention_mask.to(device)
+    with torch.no_grad():
+        logits = model(input_values, attention_mask=attention_mask).logits
+    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
+    batch["predicted"] = pred_ids
+    return batch
+if __name__ == '__main__':
+    data_files = {
+        "test" : 'data/test.csv'
+    }
+    test_dataset = load_dataset('csv', data_files = data_files, delimiter = "\t")["test"]
+    print(test_dataset)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    # model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
+    model_name_or_path2 = "lighteternal/wav2vec2-large-xlsr-53-greek"
+    # model_name_or_path = "data/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-180"
+    model_name_or_path = 'artifacts/aesdd_classifier:v0'
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    processor = Wav2Vec2Processor.from_pretrained(model_name_or_path2)
+    model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
+    test_dataset = test_dataset.map(speech_file_to_array_fn)
+    result = test_dataset.map(predict, batched=True, batch_size=8)
+    label_names = [config.id2label[i] for i in range(config.num_labels)]
+    print(f'Labels: {label_names}')
+    y_true = [config.label2id[name] for name in result["emotion"]]
+    y_pred = result["predicted"]
+    print(y_true[:5])
+    print(y_pred[:5])
+    print(classification_report(y_true, y_pred, target_names=label_names))

utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torchaudio
+def speech_file_to_array_fn(path):
+    speech_array, sampling_rate = torchaudio.load(path)
+    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
+    speech = resampler(speech_array).squeeze().numpy()
+    return speech
+def label_to_id(label, label_list):
+    if len(label_list) > 0:
+        return label_list.index(label) if label in label_list else -1
+    return label
+def preprocess_function(examples):
+    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
+    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
+    result = processor(speech_list, sampling_rate=target_sampling_rate)
+    result["labels"] = list(target_list)
+    return result
+@dataclass
+class SpeechClassifierOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None