harshit345 commited on
Commit
683b5f3
·
1 Parent(s): edc4233

Upload README.md

Browse files
Files changed (1) hide show
  1. README.md +81 -0
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: el
3
+ datasets:
4
+ - aesdd
5
+ tags:
6
+ - audio
7
+ - audio-classification
8
+ - speech
9
+ license: apache-2.0
10
+ ---
11
+
12
+
13
+ ~~~
14
+ # requirement packages
15
+ !pip install git+https://github.com/huggingface/datasets.git
16
+ !pip install git+https://github.com/huggingface/transformers.git
17
+ !pip install torchaudio
18
+ !pip install librosa
19
+ !git clone https://github.com/m3hrdadfi/soxan
20
+ cd soxan
21
+ ~~~
22
+
23
+
24
+ # prediction
25
+ ~~~
26
+ import torch
27
+ import torch.nn as nn
28
+ import torch.nn.functional as F
29
+ import torchaudio
30
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor
31
+
32
+ import librosa
33
+ import IPython.display as ipd
34
+ import numpy as np
35
+ import pandas as pd
36
+ ~~~
37
+
38
+ ~~~
39
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+ model_name_or_path = "Bagus/wav2vec2-xlsr-greek-speech-emotion-recognition"
41
+ config = AutoConfig.from_pretrained(model_name_or_path)
42
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
43
+ sampling_rate = feature_extractor.sampling_rate
44
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
45
+ ~~~
46
+
47
+ ~~~
48
+ def speech_file_to_array_fn(path, sampling_rate):
49
+ speech_array, _sampling_rate = torchaudio.load(path)
50
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
51
+ speech = resampler(speech_array).squeeze().numpy()
52
+ return speech
53
+
54
+
55
+ def predict(path, sampling_rate):
56
+ speech = speech_file_to_array_fn(path, sampling_rate)
57
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
58
+ inputs = {key: inputs[key].to(device) for key in inputs}
59
+
60
+ with torch.no_grad():
61
+ logits = model(**inputs).logits
62
+
63
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
64
+ outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
65
+ return outputs
66
+ ~~~
67
+
68
+ # prediction
69
+ ~~~
70
+ # path for a sample
71
+ path = '/data/jtes_v1.1/wav/f01/ang/f01_ang_01.wav'
72
+ outputs = predict(path, sampling_rate)
73
+ ~~~
74
+
75
+ ~~~
76
+ [{'Emotion': 'anger', 'Score': '98.3%'},
77
+ {'Emotion': 'disgust', 'Score': '0.0%'},
78
+ {'Emotion': 'fear', 'Score': '0.4%'},
79
+ {'Emotion': 'happiness', 'Score': '0.7%'},
80
+ {'Emotion': 'sadness', 'Score': '0.5%'}]
81
+ ~~~