m3hrdadfi commited on
Commit
bfd1618
1 Parent(s): 492067e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -14
README.md CHANGED
@@ -32,7 +32,7 @@ import torch
32
  import torch.nn as nn
33
  import torch.nn.functional as F
34
  import torchaudio
35
- from transformers import AutoConfig, Wav2Vec2Processor
36
 
37
  import librosa
38
  import IPython.display as ipd
@@ -44,8 +44,8 @@ import pandas as pd
44
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
  model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
46
  config = AutoConfig.from_pretrained(model_name_or_path)
47
- processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
48
- sampling_rate = processor.feature_extractor.sampling_rate
49
  model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
50
  ```
51
 
@@ -59,13 +59,11 @@ def speech_file_to_array_fn(path, sampling_rate):
59
 
60
  def predict(path, sampling_rate):
61
  speech = speech_file_to_array_fn(path, sampling_rate)
62
- features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
63
-
64
- input_values = features.input_values.to(device)
65
- attention_mask = features.attention_mask.to(device)
66
 
67
  with torch.no_grad():
68
- logits = model(input_values, attention_mask=attention_mask).logits
69
 
70
  scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
71
  outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
@@ -73,17 +71,17 @@ def predict(path, sampling_rate):
73
  ```
74
 
75
  ```python
76
- path = "/path/to/audio.wav"
77
  outputs = predict(path, sampling_rate)
78
  ```
79
 
80
  ```bash
81
  [
82
- {'Emotion': 'anger', 'Score': '0.0%'},
83
- {'Emotion': 'disgust', 'Score': '99.2%'},
84
- {'Emotion': 'fear', 'Score': '0.1%'},
85
- {'Emotion': 'happiness', 'Score': '0.3%'},
86
- {'Emotion': 'sadness', 'Score': '0.5%'}
87
  ]
88
  ```
89
 
 
32
  import torch.nn as nn
33
  import torch.nn.functional as F
34
  import torchaudio
35
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor
36
 
37
  import librosa
38
  import IPython.display as ipd
 
44
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
  model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
46
  config = AutoConfig.from_pretrained(model_name_or_path)
47
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
48
+ sampling_rate = feature_extractor.sampling_rate
49
  model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
50
  ```
51
 
 
59
 
60
  def predict(path, sampling_rate):
61
  speech = speech_file_to_array_fn(path, sampling_rate)
62
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
63
+ inputs = {key: inputs[key].to(device) for key in inputs}
 
 
64
 
65
  with torch.no_grad():
66
+ logits = model(**inputs).logits
67
 
68
  scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
69
  outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
 
71
  ```
72
 
73
  ```python
74
+ path = "/path/to/disgust.wav"
75
  outputs = predict(path, sampling_rate)
76
  ```
77
 
78
  ```bash
79
  [
80
+ \t{'Emotion': 'anger', 'Score': '0.0%'},
81
+ \t{'Emotion': 'disgust', 'Score': '99.2%'},
82
+ \t{'Emotion': 'fear', 'Score': '0.1%'},
83
+ \t{'Emotion': 'happiness', 'Score': '0.3%'},
84
+ \t{'Emotion': 'sadness', 'Score': '0.5%'}
85
  ]
86
  ```
87