File size: 2,264 Bytes
db84255
161cd5c
 
 
 
 
db84255
161cd5c
 
db84255
161cd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torch.nn as nn

# Define emotions
emotion_list = ['anger', 'disgust', 'fear', 'happy', 'neutral', 'sad']

# Define the model
class EmotionClassifier(nn.Module):
    def __init__(self, num_classes):
        super(EmotionClassifier, self).__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base')
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.wav2vec2.config.hidden_size, nhead=8, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.classifier = nn.Linear(self.wav2vec2.config.hidden_size, num_classes)
    
    def forward(self, input_values):
        outputs = self.wav2vec2(input_values).last_hidden_state
        encoded = self.transformer_encoder(outputs)
        logits = self.classifier(encoded[:, 0, :])
        return logits

# Load your trained model
model_path = "best_model_state_dict.pth"
num_classes = len(emotion_list)
model = EmotionClassifier(num_classes)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Define processor
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base')

def predict_emotion(audio):
    # Load and process audio
    audio, sr = librosa.load(audio, sr=16000)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True).input_values
    if inputs.ndimension() == 2:  # Ensure correct input shape
        inputs = inputs.squeeze(0)
    with torch.no_grad():
        logits = model(inputs.unsqueeze(0)).squeeze()
    
    # Get predicted emotions
    probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()
    predictions = {emotion: float(prob) for emotion, prob in zip(emotion_list, probabilities)}
    return predictions

# Create Gradio interface
interface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.Label(num_top_classes=3),
    title="语音情感识别",
    description="上传音频文件(.wav 或 .mp3)或录制您的声音以预测情感。"
)

# Launch the app
if __name__ == "__main__":
    interface.launch()