File size: 3,934 Bytes
c9d400e
 
 
 
 
 
 
 
 
 
981f00a
 
 
 
c9d400e
 
 
981f00a
 
 
 
 
 
 
c9d400e
981f00a
c9d400e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981f00a
c9d400e
981f00a
c9d400e
 
981f00a
c9d400e
 
 
981f00a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ad6dd
981f00a
af16a0b
 
981f00a
af16a0b
 
981f00a
f8ad6dd
 
c9d400e
f8ad6dd
c9d400e
 
 
 
 
e90456c
 
f8ad6dd
e90456c
f8ad6dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import opensmile
import joblib
import wave
import datetime
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

from base64 import b64decode

import onnx
import onnxruntime
import torch

import gradio as gr


model_names = ["DNN", "RandomForest"]

rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib"
dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx"

dnn_model = onnxruntime.InferenceSession(dnn_model_path)
rf_model = joblib.load(rf_model_path)

def extract_features_rf(audio_path):
    smile = opensmile.Smile(
    #feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
    )
    feature_df = smile.process_files(audio_path)
    output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
    df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
    df = df[df.index.isin(output_features)]
    df = df.T
    scaler = StandardScaler()
    feature = scaler.fit_transform(df)
    print(df.shape)

    return feature

def predict_rf(input):
  # openSMILEで特徴量抽出
  feature_vector = extract_features_rf([input])

  # ロードしたモデルで推論
  prediction = rf_model.predict(feature_vector)
  #print(f"Prediction: {prediction}")
  return prediction






def extract_features_dnn(audio_path):
    smile = opensmile.Smile(
    #feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
    )
    feature_df = smile.process_files(audio_path)
    #output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
    df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
    #df = df[df.index.isin(output_features)]
    df = df.T
    scaler = StandardScaler()
    feature = scaler.fit_transform(df)
    print(df.shape)

    return feature

def softmax_calc_(pred):
    if torch.argmax(pred) == torch.tensor(0) :
        prediction = "question"

    else:
        prediction = "declarative"
    return prediction

def predict_dnn(input):
    # openSMILEで特徴量抽出
    feature_vector = extract_features_dnn([input])
    
    # ロードしたモデルで推論
    onnx_outs = dnn_model.run(None, {"model_input":feature_vector})
    print(onnx_outs)
    prediction = softmax_calc_(torch.FloatTensor(onnx_outs))
    print(f"Prediction: {prediction}")
    return prediction

def main(model, audio):
    if model == "DNN":
        predict = predict_dnn(audio)
        
    elif model == "RandomForest":
        predict = predict_rf(audio)
    return predict

iface = gr.Interface(
    fn = main,
    inputs=[
        gr.Dropdown(choices=model_names),
        gr.Audio(sources=["microphone","upload"], type="filepath")
    ],
    outputs=[
        "textbox"
    ],
    live=True,
    description="demo for Audio to question classifier"
)

iface.launch()