Spaces:
Sleeping
Sleeping
File size: 3,934 Bytes
c9d400e 981f00a c9d400e 981f00a c9d400e 981f00a c9d400e 981f00a c9d400e 981f00a c9d400e 981f00a c9d400e 981f00a f8ad6dd 981f00a af16a0b 981f00a af16a0b 981f00a f8ad6dd c9d400e f8ad6dd c9d400e e90456c f8ad6dd e90456c f8ad6dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import opensmile
import joblib
import wave
import datetime
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from base64 import b64decode
import onnx
import onnxruntime
import torch
import gradio as gr
model_names = ["DNN", "RandomForest"]
rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib"
dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx"
dnn_model = onnxruntime.InferenceSession(dnn_model_path)
rf_model = joblib.load(rf_model_path)
def extract_features_rf(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def predict_rf(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_rf([input])
# ロードしたモデルで推論
prediction = rf_model.predict(feature_vector)
#print(f"Prediction: {prediction}")
return prediction
def extract_features_dnn(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
#output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
#df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def softmax_calc_(pred):
if torch.argmax(pred) == torch.tensor(0) :
prediction = "question"
else:
prediction = "declarative"
return prediction
def predict_dnn(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_dnn([input])
# ロードしたモデルで推論
onnx_outs = dnn_model.run(None, {"model_input":feature_vector})
print(onnx_outs)
prediction = softmax_calc_(torch.FloatTensor(onnx_outs))
print(f"Prediction: {prediction}")
return prediction
def main(model, audio):
if model == "DNN":
predict = predict_dnn(audio)
elif model == "RandomForest":
predict = predict_rf(audio)
return predict
iface = gr.Interface(
fn = main,
inputs=[
gr.Dropdown(choices=model_names),
gr.Audio(sources=["microphone","upload"], type="filepath")
],
outputs=[
"textbox"
],
live=True,
description="demo for Audio to question classifier"
)
iface.launch() |