Spaces:

jmparejaz
/

Audio_to_text_classification

Running

File size: 3,317 Bytes

ccbbbf4
 
 
 
b5a2ee4
e4a6674
 
b5a2ee4
 
3161b19
 
b5a2ee4
 
ccbbbf4
e4a6674
 
 
 
05856c6
e4a6674
 
ccbbbf4
 
b5a2ee4
 
 
 
 
 
 
 
 
 
 
e4a6674
 
ccbbbf4
 
 
 
 
 
 
 
 
 
93390f1
ccbbbf4
e4a6674
93390f1
e4a6674
 
5272af9
 
c4f4e03
e4a6674
b5a2ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71216c6
b5a2ee4
7eb62ed
b5a2ee4
 
 
 
 
 
 
71216c6
b5a2ee4
c4f4e03
71216c6
c4f4e03
b5a2ee4
 
c4f4e03
 
 
 
 
 
e4a6674
5272af9
c4f4e03

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from huggingface_hub import from_pretrained_keras
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.preprocessing import StandardScaler
import logging
import librosa
import numpy as np
import pickle



#call tokenizer and NLP model for text classification
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


# call whisper model for audio/speech processing
model = whisper.load_model("small")

# call model for audio emotions
reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')

# call scaler and decoder
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)



def inference_audio(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)
    
    return result.text

def inference_text(audio):
    text =inference_audio(audio)

    sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
    res=sentiment_task(text)[0]

    return text,res['label'],res['score']

    
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result
"""
def audio_emotions(audio):
    sr,data = audio
    features_audio = extract_features(data)
    features_audio = np.array(features_audio)
    scaled_features=scaler.transform(features_audio)
    scaled_features = np.expand_dims(scaled_features, axis=2)
    prediction=reloaded_model.predict(scaled_features)
    y_pred = encoder.inverse_transform(prediction)
    return y_pred
"""
def main(audio):
    r1,r2,r3=inference_text(audio)
    #r3=audio_emotions(audio)
    return r1,r2,r3
    

audio = gr.Audio(
                    label="Input Audio",
                    show_label=False,
                    source="microphone",
                    type="filepath"
                )


app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True)