Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import soundfile as sf | |
| from faster_whisper import WhisperModel | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" | |
| CENTROIDS_PATH = "emotion_avg.pkl" | |
| with open(CENTROIDS_PATH, "rb") as f: | |
| emotion_avg = pickle.load(f) | |
| for k in list(emotion_avg.keys()): | |
| emotion_avg[k] = np.array(emotion_avg[k]) | |
| EMOTIONS = list(emotion_avg.keys()) | |
| embedder = SentenceTransformer(EMBED_MODEL_NAME) | |
| whisper_model = WhisperModel("base", compute_type="int8") | |
| def predict_emotion_sentence(sentence): | |
| emb = embedder.encode([sentence], convert_to_numpy=True)[0] | |
| labels = [] | |
| sims = [] | |
| for emotion in EMOTIONS: | |
| sim = cosine_similarity( | |
| emb.reshape(1, -1), | |
| emotion_avg[emotion].reshape(1, -1) | |
| )[0][0] | |
| labels.append(emotion) | |
| sims.append(sim) | |
| order = np.argsort(sims)[::-1] | |
| best_idx = order[0] | |
| second_idx = order[1] if len(order) > 1 else order[0] | |
| return { | |
| "emotion": labels[best_idx], | |
| "score": float(sims[best_idx]), | |
| "margin": float(sims[best_idx] - sims[second_idx]) | |
| } | |
| def analyze_audio(audio_path): | |
| if audio_path is None: | |
| return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"]) | |
| segments, _ = whisper_model.transcribe(audio_path) | |
| transcript_parts = [] | |
| rows = [] | |
| for seg in segments: | |
| text = seg.text.strip() | |
| if not text: | |
| continue | |
| transcript_parts.append(text) | |
| pred = predict_emotion_sentence(text) | |
| rows.append({ | |
| "sentence": text, | |
| "emotion": pred["emotion"], | |
| "score": pred["score"], | |
| "margin": pred["margin"] | |
| }) | |
| transcript = " ".join(transcript_parts).strip() | |
| if rows: | |
| latest = rows[-1] | |
| latest_emotion = latest["emotion"] | |
| latest_margin = latest["margin"] | |
| else: | |
| latest_emotion = "None" | |
| latest_margin = 0.0 | |
| df = pd.DataFrame(rows) | |
| return transcript, latest_emotion, latest_margin, df | |
| with gr.Blocks(title="Emotion Speech Classifier") as demo: | |
| gr.Markdown("# Emotion Speech Classifier") | |
| gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Audio Input" | |
| ) | |
| run_btn = gr.Button("Analyze Audio") | |
| with gr.Column(scale=2): | |
| transcript_box = gr.Textbox(label="Transcript", lines=8) | |
| with gr.Row(): | |
| latest_emotion_box = gr.Textbox(label="Latest Emotion") | |
| margin_box = gr.Number(label="Match Margin") | |
| results_df = gr.Dataframe( | |
| headers=["sentence", "emotion", "score", "margin"], | |
| label="Sentence Analysis" | |
| ) | |
| run_btn.click( | |
| fn=analyze_audio, | |
| inputs=audio_input, | |
| outputs=[transcript_box, latest_emotion_box, margin_box, results_df] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |