Spaces:

jcvsalinas
/

recorder

Sleeping

App Files Files Community

recorder / app.py

jcvsalinas

Upload 3 files

4f12294 verified 12 months ago

raw

history blame contribute delete

7.84 kB

	import gradio as gr
	import numpy as np
	import matplotlib.pyplot as plt
	from PIL import Image
	import librosa
	import time
	from datetime import datetime
	import pandas as pd
	HOME_DIR = ""
	local_config_path = 'config.json'
	local_preprocessor_config_path = 'preprocessor_config.json'
	local_weights_path = 'pytorch_model.bin'
	local_training_args_path = 'training_args.bin'

	import torch
	import torch.nn.functional as F
	import numpy as np
	from tqdm import tqdm

	# Define the id2label mapping
	id2label = {
	0: "angry",
	1: "disgust",
	2: "fear",
	3: "happy",
	4: "neutral",
	5: "sad",
	6: "surprise"
	}


	def predict(model, feature_extractor, data, max_length, id2label):
	# Extract features
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
	inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
	torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
	# Forward pass
	outputs = model(input_values=torch_inputs)

	# Extract logits from the output
	logits = outputs

	# Apply softmax to get probabilities
	probabilities = F.softmax(logits, dim=-1)

	# Get the predicted class index
	predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
	predicted_label = id2label[predicted_class_idx]
	#predicted_label = predicted_class_idx

	return predicted_label, probabilities

	from transformers import Wav2Vec2Config, Wav2Vec2Model
	import torch.nn as nn
	from huggingface_hub import PyTorchModelHubMixin

	config = Wav2Vec2Config.from_pretrained(local_config_path)
	class Wav2Vec2ForSpeechClassification(nn.Module, PyTorchModelHubMixin):
	def __init__(self, config):
	super(Wav2Vec2ForSpeechClassification, self).__init__()
	self.wav2vec2 = Wav2Vec2Model(config)

	self.classifier = nn.ModuleDict({
	'dense': nn.Linear(config.hidden_size, config.hidden_size),
	'activation': nn.ReLU(),
	'dropout': nn.Dropout(config.final_dropout),
	'out_proj': nn.Linear(config.hidden_size, config.num_labels)
	})

	def forward(self, input_values):
	outputs = self.wav2vec2(input_values)
	hidden_states = outputs.last_hidden_state

	x = self.classifier['dense'](hidden_states[:, 0, :])
	x = self.classifier['activation'](x)
	x = self.classifier['dropout'](x)
	logits = self.classifier['out_proj'](x)

	return logits

	import json
	from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

	# Load the preprocessor configuration from the local file
	with open(local_preprocessor_config_path, 'r') as file:
	preprocessor_config = json.load(file)

	# Initialize the preprocessor using the loaded configuration
	feature_extractor = Wav2Vec2FeatureExtractor(
	do_normalize=preprocessor_config["do_normalize"],
	feature_extractor_type=preprocessor_config["feature_extractor_type"],
	feature_size=preprocessor_config["feature_size"],
	padding_side=preprocessor_config["padding_side"],
	padding_value=preprocessor_config["padding_value"],
	processor_class_from_name=preprocessor_config["processor_class"],
	return_attention_mask=preprocessor_config["return_attention_mask"],
	sampling_rate=preprocessor_config["sampling_rate"]
	)

	# load the newly finetuned model from huggingface repo

	from huggingface_hub import hf_hub_download

	model_path = hf_hub_download(
	repo_id="kvilla/wav2vec-english-speech-emotion-recognition-finetuned",
	filename="model_finetuned.pth"
	)

	# load the newly finetuned model! from local
	saved_model = torch.load(model_path, map_location=torch.device('cpu'))

	# Create the model with the loaded configuration
	model = Wav2Vec2ForSpeechClassification(config=config)

	# Load the state dictionary
	model.load_state_dict(saved_model)

	print("Model initialized successfully.")

	model.eval()


	def recognize_emotion(audio):
	# Load the audio file using librosa

	sample_rate, audio_data = audio

	# Ensure audio data is in floating-point format
	if not np.issubdtype(audio_data.dtype, np.floating):
	audio_data = audio_data.astype(np.float32)
	# If you still want to process it with librosa, e.g., to change sample rate:
	if sample_rate != 16000:
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
	emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
	print(probabilities)
	probs = probabilities.detach().numpy().flatten().tolist()
	print(probs)
	# Convert probabilities to percentages
	percentages = [round(prob * 100, 2) for prob in probs]
	print(percentages)
	# Define the class labels (adjust to match your specific model's class labels)
	labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
	print(labels)
	# Create a DataFrame
	df = pd.DataFrame({"Emotion": labels, "Probability (%)": percentages})
	df = df.sort_values(by="Probability (%)", ascending=False)
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), df)
	return emotion, get_emotion_image(emotion), df

	def get_emotion_image(emotion):
	# Here, you would have a dictionary or logic to map emotions to images
	emotion_to_image = {
	"angry": "angry.jpeg",
	"disgust": "disgust.jpeg",
	"fear": "fear.jpeg",
	"happy": "happy.jpeg",
	"neutral": "neutral.jpeg",
	"sad": "sad.jpeg",
	"surprise": "surprise.jpeg"
	# Add other emotions and their corresponding images
	}

	# Default image if emotion is not found
	image_path = emotion_to_image.get(emotion, "default.jpg")
	# Load and return the image
	return Image.open(image_path)

	demo = gr.Blocks()
	with demo:
	df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
	theme= gr.themes.Soft(),
	audio_input = gr.Audio(type="numpy",
	sources=["microphone"],
	show_label=True,
	streaming=True
	)
	text_output = gr.Textbox(label="Recognized Emotion")
	output_df = gr.DataFrame(label="Emotion Probabilities")
	image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
	df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
	def process_audio(audio, emotion, image, state, df_probs, df_logs):

	current_time = time.time()
	if state is None or (current_time - state >= 10):
	state = current_time
	emotion, image, df_probs = recognize_emotion(audio)
	# Sample prediction data
	timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

	# Create a dictionary for the new row
	new_row = {'Timestamp': timestamp, 'Emotion': emotion}

	# Append the new row to the DataFrame
	df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
	return emotion, image, state, df_probs, df_logs
	else:
	print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
	return emotion, image, state, df_probs, df_logs

	# Automatically call the recognize_em otion function when audio is recorded
	state = gr.State(None)
	audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
	demo.launch(share=True)