wave-app / app.py
ajnx014's picture
Update app.py
b430002 verified
import gradio as gr
import os
import numpy as np
import librosa
from resemblyzer import VoiceEncoder
import soundfile as sf
# Initialize the encoder
encoder = VoiceEncoder()
reference_embeddings = []
def load_audio(file, target_sr=16000):
"""Loads an audio file and resamples it to the target sample rate."""
audio, sr = librosa.load(file, sr=target_sr)
return audio
def extract_embeddings(encoder, audio_files):
"""Extracts voice embeddings from uploaded audio files."""
embeddings = []
for file_path in audio_files:
audio = load_audio(file_path) # Load and preprocess the audio file
embedding = encoder.embed_utterance(audio) # Extract voice embedding
embeddings.append(embedding)
return np.array(embeddings)
def compute_similarity(embedding, reference_embeddings):
"""Computes similarity between test and reference embeddings."""
similarities = np.dot(reference_embeddings, embedding) / (
np.linalg.norm(reference_embeddings, axis=1) * np.linalg.norm(embedding)
)
return np.mean(similarities)
def train_voice_samples(files):
global reference_embeddings
if len(files) > 50:
return "Please upload up to 50 files only."
reference_embeddings = extract_embeddings(encoder, [file.name for file in files])
return f"Extracted embeddings from {len(files)} voice samples. Ready for testing!"
def test_voice(file):
try:
if reference_embeddings is None or len(reference_embeddings) == 0:
return "No reference voice samples found. Please upload training samples first."
# Debugging: Check if file is received
print(f"Received test file: {file.name}")
# Load test audio properly
test_audio, sr = librosa.load(file.name, sr=16000)
# Debugging: Check audio shape
print(f"Loaded test audio, shape: {test_audio.shape}, Sample rate: {sr}")
# Extract embedding
test_embedding = encoder.embed_utterance(test_audio)
# Compute similarity
similarity_score = compute_similarity(test_embedding, reference_embeddings)
# Debugging: Check similarity score
print(f"Computed similarity score: {similarity_score}")
# Generate result message
result = f"Similarity Score: {similarity_score:.2f}\n"
if similarity_score > 0.8:
result += "The voice matches closely with the training samples!\n"
result += "You can access the demo content via the following link:\n"
result += "[πŸ”— Access Demo](https://example.com/demo)"
else:
result += "The voice does not match the training samples."
return result
except Exception as e:
return f"Error: {str(e)}"
with gr.Blocks() as app:
gr.Markdown("## Voice Recognition with Similarity Testing")
gr.Markdown("**Instruction:** Upload a single file of more than 1-minute duration or multiple files totaling more than 1 minute.")
gr.Markdown("[πŸ”— Link to Eleven Labs](https://elevenlabs.io/app/speech-synthesis/text-to-speech)")
gr.Markdown("**Access Eleven Labs to test the model on multiple voices**")
with gr.Row():
train_audio = gr.File(label="Upload up to 50 training voice samples", file_types=[".wav"], file_count="multiple")
train_button = gr.Button("Train Model")
train_output = gr.Textbox()
train_button.click(train_voice_samples, inputs=train_audio, outputs=train_output)
with gr.Row():
test_audio = gr.File(label="Upload a test voice file", file_types=[".wav"])
test_button = gr.Button("Test Voice")
test_output = gr.Textbox()
test_button.click(test_voice, inputs=test_audio, outputs=test_output)
app.launch(share=True)