import gradio as gr import os import numpy as np import librosa from resemblyzer import VoiceEncoder import soundfile as sf # Initialize the encoder encoder = VoiceEncoder() reference_embeddings = [] def load_audio(file, target_sr=16000): """Loads an audio file and resamples it to the target sample rate.""" audio, sr = librosa.load(file, sr=target_sr) return audio def extract_embeddings(encoder, audio_files): """Extracts voice embeddings from uploaded audio files.""" embeddings = [] for file_path in audio_files: audio = load_audio(file_path) # Load and preprocess the audio file embedding = encoder.embed_utterance(audio) # Extract voice embedding embeddings.append(embedding) return np.array(embeddings) def compute_similarity(embedding, reference_embeddings): """Computes similarity between test and reference embeddings.""" similarities = np.dot(reference_embeddings, embedding) / ( np.linalg.norm(reference_embeddings, axis=1) * np.linalg.norm(embedding) ) return np.mean(similarities) def train_voice_samples(files): global reference_embeddings if len(files) > 50: return "Please upload up to 50 files only." reference_embeddings = extract_embeddings(encoder, [file.name for file in files]) return f"Extracted embeddings from {len(files)} voice samples. Ready for testing!" def test_voice(file): try: if reference_embeddings is None or len(reference_embeddings) == 0: return "No reference voice samples found. Please upload training samples first." # Debugging: Check if file is received print(f"Received test file: {file.name}") # Load test audio properly test_audio, sr = librosa.load(file.name, sr=16000) # Debugging: Check audio shape print(f"Loaded test audio, shape: {test_audio.shape}, Sample rate: {sr}") # Extract embedding test_embedding = encoder.embed_utterance(test_audio) # Compute similarity similarity_score = compute_similarity(test_embedding, reference_embeddings) # Debugging: Check similarity score print(f"Computed similarity score: {similarity_score}") # Generate result message result = f"Similarity Score: {similarity_score:.2f}\n" if similarity_score > 0.8: result += "The voice matches closely with the training samples!\n" result += "You can access the demo content via the following link:\n" result += "[🔗 Access Demo](https://example.com/demo)" else: result += "The voice does not match the training samples." return result except Exception as e: return f"Error: {str(e)}" with gr.Blocks() as app: gr.Markdown("## Voice Recognition with Similarity Testing") gr.Markdown("**Instruction:** Upload a single file of more than 1-minute duration or multiple files totaling more than 1 minute.") gr.Markdown("[🔗 Link to Eleven Labs](https://elevenlabs.io/app/speech-synthesis/text-to-speech)") gr.Markdown("**Access Eleven Labs to test the model on multiple voices**") with gr.Row(): train_audio = gr.File(label="Upload up to 50 training voice samples", file_types=[".wav"], file_count="multiple") train_button = gr.Button("Train Model") train_output = gr.Textbox() train_button.click(train_voice_samples, inputs=train_audio, outputs=train_output) with gr.Row(): test_audio = gr.File(label="Upload a test voice file", file_types=[".wav"]) test_button = gr.Button("Test Voice") test_output = gr.Textbox() test_button.click(test_voice, inputs=test_audio, outputs=test_output) app.launch(share=True)