|
|
import gradio as gr |
|
|
import os |
|
|
import numpy as np |
|
|
import librosa |
|
|
from resemblyzer import VoiceEncoder |
|
|
import soundfile as sf |
|
|
|
|
|
|
|
|
encoder = VoiceEncoder() |
|
|
reference_embeddings = [] |
|
|
|
|
|
def load_audio(file, target_sr=16000): |
|
|
"""Loads an audio file and resamples it to the target sample rate.""" |
|
|
audio, sr = librosa.load(file, sr=target_sr) |
|
|
return audio |
|
|
|
|
|
def extract_embeddings(encoder, audio_files): |
|
|
"""Extracts voice embeddings from uploaded audio files.""" |
|
|
embeddings = [] |
|
|
for file_path in audio_files: |
|
|
audio = load_audio(file_path) |
|
|
embedding = encoder.embed_utterance(audio) |
|
|
embeddings.append(embedding) |
|
|
return np.array(embeddings) |
|
|
|
|
|
def compute_similarity(embedding, reference_embeddings): |
|
|
"""Computes similarity between test and reference embeddings.""" |
|
|
similarities = np.dot(reference_embeddings, embedding) / ( |
|
|
np.linalg.norm(reference_embeddings, axis=1) * np.linalg.norm(embedding) |
|
|
) |
|
|
return np.mean(similarities) |
|
|
|
|
|
def train_voice_samples(files): |
|
|
global reference_embeddings |
|
|
if len(files) > 50: |
|
|
return "Please upload up to 50 files only." |
|
|
|
|
|
reference_embeddings = extract_embeddings(encoder, [file.name for file in files]) |
|
|
return f"Extracted embeddings from {len(files)} voice samples. Ready for testing!" |
|
|
|
|
|
def test_voice(file): |
|
|
try: |
|
|
if reference_embeddings is None or len(reference_embeddings) == 0: |
|
|
return "No reference voice samples found. Please upload training samples first." |
|
|
|
|
|
|
|
|
print(f"Received test file: {file.name}") |
|
|
|
|
|
|
|
|
test_audio, sr = librosa.load(file.name, sr=16000) |
|
|
|
|
|
|
|
|
print(f"Loaded test audio, shape: {test_audio.shape}, Sample rate: {sr}") |
|
|
|
|
|
|
|
|
test_embedding = encoder.embed_utterance(test_audio) |
|
|
|
|
|
|
|
|
similarity_score = compute_similarity(test_embedding, reference_embeddings) |
|
|
|
|
|
|
|
|
print(f"Computed similarity score: {similarity_score}") |
|
|
|
|
|
|
|
|
result = f"Similarity Score: {similarity_score:.2f}\n" |
|
|
if similarity_score > 0.8: |
|
|
result += "The voice matches closely with the training samples!\n" |
|
|
result += "You can access the demo content via the following link:\n" |
|
|
result += "[π Access Demo](https://example.com/demo)" |
|
|
else: |
|
|
result += "The voice does not match the training samples." |
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks() as app: |
|
|
gr.Markdown("## Voice Recognition with Similarity Testing") |
|
|
gr.Markdown("**Instruction:** Upload a single file of more than 1-minute duration or multiple files totaling more than 1 minute.") |
|
|
gr.Markdown("[π Link to Eleven Labs](https://elevenlabs.io/app/speech-synthesis/text-to-speech)") |
|
|
gr.Markdown("**Access Eleven Labs to test the model on multiple voices**") |
|
|
|
|
|
with gr.Row(): |
|
|
train_audio = gr.File(label="Upload up to 50 training voice samples", file_types=[".wav"], file_count="multiple") |
|
|
train_button = gr.Button("Train Model") |
|
|
train_output = gr.Textbox() |
|
|
train_button.click(train_voice_samples, inputs=train_audio, outputs=train_output) |
|
|
|
|
|
with gr.Row(): |
|
|
test_audio = gr.File(label="Upload a test voice file", file_types=[".wav"]) |
|
|
test_button = gr.Button("Test Voice") |
|
|
test_output = gr.Textbox() |
|
|
test_button.click(test_voice, inputs=test_audio, outputs=test_output) |
|
|
|
|
|
app.launch(share=True) |