File size: 4,603 Bytes
dbc5a52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import torchaudio
import torch
import torch.nn.functional as F
from speechbrain.inference.speaker import EncoderClassifier
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import noisereduce as nr
import librosa

# Load the classifier model
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")


def f2embed(wav_file, classifier, size_embed):

    signal, fs = stereo_to_mono(wav_file)
    if signal is None:
        return None
    # print(fs, "FS")
    if fs != 16000:
        signal, fs = resample_to_16000(signal, fs)
        if signal is None:
            return None
    assert fs == 16000, fs
    with torch.no_grad():
        embeddings = classifier.encode_batch(signal)
        embeddings = F.normalize(embeddings, dim=2)
        embeddings = embeddings.squeeze().cpu().numpy()
    assert embeddings.shape[0] == size_embed, embeddings.shape[0]
    return embeddings

def stereo_to_mono(wav_file):
    try:
        signal, fs = torchaudio.load(wav_file)
        signal_np = signal.numpy()
        if signal_np.shape[0] == 2:  # If stereo
            signal_mono = librosa.to_mono(signal_np)
            signal_mono = torch.from_numpy(signal_mono).unsqueeze(0)
        else:
            signal_mono = signal  # Already mono
        print(f"Converted to mono: {signal_mono.shape}")
        return signal_mono, fs
    except Exception as e:
        print(f"Error in stereo_to_mono: {e}")
        return None, None

def resample_to_16000(signal, original_sr):
    try:
        signal_np = signal.numpy().flatten()
        signal_resampled = librosa.resample(signal_np, orig_sr=original_sr, target_sr=16000)
        signal_resampled = torch.from_numpy(signal_resampled).unsqueeze(0)
        print(f"Resampled to 16000 Hz: {signal_resampled.shape}")
        return signal_resampled, 16000
    except Exception as e:
        print(f"Error in resample_to_16000: {e}")
        return None, None

def reduce_noise(speech, noise_reduction_amount=0.5):
    try:
        denoised_speech = nr.reduce_noise(y=speech, sr=16000)
        return denoised_speech
    except Exception as e:
        print(f"Error in reduce_noise: {e}")
        return speech



def process_audio(wav_file, text):
    try:
        # Extract speaker embeddings
        speaker_embeddings = f2embed(wav_file, classifier, 512)
        if speaker_embeddings is None:
            return None, "Error in speaker embedding extraction"

        embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
        
        # Load and process the speech file
        signal, fs = torchaudio.load(wav_file)
        signal_np = signal.numpy().flatten()
        print(f"Loaded signal: {signal_np.shape}, Sample rate: {fs}")
        
        # Convert text to speech using the speaker embeddings
        processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        inputs = processor(text=text, return_tensors="pt")
        inputs.update({"speaker_embeddings": embeddings})
        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=inputs["speaker_embeddings"],vocoder=vocoder)
        print(f"Generated speech, shape: {speech.shape}")

        # Reduce noise
        speech_denoised = reduce_noise(speech)
        print(f"Reduced noise, signal shape: {speech_denoised.shape}")
        return speech_denoised, 16000
    except Exception as e:
        print(f"Error in process_audio: {e}")
        return None, "Error in audio processing"

# Gradio interface
def gradio_interface(wav_file, text):
    try:
        processed_audio, rate = process_audio(wav_file, text)
        if processed_audio is None:
            return "Error occurred during processing"
        return (rate, processed_audio)
    except Exception as e:
        print(f"Error in gradio_interface: {e}")
        return "Error occurred during processing"

# Create Gradio interface
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Audio(type="filepath"), gr.Textbox(lines=2, placeholder="Enter text here...")],
    outputs=gr.Audio(type="numpy"),
    title="Text-to-Speech with Speaker Embeddings",
    description="Upload a speaker audio file and enter text to convert the text to speech using the speaker's voice.",
)

gr_interface.launch()


# process_audio("/content/Network Chunck.mp3","Hello this network chunk")