| | |
| | import torch |
| | import torchaudio |
| | import librosa |
| | import sounddevice as sd |
| | import streamlit as st |
| | from transformers import HubertModel, HubertProcessor |
| | from speechbrain.inference import HIFIGAN |
| |
|
| | |
| | |
| | |
| | st.write("Loading models, please wait...") |
| | processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft") |
| | hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") |
| | vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan") |
| |
|
| | |
| | |
| | |
| | def extract_speaker_embedding(audio_path): |
| | y, sr = librosa.load(audio_path, sr=16000) |
| | inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True) |
| | with torch.no_grad(): |
| | embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1) |
| | return embeddings |
| |
|
| | def audio_to_mel(audio_path): |
| | waveform, sample_rate = torchaudio.load(audio_path) |
| | mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform) |
| | return mel_spec |
| |
|
| | def mel_to_audio(mel_spec): |
| | |
| | waveform = vocoder.decode_batch(mel_spec) |
| | return waveform |
| |
|
| | def convert_voice(input_path, reference_path): |
| | ref_emb = extract_speaker_embedding(reference_path) |
| | mel_spec = audio_to_mel(input_path) |
| | |
| | converted_mel = mel_spec + ref_emb.unsqueeze(-1) |
| | waveform = mel_to_audio(converted_mel) |
| | return waveform |
| |
|
| | def play_audio(waveform, sample_rate=16000): |
| | sd.play(waveform.squeeze().cpu().numpy(), sample_rate) |
| | sd.wait() |
| |
|
| | |
| | |
| | |
| | st.title("Advanced RVC Voice Converter") |
| |
|
| | input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"]) |
| | reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"]) |
| |
|
| | if input_audio and reference_audio: |
| | if st.button("Convert Voice"): |
| | |
| | input_path = "temp_input.wav" |
| | reference_path = "temp_reference.wav" |
| | with open(input_path, "wb") as f: |
| | f.write(input_audio.read()) |
| | with open(reference_path, "wb") as f: |
| | f.write(reference_audio.read()) |
| | |
| | st.write("Converting voice...") |
| | waveform = convert_voice(input_path, reference_path) |
| | st.write("Playing converted audio...") |
| | play_audio(waveform) |
| | st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav") |
| |
|