Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class | |
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class | |
import numpy as np | |
from voicebox.src.constants import PPG_PRETRAINED_PATH | |
from voicebox.src.models import ResNetSE34V2 | |
#Set voicebox default parameters | |
LOOKAHEAD = 5 | |
voicebox_kwargs={'win_length': 256, | |
'ppg_encoder_hidden_size': 256, | |
'use_phoneme_encoder': True, | |
'use_pitch_encoder': True, | |
'use_loudness_encoder': True, | |
'spec_encoder_lookahead_frames': 0, | |
'spec_encoder_type': 'mel', | |
'spec_encoder_mlp_depth': 2, | |
'bottleneck_lookahead_frames': LOOKAHEAD, | |
'ppg_encoder_path': PPG_PRETRAINED_PATH, | |
'n_bands': 128, | |
'spec_encoder_hidden_size': 512, | |
'bottleneck_skip': True, | |
'bottleneck_hidden_size': 512, | |
'bottleneck_feedforward_size': 512, | |
'bottleneck_type': 'lstm', | |
'bottleneck_depth': 2, | |
'control_eps': 0.5, | |
'projection_norm': float('inf'), | |
'conditioning_dim': 512} | |
''' | |
#Set streamer default parameters: | |
config_path = 'voicebox/pretrained/voicebox/voicebox_final.yaml' | |
with open(config_path) as f: | |
config = yaml.safe_load(f) | |
#Load pretrained model (streamer): | |
model = streamer.VoiceBoxStreamer(**config) | |
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) | |
model.eval() | |
''' | |
#Load pretrained model (VoiceBox): | |
model = vb.VoiceBox(**voicebox_kwargs) | |
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) | |
model.eval() | |
#Define function to convert final audio format: | |
def float32_to_int16(waveform): | |
waveform = waveform / np.abs(waveform).max() | |
waveform = waveform * 32767 | |
waveform = waveform.astype(np.int16) | |
waveform = waveform.ravel() | |
return waveform | |
def get_embedding(recording): | |
resnet = ResNetSE34V2(nOut=512, encoder_type='ASP') | |
recording = recording.view(1, -1) | |
embedding = resnet(recording) | |
return embedding | |
#Define predict function: | |
def predict(inp): | |
#How to transform audio from string to tensor | |
waveform, sample_rate = torchaudio.load(inp) | |
#Resample to 16kHz | |
transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = transform_to_16hz(waveform) | |
sample_rate = 16000 | |
#Get speaker embedding | |
condition_tensor = get_embedding(waveform) | |
condition_tensor = condition_tensor.reshape(1, 1, -1) | |
n_frames = waveform.shape[1] | |
condition_tensor = condition_tensor.repeat(1, n_frames, 1) | |
#Run model without changing weights | |
with torch.no_grad(): | |
waveform = model(x=waveform, y=condition_tensor) | |
#Transform output audio into gradio-readable format | |
waveform = waveform.numpy() | |
waveform = float32_to_int16(waveform) | |
return sample_rate, waveform | |
#Set up gradio interface | |
import gradio as gr | |
interface = gr.Interface( | |
fn=predict, | |
inputs=gr.Audio(type="filepath"), | |
outputs=gr.Audio() | |
) | |
interface.launch() |