import json from tqdm import tqdm from copy import deepcopy import soundfile as sf import numpy as np import gradio as gr import torch import random random.seed(0) torch.manual_seed(0) np.random.seed(0) from util import print_size, sampling from network import CleanUNet import torchaudio import torchaudio.transforms as T SAMPLE_RATE = 22050 def load_simple(filename): wav, sr = torchaudio.load(filename) resampler = T.Resample(sr, SAMPLE_RATE, dtype=wav.dtype) resampled_wav = resampler(wav) return resampled_wav CONFIG = "configs/DNS-large-full.json" CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl" # Parse configs. Globals nicer in this case with open(CONFIG) as f: data = f.read() config = json.loads(data) gen_config = config["gen_config"] global network_config network_config = config["network_config"] # to define wavenet global train_config train_config = config["train_config"] # train config global trainset_config trainset_config = config["trainset_config"] # to read trainset configurations def denoise(filename, ckpt_path = CHECKPOINT, out = "out.wav"): """ Denoise audio Parameters: output_directory (str): save generated speeches to this path ckpt_iter (int or 'max'): the pretrained checkpoint to be loaded; automitically selects the maximum iteration if 'max' is selected subset (str): training, testing, validation dump (bool): whether save enhanced (denoised) audio """ # setup local experiment path exp_path = train_config["exp_path"] print('exp_path:', exp_path) # load data loader_config = deepcopy(trainset_config) loader_config["crop_length_sec"] = 0 # predefine model net = CleanUNet(**network_config) print_size(net) # load checkpoint checkpoint = torch.load(ckpt_path, map_location='cpu') net.load_state_dict(checkpoint['model_state_dict']) net.eval() # inference noisy_audio = load_simple(filename) with torch.no_grad(): with torch.cuda.amp.autocast(): generated_audio = sampling(net, noisy_audio) generated_audio = generated_audio[0].squeeze().cpu().numpy() sf.write(out, np.ravel(generated_audio), SAMPLE_RATE) return out # audio = gr.inputs.Audio(label = "Audio to denoise", type = 'filepath') # inputs = [audio] # outputs = gr.outputs.Audio(label = "Denoised audio", type = 'filepath') # title = "Speech Denoising in the Waveform Domain with Self-Attention from Nvidia" # gr.Interface(denoise, inputs, outputs, title=title, enable_queue=True).launch() mic_transcribe = gr.Interface( fn=denoise, inputs=[ gr.inputs.Audio(source="microphone", label="Audio to denoise", type="filepath", optional=True), ], outputs=gr.outputs.Audio(label = "Denoised audio", type = 'filepath'), layout="horizontal", #theme="huggingface", title="My Demo: Speech enhancement", #description=( # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the" # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" # " of arbitrary length." # ), allow_flagging="never", ) mic_transcribe.launch()