File size: 1,541 Bytes
eb0573c b80ddf0 c269c69 b80ddf0 ff6ce01 529fa0b b80ddf0 001ec5c 529fa0b b80ddf0 3e91b4e b80ddf0 ff6ce01 b80ddf0 ff6ce01 b80ddf0 3e91b4e b80ddf0 529fa0b ff6ce01 0c6f787 45caaa1 635d837 a841262 3e91b4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import os
import torchaudio
import torch
import numpy as np
import gradio as gr
from hubert.hubert_manager import HuBERTManager
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer
from encodec import EncodecModel
from encodec.utils import convert_audio
hubert_model = CustomHubert(checkpoint_path='hubert.pt')
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)
tokenizer = CustomTokenizer.load_from_checkpoint('polish-HuBERT-quantizer_8_epoch.pth', map_location=torch.device('cpu'))
def process_audio(in_file):
input_filename = in_file.name
wav, sr = torchaudio.load(input_filename)
if wav.shape[0] == 2:
wav = wav.mean(0, keepdim=True)
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
semantic_tokens = tokenizer.get_token(semantic_vectors)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0)
with torch.no_grad():
encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
fine_prompt = codes
coarse_prompt = fine_prompt[:2, :]
output_filename = os.path.splitext(input_filename)[0] + '.npz'
np.savez(output_filename, semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt)
return output_filename
iface = gr.Interface(fn=process_audio, inputs=gr.inputs.File(label="Input Audio"), outputs=gr.outputs.File(label="Output File"))
iface.launch()
|