|
|
|
import os |
|
import torchaudio |
|
import torch |
|
import numpy as np |
|
import gradio as gr |
|
|
|
from hubert.hubert_manager import HuBERTManager |
|
from hubert.pre_kmeans_hubert import CustomHubert |
|
from hubert.customtokenizer import CustomTokenizer |
|
from encodec import EncodecModel |
|
from encodec.utils import convert_audio |
|
|
|
hubert_model = CustomHubert(checkpoint_path='hubert.pt') |
|
model = EncodecModel.encodec_model_24khz() |
|
model.set_target_bandwidth(6.0) |
|
tokenizer = CustomTokenizer.load_from_checkpoint('polish-HuBERT-quantizer_8_epoch.pth', map_location=torch.device('cpu')) |
|
|
|
|
|
def process_audio(in_file): |
|
input_filename = in_file.name |
|
|
|
wav, sr = torchaudio.load(input_filename) |
|
if wav.shape[0] == 2: |
|
wav = wav.mean(0, keepdim=True) |
|
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr) |
|
semantic_tokens = tokenizer.get_token(semantic_vectors) |
|
wav = convert_audio(wav, sr, model.sample_rate, model.channels) |
|
wav = wav.unsqueeze(0) |
|
with torch.no_grad(): |
|
encoded_frames = model.encode(wav) |
|
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() |
|
fine_prompt = codes |
|
coarse_prompt = fine_prompt[:2, :] |
|
|
|
output_filename = os.path.splitext(input_filename)[0] + '.npz' |
|
|
|
np.savez(output_filename, semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt) |
|
return output_filename |
|
|
|
iface = gr.Interface(fn=process_audio, inputs=gr.inputs.File(label="Input Audio"), outputs=gr.outputs.File(label="Output File")) |
|
iface.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|