File size: 1,541 Bytes
eb0573c
b80ddf0
 
c269c69
b80ddf0
 
ff6ce01
 
 
 
 
 
 
529fa0b
b80ddf0
 
001ec5c
 
529fa0b
b80ddf0
3e91b4e
 
 
b80ddf0
ff6ce01
 
b80ddf0
 
ff6ce01
 
b80ddf0
 
 
 
3e91b4e
 
 
 
 
b80ddf0
 
529fa0b
ff6ce01
 
0c6f787
45caaa1
635d837
a841262
3e91b4e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

import os
import torchaudio
import torch
import numpy as np
import gradio as gr

from hubert.hubert_manager import HuBERTManager
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer
from encodec import EncodecModel
from encodec.utils import convert_audio

hubert_model = CustomHubert(checkpoint_path='hubert.pt')
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)
tokenizer = CustomTokenizer.load_from_checkpoint('polish-HuBERT-quantizer_8_epoch.pth', map_location=torch.device('cpu'))


def process_audio(in_file):
    input_filename = in_file.name

    wav, sr = torchaudio.load(input_filename)
    if wav.shape[0] == 2:
        wav = wav.mean(0, keepdim=True)
    semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
    semantic_tokens = tokenizer.get_token(semantic_vectors)
    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
    wav = wav.unsqueeze(0)
    with torch.no_grad():
        encoded_frames = model.encode(wav)
    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
    fine_prompt = codes
    coarse_prompt = fine_prompt[:2, :]

    output_filename = os.path.splitext(input_filename)[0] + '.npz'

    np.savez(output_filename, semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt)
    return output_filename

iface = gr.Interface(fn=process_audio, inputs=gr.inputs.File(label="Input Audio"), outputs=gr.outputs.File(label="Output File"))
iface.launch()