|
from transformers import Wav2Vec2Processor, HubertModel |
|
import soundfile as sf |
|
import numpy as np |
|
import torch |
|
|
|
print("Loading the Wav2Vec2 Processor...") |
|
wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") |
|
print("Loading the HuBERT Model...") |
|
hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") |
|
|
|
|
|
def get_hubert_from_16k_wav(wav_16k_name): |
|
speech_16k, _ = sf.read(wav_16k_name) |
|
hubert = get_hubert_from_16k_speech(speech_16k) |
|
return hubert |
|
|
|
@torch.no_grad() |
|
def get_hubert_from_16k_speech(speech, device="cuda:0"): |
|
global hubert_model |
|
hubert_model = hubert_model.to(device) |
|
if speech.ndim ==2: |
|
speech = speech[:, 0] |
|
input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values |
|
input_values_all = input_values_all.to(device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
kernel = 400 |
|
stride = 320 |
|
clip_length = stride * 1000 |
|
num_iter = input_values_all.shape[1] // clip_length |
|
expected_T = (input_values_all.shape[1] - (kernel-stride)) // stride |
|
res_lst = [] |
|
for i in range(num_iter): |
|
if i == 0: |
|
start_idx = 0 |
|
end_idx = clip_length - stride + kernel |
|
else: |
|
start_idx = clip_length * i |
|
end_idx = start_idx + (clip_length - stride + kernel) |
|
input_values = input_values_all[:, start_idx: end_idx] |
|
hidden_states = hubert_model.forward(input_values).last_hidden_state |
|
res_lst.append(hidden_states[0]) |
|
if num_iter > 0: |
|
input_values = input_values_all[:, clip_length * num_iter:] |
|
else: |
|
input_values = input_values_all |
|
|
|
if input_values.shape[1] >= kernel: |
|
hidden_states = hubert_model(input_values).last_hidden_state |
|
res_lst.append(hidden_states[0]) |
|
ret = torch.cat(res_lst, dim=0).cpu() |
|
|
|
assert abs(ret.shape[0] - expected_T) <= 1 |
|
if ret.shape[0] < expected_T: |
|
ret = torch.nn.functional.pad(ret, (0,0,0,expected_T-ret.shape[0])) |
|
else: |
|
ret = ret[:expected_T] |
|
return ret |
|
|
|
def make_even_first_dim(tensor): |
|
size = list(tensor.size()) |
|
if size[0] % 2 == 1: |
|
size[0] -= 1 |
|
return tensor[:size[0]] |
|
return tensor |
|
|
|
import soundfile as sf |
|
import numpy as np |
|
import torch |
|
from argparse import ArgumentParser |
|
import librosa |
|
|
|
parser = ArgumentParser() |
|
parser.add_argument('--wav', type=str, help='') |
|
args = parser.parse_args() |
|
|
|
wav_name = args.wav |
|
|
|
speech, sr = sf.read(wav_name) |
|
speech_16k = librosa.resample(speech, orig_sr=sr, target_sr=16000) |
|
print("SR: {} to {}".format(sr, 16000)) |
|
|
|
|
|
hubert_hidden = get_hubert_from_16k_speech(speech_16k) |
|
hubert_hidden = make_even_first_dim(hubert_hidden).reshape(-1, 2, 1024) |
|
np.save(wav_name.replace('.wav', '_hu.npy'), hubert_hidden.detach().numpy()) |
|
print(hubert_hidden.detach().numpy().shape) |