Delete dreamvoice/src/feats/.ipynb_checkpoints
Browse files
dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import librosa
|
3 |
-
from fairseq import checkpoint_utils
|
4 |
-
import torch.nn.functional as F
|
5 |
-
|
6 |
-
|
7 |
-
def get_model(vec_path):
|
8 |
-
print("load model(s) from {}".format(vec_path))
|
9 |
-
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
10 |
-
[vec_path],
|
11 |
-
suffix="",
|
12 |
-
)
|
13 |
-
model = models[0]
|
14 |
-
model.eval()
|
15 |
-
return model
|
16 |
-
|
17 |
-
|
18 |
-
@torch.no_grad()
|
19 |
-
def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
|
20 |
-
# print(layer)
|
21 |
-
wav_16k_tensor = wav_16k_tensor.to(device)
|
22 |
-
# so that the output shape will be len(audio//320)
|
23 |
-
wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
|
24 |
-
feats = wav_16k_tensor
|
25 |
-
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
26 |
-
inputs = {
|
27 |
-
"source": feats.to(wav_16k_tensor.device),
|
28 |
-
"padding_mask": padding_mask.to(wav_16k_tensor.device),
|
29 |
-
"output_layer": layer
|
30 |
-
}
|
31 |
-
logits = hmodel.extract_features(**inputs)[0]
|
32 |
-
# feats = hmodel.final_proj(logits[0])
|
33 |
-
return logits
|
34 |
-
|
35 |
-
|
36 |
-
if __name__ == '__main__':
|
37 |
-
audio, sr = librosa.load('test.wav', sr=16000)
|
38 |
-
audio = audio[:100*320]
|
39 |
-
model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
|
40 |
-
model = model.cuda()
|
41 |
-
content = get_content(model, torch.tensor([audio]))
|
42 |
-
print(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
from transformers import HubertModel
|
2 |
-
import torch.nn as nn
|
3 |
-
import torch
|
4 |
-
import torch.nn.functional as F
|
5 |
-
import librosa
|
6 |
-
|
7 |
-
|
8 |
-
class HubertModelWithFinalProj(HubertModel):
|
9 |
-
def __init__(self, config):
|
10 |
-
super().__init__(config)
|
11 |
-
|
12 |
-
# The final projection layer is only used for backward compatibility.
|
13 |
-
# Following https://github.com/auspicious3000/contentvec/issues/6
|
14 |
-
# Remove this layer is necessary to achieve the desired outcome.
|
15 |
-
self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
|
16 |
-
|
17 |
-
|
18 |
-
def get_content_model(config='lengyue233/content-vec-best'):
|
19 |
-
model = HubertModelWithFinalProj.from_pretrained(config)
|
20 |
-
model.eval()
|
21 |
-
return model
|
22 |
-
|
23 |
-
|
24 |
-
@torch.no_grad()
|
25 |
-
def get_content(model, wav_16k_tensor, device='cuda'):
|
26 |
-
# print(layer)
|
27 |
-
wav_16k_tensor = wav_16k_tensor.to(device)
|
28 |
-
# so that the output shape will be len(audio//320)
|
29 |
-
wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
|
30 |
-
logits = model(wav_16k_tensor)['last_hidden_state']
|
31 |
-
return logits
|
32 |
-
|
33 |
-
|
34 |
-
if __name__ == '__main__':
|
35 |
-
model = get_content_model().cuda()
|
36 |
-
audio, sr = librosa.load('test.wav', sr=16000)
|
37 |
-
audio = audio[:100*320]
|
38 |
-
audio = torch.tensor([audio])
|
39 |
-
content = get_content(model, audio, 'cuda')
|
40 |
-
print(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
import torch, torchaudio
|
2 |
-
from .hubert.hubert import HubertSoft
|
3 |
-
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
4 |
-
import librosa
|
5 |
-
|
6 |
-
|
7 |
-
def get_soft_model(model_path):
|
8 |
-
hubert = HubertSoft()
|
9 |
-
# Load checkpoint (either hubert_soft or hubert_discrete)
|
10 |
-
# hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
11 |
-
checkpoint = torch.load(model_path)
|
12 |
-
consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
|
13 |
-
hubert.load_state_dict(checkpoint["hubert"])
|
14 |
-
hubert.eval()
|
15 |
-
return hubert
|
16 |
-
|
17 |
-
|
18 |
-
@torch.no_grad()
|
19 |
-
def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
|
20 |
-
wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
|
21 |
-
# print(wav_16k_tensor.shape)
|
22 |
-
units = hmodel.units(wav_16k_tensor)
|
23 |
-
# print(units.shape)
|
24 |
-
return units.cpu()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import torch, torchaudio
|
2 |
-
from hubert.hubert import HubertSoft
|
3 |
-
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
4 |
-
import librosa
|
5 |
-
|
6 |
-
|
7 |
-
def get_soft_model(model_path):
|
8 |
-
hubert = HubertSoft()
|
9 |
-
# Load checkpoint (either hubert_soft or hubert_discrete)
|
10 |
-
# hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
11 |
-
checkpoint = torch.load(model_path)
|
12 |
-
consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
|
13 |
-
hubert.load_state_dict(checkpoint["hubert"])
|
14 |
-
hubert.eval()
|
15 |
-
return model
|
16 |
-
|
17 |
-
|
18 |
-
@torch.no_grad()
|
19 |
-
def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
|
20 |
-
wav_16k_tensor = wav_16k_tensor.to(device)
|
21 |
-
units = hmodel.units(wav_16k_tensor)
|
22 |
-
return units.cpu()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|