Spaces:
Running
Running
import os | |
import torch | |
import shutil | |
import librosa | |
import numpy as np | |
import gradio as gr | |
import librosa.display | |
import matplotlib.pyplot as plt | |
import torchvision.transforms as transforms | |
from collections import Counter | |
from model import EvalNet | |
from PIL import Image | |
from utils import * | |
import warnings | |
warnings.filterwarnings("ignore") | |
classes = ['m_bel', 'f_bel', 'm_folk', 'f_folk'] | |
def most_common_element(input_list): | |
# 使用 Counter 统计每个元素的出现次数 | |
counter = Counter(input_list) | |
# 使用 most_common 方法获取出现次数最多的元素 | |
most_common_element, _ = counter.most_common(1)[0] | |
return most_common_element | |
def wav_to_mel(audio_path: str, width=1.6, topdb=40): | |
create_dir('./tmp') | |
try: | |
y, sr = librosa.load(audio_path, sr=48000) | |
non_silents = librosa.effects.split(y, top_db=topdb) | |
non_silent = np.concatenate( | |
[y[start:end] for start, end in non_silents] | |
) | |
mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr) | |
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
dur = librosa.get_duration(y=non_silent, sr=sr) | |
total_frames = log_mel_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_mel_spec[:, i:i + step]) | |
plt.axis('off') | |
plt.savefig( | |
f'./tmp/mel_{round(dur, 2)}_{i}.jpg', | |
bbox_inches='tight', | |
pad_inches=0.0 | |
) | |
plt.close() | |
except Exception as e: | |
print(f'Error converting {audio_path} : {e}') | |
def wav_to_cqt(audio_path: str, width=1.6, topdb=40): | |
create_dir('./tmp') | |
try: | |
y, sr = librosa.load(audio_path, sr=48000) | |
non_silents = librosa.effects.split(y, top_db=topdb) | |
non_silent = np.concatenate( | |
[y[start:end] for start, end in non_silents] | |
) | |
cqt_spec = librosa.cqt(y=non_silent, sr=sr) | |
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec)**2, ref=np.max) | |
dur = librosa.get_duration(y=non_silent, sr=sr) | |
total_frames = log_cqt_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_cqt_spec[:, i:i + step]) | |
plt.axis('off') | |
plt.savefig( | |
f'./tmp/cqt_{round(dur, 2)}_{i}.jpg', | |
bbox_inches='tight', | |
pad_inches=0.0 | |
) | |
plt.close() | |
except Exception as e: | |
print(f'Error converting {audio_path} : {e}') | |
def wav_to_chroma(audio_path: str, width=1.6, topdb=40): | |
create_dir('./tmp') | |
try: | |
y, sr = librosa.load(audio_path, sr=48000) | |
non_silents = librosa.effects.split(y, top_db=topdb) | |
non_silent = np.concatenate( | |
[y[start:end] for start, end in non_silents] | |
) | |
chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr) | |
log_chroma_spec = librosa.power_to_db( | |
np.abs(chroma_spec)**2, | |
ref=np.max | |
) | |
dur = librosa.get_duration(y=non_silent, sr=sr) | |
total_frames = log_chroma_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_chroma_spec[:, i:i + step]) | |
plt.axis('off') | |
plt.savefig( | |
f'./tmp/chroma_{round(dur, 2)}_{i}.jpg', | |
bbox_inches='tight', | |
pad_inches=0.0 | |
) | |
plt.close() | |
except Exception as e: | |
print(f'Error converting {audio_path} : {e}') | |
def embed_img(img_path, input_size=224): | |
transform = transforms.Compose([ | |
transforms.Resize([input_size, input_size]), | |
transforms.ToTensor(), | |
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) | |
]) | |
img = Image.open(img_path).convert("RGB") | |
return transform(img).unsqueeze(0) | |
def inference(wav_path, log_name, folder_path='./tmp'): | |
if os.path.exists(folder_path): | |
shutil.rmtree(folder_path) | |
if not wav_path: | |
wav_path = './examples/f_bel.wav' | |
model = EvalNet(log_name).model | |
spec = log_name.split('_')[-3] | |
eval('wav_to_%s' % spec)(wav_path) | |
outputs = [] | |
all_files = os.listdir(folder_path) | |
for file_name in all_files: | |
if file_name.lower().endswith('.jpg'): | |
file_path = os.path.join(folder_path, file_name) | |
input = embed_img(file_path) | |
output = model(input) | |
pred_id = torch.max(output.data, 1)[1] | |
outputs.append(pred_id) | |
max_count_item = most_common_element(outputs) | |
shutil.rmtree(folder_path) | |
return translate[classes[max_count_item]] | |
models = [ | |
'vit_b_16_mel_2024-01-07_05-16-24', | |
'swin_b_chroma_2024-01-07_14-01-10' | |
] | |
translate = { | |
'm_bel': 'male bel canto', | |
'm_folk': 'male folk singing', | |
'f_bel': 'female bel canto', | |
'f_folk': 'female folk singing' | |
} | |
examples = [] | |
example_wavs = find_wav_files() | |
for wav in example_wavs: | |
examples.append([ | |
wav, | |
models[0] | |
]) | |
iface = gr.Interface( | |
fn=inference, | |
inputs=[ | |
gr.Audio(label='Upload audio', type='filepath'), | |
gr.Dropdown( | |
choices=models, | |
label='Select model', | |
value=models[0] | |
) | |
], | |
outputs=gr.Textbox(label='Singing method'), | |
examples=examples | |
) | |
iface.launch() | |