Spaces:

ccmusic-database
/

chest_falsetto

Running

App Files Files

chest_falsetto / app.py

Monet Joe

Update app.py

c234ee1 verified 4 months ago

raw

history blame

No virus

5.88 kB

	import os
	import torch
	import random
	import shutil
	import librosa
	import warnings
	import numpy as np
	import gradio as gr
	import librosa.display
	import matplotlib.pyplot as plt
	import torchvision.transforms as transforms
	from utils import get_modelist, find_wav_files
	from collections import Counter
	from model import EvalNet
	from PIL import Image


	TRANSLATE = {
	"m_chest": "Chest voice, male",
	"f_chest": "Chest voice, female",
	"m_falsetto": "Falsetto voice, male",
	"f_falsetto": "Falsetto voice, female",
	}
	CLASSES = list(TRANSLATE.keys())


	def most_common_element(input_list):
	# 使用 Counter 统计每个元素的出现次数
	counter = Counter(input_list)
	# 使用 most_common 方法获取出现次数最多的元素
	most_common_element, _ = counter.most_common(1)[0]
	return most_common_element


	def wav_to_mel(audio_path: str, width=0.07):
	os.makedirs("./tmp", exist_ok=True)
	try:
	y, sr = librosa.load(audio_path, sr=48000)
	mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
	log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
	dur = librosa.get_duration(y=y, sr=sr)
	total_frames = log_mel_spec.shape[1]
	step = int(width * total_frames / dur)
	count = int(total_frames / step)
	begin = int(0.5 * (total_frames - count * step))
	end = begin + step * count
	for i in range(begin, end, step):
	librosa.display.specshow(log_mel_spec[:, i : i + step])
	plt.axis("off")
	plt.savefig(
	f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
	bbox_inches="tight",
	pad_inches=0.0,
	)
	plt.close()

	except Exception as e:
	print(f"Error converting {audio_path} : {e}")


	def wav_to_cqt(audio_path: str, width=0.07):
	os.makedirs("./tmp", exist_ok=True)
	try:
	y, sr = librosa.load(audio_path, sr=48000)
	cqt_spec = librosa.cqt(y=y, sr=sr)
	log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
	dur = librosa.get_duration(y=y, sr=sr)
	total_frames = log_cqt_spec.shape[1]
	step = int(width * total_frames / dur)
	count = int(total_frames / step)
	begin = int(0.5 * (total_frames - count * step))
	end = begin + step * count
	for i in range(begin, end, step):
	librosa.display.specshow(log_cqt_spec[:, i : i + step])
	plt.axis("off")
	plt.savefig(
	f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
	bbox_inches="tight",
	pad_inches=0.0,
	)
	plt.close()

	except Exception as e:
	print(f"Error converting {audio_path} : {e}")


	def wav_to_chroma(audio_path: str, width=0.07):
	os.makedirs("./tmp", exist_ok=True)
	try:
	y, sr = librosa.load(audio_path, sr=48000)
	chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
	log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
	dur = librosa.get_duration(y=y, sr=sr)
	total_frames = log_chroma_spec.shape[1]
	step = int(width * total_frames / dur)
	count = int(total_frames / step)
	begin = int(0.5 * (total_frames - count * step))
	end = begin + step * count
	for i in range(begin, end, step):
	librosa.display.specshow(log_chroma_spec[:, i : i + step])
	plt.axis("off")
	plt.savefig(
	f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
	bbox_inches="tight",
	pad_inches=0.0,
	)
	plt.close()

	except Exception as e:
	print(f"Error converting {audio_path} : {e}")


	def embed_img(img_path, input_size=224):
	transform = transforms.Compose(
	[
	transforms.Resize([input_size, input_size]),
	transforms.ToTensor(),
	transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
	]
	)
	img = Image.open(img_path).convert("RGB")
	return transform(img).unsqueeze(0)


	def inference(wav_path, log_name: str, folder_path="./tmp"):
	if os.path.exists(folder_path):
	shutil.rmtree(folder_path)

	if not wav_path:
	wav_path = "./examples/m_falsetto.wav"

	model = EvalNet(log_name).model
	spec = log_name.split("_")[-1]
	eval("wav_to_%s" % spec)(wav_path)
	outputs = []
	all_files = os.listdir(folder_path)
	for file_name in all_files:
	if file_name.lower().endswith(".jpg"):
	file_path = os.path.join(folder_path, file_name)
	input = embed_img(file_path)
	output = model(input)
	pred_id = torch.max(output.data, 1)[1]
	outputs.append(int(pred_id))

	max_count_item = most_common_element(outputs)
	shutil.rmtree(folder_path)
	return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]


	if __name__ == "__main__":
	warnings.filterwarnings("ignore")

	models = get_modelist()
	examples = []
	example_wavs = find_wav_files()
	model_num = len(models)
	for wav in example_wavs:
	examples.append([wav, models[random.randint(0, model_num - 1)]])

	with gr.Blocks() as demo:
	gr.Interface(
	fn=inference,
	inputs=[
	gr.Audio(label="Upload a recording", type="filepath"),
	gr.Dropdown(choices=models, label="Select a model", value=models[0]),
	],
	outputs=[
	gr.Textbox(label="Audio filename", show_copy_button=True),
	gr.Textbox(label="Singing method recognition", show_copy_button=True),
	],
	examples=examples,
	allow_flagging="never",
	title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
	)

	demo.launch()