Monet Joe commited on
Commit
b4d0177
1 Parent(s): 20e29fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -176
app.py CHANGED
@@ -1,176 +1,176 @@
1
- import os
2
- import torch
3
- import random
4
- import shutil
5
- import librosa
6
- import warnings
7
- import numpy as np
8
- import gradio as gr
9
- import librosa.display
10
- import matplotlib.pyplot as plt
11
- import torchvision.transforms as transforms
12
- from utils import get_modelist, find_wav_files
13
- from collections import Counter
14
- from model import EvalNet
15
- from PIL import Image
16
-
17
-
18
- TRANSLATE = {
19
- "m_chest": "男真声",
20
- "f_chest": "女真声",
21
- "m_falsetto": "男假声",
22
- "f_falsetto": "女假声",
23
- }
24
- CLASSES = list(TRANSLATE.keys())
25
-
26
-
27
- def most_common_element(input_list):
28
- # 使用 Counter 统计每个元素的出现次数
29
- counter = Counter(input_list)
30
- # 使用 most_common 方法获取出现次数最多的元素
31
- most_common_element, _ = counter.most_common(1)[0]
32
- return most_common_element
33
-
34
-
35
- def wav_to_mel(audio_path: str, width=0.07):
36
- os.makedirs("./tmp", exist_ok=True)
37
- try:
38
- y, sr = librosa.load(audio_path, sr=48000)
39
- mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
40
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
41
- dur = librosa.get_duration(y=y, sr=sr)
42
- total_frames = log_mel_spec.shape[1]
43
- step = int(width * total_frames / dur)
44
- count = int(total_frames / step)
45
- begin = int(0.5 * (total_frames - count * step))
46
- end = begin + step * count
47
- for i in range(begin, end, step):
48
- librosa.display.specshow(log_mel_spec[:, i : i + step])
49
- plt.axis("off")
50
- plt.savefig(
51
- f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
52
- bbox_inches="tight",
53
- pad_inches=0.0,
54
- )
55
- plt.close()
56
-
57
- except Exception as e:
58
- print(f"Error converting {audio_path} : {e}")
59
-
60
-
61
- def wav_to_cqt(audio_path: str, width=0.07):
62
- os.makedirs("./tmp", exist_ok=True)
63
- try:
64
- y, sr = librosa.load(audio_path, sr=48000)
65
- cqt_spec = librosa.cqt(y=y, sr=sr)
66
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
67
- dur = librosa.get_duration(y=y, sr=sr)
68
- total_frames = log_cqt_spec.shape[1]
69
- step = int(width * total_frames / dur)
70
- count = int(total_frames / step)
71
- begin = int(0.5 * (total_frames - count * step))
72
- end = begin + step * count
73
- for i in range(begin, end, step):
74
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
75
- plt.axis("off")
76
- plt.savefig(
77
- f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
78
- bbox_inches="tight",
79
- pad_inches=0.0,
80
- )
81
- plt.close()
82
-
83
- except Exception as e:
84
- print(f"Error converting {audio_path} : {e}")
85
-
86
-
87
- def wav_to_chroma(audio_path: str, width=0.07):
88
- os.makedirs("./tmp", exist_ok=True)
89
- try:
90
- y, sr = librosa.load(audio_path, sr=48000)
91
- chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
92
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
93
- dur = librosa.get_duration(y=y, sr=sr)
94
- total_frames = log_chroma_spec.shape[1]
95
- step = int(width * total_frames / dur)
96
- count = int(total_frames / step)
97
- begin = int(0.5 * (total_frames - count * step))
98
- end = begin + step * count
99
- for i in range(begin, end, step):
100
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
101
- plt.axis("off")
102
- plt.savefig(
103
- f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
104
- bbox_inches="tight",
105
- pad_inches=0.0,
106
- )
107
- plt.close()
108
-
109
- except Exception as e:
110
- print(f"Error converting {audio_path} : {e}")
111
-
112
-
113
- def embed_img(img_path, input_size=224):
114
- transform = transforms.Compose(
115
- [
116
- transforms.Resize([input_size, input_size]),
117
- transforms.ToTensor(),
118
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
119
- ]
120
- )
121
- img = Image.open(img_path).convert("RGB")
122
- return transform(img).unsqueeze(0)
123
-
124
-
125
- def inference(wav_path, log_name: str, folder_path="./tmp"):
126
- if os.path.exists(folder_path):
127
- shutil.rmtree(folder_path)
128
-
129
- if not wav_path:
130
- wav_path = "./examples/m_falsetto.wav"
131
-
132
- model = EvalNet(log_name).model
133
- spec = log_name.split("_")[-1]
134
- eval("wav_to_%s" % spec)(wav_path)
135
- outputs = []
136
- all_files = os.listdir(folder_path)
137
- for file_name in all_files:
138
- if file_name.lower().endswith(".jpg"):
139
- file_path = os.path.join(folder_path, file_name)
140
- input = embed_img(file_path)
141
- output = model(input)
142
- pred_id = torch.max(output.data, 1)[1]
143
- outputs.append(pred_id)
144
-
145
- max_count_item = most_common_element(outputs)
146
- shutil.rmtree(folder_path)
147
- return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
148
-
149
-
150
- if __name__ == "__main__":
151
- warnings.filterwarnings("ignore")
152
-
153
- models = get_modelist()
154
- examples = []
155
- example_wavs = find_wav_files()
156
- model_num = len(models)
157
- for wav in example_wavs:
158
- examples.append([wav, models[random.randint(0, model_num - 1)]])
159
-
160
- with gr.Blocks() as demo:
161
- gr.Interface(
162
- fn=inference,
163
- inputs=[
164
- gr.Audio(label="上传录音", type="filepath"),
165
- gr.Dropdown(choices=models, label="选择模型", value=models[0]),
166
- ],
167
- outputs=[
168
- gr.Textbox(label="音频文件名", show_copy_button=True),
169
- gr.Textbox(label="唱法识别", show_copy_button=True),
170
- ],
171
- examples=examples,
172
- allow_flagging="never",
173
- title="建议录音时长保持在 5s 左右, 过长会影响识别效率",
174
- )
175
-
176
- demo.launch()
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import shutil
5
+ import librosa
6
+ import warnings
7
+ import numpy as np
8
+ import gradio as gr
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import torchvision.transforms as transforms
12
+ from utils import get_modelist, find_wav_files
13
+ from collections import Counter
14
+ from model import EvalNet
15
+ from PIL import Image
16
+
17
+
18
+ TRANSLATE = {
19
+ "m_chest": "Chest voice, male",
20
+ "f_chest": "Chest voice, female",
21
+ "m_falsetto": "Falsetto voice, male",
22
+ "f_falsetto": "Falsetto voice, female",
23
+ }
24
+ CLASSES = list(TRANSLATE.keys())
25
+
26
+
27
+ def most_common_element(input_list):
28
+ # 使用 Counter 统计每个元素的出现次数
29
+ counter = Counter(input_list)
30
+ # 使用 most_common 方法获取出现次数最多的元素
31
+ most_common_element, _ = counter.most_common(1)[0]
32
+ return most_common_element
33
+
34
+
35
+ def wav_to_mel(audio_path: str, width=0.07):
36
+ os.makedirs("./tmp", exist_ok=True)
37
+ try:
38
+ y, sr = librosa.load(audio_path, sr=48000)
39
+ mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
40
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
41
+ dur = librosa.get_duration(y=y, sr=sr)
42
+ total_frames = log_mel_spec.shape[1]
43
+ step = int(width * total_frames / dur)
44
+ count = int(total_frames / step)
45
+ begin = int(0.5 * (total_frames - count * step))
46
+ end = begin + step * count
47
+ for i in range(begin, end, step):
48
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
49
+ plt.axis("off")
50
+ plt.savefig(
51
+ f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
52
+ bbox_inches="tight",
53
+ pad_inches=0.0,
54
+ )
55
+ plt.close()
56
+
57
+ except Exception as e:
58
+ print(f"Error converting {audio_path} : {e}")
59
+
60
+
61
+ def wav_to_cqt(audio_path: str, width=0.07):
62
+ os.makedirs("./tmp", exist_ok=True)
63
+ try:
64
+ y, sr = librosa.load(audio_path, sr=48000)
65
+ cqt_spec = librosa.cqt(y=y, sr=sr)
66
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
67
+ dur = librosa.get_duration(y=y, sr=sr)
68
+ total_frames = log_cqt_spec.shape[1]
69
+ step = int(width * total_frames / dur)
70
+ count = int(total_frames / step)
71
+ begin = int(0.5 * (total_frames - count * step))
72
+ end = begin + step * count
73
+ for i in range(begin, end, step):
74
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
75
+ plt.axis("off")
76
+ plt.savefig(
77
+ f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
78
+ bbox_inches="tight",
79
+ pad_inches=0.0,
80
+ )
81
+ plt.close()
82
+
83
+ except Exception as e:
84
+ print(f"Error converting {audio_path} : {e}")
85
+
86
+
87
+ def wav_to_chroma(audio_path: str, width=0.07):
88
+ os.makedirs("./tmp", exist_ok=True)
89
+ try:
90
+ y, sr = librosa.load(audio_path, sr=48000)
91
+ chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
92
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
93
+ dur = librosa.get_duration(y=y, sr=sr)
94
+ total_frames = log_chroma_spec.shape[1]
95
+ step = int(width * total_frames / dur)
96
+ count = int(total_frames / step)
97
+ begin = int(0.5 * (total_frames - count * step))
98
+ end = begin + step * count
99
+ for i in range(begin, end, step):
100
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
101
+ plt.axis("off")
102
+ plt.savefig(
103
+ f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
104
+ bbox_inches="tight",
105
+ pad_inches=0.0,
106
+ )
107
+ plt.close()
108
+
109
+ except Exception as e:
110
+ print(f"Error converting {audio_path} : {e}")
111
+
112
+
113
+ def embed_img(img_path, input_size=224):
114
+ transform = transforms.Compose(
115
+ [
116
+ transforms.Resize([input_size, input_size]),
117
+ transforms.ToTensor(),
118
+ transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
119
+ ]
120
+ )
121
+ img = Image.open(img_path).convert("RGB")
122
+ return transform(img).unsqueeze(0)
123
+
124
+
125
+ def inference(wav_path, log_name: str, folder_path="./tmp"):
126
+ if os.path.exists(folder_path):
127
+ shutil.rmtree(folder_path)
128
+
129
+ if not wav_path:
130
+ wav_path = "./examples/m_falsetto.wav"
131
+
132
+ model = EvalNet(log_name).model
133
+ spec = log_name.split("_")[-1]
134
+ eval("wav_to_%s" % spec)(wav_path)
135
+ outputs = []
136
+ all_files = os.listdir(folder_path)
137
+ for file_name in all_files:
138
+ if file_name.lower().endswith(".jpg"):
139
+ file_path = os.path.join(folder_path, file_name)
140
+ input = embed_img(file_path)
141
+ output = model(input)
142
+ pred_id = torch.max(output.data, 1)[1]
143
+ outputs.append(pred_id)
144
+
145
+ max_count_item = most_common_element(outputs)
146
+ shutil.rmtree(folder_path)
147
+ return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
148
+
149
+
150
+ if __name__ == "__main__":
151
+ warnings.filterwarnings("ignore")
152
+
153
+ models = get_modelist()
154
+ examples = []
155
+ example_wavs = find_wav_files()
156
+ model_num = len(models)
157
+ for wav in example_wavs:
158
+ examples.append([wav, models[random.randint(0, model_num - 1)]])
159
+
160
+ with gr.Blocks() as demo:
161
+ gr.Interface(
162
+ fn=inference,
163
+ inputs=[
164
+ gr.Audio(label="Upload a recording", type="filepath"),
165
+ gr.Dropdown(choices=models, label="Select a model", value=models[0]),
166
+ ],
167
+ outputs=[
168
+ gr.Textbox(label="Audio filename", show_copy_button=True),
169
+ gr.Textbox(label="Singing method recognition", show_copy_button=True),
170
+ ],
171
+ examples=examples,
172
+ allow_flagging="never",
173
+ title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
174
+ )
175
+
176
+ demo.launch()