Monet Joe commited on
Commit
e8a66fd
1 Parent(s): 98945f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -183
app.py CHANGED
@@ -1,183 +1,183 @@
1
- import os
2
- import torch
3
- import random
4
- import shutil
5
- import librosa
6
- import warnings
7
- import numpy as np
8
- import gradio as gr
9
- import librosa.display
10
- import matplotlib.pyplot as plt
11
- import torchvision.transforms as transforms
12
- from utils import get_modelist, find_wav_files
13
- from collections import Counter
14
- from model import EvalNet
15
- from PIL import Image
16
-
17
-
18
- TRANSLATE = {
19
- "m_bel": "男声美声唱法",
20
- "f_bel": "女声美声唱法",
21
- "m_folk": "男声民族唱法",
22
- "f_folk": "女声民族唱法",
23
- }
24
-
25
- CLASSES = list(TRANSLATE.keys())
26
-
27
-
28
- def most_common_element(input_list):
29
- # 使用 Counter 统计每个元素的出现次数
30
- counter = Counter(input_list)
31
- # 使用 most_common 方法获取出现次数最多的元素
32
- most_common_element, _ = counter.most_common(1)[0]
33
- return most_common_element
34
-
35
-
36
- def wav_to_mel(audio_path: str, width=1.6, topdb=40):
37
- os.makedirs("./tmp", exist_ok=True)
38
- try:
39
- y, sr = librosa.load(audio_path, sr=48000)
40
- non_silents = librosa.effects.split(y, top_db=topdb)
41
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
42
- mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
43
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
44
- dur = librosa.get_duration(y=non_silent, sr=sr)
45
- total_frames = log_mel_spec.shape[1]
46
- step = int(width * total_frames / dur)
47
- count = int(total_frames / step)
48
- begin = int(0.5 * (total_frames - count * step))
49
- end = begin + step * count
50
- for i in range(begin, end, step):
51
- librosa.display.specshow(log_mel_spec[:, i : i + step])
52
- plt.axis("off")
53
- plt.savefig(
54
- f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
55
- bbox_inches="tight",
56
- pad_inches=0.0,
57
- )
58
- plt.close()
59
-
60
- except Exception as e:
61
- print(f"Error converting {audio_path} : {e}")
62
-
63
-
64
- def wav_to_cqt(audio_path: str, width=1.6, topdb=40):
65
- os.makedirs("./tmp", exist_ok=True)
66
- try:
67
- y, sr = librosa.load(audio_path, sr=48000)
68
- non_silents = librosa.effects.split(y, top_db=topdb)
69
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
70
- cqt_spec = librosa.cqt(y=non_silent, sr=sr)
71
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
72
- dur = librosa.get_duration(y=non_silent, sr=sr)
73
- total_frames = log_cqt_spec.shape[1]
74
- step = int(width * total_frames / dur)
75
- count = int(total_frames / step)
76
- begin = int(0.5 * (total_frames - count * step))
77
- end = begin + step * count
78
- for i in range(begin, end, step):
79
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
80
- plt.axis("off")
81
- plt.savefig(
82
- f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
83
- bbox_inches="tight",
84
- pad_inches=0.0,
85
- )
86
- plt.close()
87
-
88
- except Exception as e:
89
- print(f"Error converting {audio_path} : {e}")
90
-
91
-
92
- def wav_to_chroma(audio_path: str, width=1.6, topdb=40):
93
- os.makedirs("./tmp", exist_ok=True)
94
- try:
95
- y, sr = librosa.load(audio_path, sr=48000)
96
- non_silents = librosa.effects.split(y, top_db=topdb)
97
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
98
- chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr)
99
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
100
- dur = librosa.get_duration(y=non_silent, sr=sr)
101
- total_frames = log_chroma_spec.shape[1]
102
- step = int(width * total_frames / dur)
103
- count = int(total_frames / step)
104
- begin = int(0.5 * (total_frames - count * step))
105
- end = begin + step * count
106
- for i in range(begin, end, step):
107
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
108
- plt.axis("off")
109
- plt.savefig(
110
- f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
111
- bbox_inches="tight",
112
- pad_inches=0.0,
113
- )
114
- plt.close()
115
-
116
- except Exception as e:
117
- print(f"Error converting {audio_path} : {e}")
118
-
119
-
120
- def embed_img(img_path, input_size=224):
121
- transform = transforms.Compose(
122
- [
123
- transforms.Resize([input_size, input_size]),
124
- transforms.ToTensor(),
125
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
126
- ]
127
- )
128
- img = Image.open(img_path).convert("RGB")
129
- return transform(img).unsqueeze(0)
130
-
131
-
132
- def inference(wav_path: str, log_name: str, folder_path="./tmp"):
133
- if os.path.exists(folder_path):
134
- shutil.rmtree(folder_path)
135
-
136
- if not wav_path:
137
- wav_path = "./examples/f_bel.wav"
138
-
139
- model = EvalNet(log_name).model
140
- spec = log_name.split("_")[-1]
141
- eval("wav_to_%s" % spec)(wav_path)
142
- outputs = []
143
- all_files = os.listdir(folder_path)
144
- for file_name in all_files:
145
- if file_name.lower().endswith(".jpg"):
146
- file_path = os.path.join(folder_path, file_name)
147
- input = embed_img(file_path)
148
- output = model(input)
149
- pred_id = torch.max(output.data, 1)[1]
150
- outputs.append(pred_id)
151
-
152
- max_count_item = most_common_element(outputs)
153
- shutil.rmtree(folder_path)
154
- return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
155
-
156
-
157
- if __name__ == "__main__":
158
- warnings.filterwarnings("ignore")
159
-
160
- models = get_modelist()
161
- examples = []
162
- example_wavs = find_wav_files()
163
- model_num = len(models)
164
- for wav in example_wavs:
165
- examples.append([wav, models[random.randint(0, model_num - 1)]])
166
-
167
- with gr.Blocks() as demo:
168
- gr.Interface(
169
- fn=inference,
170
- inputs=[
171
- gr.Audio(label="上传录音", type="filepath"),
172
- gr.Dropdown(choices=models, label="选择模型", value=models[0]),
173
- ],
174
- outputs=[
175
- gr.Textbox(label="音频文件名", show_copy_button=True),
176
- gr.Textbox(label="唱法识别", show_copy_button=True),
177
- ],
178
- examples=examples,
179
- allow_flagging="never",
180
- title="建议录音时长保持在 5s 左右, 过长会影响识别效率",
181
- )
182
-
183
- demo.launch()
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import shutil
5
+ import librosa
6
+ import warnings
7
+ import numpy as np
8
+ import gradio as gr
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import torchvision.transforms as transforms
12
+ from utils import get_modelist, find_wav_files
13
+ from collections import Counter
14
+ from model import EvalNet
15
+ from PIL import Image
16
+
17
+
18
+ TRANSLATE = {
19
+ "m_bel": "Bel Canto Male",
20
+ "f_bel": "Bel Canto Female",
21
+ "m_folk": "Folk Singing Male",
22
+ "f_folk": "Folk Singing Female",
23
+ }
24
+
25
+ CLASSES = list(TRANSLATE.keys())
26
+
27
+
28
+ def most_common_element(input_list):
29
+ # 使用 Counter 统计每个元素的出现次数
30
+ counter = Counter(input_list)
31
+ # 使用 most_common 方法获取出现次数最多的元素
32
+ most_common_element, _ = counter.most_common(1)[0]
33
+ return most_common_element
34
+
35
+
36
+ def wav_to_mel(audio_path: str, width=1.6, topdb=40):
37
+ os.makedirs("./tmp", exist_ok=True)
38
+ try:
39
+ y, sr = librosa.load(audio_path, sr=48000)
40
+ non_silents = librosa.effects.split(y, top_db=topdb)
41
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
42
+ mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
43
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
44
+ dur = librosa.get_duration(y=non_silent, sr=sr)
45
+ total_frames = log_mel_spec.shape[1]
46
+ step = int(width * total_frames / dur)
47
+ count = int(total_frames / step)
48
+ begin = int(0.5 * (total_frames - count * step))
49
+ end = begin + step * count
50
+ for i in range(begin, end, step):
51
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
52
+ plt.axis("off")
53
+ plt.savefig(
54
+ f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
55
+ bbox_inches="tight",
56
+ pad_inches=0.0,
57
+ )
58
+ plt.close()
59
+
60
+ except Exception as e:
61
+ print(f"Error converting {audio_path} : {e}")
62
+
63
+
64
+ def wav_to_cqt(audio_path: str, width=1.6, topdb=40):
65
+ os.makedirs("./tmp", exist_ok=True)
66
+ try:
67
+ y, sr = librosa.load(audio_path, sr=48000)
68
+ non_silents = librosa.effects.split(y, top_db=topdb)
69
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
70
+ cqt_spec = librosa.cqt(y=non_silent, sr=sr)
71
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
72
+ dur = librosa.get_duration(y=non_silent, sr=sr)
73
+ total_frames = log_cqt_spec.shape[1]
74
+ step = int(width * total_frames / dur)
75
+ count = int(total_frames / step)
76
+ begin = int(0.5 * (total_frames - count * step))
77
+ end = begin + step * count
78
+ for i in range(begin, end, step):
79
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
80
+ plt.axis("off")
81
+ plt.savefig(
82
+ f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
83
+ bbox_inches="tight",
84
+ pad_inches=0.0,
85
+ )
86
+ plt.close()
87
+
88
+ except Exception as e:
89
+ print(f"Error converting {audio_path} : {e}")
90
+
91
+
92
+ def wav_to_chroma(audio_path: str, width=1.6, topdb=40):
93
+ os.makedirs("./tmp", exist_ok=True)
94
+ try:
95
+ y, sr = librosa.load(audio_path, sr=48000)
96
+ non_silents = librosa.effects.split(y, top_db=topdb)
97
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
98
+ chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr)
99
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
100
+ dur = librosa.get_duration(y=non_silent, sr=sr)
101
+ total_frames = log_chroma_spec.shape[1]
102
+ step = int(width * total_frames / dur)
103
+ count = int(total_frames / step)
104
+ begin = int(0.5 * (total_frames - count * step))
105
+ end = begin + step * count
106
+ for i in range(begin, end, step):
107
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
108
+ plt.axis("off")
109
+ plt.savefig(
110
+ f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
111
+ bbox_inches="tight",
112
+ pad_inches=0.0,
113
+ )
114
+ plt.close()
115
+
116
+ except Exception as e:
117
+ print(f"Error converting {audio_path} : {e}")
118
+
119
+
120
+ def embed_img(img_path, input_size=224):
121
+ transform = transforms.Compose(
122
+ [
123
+ transforms.Resize([input_size, input_size]),
124
+ transforms.ToTensor(),
125
+ transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
126
+ ]
127
+ )
128
+ img = Image.open(img_path).convert("RGB")
129
+ return transform(img).unsqueeze(0)
130
+
131
+
132
+ def inference(wav_path: str, log_name: str, folder_path="./tmp"):
133
+ if os.path.exists(folder_path):
134
+ shutil.rmtree(folder_path)
135
+
136
+ if not wav_path:
137
+ wav_path = "./examples/f_bel.wav"
138
+
139
+ model = EvalNet(log_name).model
140
+ spec = log_name.split("_")[-1]
141
+ eval("wav_to_%s" % spec)(wav_path)
142
+ outputs = []
143
+ all_files = os.listdir(folder_path)
144
+ for file_name in all_files:
145
+ if file_name.lower().endswith(".jpg"):
146
+ file_path = os.path.join(folder_path, file_name)
147
+ input = embed_img(file_path)
148
+ output = model(input)
149
+ pred_id = torch.max(output.data, 1)[1]
150
+ outputs.append(pred_id)
151
+
152
+ max_count_item = most_common_element(outputs)
153
+ shutil.rmtree(folder_path)
154
+ return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
155
+
156
+
157
+ if __name__ == "__main__":
158
+ warnings.filterwarnings("ignore")
159
+
160
+ models = get_modelist()
161
+ examples = []
162
+ example_wavs = find_wav_files()
163
+ model_num = len(models)
164
+ for wav in example_wavs:
165
+ examples.append([wav, models[random.randint(0, model_num - 1)]])
166
+
167
+ with gr.Blocks() as demo:
168
+ gr.Interface(
169
+ fn=inference,
170
+ inputs=[
171
+ gr.Audio(label="Upload a recording", type="filepath"),
172
+ gr.Dropdown(choices=models, label="Select a model", value=models[0]),
173
+ ],
174
+ outputs=[
175
+ gr.Textbox(label="Audio filename", show_copy_button=True),
176
+ gr.Textbox(label="Singing method recognition", show_copy_button=True),
177
+ ],
178
+ examples=examples,
179
+ allow_flagging="never",
180
+ title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
181
+ )
182
+
183
+ demo.launch()