Monan Zhou commited on
Commit
e7b9543
1 Parent(s): b9c341a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -182
app.py CHANGED
@@ -1,182 +1,182 @@
1
- import os
2
- import torch
3
- import random
4
- import shutil
5
- import librosa
6
- import warnings
7
- import numpy as np
8
- import gradio as gr
9
- import librosa.display
10
- import matplotlib.pyplot as plt
11
- import torchvision.transforms as transforms
12
- from utils import get_modelist, find_wav_files
13
- from collections import Counter
14
- from model import EvalNet
15
- from PIL import Image
16
-
17
-
18
- CLASSES = ["m_bel", "f_bel", "m_folk", "f_folk"]
19
-
20
-
21
- def most_common_element(input_list):
22
- # 使用 Counter 统计每个元素的出现次数
23
- counter = Counter(input_list)
24
- # 使用 most_common 方法获取出现次数最多的元素
25
- most_common_element, _ = counter.most_common(1)[0]
26
- return most_common_element
27
-
28
-
29
- def wav_to_mel(audio_path: str, width=1.6, topdb=40):
30
- os.makedirs("./tmp", exist_ok=True)
31
- try:
32
- y, sr = librosa.load(audio_path, sr=48000)
33
- non_silents = librosa.effects.split(y, top_db=topdb)
34
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
35
- mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
36
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
37
- dur = librosa.get_duration(y=non_silent, sr=sr)
38
- total_frames = log_mel_spec.shape[1]
39
- step = int(width * total_frames / dur)
40
- count = int(total_frames / step)
41
- begin = int(0.5 * (total_frames - count * step))
42
- end = begin + step * count
43
- for i in range(begin, end, step):
44
- librosa.display.specshow(log_mel_spec[:, i : i + step])
45
- plt.axis("off")
46
- plt.savefig(
47
- f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
48
- bbox_inches="tight",
49
- pad_inches=0.0,
50
- )
51
- plt.close()
52
-
53
- except Exception as e:
54
- print(f"Error converting {audio_path} : {e}")
55
-
56
-
57
- def wav_to_cqt(audio_path: str, width=1.6, topdb=40):
58
- os.makedirs("./tmp", exist_ok=True)
59
- try:
60
- y, sr = librosa.load(audio_path, sr=48000)
61
- non_silents = librosa.effects.split(y, top_db=topdb)
62
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
63
- cqt_spec = librosa.cqt(y=non_silent, sr=sr)
64
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
65
- dur = librosa.get_duration(y=non_silent, sr=sr)
66
- total_frames = log_cqt_spec.shape[1]
67
- step = int(width * total_frames / dur)
68
- count = int(total_frames / step)
69
- begin = int(0.5 * (total_frames - count * step))
70
- end = begin + step * count
71
- for i in range(begin, end, step):
72
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
73
- plt.axis("off")
74
- plt.savefig(
75
- f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
76
- bbox_inches="tight",
77
- pad_inches=0.0,
78
- )
79
- plt.close()
80
-
81
- except Exception as e:
82
- print(f"Error converting {audio_path} : {e}")
83
-
84
-
85
- def wav_to_chroma(audio_path: str, width=1.6, topdb=40):
86
- os.makedirs("./tmp", exist_ok=True)
87
- try:
88
- y, sr = librosa.load(audio_path, sr=48000)
89
- non_silents = librosa.effects.split(y, top_db=topdb)
90
- non_silent = np.concatenate([y[start:end] for start, end in non_silents])
91
- chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr)
92
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
93
- dur = librosa.get_duration(y=non_silent, sr=sr)
94
- total_frames = log_chroma_spec.shape[1]
95
- step = int(width * total_frames / dur)
96
- count = int(total_frames / step)
97
- begin = int(0.5 * (total_frames - count * step))
98
- end = begin + step * count
99
- for i in range(begin, end, step):
100
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
101
- plt.axis("off")
102
- plt.savefig(
103
- f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
104
- bbox_inches="tight",
105
- pad_inches=0.0,
106
- )
107
- plt.close()
108
-
109
- except Exception as e:
110
- print(f"Error converting {audio_path} : {e}")
111
-
112
-
113
- def embed_img(img_path, input_size=224):
114
- transform = transforms.Compose(
115
- [
116
- transforms.Resize([input_size, input_size]),
117
- transforms.ToTensor(),
118
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
119
- ]
120
- )
121
- img = Image.open(img_path).convert("RGB")
122
- return transform(img).unsqueeze(0)
123
-
124
-
125
- def inference(wav_path: str, log_name: str, folder_path="./tmp"):
126
- if os.path.exists(folder_path):
127
- shutil.rmtree(folder_path)
128
-
129
- if not wav_path:
130
- wav_path = "./examples/f_bel.wav"
131
-
132
- model = EvalNet(log_name).model
133
- spec = log_name.split("_")[-1]
134
- eval("wav_to_%s" % spec)(wav_path)
135
- outputs = []
136
- all_files = os.listdir(folder_path)
137
- for file_name in all_files:
138
- if file_name.lower().endswith(".jpg"):
139
- file_path = os.path.join(folder_path, file_name)
140
- input = embed_img(file_path)
141
- output = model(input)
142
- pred_id = torch.max(output.data, 1)[1]
143
- outputs.append(pred_id)
144
-
145
- max_count_item = most_common_element(outputs)
146
- shutil.rmtree(folder_path)
147
- return os.path.basename(wav_path), translate[CLASSES[max_count_item]]
148
-
149
-
150
- if __name__ == "__main__":
151
- warnings.filterwarnings("ignore")
152
-
153
- models = get_modelist()
154
- translate = {
155
- "m_bel": "男声美声唱法",
156
- "m_folk": "男声民族唱法",
157
- "f_bel": "女声美声唱法",
158
- "f_folk": "女声民族唱法",
159
- }
160
- examples = []
161
- example_wavs = find_wav_files()
162
- model_num = len(models)
163
- for wav in example_wavs:
164
- examples.append([wav, models[random.randint(0, model_num - 1)]])
165
-
166
- with gr.Blocks() as demo:
167
- gr.Interface(
168
- fn=inference,
169
- inputs=[
170
- gr.Audio(label="上传录音", type="filepath"),
171
- gr.Dropdown(choices=models, label="选择模型", value=models[0]),
172
- ],
173
- outputs=[
174
- gr.Textbox(label="音频文件名", show_copy_button=True),
175
- gr.Textbox(label="唱法识别", show_copy_button=True),
176
- ],
177
- examples=examples,
178
- allow_flagging="never",
179
- title="建议录音时长保持在 5s 左右, 过长会影响识别效率",
180
- )
181
-
182
- demo.launch()
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import shutil
5
+ import librosa
6
+ import warnings
7
+ import numpy as np
8
+ import gradio as gr
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import torchvision.transforms as transforms
12
+ from utils import get_modelist, find_wav_files
13
+ from collections import Counter
14
+ from model import EvalNet
15
+ from PIL import Image
16
+
17
+
18
+ CLASSES = ["m_bel", "f_bel", "m_folk", "f_folk"]
19
+
20
+
21
+ def most_common_element(input_list):
22
+ # 使用 Counter 统计每个元素的出现次数
23
+ counter = Counter(input_list)
24
+ # 使用 most_common 方法获取出现次数最多的元素
25
+ most_common_element, _ = counter.most_common(1)[0]
26
+ return most_common_element
27
+
28
+
29
+ def wav_to_mel(audio_path: str, width=1.6, topdb=40):
30
+ os.makedirs("./tmp", exist_ok=True)
31
+ try:
32
+ y, sr = librosa.load(audio_path, sr=48000)
33
+ non_silents = librosa.effects.split(y, top_db=topdb)
34
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
35
+ mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
36
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
37
+ dur = librosa.get_duration(y=non_silent, sr=sr)
38
+ total_frames = log_mel_spec.shape[1]
39
+ step = int(width * total_frames / dur)
40
+ count = int(total_frames / step)
41
+ begin = int(0.5 * (total_frames - count * step))
42
+ end = begin + step * count
43
+ for i in range(begin, end, step):
44
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
45
+ plt.axis("off")
46
+ plt.savefig(
47
+ f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
48
+ bbox_inches="tight",
49
+ pad_inches=0.0,
50
+ )
51
+ plt.close()
52
+
53
+ except Exception as e:
54
+ print(f"Error converting {audio_path} : {e}")
55
+
56
+
57
+ def wav_to_cqt(audio_path: str, width=1.6, topdb=40):
58
+ os.makedirs("./tmp", exist_ok=True)
59
+ try:
60
+ y, sr = librosa.load(audio_path, sr=48000)
61
+ non_silents = librosa.effects.split(y, top_db=topdb)
62
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
63
+ cqt_spec = librosa.cqt(y=non_silent, sr=sr)
64
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
65
+ dur = librosa.get_duration(y=non_silent, sr=sr)
66
+ total_frames = log_cqt_spec.shape[1]
67
+ step = int(width * total_frames / dur)
68
+ count = int(total_frames / step)
69
+ begin = int(0.5 * (total_frames - count * step))
70
+ end = begin + step * count
71
+ for i in range(begin, end, step):
72
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
73
+ plt.axis("off")
74
+ plt.savefig(
75
+ f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
76
+ bbox_inches="tight",
77
+ pad_inches=0.0,
78
+ )
79
+ plt.close()
80
+
81
+ except Exception as e:
82
+ print(f"Error converting {audio_path} : {e}")
83
+
84
+
85
+ def wav_to_chroma(audio_path: str, width=1.6, topdb=40):
86
+ os.makedirs("./tmp", exist_ok=True)
87
+ try:
88
+ y, sr = librosa.load(audio_path, sr=48000)
89
+ non_silents = librosa.effects.split(y, top_db=topdb)
90
+ non_silent = np.concatenate([y[start:end] for start, end in non_silents])
91
+ chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr)
92
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
93
+ dur = librosa.get_duration(y=non_silent, sr=sr)
94
+ total_frames = log_chroma_spec.shape[1]
95
+ step = int(width * total_frames / dur)
96
+ count = int(total_frames / step)
97
+ begin = int(0.5 * (total_frames - count * step))
98
+ end = begin + step * count
99
+ for i in range(begin, end, step):
100
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
101
+ plt.axis("off")
102
+ plt.savefig(
103
+ f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
104
+ bbox_inches="tight",
105
+ pad_inches=0.0,
106
+ )
107
+ plt.close()
108
+
109
+ except Exception as e:
110
+ print(f"Error converting {audio_path} : {e}")
111
+
112
+
113
+ def embed_img(img_path, input_size=224):
114
+ transform = transforms.Compose(
115
+ [
116
+ transforms.Resize([input_size, input_size]),
117
+ transforms.ToTensor(),
118
+ transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
119
+ ]
120
+ )
121
+ img = Image.open(img_path).convert("RGB")
122
+ return transform(img).unsqueeze(0)
123
+
124
+
125
+ def inference(wav_path: str, log_name: str, folder_path="./tmp"):
126
+ if os.path.exists(folder_path):
127
+ shutil.rmtree(folder_path)
128
+
129
+ if not wav_path:
130
+ wav_path = "./examples/f_bel.wav"
131
+
132
+ model = EvalNet(log_name).model
133
+ spec = log_name.split("_")[-1]
134
+ eval("wav_to_%s" % spec)(wav_path)
135
+ outputs = []
136
+ all_files = os.listdir(folder_path)
137
+ for file_name in all_files:
138
+ if file_name.lower().endswith(".jpg"):
139
+ file_path = os.path.join(folder_path, file_name)
140
+ input = embed_img(file_path)
141
+ output = model(input)
142
+ pred_id = torch.max(output.data, 1)[1]
143
+ outputs.append(pred_id)
144
+
145
+ max_count_item = most_common_element(outputs)
146
+ shutil.rmtree(folder_path)
147
+ return os.path.basename(wav_path), translate[CLASSES[max_count_item]]
148
+
149
+
150
+ if __name__ == "__main__":
151
+ warnings.filterwarnings("ignore")
152
+
153
+ models = get_modelist()
154
+ translate = {
155
+ "m_bel": "Male bel canto",
156
+ "m_folk": "Male folk singing",
157
+ "f_bel": "Female bel canto",
158
+ "f_folk": "Female folk singing",
159
+ }
160
+ examples = []
161
+ example_wavs = find_wav_files()
162
+ model_num = len(models)
163
+ for wav in example_wavs:
164
+ examples.append([wav, models[random.randint(0, model_num - 1)]])
165
+
166
+ with gr.Blocks() as demo:
167
+ gr.Interface(
168
+ fn=inference,
169
+ inputs=[
170
+ gr.Audio(label="Uploading a recording", type="filepath"),
171
+ gr.Dropdown(choices=models, label="Select a model", value=models[0]),
172
+ ],
173
+ outputs=[
174
+ gr.Textbox(label="Audio filename", show_copy_button=True),
175
+ gr.Textbox(label="Singing style recognition", show_copy_button=True),
176
+ ],
177
+ examples=examples,
178
+ allow_flagging="never",
179
+ title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
180
+ )
181
+
182
+ demo.launch()