Monan Zhou commited on
Commit
752fe95
1 Parent(s): 9518326

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -176
app.py CHANGED
@@ -1,176 +1,176 @@
1
- import os
2
- import torch
3
- import random
4
- import shutil
5
- import librosa
6
- import warnings
7
- import numpy as np
8
- import gradio as gr
9
- import librosa.display
10
- import matplotlib.pyplot as plt
11
- import torchvision.transforms as transforms
12
- from utils import get_modelist, find_wav_files
13
- from collections import Counter
14
- from model import EvalNet
15
- from PIL import Image
16
-
17
-
18
- CLASSES = ["m_chest", "f_chest", "m_falsetto", "f_falsetto"]
19
-
20
-
21
- def most_common_element(input_list):
22
- # 使用 Counter 统计每个元素的出现次数
23
- counter = Counter(input_list)
24
- # 使用 most_common 方法获取出现次数最多的元素
25
- most_common_element, _ = counter.most_common(1)[0]
26
- return most_common_element
27
-
28
-
29
- def wav_to_mel(audio_path: str, width=0.07):
30
- os.makedirs("./tmp", exist_ok=True)
31
- try:
32
- y, sr = librosa.load(audio_path, sr=48000)
33
- mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
34
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
35
- dur = librosa.get_duration(y=y, sr=sr)
36
- total_frames = log_mel_spec.shape[1]
37
- step = int(width * total_frames / dur)
38
- count = int(total_frames / step)
39
- begin = int(0.5 * (total_frames - count * step))
40
- end = begin + step * count
41
- for i in range(begin, end, step):
42
- librosa.display.specshow(log_mel_spec[:, i : i + step])
43
- plt.axis("off")
44
- plt.savefig(
45
- f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
46
- bbox_inches="tight",
47
- pad_inches=0.0,
48
- )
49
- plt.close()
50
-
51
- except Exception as e:
52
- print(f"Error converting {audio_path} : {e}")
53
-
54
-
55
- def wav_to_cqt(audio_path: str, width=0.07):
56
- os.makedirs("./tmp", exist_ok=True)
57
- try:
58
- y, sr = librosa.load(audio_path, sr=48000)
59
- cqt_spec = librosa.cqt(y=y, sr=sr)
60
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
61
- dur = librosa.get_duration(y=y, sr=sr)
62
- total_frames = log_cqt_spec.shape[1]
63
- step = int(width * total_frames / dur)
64
- count = int(total_frames / step)
65
- begin = int(0.5 * (total_frames - count * step))
66
- end = begin + step * count
67
- for i in range(begin, end, step):
68
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
69
- plt.axis("off")
70
- plt.savefig(
71
- f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
72
- bbox_inches="tight",
73
- pad_inches=0.0,
74
- )
75
- plt.close()
76
-
77
- except Exception as e:
78
- print(f"Error converting {audio_path} : {e}")
79
-
80
-
81
- def wav_to_chroma(audio_path: str, width=0.07):
82
- os.makedirs("./tmp", exist_ok=True)
83
- try:
84
- y, sr = librosa.load(audio_path, sr=48000)
85
- chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
86
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
87
- dur = librosa.get_duration(y=y, sr=sr)
88
- total_frames = log_chroma_spec.shape[1]
89
- step = int(width * total_frames / dur)
90
- count = int(total_frames / step)
91
- begin = int(0.5 * (total_frames - count * step))
92
- end = begin + step * count
93
- for i in range(begin, end, step):
94
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
95
- plt.axis("off")
96
- plt.savefig(
97
- f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
98
- bbox_inches="tight",
99
- pad_inches=0.0,
100
- )
101
- plt.close()
102
-
103
- except Exception as e:
104
- print(f"Error converting {audio_path} : {e}")
105
-
106
-
107
- def embed_img(img_path, input_size=224):
108
- transform = transforms.Compose(
109
- [
110
- transforms.Resize([input_size, input_size]),
111
- transforms.ToTensor(),
112
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
113
- ]
114
- )
115
- img = Image.open(img_path).convert("RGB")
116
- return transform(img).unsqueeze(0)
117
-
118
-
119
- def inference(wav_path, log_name: str, folder_path="./tmp"):
120
- if os.path.exists(folder_path):
121
- shutil.rmtree(folder_path)
122
-
123
- if not wav_path:
124
- wav_path = "./examples/m_falsetto.wav"
125
-
126
- model = EvalNet(log_name).model
127
- spec = log_name.split("_")[-1]
128
- eval("wav_to_%s" % spec)(wav_path)
129
- outputs = []
130
- all_files = os.listdir(folder_path)
131
- for file_name in all_files:
132
- if file_name.lower().endswith(".jpg"):
133
- file_path = os.path.join(folder_path, file_name)
134
- input = embed_img(file_path)
135
- output = model(input)
136
- pred_id = torch.max(output.data, 1)[1]
137
- outputs.append(pred_id)
138
-
139
- max_count_item = most_common_element(outputs)
140
- shutil.rmtree(folder_path)
141
- return os.path.basename(wav_path), translate[CLASSES[max_count_item]]
142
-
143
-
144
- if __name__ == "__main__":
145
- warnings.filterwarnings("ignore")
146
-
147
- models = get_modelist()
148
- translate = {
149
- "m_chest": "男真声",
150
- "f_chest": "女真声",
151
- "m_falsetto": "男假声",
152
- "f_falsetto": "女假声",
153
- }
154
- examples = []
155
- example_wavs = find_wav_files()
156
- model_num = len(models)
157
- for wav in example_wavs:
158
- examples.append([wav, models[random.randint(0, model_num - 1)]])
159
-
160
- with gr.Blocks() as demo:
161
- gr.Interface(
162
- fn=inference,
163
- inputs=[
164
- gr.Audio(label="上传录音", type="filepath"),
165
- gr.Dropdown(choices=models, label="选择模型", value=models[0]),
166
- ],
167
- outputs=[
168
- gr.Textbox(label="音频文件名", show_copy_button=True),
169
- gr.Textbox(label="唱法识别", show_copy_button=True),
170
- ],
171
- examples=examples,
172
- allow_flagging="never",
173
- title="建议录音时长保持在 5s 左右, 过长会影响识别效率",
174
- )
175
-
176
- demo.launch()
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import shutil
5
+ import librosa
6
+ import warnings
7
+ import numpy as np
8
+ import gradio as gr
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import torchvision.transforms as transforms
12
+ from utils import get_modelist, find_wav_files
13
+ from collections import Counter
14
+ from model import EvalNet
15
+ from PIL import Image
16
+
17
+
18
+ CLASSES = ["m_chest", "f_chest", "m_falsetto", "f_falsetto"]
19
+
20
+
21
+ def most_common_element(input_list):
22
+ # 使用 Counter 统计每个元素的出现次数
23
+ counter = Counter(input_list)
24
+ # 使用 most_common 方法获取出现次数最多的元素
25
+ most_common_element, _ = counter.most_common(1)[0]
26
+ return most_common_element
27
+
28
+
29
+ def wav_to_mel(audio_path: str, width=0.07):
30
+ os.makedirs("./tmp", exist_ok=True)
31
+ try:
32
+ y, sr = librosa.load(audio_path, sr=48000)
33
+ mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
34
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
35
+ dur = librosa.get_duration(y=y, sr=sr)
36
+ total_frames = log_mel_spec.shape[1]
37
+ step = int(width * total_frames / dur)
38
+ count = int(total_frames / step)
39
+ begin = int(0.5 * (total_frames - count * step))
40
+ end = begin + step * count
41
+ for i in range(begin, end, step):
42
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
43
+ plt.axis("off")
44
+ plt.savefig(
45
+ f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
46
+ bbox_inches="tight",
47
+ pad_inches=0.0,
48
+ )
49
+ plt.close()
50
+
51
+ except Exception as e:
52
+ print(f"Error converting {audio_path} : {e}")
53
+
54
+
55
+ def wav_to_cqt(audio_path: str, width=0.07):
56
+ os.makedirs("./tmp", exist_ok=True)
57
+ try:
58
+ y, sr = librosa.load(audio_path, sr=48000)
59
+ cqt_spec = librosa.cqt(y=y, sr=sr)
60
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
61
+ dur = librosa.get_duration(y=y, sr=sr)
62
+ total_frames = log_cqt_spec.shape[1]
63
+ step = int(width * total_frames / dur)
64
+ count = int(total_frames / step)
65
+ begin = int(0.5 * (total_frames - count * step))
66
+ end = begin + step * count
67
+ for i in range(begin, end, step):
68
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
69
+ plt.axis("off")
70
+ plt.savefig(
71
+ f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
72
+ bbox_inches="tight",
73
+ pad_inches=0.0,
74
+ )
75
+ plt.close()
76
+
77
+ except Exception as e:
78
+ print(f"Error converting {audio_path} : {e}")
79
+
80
+
81
+ def wav_to_chroma(audio_path: str, width=0.07):
82
+ os.makedirs("./tmp", exist_ok=True)
83
+ try:
84
+ y, sr = librosa.load(audio_path, sr=48000)
85
+ chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
86
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
87
+ dur = librosa.get_duration(y=y, sr=sr)
88
+ total_frames = log_chroma_spec.shape[1]
89
+ step = int(width * total_frames / dur)
90
+ count = int(total_frames / step)
91
+ begin = int(0.5 * (total_frames - count * step))
92
+ end = begin + step * count
93
+ for i in range(begin, end, step):
94
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
95
+ plt.axis("off")
96
+ plt.savefig(
97
+ f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
98
+ bbox_inches="tight",
99
+ pad_inches=0.0,
100
+ )
101
+ plt.close()
102
+
103
+ except Exception as e:
104
+ print(f"Error converting {audio_path} : {e}")
105
+
106
+
107
+ def embed_img(img_path, input_size=224):
108
+ transform = transforms.Compose(
109
+ [
110
+ transforms.Resize([input_size, input_size]),
111
+ transforms.ToTensor(),
112
+ transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
113
+ ]
114
+ )
115
+ img = Image.open(img_path).convert("RGB")
116
+ return transform(img).unsqueeze(0)
117
+
118
+
119
+ def inference(wav_path, log_name: str, folder_path="./tmp"):
120
+ if os.path.exists(folder_path):
121
+ shutil.rmtree(folder_path)
122
+
123
+ if not wav_path:
124
+ wav_path = "./examples/m_falsetto.wav"
125
+
126
+ model = EvalNet(log_name).model
127
+ spec = log_name.split("_")[-1]
128
+ eval("wav_to_%s" % spec)(wav_path)
129
+ outputs = []
130
+ all_files = os.listdir(folder_path)
131
+ for file_name in all_files:
132
+ if file_name.lower().endswith(".jpg"):
133
+ file_path = os.path.join(folder_path, file_name)
134
+ input = embed_img(file_path)
135
+ output = model(input)
136
+ pred_id = torch.max(output.data, 1)[1]
137
+ outputs.append(pred_id)
138
+
139
+ max_count_item = most_common_element(outputs)
140
+ shutil.rmtree(folder_path)
141
+ return os.path.basename(wav_path), translate[CLASSES[max_count_item]]
142
+
143
+
144
+ if __name__ == "__main__":
145
+ warnings.filterwarnings("ignore")
146
+
147
+ models = get_modelist()
148
+ translate = {
149
+ "m_chest": "Male chest voice",
150
+ "f_chest": "Female chest voice",
151
+ "m_falsetto": "Male falsetto voice",
152
+ "f_falsetto": "Female falsetto voice",
153
+ }
154
+ examples = []
155
+ example_wavs = find_wav_files()
156
+ model_num = len(models)
157
+ for wav in example_wavs:
158
+ examples.append([wav, models[random.randint(0, model_num - 1)]])
159
+
160
+ with gr.Blocks() as demo:
161
+ gr.Interface(
162
+ fn=inference,
163
+ inputs=[
164
+ gr.Audio(label="Uploading a recording", type="filepath"),
165
+ gr.Dropdown(choices=models, label="Select a model", value=models[0]),
166
+ ],
167
+ outputs=[
168
+ gr.Textbox(label="Audio filename", show_copy_button=True),
169
+ gr.Textbox(label="Singing style recognition", show_copy_button=True),
170
+ ],
171
+ examples=examples,
172
+ allow_flagging="never",
173
+ title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
174
+ )
175
+
176
+ demo.launch()