monet-joe commited on
Commit
20e29fa
1 Parent(s): 3b9e533

Upload 3 files

Browse files
Files changed (2) hide show
  1. app.py +176 -176
  2. model.py +144 -142
app.py CHANGED
@@ -1,176 +1,176 @@
1
- import os
2
- import torch
3
- import random
4
- import shutil
5
- import librosa
6
- import warnings
7
- import numpy as np
8
- import gradio as gr
9
- import librosa.display
10
- import matplotlib.pyplot as plt
11
- import torchvision.transforms as transforms
12
- from utils import get_modelist, find_wav_files
13
- from collections import Counter
14
- from model import EvalNet
15
- from PIL import Image
16
-
17
-
18
- CLASSES = ["m_chest", "f_chest", "m_falsetto", "f_falsetto"]
19
-
20
-
21
- def most_common_element(input_list):
22
- # 使用 Counter 统计每个元素的出现次数
23
- counter = Counter(input_list)
24
- # 使用 most_common 方法获取出现次数最多的元素
25
- most_common_element, _ = counter.most_common(1)[0]
26
- return most_common_element
27
-
28
-
29
- def wav_to_mel(audio_path: str, width=0.07):
30
- os.makedirs("./tmp", exist_ok=True)
31
- try:
32
- y, sr = librosa.load(audio_path, sr=48000)
33
- mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
34
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
35
- dur = librosa.get_duration(y=y, sr=sr)
36
- total_frames = log_mel_spec.shape[1]
37
- step = int(width * total_frames / dur)
38
- count = int(total_frames / step)
39
- begin = int(0.5 * (total_frames - count * step))
40
- end = begin + step * count
41
- for i in range(begin, end, step):
42
- librosa.display.specshow(log_mel_spec[:, i : i + step])
43
- plt.axis("off")
44
- plt.savefig(
45
- f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
46
- bbox_inches="tight",
47
- pad_inches=0.0,
48
- )
49
- plt.close()
50
-
51
- except Exception as e:
52
- print(f"Error converting {audio_path} : {e}")
53
-
54
-
55
- def wav_to_cqt(audio_path: str, width=0.07):
56
- os.makedirs("./tmp", exist_ok=True)
57
- try:
58
- y, sr = librosa.load(audio_path, sr=48000)
59
- cqt_spec = librosa.cqt(y=y, sr=sr)
60
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
61
- dur = librosa.get_duration(y=y, sr=sr)
62
- total_frames = log_cqt_spec.shape[1]
63
- step = int(width * total_frames / dur)
64
- count = int(total_frames / step)
65
- begin = int(0.5 * (total_frames - count * step))
66
- end = begin + step * count
67
- for i in range(begin, end, step):
68
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
69
- plt.axis("off")
70
- plt.savefig(
71
- f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
72
- bbox_inches="tight",
73
- pad_inches=0.0,
74
- )
75
- plt.close()
76
-
77
- except Exception as e:
78
- print(f"Error converting {audio_path} : {e}")
79
-
80
-
81
- def wav_to_chroma(audio_path: str, width=0.07):
82
- os.makedirs("./tmp", exist_ok=True)
83
- try:
84
- y, sr = librosa.load(audio_path, sr=48000)
85
- chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
86
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
87
- dur = librosa.get_duration(y=y, sr=sr)
88
- total_frames = log_chroma_spec.shape[1]
89
- step = int(width * total_frames / dur)
90
- count = int(total_frames / step)
91
- begin = int(0.5 * (total_frames - count * step))
92
- end = begin + step * count
93
- for i in range(begin, end, step):
94
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
95
- plt.axis("off")
96
- plt.savefig(
97
- f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
98
- bbox_inches="tight",
99
- pad_inches=0.0,
100
- )
101
- plt.close()
102
-
103
- except Exception as e:
104
- print(f"Error converting {audio_path} : {e}")
105
-
106
-
107
- def embed_img(img_path, input_size=224):
108
- transform = transforms.Compose(
109
- [
110
- transforms.Resize([input_size, input_size]),
111
- transforms.ToTensor(),
112
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
113
- ]
114
- )
115
- img = Image.open(img_path).convert("RGB")
116
- return transform(img).unsqueeze(0)
117
-
118
-
119
- def inference(wav_path, log_name: str, folder_path="./tmp"):
120
- if os.path.exists(folder_path):
121
- shutil.rmtree(folder_path)
122
-
123
- if not wav_path:
124
- wav_path = "./examples/m_falsetto.wav"
125
-
126
- model = EvalNet(log_name).model
127
- spec = log_name.split("_")[-1]
128
- eval("wav_to_%s" % spec)(wav_path)
129
- outputs = []
130
- all_files = os.listdir(folder_path)
131
- for file_name in all_files:
132
- if file_name.lower().endswith(".jpg"):
133
- file_path = os.path.join(folder_path, file_name)
134
- input = embed_img(file_path)
135
- output = model(input)
136
- pred_id = torch.max(output.data, 1)[1]
137
- outputs.append(pred_id)
138
-
139
- max_count_item = most_common_element(outputs)
140
- shutil.rmtree(folder_path)
141
- return os.path.basename(wav_path), translate[CLASSES[max_count_item]]
142
-
143
-
144
- if __name__ == "__main__":
145
- warnings.filterwarnings("ignore")
146
-
147
- models = get_modelist()
148
- translate = {
149
- "m_chest": "Male chest voice",
150
- "f_chest": "Female chest voice",
151
- "m_falsetto": "Male falsetto voice",
152
- "f_falsetto": "Female falsetto voice",
153
- }
154
- examples = []
155
- example_wavs = find_wav_files()
156
- model_num = len(models)
157
- for wav in example_wavs:
158
- examples.append([wav, models[random.randint(0, model_num - 1)]])
159
-
160
- with gr.Blocks() as demo:
161
- gr.Interface(
162
- fn=inference,
163
- inputs=[
164
- gr.Audio(label="Uploading a recording", type="filepath"),
165
- gr.Dropdown(choices=models, label="Select a model", value=models[0]),
166
- ],
167
- outputs=[
168
- gr.Textbox(label="Audio filename", show_copy_button=True),
169
- gr.Textbox(label="Singing style recognition", show_copy_button=True),
170
- ],
171
- examples=examples,
172
- allow_flagging="never",
173
- title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
174
- )
175
-
176
- demo.launch()
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import shutil
5
+ import librosa
6
+ import warnings
7
+ import numpy as np
8
+ import gradio as gr
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import torchvision.transforms as transforms
12
+ from utils import get_modelist, find_wav_files
13
+ from collections import Counter
14
+ from model import EvalNet
15
+ from PIL import Image
16
+
17
+
18
+ TRANSLATE = {
19
+ "m_chest": "男真声",
20
+ "f_chest": "女真声",
21
+ "m_falsetto": "男假声",
22
+ "f_falsetto": "女假声",
23
+ }
24
+ CLASSES = list(TRANSLATE.keys())
25
+
26
+
27
+ def most_common_element(input_list):
28
+ # 使用 Counter 统计每个元素的出现次数
29
+ counter = Counter(input_list)
30
+ # 使用 most_common 方法获取出现次数最多的元素
31
+ most_common_element, _ = counter.most_common(1)[0]
32
+ return most_common_element
33
+
34
+
35
+ def wav_to_mel(audio_path: str, width=0.07):
36
+ os.makedirs("./tmp", exist_ok=True)
37
+ try:
38
+ y, sr = librosa.load(audio_path, sr=48000)
39
+ mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
40
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
41
+ dur = librosa.get_duration(y=y, sr=sr)
42
+ total_frames = log_mel_spec.shape[1]
43
+ step = int(width * total_frames / dur)
44
+ count = int(total_frames / step)
45
+ begin = int(0.5 * (total_frames - count * step))
46
+ end = begin + step * count
47
+ for i in range(begin, end, step):
48
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
49
+ plt.axis("off")
50
+ plt.savefig(
51
+ f"./tmp/mel_{round(dur, 2)}_{i}.jpg",
52
+ bbox_inches="tight",
53
+ pad_inches=0.0,
54
+ )
55
+ plt.close()
56
+
57
+ except Exception as e:
58
+ print(f"Error converting {audio_path} : {e}")
59
+
60
+
61
+ def wav_to_cqt(audio_path: str, width=0.07):
62
+ os.makedirs("./tmp", exist_ok=True)
63
+ try:
64
+ y, sr = librosa.load(audio_path, sr=48000)
65
+ cqt_spec = librosa.cqt(y=y, sr=sr)
66
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
67
+ dur = librosa.get_duration(y=y, sr=sr)
68
+ total_frames = log_cqt_spec.shape[1]
69
+ step = int(width * total_frames / dur)
70
+ count = int(total_frames / step)
71
+ begin = int(0.5 * (total_frames - count * step))
72
+ end = begin + step * count
73
+ for i in range(begin, end, step):
74
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
75
+ plt.axis("off")
76
+ plt.savefig(
77
+ f"./tmp/cqt_{round(dur, 2)}_{i}.jpg",
78
+ bbox_inches="tight",
79
+ pad_inches=0.0,
80
+ )
81
+ plt.close()
82
+
83
+ except Exception as e:
84
+ print(f"Error converting {audio_path} : {e}")
85
+
86
+
87
+ def wav_to_chroma(audio_path: str, width=0.07):
88
+ os.makedirs("./tmp", exist_ok=True)
89
+ try:
90
+ y, sr = librosa.load(audio_path, sr=48000)
91
+ chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
92
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
93
+ dur = librosa.get_duration(y=y, sr=sr)
94
+ total_frames = log_chroma_spec.shape[1]
95
+ step = int(width * total_frames / dur)
96
+ count = int(total_frames / step)
97
+ begin = int(0.5 * (total_frames - count * step))
98
+ end = begin + step * count
99
+ for i in range(begin, end, step):
100
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
101
+ plt.axis("off")
102
+ plt.savefig(
103
+ f"./tmp/chroma_{round(dur, 2)}_{i}.jpg",
104
+ bbox_inches="tight",
105
+ pad_inches=0.0,
106
+ )
107
+ plt.close()
108
+
109
+ except Exception as e:
110
+ print(f"Error converting {audio_path} : {e}")
111
+
112
+
113
+ def embed_img(img_path, input_size=224):
114
+ transform = transforms.Compose(
115
+ [
116
+ transforms.Resize([input_size, input_size]),
117
+ transforms.ToTensor(),
118
+ transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
119
+ ]
120
+ )
121
+ img = Image.open(img_path).convert("RGB")
122
+ return transform(img).unsqueeze(0)
123
+
124
+
125
+ def inference(wav_path, log_name: str, folder_path="./tmp"):
126
+ if os.path.exists(folder_path):
127
+ shutil.rmtree(folder_path)
128
+
129
+ if not wav_path:
130
+ wav_path = "./examples/m_falsetto.wav"
131
+
132
+ model = EvalNet(log_name).model
133
+ spec = log_name.split("_")[-1]
134
+ eval("wav_to_%s" % spec)(wav_path)
135
+ outputs = []
136
+ all_files = os.listdir(folder_path)
137
+ for file_name in all_files:
138
+ if file_name.lower().endswith(".jpg"):
139
+ file_path = os.path.join(folder_path, file_name)
140
+ input = embed_img(file_path)
141
+ output = model(input)
142
+ pred_id = torch.max(output.data, 1)[1]
143
+ outputs.append(pred_id)
144
+
145
+ max_count_item = most_common_element(outputs)
146
+ shutil.rmtree(folder_path)
147
+ return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
148
+
149
+
150
+ if __name__ == "__main__":
151
+ warnings.filterwarnings("ignore")
152
+
153
+ models = get_modelist()
154
+ examples = []
155
+ example_wavs = find_wav_files()
156
+ model_num = len(models)
157
+ for wav in example_wavs:
158
+ examples.append([wav, models[random.randint(0, model_num - 1)]])
159
+
160
+ with gr.Blocks() as demo:
161
+ gr.Interface(
162
+ fn=inference,
163
+ inputs=[
164
+ gr.Audio(label="上传录音", type="filepath"),
165
+ gr.Dropdown(choices=models, label="选择模型", value=models[0]),
166
+ ],
167
+ outputs=[
168
+ gr.Textbox(label="音频文件名", show_copy_button=True),
169
+ gr.Textbox(label="唱法识别", show_copy_button=True),
170
+ ],
171
+ examples=examples,
172
+ allow_flagging="never",
173
+ title="建议录音时长保持在 5s 左右, 过长会影响识别效率",
174
+ )
175
+
176
+ demo.launch()
model.py CHANGED
@@ -1,142 +1,144 @@
1
- import torch
2
- import torch.nn as nn
3
- import torchvision.models as models
4
- from modelscope.msdatasets import MsDataset
5
- from utils import MODEL_DIR
6
-
7
-
8
- def get_backbone(ver, backbone_list):
9
- for bb in backbone_list:
10
- if ver == bb["ver"]:
11
- return bb
12
-
13
- print("Backbone name not found, using default option - alexnet.")
14
- return backbone_list[0]
15
-
16
-
17
- def model_info(m_ver):
18
- backbone_list = MsDataset.load("monetjoe/cv_backbones", split="train")
19
- backbone = get_backbone(m_ver, backbone_list)
20
- m_type = str(backbone["type"])
21
- input_size = int(backbone["input_size"])
22
- return m_type, input_size
23
-
24
-
25
- def Classifier(cls_num: int, output_size: int, linear_output: bool):
26
- q = (1.0 * output_size / cls_num) ** 0.25
27
- l1 = int(q * cls_num)
28
- l2 = int(q * l1)
29
- l3 = int(q * l2)
30
-
31
- if linear_output:
32
- return torch.nn.Sequential(
33
- nn.Dropout(),
34
- nn.Linear(output_size, l3),
35
- nn.ReLU(inplace=True),
36
- nn.Dropout(),
37
- nn.Linear(l3, l2),
38
- nn.ReLU(inplace=True),
39
- nn.Dropout(),
40
- nn.Linear(l2, l1),
41
- nn.ReLU(inplace=True),
42
- nn.Linear(l1, cls_num),
43
- )
44
-
45
- else:
46
- return torch.nn.Sequential(
47
- nn.Dropout(),
48
- nn.Conv2d(output_size, l3, kernel_size=(1, 1), stride=(1, 1)),
49
- nn.ReLU(inplace=True),
50
- nn.AdaptiveAvgPool2d(output_size=(1, 1)),
51
- nn.Flatten(),
52
- nn.Linear(l3, l2),
53
- nn.ReLU(inplace=True),
54
- nn.Dropout(),
55
- nn.Linear(l2, l1),
56
- nn.ReLU(inplace=True),
57
- nn.Linear(l1, cls_num),
58
- )
59
-
60
-
61
- class EvalNet:
62
- model = None
63
- m_type = "squeezenet"
64
- input_size = 224
65
- output_size = 512
66
-
67
- def __init__(self, log_name: str, cls_num=4):
68
- saved_model_path = f"{MODEL_DIR}/{log_name}/save.pt"
69
- m_ver = "_".join(log_name.split("_")[:-1])
70
- self.m_type, self.input_size = model_info(m_ver)
71
-
72
- if not hasattr(models, m_ver):
73
- print("Unsupported model.")
74
- exit()
75
-
76
- self.model = eval("models.%s()" % m_ver)
77
- linear_output = self._set_outsize()
78
- self._set_classifier(cls_num, linear_output)
79
- checkpoint = torch.load(saved_model_path, map_location="cpu")
80
- if torch.cuda.is_available():
81
- checkpoint = torch.load(saved_model_path)
82
-
83
- self.model.load_state_dict(checkpoint, False)
84
- self.model.eval()
85
-
86
- def _set_outsize(self, debug_mode=False):
87
- for name, module in self.model.named_modules():
88
- if (
89
- str(name).__contains__("classifier")
90
- or str(name).__eq__("fc")
91
- or str(name).__contains__("head")
92
- ):
93
- if isinstance(module, torch.nn.Linear):
94
- self.output_size = module.in_features
95
- if debug_mode:
96
- print(
97
- f"{name}(Linear): {self.output_size} -> {module.out_features}"
98
- )
99
- return True
100
-
101
- if isinstance(module, torch.nn.Conv2d):
102
- self.output_size = module.in_channels
103
- if debug_mode:
104
- print(
105
- f"{name}(Conv2d): {self.output_size} -> {module.out_channels}"
106
- )
107
- return False
108
-
109
- return False
110
-
111
- def _set_classifier(self, cls_num, linear_output):
112
- if self.m_type == "convnext":
113
- del self.model.classifier[2]
114
- self.model.classifier = nn.Sequential(
115
- *list(self.model.classifier)
116
- + list(Classifier(cls_num, self.output_size, linear_output))
117
- )
118
- return
119
-
120
- if hasattr(self.model, "classifier"):
121
- self.model.classifier = Classifier(cls_num, self.output_size, linear_output)
122
- return
123
-
124
- elif hasattr(self.model, "fc"):
125
- self.model.fc = Classifier(cls_num, self.output_size, linear_output)
126
- return
127
-
128
- elif hasattr(self.model, "head"):
129
- self.model.head = Classifier(cls_num, self.output_size, linear_output)
130
- return
131
-
132
- self.model.heads.head = Classifier(cls_num, self.output_size, linear_output)
133
-
134
- def forward(self, x):
135
- if torch.cuda.is_available():
136
- x = x.cuda()
137
- self.model = self.model.cuda()
138
-
139
- if self.m_type == "googlenet" and self.training:
140
- return self.model(x)[0]
141
- else:
142
- return self.model(x)
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torchvision.models as models
4
+ from modelscope.msdatasets import MsDataset
5
+ from utils import MODEL_DIR
6
+
7
+
8
+ def get_backbone(ver, backbone_list):
9
+ for bb in backbone_list:
10
+ if ver == bb["ver"]:
11
+ return bb
12
+
13
+ print("Backbone name not found, using default option - alexnet.")
14
+ return backbone_list[0]
15
+
16
+
17
+ def model_info(m_ver):
18
+ backbone_list = MsDataset.load(
19
+ "monetjoe/cv_backbones", split="train"
20
+ )
21
+ backbone = get_backbone(m_ver, backbone_list)
22
+ m_type = str(backbone["type"])
23
+ input_size = int(backbone["input_size"])
24
+ return m_type, input_size
25
+
26
+
27
+ def Classifier(cls_num: int, output_size: int, linear_output: bool):
28
+ q = (1.0 * output_size / cls_num) ** 0.25
29
+ l1 = int(q * cls_num)
30
+ l2 = int(q * l1)
31
+ l3 = int(q * l2)
32
+
33
+ if linear_output:
34
+ return torch.nn.Sequential(
35
+ nn.Dropout(),
36
+ nn.Linear(output_size, l3),
37
+ nn.ReLU(inplace=True),
38
+ nn.Dropout(),
39
+ nn.Linear(l3, l2),
40
+ nn.ReLU(inplace=True),
41
+ nn.Dropout(),
42
+ nn.Linear(l2, l1),
43
+ nn.ReLU(inplace=True),
44
+ nn.Linear(l1, cls_num),
45
+ )
46
+
47
+ else:
48
+ return torch.nn.Sequential(
49
+ nn.Dropout(),
50
+ nn.Conv2d(output_size, l3, kernel_size=(1, 1), stride=(1, 1)),
51
+ nn.ReLU(inplace=True),
52
+ nn.AdaptiveAvgPool2d(output_size=(1, 1)),
53
+ nn.Flatten(),
54
+ nn.Linear(l3, l2),
55
+ nn.ReLU(inplace=True),
56
+ nn.Dropout(),
57
+ nn.Linear(l2, l1),
58
+ nn.ReLU(inplace=True),
59
+ nn.Linear(l1, cls_num),
60
+ )
61
+
62
+
63
+ class EvalNet:
64
+ model = None
65
+ m_type = "squeezenet"
66
+ input_size = 224
67
+ output_size = 512
68
+
69
+ def __init__(self, log_name: str, cls_num=4):
70
+ saved_model_path = f"{MODEL_DIR}/{log_name}/save.pt"
71
+ m_ver = "_".join(log_name.split("_")[:-1])
72
+ self.m_type, self.input_size = model_info(m_ver)
73
+
74
+ if not hasattr(models, m_ver):
75
+ print("Unsupported model.")
76
+ exit()
77
+
78
+ self.model = eval("models.%s()" % m_ver)
79
+ linear_output = self._set_outsize()
80
+ self._set_classifier(cls_num, linear_output)
81
+ checkpoint = torch.load(saved_model_path, map_location="cpu")
82
+ if torch.cuda.is_available():
83
+ checkpoint = torch.load(saved_model_path)
84
+
85
+ self.model.load_state_dict(checkpoint, False)
86
+ self.model.eval()
87
+
88
+ def _set_outsize(self, debug_mode=False):
89
+ for name, module in self.model.named_modules():
90
+ if (
91
+ str(name).__contains__("classifier")
92
+ or str(name).__eq__("fc")
93
+ or str(name).__contains__("head")
94
+ ):
95
+ if isinstance(module, torch.nn.Linear):
96
+ self.output_size = module.in_features
97
+ if debug_mode:
98
+ print(
99
+ f"{name}(Linear): {self.output_size} -> {module.out_features}"
100
+ )
101
+ return True
102
+
103
+ if isinstance(module, torch.nn.Conv2d):
104
+ self.output_size = module.in_channels
105
+ if debug_mode:
106
+ print(
107
+ f"{name}(Conv2d): {self.output_size} -> {module.out_channels}"
108
+ )
109
+ return False
110
+
111
+ return False
112
+
113
+ def _set_classifier(self, cls_num, linear_output):
114
+ if self.m_type == "convnext":
115
+ del self.model.classifier[2]
116
+ self.model.classifier = nn.Sequential(
117
+ *list(self.model.classifier)
118
+ + list(Classifier(cls_num, self.output_size, linear_output))
119
+ )
120
+ return
121
+
122
+ if hasattr(self.model, "classifier"):
123
+ self.model.classifier = Classifier(cls_num, self.output_size, linear_output)
124
+ return
125
+
126
+ elif hasattr(self.model, "fc"):
127
+ self.model.fc = Classifier(cls_num, self.output_size, linear_output)
128
+ return
129
+
130
+ elif hasattr(self.model, "head"):
131
+ self.model.head = Classifier(cls_num, self.output_size, linear_output)
132
+ return
133
+
134
+ self.model.heads.head = Classifier(cls_num, self.output_size, linear_output)
135
+
136
+ def forward(self, x):
137
+ if torch.cuda.is_available():
138
+ x = x.cuda()
139
+ self.model = self.model.cuda()
140
+
141
+ if self.m_type == "googlenet" and self.training:
142
+ return self.model(x)[0]
143
+ else:
144
+ return self.model(x)