pengdaqian commited on
Commit
dbea546
1 Parent(s): c868571
Files changed (5) hide show
  1. app.py +12 -5
  2. torchspleeter/utils.py +2 -0
  3. utils/__init__.py +0 -0
  4. utils/utils.py +13 -0
  5. whisper/inference.py +5 -3
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import sys
3
  from music.search import get_youtube, download_random
 
4
  from vits.models import SynthesizerInfer
5
  import whisper.inference
6
  from omegaconf import OmegaConf
@@ -35,6 +36,7 @@ def load_svc_model(checkpoint_path, model):
35
  return model
36
 
37
 
 
38
  def compute_f0_nn(filename, device):
39
  audio, sr = librosa.load(filename, sr=16000)
40
  assert sr == 16000
@@ -82,16 +84,21 @@ load_svc_model("vits_pretrain/sovits5.0-48k-debug.pth", model)
82
  model.eval()
83
  model.to(device)
84
  whisper_model = whisper.inference.load_model(os.path.join("whisper_pretrain", "medium.pt"))
 
 
 
85
  splitter_model = Splitter.from_pretrained(os.path.join("torchspleeter/models/2stems", "spleeter.pth")).to(device).eval()
86
 
87
 
88
  # warm up
89
  # separator.separate_to_file('warm.wav', '/tmp/warm')
90
 
91
-
92
  def svc_change(argswave, argsspk):
93
- argsppg = "svc_tmp.ppg.npy"
94
- whisper.inference.pred_ppg(whisper_model, argswave, argsppg)
 
 
95
  # os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
96
 
97
  spk = np.load(argsspk)
@@ -173,6 +180,7 @@ def svc_change(argswave, argsspk):
173
  return out_audio
174
 
175
 
 
176
  def svc_main(sid, input_audio):
177
  if input_audio is None:
178
  return "You need to upload an audio", None
@@ -218,12 +226,10 @@ def svc_main(sid, input_audio):
218
  soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
219
  print(f"out_vocals_filepath: {out_vocals_filepath}")
220
 
221
- print("start to mix")
222
  sound1 = AudioSegment.from_file(out_vocals_filepath)
223
  sound2 = AudioSegment.from_file(accompaniment_filepath)
224
 
225
  played_togther = sound1.overlay(sound2)
226
- print("mix done")
227
 
228
  result_path = os.path.join(curr_tmp_path, 'out_song.wav')
229
  played_togther.export(result_path, format="wav")
@@ -234,6 +240,7 @@ def svc_main(sid, input_audio):
234
  return "Success", (sampling_rate, result)
235
 
236
 
 
237
  def auto_search(name):
238
  save_music_path = '/tmp/downloaded'
239
  if not os.path.exists(save_music_path):
 
1
  import os
2
  import sys
3
  from music.search import get_youtube, download_random
4
+ from utils.utils import log_execution_time
5
  from vits.models import SynthesizerInfer
6
  import whisper.inference
7
  from omegaconf import OmegaConf
 
36
  return model
37
 
38
 
39
+ @log_execution_time
40
  def compute_f0_nn(filename, device):
41
  audio, sr = librosa.load(filename, sr=16000)
42
  assert sr == 16000
 
84
  model.eval()
85
  model.to(device)
86
  whisper_model = whisper.inference.load_model(os.path.join("whisper_pretrain", "medium.pt"))
87
+ whisper_quant_model = torch.quantization.quantize_dynamic(
88
+ whisper_model, {torch.nn.Linear}, dtype=torch.qint8
89
+ )
90
  splitter_model = Splitter.from_pretrained(os.path.join("torchspleeter/models/2stems", "spleeter.pth")).to(device).eval()
91
 
92
 
93
  # warm up
94
  # separator.separate_to_file('warm.wav', '/tmp/warm')
95
 
96
+ @log_execution_time
97
  def svc_change(argswave, argsspk):
98
+ argsppg = "svc_tmp_quant.ppg.npy"
99
+ # whisper.inference.pred_ppg(whisper_model, argswave, argsppg)
100
+ whisper.inference.pred_ppg(whisper_quant_model, argswave, argsppg)
101
+
102
  # os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
103
 
104
  spk = np.load(argsspk)
 
180
  return out_audio
181
 
182
 
183
+ @log_execution_time
184
  def svc_main(sid, input_audio):
185
  if input_audio is None:
186
  return "You need to upload an audio", None
 
226
  soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
227
  print(f"out_vocals_filepath: {out_vocals_filepath}")
228
 
 
229
  sound1 = AudioSegment.from_file(out_vocals_filepath)
230
  sound2 = AudioSegment.from_file(accompaniment_filepath)
231
 
232
  played_togther = sound1.overlay(sound2)
 
233
 
234
  result_path = os.path.join(curr_tmp_path, 'out_song.wav')
235
  played_togther.export(result_path, format="wav")
 
240
  return "Success", (sampling_rate, result)
241
 
242
 
243
+ @log_execution_time
244
  def auto_search(name):
245
  save_music_path = '/tmp/downloaded'
246
  if not os.path.exists(save_music_path):
torchspleeter/utils.py CHANGED
@@ -4,9 +4,11 @@ from pathlib import Path
4
 
5
  import torch
6
 
 
7
  from .splitter import Splitter
8
 
9
 
 
10
  def sound_split(
11
  model: Splitter,
12
  input: str = "data/audio_example.mp3",
 
4
 
5
  import torch
6
 
7
+ from utils.utils import log_execution_time
8
  from .splitter import Splitter
9
 
10
 
11
+ @log_execution_time
12
  def sound_split(
13
  model: Splitter,
14
  input: str = "data/audio_example.mp3",
utils/__init__.py ADDED
File without changes
utils/utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+
4
+ def log_execution_time(func):
5
+ def wrapper(*args, **kwargs):
6
+ start_time = time.time()
7
+ result = func(*args, **kwargs)
8
+ end_time = time.time()
9
+ execution_time = end_time - start_time
10
+ print(f"Func {func.__name__} Cost {execution_time} s")
11
+ return result
12
+
13
+ return wrapper
whisper/inference.py CHANGED
@@ -3,6 +3,7 @@ import numpy as np
3
  import argparse
4
  import torch
5
 
 
6
  from whisper.model import Whisper, ModelDimensions
7
  from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
8
 
@@ -16,6 +17,7 @@ def load_model(path) -> Whisper:
16
  return model.to(device)
17
 
18
 
 
19
  def pred_ppg(whisper: Whisper, wavPath, ppgPath):
20
  audio = load_audio(wavPath)
21
  audln = audio.shape[0]
@@ -29,7 +31,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
29
  mel = log_mel_spectrogram(short).to(whisper.device)
30
  with torch.no_grad():
31
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
32
- ppg = ppg[:ppgln,] # [length, dim=1024]
33
  ppg_a.extend(ppg)
34
  if idx_s < audln:
35
  short = audio[idx_s:audln]
@@ -38,7 +40,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
38
  mel = log_mel_spectrogram(short).to(whisper.device)
39
  with torch.no_grad():
40
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
41
- ppg = ppg[:ppgln,] # [length, dim=1024]
42
  ppg_a.extend(ppg)
43
  np.save(ppgPath, ppg_a, allow_pickle=False)
44
 
@@ -48,7 +50,7 @@ if __name__ == "__main__":
48
  parser.description = 'please enter embed parameter ...'
49
  parser.add_argument("-w", "--wav", help="wav", dest="wav")
50
  parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
51
-
52
  args = parser.parse_args()
53
  print(args.wav)
54
  print(args.ppg)
 
3
  import argparse
4
  import torch
5
 
6
+ from utils.utils import log_execution_time
7
  from whisper.model import Whisper, ModelDimensions
8
  from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
9
 
 
17
  return model.to(device)
18
 
19
 
20
+ @log_execution_time
21
  def pred_ppg(whisper: Whisper, wavPath, ppgPath):
22
  audio = load_audio(wavPath)
23
  audln = audio.shape[0]
 
31
  mel = log_mel_spectrogram(short).to(whisper.device)
32
  with torch.no_grad():
33
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
34
+ ppg = ppg[:ppgln, ] # [length, dim=1024]
35
  ppg_a.extend(ppg)
36
  if idx_s < audln:
37
  short = audio[idx_s:audln]
 
40
  mel = log_mel_spectrogram(short).to(whisper.device)
41
  with torch.no_grad():
42
  ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
43
+ ppg = ppg[:ppgln, ] # [length, dim=1024]
44
  ppg_a.extend(ppg)
45
  np.save(ppgPath, ppg_a, allow_pickle=False)
46
 
 
50
  parser.description = 'please enter embed parameter ...'
51
  parser.add_argument("-w", "--wav", help="wav", dest="wav")
52
  parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
53
+
54
  args = parser.parse_args()
55
  print(args.wav)
56
  print(args.ppg)