Spaces:
Running
Running
Commit
·
33085cc
1
Parent(s):
5856dbb
minor fixes
Browse files- app.py +19 -7
- separate.py +27 -21
app.py
CHANGED
|
@@ -107,11 +107,11 @@ def process(model_name, in_filename: str):
|
|
| 107 |
logging.info(f"model_name: {model_name}")
|
| 108 |
logging.info(f"in_filename: {in_filename}")
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
-
duration =
|
| 115 |
|
| 116 |
sp = load_model(model_name)
|
| 117 |
|
|
@@ -121,7 +121,7 @@ def process(model_name, in_filename: str):
|
|
| 121 |
|
| 122 |
start = time.time()
|
| 123 |
|
| 124 |
-
output = sp.process(sample_rate=
|
| 125 |
|
| 126 |
date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
|
| 127 |
end = time.time()
|
|
@@ -154,6 +154,17 @@ def process(model_name, in_filename: str):
|
|
| 154 |
|
| 155 |
|
| 156 |
title = "# Source separation with Next-gen Kaldi"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
# css style is copied from
|
| 159 |
# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
|
|
@@ -172,9 +183,9 @@ with demo:
|
|
| 172 |
gr.Markdown(title)
|
| 173 |
|
| 174 |
model_dropdown = gr.Dropdown(
|
| 175 |
-
choices=model_list[
|
| 176 |
label="Select a model",
|
| 177 |
-
value=model_list[
|
| 178 |
)
|
| 179 |
|
| 180 |
with gr.Tabs():
|
|
@@ -259,6 +270,7 @@ with demo:
|
|
| 259 |
inputs=[model_dropdown, url_textbox],
|
| 260 |
outputs=[url_vocals, url_non_vocals, url_html_info],
|
| 261 |
)
|
|
|
|
| 262 |
|
| 263 |
if __name__ == "__main__":
|
| 264 |
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
|
|
| 107 |
logging.info(f"model_name: {model_name}")
|
| 108 |
logging.info(f"in_filename: {in_filename}")
|
| 109 |
|
| 110 |
+
samples, sample_rate = load_audio(in_filename)
|
| 111 |
+
samples = np.transpose(samples)
|
| 112 |
+
samples = np.ascontiguousarray(samples)
|
| 113 |
|
| 114 |
+
duration = samples.shape[1] / sample_rate # in seconds
|
| 115 |
|
| 116 |
sp = load_model(model_name)
|
| 117 |
|
|
|
|
| 121 |
|
| 122 |
start = time.time()
|
| 123 |
|
| 124 |
+
output = sp.process(sample_rate=sample_rate, samples=samples)
|
| 125 |
|
| 126 |
date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
|
| 127 |
end = time.time()
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
title = "# Source separation with Next-gen Kaldi"
|
| 157 |
+
description = """
|
| 158 |
+
This space shows how to do source separation with Next-gen Kaldi.
|
| 159 |
+
|
| 160 |
+
It is running on CPU within a docker container provided by Hugging Face.
|
| 161 |
+
|
| 162 |
+
See more information by visiting the following links:
|
| 163 |
+
|
| 164 |
+
- <https://github.com/k2-fsa/sherpa-onnx>
|
| 165 |
+
|
| 166 |
+
Everything is open-sourced.
|
| 167 |
+
"""
|
| 168 |
|
| 169 |
# css style is copied from
|
| 170 |
# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
|
|
|
|
| 183 |
gr.Markdown(title)
|
| 184 |
|
| 185 |
model_dropdown = gr.Dropdown(
|
| 186 |
+
choices=model_list[0],
|
| 187 |
label="Select a model",
|
| 188 |
+
value=model_list[0],
|
| 189 |
)
|
| 190 |
|
| 191 |
with gr.Tabs():
|
|
|
|
| 270 |
inputs=[model_dropdown, url_textbox],
|
| 271 |
outputs=[url_vocals, url_non_vocals, url_html_info],
|
| 272 |
)
|
| 273 |
+
gr.Markdown(description)
|
| 274 |
|
| 275 |
if __name__ == "__main__":
|
| 276 |
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
separate.py
CHANGED
|
@@ -1,39 +1,45 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from functools import lru_cache
|
| 5 |
|
| 6 |
-
import ffmpeg
|
| 7 |
import numpy as np
|
| 8 |
-
from huggingface_hub import hf_hub_download
|
| 9 |
import sherpa_onnx
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
probe = ffmpeg.probe(filename)
|
| 17 |
-
if "streams" not in probe or len(probe["streams"]) == 0:
|
| 18 |
-
raise ValueError("No stream was found with ffprobe")
|
| 19 |
|
| 20 |
-
metadata = next(
|
| 21 |
-
stream for stream in probe["streams"] if stream["codec_type"] == "audio"
|
| 22 |
-
)
|
| 23 |
-
n_channels = metadata["channels"]
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
|
| 36 |
-
return
|
| 37 |
|
| 38 |
|
| 39 |
@lru_cache(maxsize=10)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
| 3 |
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
from functools import lru_cache
|
| 7 |
|
|
|
|
| 8 |
import numpy as np
|
|
|
|
| 9 |
import sherpa_onnx
|
| 10 |
+
import soundfile as sf
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
+
import uuid
|
| 13 |
|
| 14 |
|
| 15 |
+
def convert_to_wav(in_filename: str) -> str:
|
| 16 |
+
"""Convert the input audio file to a wave file"""
|
| 17 |
+
out_filename = str(uuid.uuid4())
|
| 18 |
+
out_filename = f"{in_filename}.wav"
|
| 19 |
|
| 20 |
+
logging.info(f"Converting '{in_filename}' to '{out_filename}'")
|
| 21 |
+
_ = os.system(
|
| 22 |
+
f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 441000 -ac 2 '{out_filename}' -y"
|
| 23 |
+
)
|
| 24 |
|
| 25 |
+
return out_filename
|
|
|
|
|
|
|
|
|
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
def load_audio(filename):
|
| 29 |
+
filename = convert_to_wav(filename)
|
| 30 |
+
|
| 31 |
+
samples, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
|
| 32 |
+
samples = np.transpose(samples)
|
| 33 |
+
# now samples is of shape (num_channels, num_samples)
|
| 34 |
+
assert (
|
| 35 |
+
samples.shape[1] > samples.shape[0]
|
| 36 |
+
), f"You should use (num_channels, num_samples). {samples.shape}"
|
| 37 |
|
| 38 |
+
assert (
|
| 39 |
+
samples.dtype == np.float32
|
| 40 |
+
), f"Expect np.float32 as dtype. Given: {samples.dtype}"
|
| 41 |
|
| 42 |
+
return samples, sample_rate
|
| 43 |
|
| 44 |
|
| 45 |
@lru_cache(maxsize=10)
|