feat: initial commit
Browse files- .gitattributes +1 -0
- .python-version +1 -0
- README.md +4 -3
- __init__.py +0 -0
- app.py +181 -0
- data/data/raw/sample_0.wav +3 -0
- data/data/raw/sample_1.wav +3 -0
- data/data/raw/sample_2.wav +3 -0
- data/model/config.yaml +8 -0
- data/model/weights/best.pt +3 -0
- requirements.txt +5 -0
- utils.py +335 -0
.gitattributes
CHANGED
@@ -29,6 +29,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
|
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
33 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10.12
|
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
title: Forest Elephant Rumbles Detection
|
3 |
-
emoji:
|
4 |
-
|
|
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.4.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Forest Elephant Rumbles Detection
|
3 |
+
emoji: π
|
4 |
+
python_version: 3.10.12
|
5 |
+
colorFrom: yellow
|
6 |
colorTo: purple
|
7 |
sdk: gradio
|
8 |
sdk_version: 5.4.0
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
+
short_description: Detection and analysis of elephants communication
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio app to showcase the elephant rumbles detector.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Tuple
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import pandas as pd
|
10 |
+
from PIL import Image
|
11 |
+
from ultralytics import YOLO
|
12 |
+
|
13 |
+
from utils import (
|
14 |
+
bgr_to_rgb,
|
15 |
+
chunk,
|
16 |
+
get_concat_v,
|
17 |
+
inference,
|
18 |
+
load_audio,
|
19 |
+
to_dataframe,
|
20 |
+
waveform_to_np_image,
|
21 |
+
yaml_read,
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def prediction_to_str(df: pd.DataFrame) -> str:
|
26 |
+
"""
|
27 |
+
Turn the yolo_prediction into a human friendly string.
|
28 |
+
"""
|
29 |
+
n = len(df)
|
30 |
+
return f"""{n} elephant rumbles detected in the audio sequence."""
|
31 |
+
|
32 |
+
|
33 |
+
def interface_fn(
|
34 |
+
model: YOLO,
|
35 |
+
audio_filepath: str,
|
36 |
+
config_model: dict[str, float | int],
|
37 |
+
) -> Tuple[Image.Image, pd.DataFrame, str]:
|
38 |
+
"""
|
39 |
+
Main interface function that runs the model on the provided audio_filepath and
|
40 |
+
returns the exepected tuple to populate the gradio interface.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
model (YOLO): Loaded ultralytics YOLO model.
|
44 |
+
audio_filepath (str): audio to run inference on.
|
45 |
+
config_model (dict[str, float | int]): config of the model.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
pil_image_spectrogram_with_prediction (PIL): spectrogram with overlaid
|
49 |
+
predictions
|
50 |
+
df (pd.DataFrame): results postprocessed as a pd.DataFrame
|
51 |
+
predition_str (str): some raw prediction for the string.
|
52 |
+
"""
|
53 |
+
overlap = 10.0
|
54 |
+
|
55 |
+
waveform, sample_rate = load_audio(Path(audio_filepath))
|
56 |
+
waveforms = chunk(
|
57 |
+
waveform=waveform,
|
58 |
+
sample_rate=sample_rate,
|
59 |
+
duration=config_model["duration"],
|
60 |
+
overlap=overlap,
|
61 |
+
)
|
62 |
+
|
63 |
+
yolov8_predictions = inference(
|
64 |
+
model=model,
|
65 |
+
audio_filepath=Path(audio_filepath),
|
66 |
+
duration=config_model["duration"],
|
67 |
+
overlap=overlap,
|
68 |
+
width=config_model["width"],
|
69 |
+
height=config_model["height"],
|
70 |
+
freq_max=config_model["freq_max"],
|
71 |
+
n_fft=config_model["n_fft"],
|
72 |
+
hop_length=config_model["hop_length"],
|
73 |
+
batch_size=16,
|
74 |
+
output_dir=Path("."),
|
75 |
+
save_spectrograms=False,
|
76 |
+
save_predictions=False,
|
77 |
+
verbose=True,
|
78 |
+
)
|
79 |
+
df = to_dataframe(
|
80 |
+
yolov8_predictions=yolov8_predictions,
|
81 |
+
duration=config_model["duration"],
|
82 |
+
overlap=overlap,
|
83 |
+
freq_min=config_model["freq_min"],
|
84 |
+
freq_max=config_model["freq_max"],
|
85 |
+
)
|
86 |
+
|
87 |
+
spectrograms_array_images = [
|
88 |
+
waveform_to_np_image(
|
89 |
+
waveform=waveform,
|
90 |
+
sample_rate=sample_rate,
|
91 |
+
n_fft=config_model["n_fft"],
|
92 |
+
hop_length=config_model["hop_length"],
|
93 |
+
freq_max=config_model["freq_max"],
|
94 |
+
width=config_model["width"],
|
95 |
+
height=config_model["height"],
|
96 |
+
)
|
97 |
+
for waveform in waveforms
|
98 |
+
]
|
99 |
+
|
100 |
+
spectrograms_pil_images = [Image.fromarray(a) for a in spectrograms_array_images]
|
101 |
+
|
102 |
+
array_image = waveform_to_np_image(
|
103 |
+
waveform=waveforms[0],
|
104 |
+
sample_rate=sample_rate,
|
105 |
+
n_fft=config_model["n_fft"],
|
106 |
+
hop_length=config_model["hop_length"],
|
107 |
+
freq_max=config_model["freq_max"],
|
108 |
+
width=config_model["width"],
|
109 |
+
height=config_model["height"],
|
110 |
+
)
|
111 |
+
|
112 |
+
predictions = model.predict(spectrograms_pil_images)
|
113 |
+
pil_image_spectrogram_with_prediction = Image.fromarray(
|
114 |
+
bgr_to_rgb(predictions[0].plot())
|
115 |
+
)
|
116 |
+
|
117 |
+
for i in range(1, len(predictions)):
|
118 |
+
pil_image_spectrogram_with_prediction = get_concat_v(
|
119 |
+
pil_image_spectrogram_with_prediction,
|
120 |
+
Image.fromarray(bgr_to_rgb(predictions[i].plot())),
|
121 |
+
)
|
122 |
+
|
123 |
+
return (pil_image_spectrogram_with_prediction, df, prediction_to_str(df=df))
|
124 |
+
|
125 |
+
|
126 |
+
def examples(dir_examples: Path) -> list[Path]:
|
127 |
+
"""
|
128 |
+
List the sound filepaths from the dir_examples directory.
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
filepaths (list[Path]): list of image filepaths.
|
132 |
+
"""
|
133 |
+
return list(dir_examples.glob("*.wav"))
|
134 |
+
|
135 |
+
|
136 |
+
def load_model(filepath_weights: Path) -> YOLO:
|
137 |
+
"""
|
138 |
+
Load the YOLO model given the filepath_weights.
|
139 |
+
"""
|
140 |
+
return YOLO(filepath_weights)
|
141 |
+
|
142 |
+
|
143 |
+
MODEL_FILEPATH_WEIGHTS = Path("data/model/weights/best.pt")
|
144 |
+
MODEL_FILEPTAH_CONFIG = Path("data/model/config.yaml")
|
145 |
+
DIR_EXAMPLES = Path("data/sounds/raw")
|
146 |
+
DEFAULT_VALUE_INDEX = 0
|
147 |
+
|
148 |
+
with gr.Blocks() as demo:
|
149 |
+
model = load_model(MODEL_FILEPATH_WEIGHTS)
|
150 |
+
sound_filepaths = examples(dir_examples=DIR_EXAMPLES)
|
151 |
+
config_model = yaml_read(MODEL_FILEPTAH_CONFIG)
|
152 |
+
print(config_model)
|
153 |
+
default_value_input = sound_filepaths[DEFAULT_VALUE_INDEX]
|
154 |
+
input = gr.Audio(
|
155 |
+
value=default_value_input,
|
156 |
+
sources=["upload"],
|
157 |
+
type="filepath",
|
158 |
+
label="input audio",
|
159 |
+
)
|
160 |
+
output_image = gr.Image(type="pil", label="model prediction")
|
161 |
+
output_raw = gr.Text(label="raw prediction")
|
162 |
+
output_dataframe = gr.DataFrame(
|
163 |
+
headers=["t_start", "t_end", "freq_start", "freq_end", "probability"],
|
164 |
+
label="prediction as CSV",
|
165 |
+
)
|
166 |
+
|
167 |
+
fn = lambda audio_filepath: interface_fn(
|
168 |
+
model=model,
|
169 |
+
audio_filepath=audio_filepath,
|
170 |
+
config_model=config_model,
|
171 |
+
)
|
172 |
+
gr.Interface(
|
173 |
+
title="ML model for forest elephant rumble detection π",
|
174 |
+
fn=fn,
|
175 |
+
inputs=input,
|
176 |
+
outputs=[output_image, output_dataframe, output_raw],
|
177 |
+
examples=sound_filepaths,
|
178 |
+
flagging_mode="never",
|
179 |
+
)
|
180 |
+
|
181 |
+
demo.launch()
|
data/data/raw/sample_0.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0179dbd11e36dba96bf1f55a542697ea382330e701953af3a5d2116f41f38da0
|
3 |
+
size 4800590
|
data/data/raw/sample_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8add2dde6bb272816be81bbd5555c84dfbb917cffc26714d74f1d08e7b730f6
|
3 |
+
size 4800590
|
data/data/raw/sample_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaedb8aa45f4b1b95073bfa249bb3dac925f46a87694c618bd4ed0a59cee7a3c
|
3 |
+
size 4800590
|
data/model/config.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
duration: 164.0
|
3 |
+
freq_min: 0.0
|
4 |
+
freq_max: 250.0
|
5 |
+
n_fft: 4096
|
6 |
+
hop_length: 1024
|
7 |
+
width: 640
|
8 |
+
height: 256
|
data/model/weights/best.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8aa9884841054eeef0cb3a0a7c09eb34b51c971aaacf52b592cd024ea212b961
|
3 |
+
size 6218137
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.4.*
|
2 |
+
torch==2.5.*
|
3 |
+
torchaudio==2.5.*
|
4 |
+
torchvision==0.20.*
|
5 |
+
ultralytics==8.3.*
|
utils.py
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Tuple
|
6 |
+
|
7 |
+
import cv2
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import torch
|
11 |
+
import torchaudio
|
12 |
+
import torchaudio.transforms as T
|
13 |
+
import yaml
|
14 |
+
from PIL import Image
|
15 |
+
from tqdm import tqdm
|
16 |
+
from ultralytics import YOLO
|
17 |
+
|
18 |
+
|
19 |
+
def yaml_read(path: Path) -> dict:
|
20 |
+
"""Returns yaml content as a python dict."""
|
21 |
+
with open(path, "r") as f:
|
22 |
+
return yaml.safe_load(f)
|
23 |
+
|
24 |
+
|
25 |
+
def clip(
|
26 |
+
waveform: torch.Tensor,
|
27 |
+
offset: float,
|
28 |
+
duration: float,
|
29 |
+
sample_rate: int,
|
30 |
+
) -> torch.Tensor:
|
31 |
+
"""
|
32 |
+
Returns a clipped waveform of `duration` seconds at `offset` in seconds.
|
33 |
+
"""
|
34 |
+
offset_frames_start = int(offset * sample_rate)
|
35 |
+
offset_frames_end = offset_frames_start + int(duration * sample_rate)
|
36 |
+
return waveform[:, offset_frames_start:offset_frames_end]
|
37 |
+
|
38 |
+
|
39 |
+
def chunk(
|
40 |
+
waveform: torch.Tensor,
|
41 |
+
sample_rate: int,
|
42 |
+
duration: float,
|
43 |
+
overlap: float,
|
44 |
+
) -> list[torch.Tensor]:
|
45 |
+
"""
|
46 |
+
Returns a list of waveforms as torch.Tensor. Each of these waveforms have the specified
|
47 |
+
duration and the specified overlap in seconds.
|
48 |
+
"""
|
49 |
+
total_seconds = waveform.shape[1] / sample_rate
|
50 |
+
number_spectrograms = total_seconds / (duration - overlap)
|
51 |
+
offsets = [
|
52 |
+
idx * (duration - overlap) for idx in range(0, math.floor(number_spectrograms))
|
53 |
+
]
|
54 |
+
return [
|
55 |
+
clip(
|
56 |
+
waveform=waveform,
|
57 |
+
offset=offset,
|
58 |
+
duration=duration,
|
59 |
+
sample_rate=sample_rate,
|
60 |
+
)
|
61 |
+
for offset in offsets
|
62 |
+
]
|
63 |
+
|
64 |
+
|
65 |
+
def load_audio(audio_filepath: Path) -> Tuple[torch.Tensor, int]:
|
66 |
+
"""
|
67 |
+
Loads an audio_filepath and returns the waveform and sample_rate of the file.
|
68 |
+
"""
|
69 |
+
start_time = time.time()
|
70 |
+
waveform, sample_rate = torchaudio.load(audio_filepath)
|
71 |
+
end_time = time.time()
|
72 |
+
elapsed_time = end_time - start_time
|
73 |
+
logging.info(
|
74 |
+
f"Elapsed time to load audio file {audio_filepath.name}: {elapsed_time:.2f}s"
|
75 |
+
)
|
76 |
+
return waveform, sample_rate
|
77 |
+
|
78 |
+
|
79 |
+
def waveform_to_spectrogram(
|
80 |
+
waveform: torch.Tensor,
|
81 |
+
sample_rate: int,
|
82 |
+
n_fft: int,
|
83 |
+
hop_length: int,
|
84 |
+
freq_max: float,
|
85 |
+
) -> torch.Tensor:
|
86 |
+
"""
|
87 |
+
Returns a spectrogram as a torch.Tensor given the provided arguments.
|
88 |
+
See torchaudio.transforms.Spectrogram for more details about the parameters.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
|
92 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
93 |
+
n_fft (int): Size of FFT
|
94 |
+
hop_length (int): Length of hop between STFT windows.
|
95 |
+
freq_max (float): cutoff frequency (Hz)
|
96 |
+
"""
|
97 |
+
filtered_waveform = torchaudio.functional.lowpass_biquad(
|
98 |
+
waveform=waveform, sample_rate=sample_rate, cutoff_freq=freq_max
|
99 |
+
)
|
100 |
+
transform = T.Spectrogram(n_fft=n_fft, hop_length=hop_length, power=2)
|
101 |
+
spectrogram = transform(filtered_waveform)
|
102 |
+
spectrogram_db = torchaudio.transforms.AmplitudeToDB()(spectrogram)
|
103 |
+
frequencies = torch.linspace(0, sample_rate // 2, spectrogram_db.size(1))
|
104 |
+
max_freq_bin = torch.searchsorted(frequencies, freq_max).item()
|
105 |
+
filtered_spectrogram_db = spectrogram_db[:, :max_freq_bin, :]
|
106 |
+
return filtered_spectrogram_db
|
107 |
+
|
108 |
+
|
109 |
+
def normalize(x: np.ndarray, max_value: int = 255) -> np.ndarray:
|
110 |
+
"""
|
111 |
+
Returns the normalized array, value in [0 - max_value]
|
112 |
+
Useful for image conversion.
|
113 |
+
"""
|
114 |
+
_min, _max = x.min(), x.max()
|
115 |
+
x_normalized = max_value * (x - _min) / (_max - _min)
|
116 |
+
return x_normalized.astype(np.uint8)
|
117 |
+
|
118 |
+
|
119 |
+
def spectrogram_tensor_to_np_image(
|
120 |
+
spectrogram: torch.Tensor, width: int, height: int
|
121 |
+
) -> np.ndarray:
|
122 |
+
"""
|
123 |
+
Returns a numpy array of shape (height, width) that represents the spectrogram tensor as an image.
|
124 |
+
"""
|
125 |
+
spectrogram_db_np = spectrogram[0].numpy()
|
126 |
+
# Normalize to [0, 255] for image conversion
|
127 |
+
spectrogram_db_normalized = normalize(spectrogram_db_np, max_value=255)
|
128 |
+
resized_spectrogram_array = cv2.resize(
|
129 |
+
spectrogram_db_normalized, (width, height), interpolation=cv2.INTER_LINEAR
|
130 |
+
)
|
131 |
+
# Horizontal flip to make it show the low frequency range at the bottom left of the image instead of the top left
|
132 |
+
flipped_resized_spectrogram_array = np.flipud(resized_spectrogram_array)
|
133 |
+
return flipped_resized_spectrogram_array
|
134 |
+
|
135 |
+
|
136 |
+
def waveform_to_np_image(
|
137 |
+
waveform: torch.Tensor,
|
138 |
+
sample_rate: int,
|
139 |
+
n_fft: int,
|
140 |
+
hop_length: int,
|
141 |
+
freq_max: float,
|
142 |
+
width: int,
|
143 |
+
height: int,
|
144 |
+
) -> np.ndarray:
|
145 |
+
"""
|
146 |
+
Returns a numpy image of shape (height, width) that represents the waveform tensor as an image of its spectrogram.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
|
150 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
151 |
+
duration (float): time in seconds of the waveform
|
152 |
+
n_fft (int): Size of FFT
|
153 |
+
hop_length (int): Length of hop between STFT windows.
|
154 |
+
freq_max (float): cutoff frequency (Hz)
|
155 |
+
width (int): width of the generated image
|
156 |
+
height (int): height of the generated image
|
157 |
+
"""
|
158 |
+
spectrogram = waveform_to_spectrogram(
|
159 |
+
waveform=waveform,
|
160 |
+
sample_rate=sample_rate,
|
161 |
+
n_fft=n_fft,
|
162 |
+
hop_length=hop_length,
|
163 |
+
freq_max=freq_max,
|
164 |
+
)
|
165 |
+
return spectrogram_tensor_to_np_image(
|
166 |
+
spectrogram=spectrogram,
|
167 |
+
width=width,
|
168 |
+
height=height,
|
169 |
+
)
|
170 |
+
|
171 |
+
|
172 |
+
def batch_sequence(xs: list, batch_size: int):
|
173 |
+
"""
|
174 |
+
Yields successive n-sized batches from xs.
|
175 |
+
"""
|
176 |
+
for i in range(0, len(xs), batch_size):
|
177 |
+
yield xs[i : i + batch_size]
|
178 |
+
|
179 |
+
|
180 |
+
def inference(
|
181 |
+
model: YOLO,
|
182 |
+
audio_filepath: Path,
|
183 |
+
duration: float,
|
184 |
+
overlap: float,
|
185 |
+
width: int,
|
186 |
+
height: int,
|
187 |
+
freq_max: float,
|
188 |
+
n_fft: int,
|
189 |
+
hop_length: int,
|
190 |
+
batch_size: int,
|
191 |
+
output_dir: Path,
|
192 |
+
save_spectrograms: bool,
|
193 |
+
save_predictions: bool,
|
194 |
+
verbose: bool,
|
195 |
+
) -> list:
|
196 |
+
"""
|
197 |
+
Inference entry point for running on an entire audio_filepath sound file.
|
198 |
+
"""
|
199 |
+
logging.info(f"Loading audio filepath {audio_filepath}")
|
200 |
+
# waveform, sample_rate = torchaudio.load(audio_filepath)
|
201 |
+
waveform, sample_rate = load_audio(audio_filepath)
|
202 |
+
waveforms = chunk(
|
203 |
+
waveform=waveform,
|
204 |
+
sample_rate=sample_rate,
|
205 |
+
duration=duration,
|
206 |
+
overlap=overlap,
|
207 |
+
)
|
208 |
+
logging.info(f"Chunking the waveform into {len(waveforms)} overlapping clips")
|
209 |
+
logging.info(f"Generating {len(waveforms)} spectrograms")
|
210 |
+
images = [
|
211 |
+
Image.fromarray(
|
212 |
+
waveform_to_np_image(
|
213 |
+
waveform=y,
|
214 |
+
sample_rate=sample_rate,
|
215 |
+
n_fft=n_fft,
|
216 |
+
hop_length=hop_length,
|
217 |
+
freq_max=freq_max,
|
218 |
+
width=width,
|
219 |
+
height=height,
|
220 |
+
)
|
221 |
+
)
|
222 |
+
for y in tqdm(waveforms)
|
223 |
+
]
|
224 |
+
if save_spectrograms:
|
225 |
+
save_dir = output_dir / "spectrograms"
|
226 |
+
logging.info(f"Saving spectrograms in {save_dir}")
|
227 |
+
save_dir.mkdir(exist_ok=True, parents=True)
|
228 |
+
for i, image in tqdm(enumerate(images), total=len(images)):
|
229 |
+
image.save(save_dir / f"spectrogram_{i}.png")
|
230 |
+
|
231 |
+
results = []
|
232 |
+
|
233 |
+
batches = list(batch_sequence(images, batch_size=batch_size))
|
234 |
+
logging.info(f"Running inference on the spectrograms, {len(batches)} batches")
|
235 |
+
for batch in tqdm(batches):
|
236 |
+
results.extend(model.predict(batch, verbose=verbose))
|
237 |
+
|
238 |
+
if save_predictions:
|
239 |
+
save_dir = output_dir / "predictions"
|
240 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
241 |
+
logging.info(f"Saving predictions in {save_dir}")
|
242 |
+
for i, yolov8_prediction in tqdm(enumerate(results), total=len(results)):
|
243 |
+
yolov8_prediction.save(str(save_dir / f"prediction_{i}.png"))
|
244 |
+
|
245 |
+
return results
|
246 |
+
|
247 |
+
|
248 |
+
def index_to_relative_offset(idx: int, duration: float, overlap: float) -> float:
|
249 |
+
"""
|
250 |
+
Returns the relative offset in seconds based on the provided spectrogram index, the duration and the overlap.
|
251 |
+
"""
|
252 |
+
return idx * (duration - overlap)
|
253 |
+
|
254 |
+
|
255 |
+
def from_yolov8_prediction(
|
256 |
+
yolov8_prediction,
|
257 |
+
idx: int,
|
258 |
+
duration: float,
|
259 |
+
overlap: float,
|
260 |
+
freq_min: float,
|
261 |
+
freq_max: float,
|
262 |
+
) -> list[dict]:
|
263 |
+
results = []
|
264 |
+
for k, box_xyxyn in enumerate(yolov8_prediction.boxes.xyxyn):
|
265 |
+
conf = yolov8_prediction.boxes.conf[k].item()
|
266 |
+
x1, y1, x2, y2 = box_xyxyn.numpy()
|
267 |
+
xmin = min(x1, x2)
|
268 |
+
xmax = max(x1, x2)
|
269 |
+
ymin = min(y1, y2)
|
270 |
+
ymax = max(y1, y2)
|
271 |
+
freq_start = ymin * (freq_max - freq_min)
|
272 |
+
freq_end = ymax * (freq_max - freq_min)
|
273 |
+
t_start = xmin * duration + index_to_relative_offset(
|
274 |
+
idx=idx, duration=duration, overlap=overlap
|
275 |
+
)
|
276 |
+
t_end = xmax * duration + index_to_relative_offset(
|
277 |
+
idx=idx, duration=duration, overlap=overlap
|
278 |
+
)
|
279 |
+
data = {
|
280 |
+
"probability": conf,
|
281 |
+
"freq_start": freq_start,
|
282 |
+
"freq_end": freq_end,
|
283 |
+
"t_start": t_start,
|
284 |
+
"t_end": t_end,
|
285 |
+
}
|
286 |
+
results.append(data)
|
287 |
+
return results
|
288 |
+
|
289 |
+
|
290 |
+
def to_dataframe(
|
291 |
+
yolov8_predictions,
|
292 |
+
duration: float,
|
293 |
+
overlap: float,
|
294 |
+
freq_min: float,
|
295 |
+
freq_max: float,
|
296 |
+
) -> pd.DataFrame:
|
297 |
+
"""
|
298 |
+
Turns the yolov8 predictions into a pandas dataframe, taking into account the relative offset of each prediction.
|
299 |
+
The dataframes contains the following columns
|
300 |
+
probability (float): float in 0-1 that represents the probability that this is an actual rumble
|
301 |
+
freq_start (float): Hz - where the box starts on the frequency axis
|
302 |
+
freq_end (float): Hz - where the box ends on the frequency axis
|
303 |
+
t_start (float): Hz - where the box starts on the time axis
|
304 |
+
t_end (float): Hz - where the box ends on the time axis
|
305 |
+
"""
|
306 |
+
results = []
|
307 |
+
for idx, yolov8_prediction in enumerate(yolov8_predictions):
|
308 |
+
results.extend(
|
309 |
+
from_yolov8_prediction(
|
310 |
+
yolov8_prediction,
|
311 |
+
idx=idx,
|
312 |
+
duration=duration,
|
313 |
+
overlap=overlap,
|
314 |
+
freq_min=freq_min,
|
315 |
+
freq_max=freq_max,
|
316 |
+
)
|
317 |
+
)
|
318 |
+
return pd.DataFrame(results)
|
319 |
+
|
320 |
+
|
321 |
+
def bgr_to_rgb(a: np.ndarray) -> np.ndarray:
|
322 |
+
"""
|
323 |
+
Turn a BGR numpy array into a RGB numpy array when the array `a` represents
|
324 |
+
an image.
|
325 |
+
"""
|
326 |
+
return a[:, :, ::-1]
|
327 |
+
|
328 |
+
def get_concat_v(im1: Image.Image, im2: Image.Image) -> Image.Image:
|
329 |
+
"""
|
330 |
+
Concatenate vertically two PIL images.
|
331 |
+
"""
|
332 |
+
dst = Image.new('RGB', (im1.width, im1.height + im2.height))
|
333 |
+
dst.paste(im1, (0, 0))
|
334 |
+
dst.paste(im2, (0, im1.height))
|
335 |
+
return dst
|