achouffe commited on
Commit
4aa9c9f
Β·
verified Β·
1 Parent(s): eafa076

feat: initial commit

Browse files
.gitattributes CHANGED
@@ -29,6 +29,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
 
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.wav filter=lfs diff=lfs merge=lfs -text
33
  *.xz filter=lfs diff=lfs merge=lfs -text
34
  *.zip filter=lfs diff=lfs merge=lfs -text
35
  *.zst filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.12
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
  title: Forest Elephant Rumbles Detection
3
- emoji: πŸƒ
4
- colorFrom: blue
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Detect forest elephant rumbles
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Forest Elephant Rumbles Detection
3
+ emoji: 🐘
4
+ python_version: 3.10.12
5
+ colorFrom: yellow
6
  colorTo: purple
7
  sdk: gradio
8
  sdk_version: 5.4.0
9
  app_file: app.py
10
  pinned: false
11
+ short_description: Detection and analysis of elephants communication
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app to showcase the elephant rumbles detector.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Tuple
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ from PIL import Image
11
+ from ultralytics import YOLO
12
+
13
+ from utils import (
14
+ bgr_to_rgb,
15
+ chunk,
16
+ get_concat_v,
17
+ inference,
18
+ load_audio,
19
+ to_dataframe,
20
+ waveform_to_np_image,
21
+ yaml_read,
22
+ )
23
+
24
+
25
+ def prediction_to_str(df: pd.DataFrame) -> str:
26
+ """
27
+ Turn the yolo_prediction into a human friendly string.
28
+ """
29
+ n = len(df)
30
+ return f"""{n} elephant rumbles detected in the audio sequence."""
31
+
32
+
33
+ def interface_fn(
34
+ model: YOLO,
35
+ audio_filepath: str,
36
+ config_model: dict[str, float | int],
37
+ ) -> Tuple[Image.Image, pd.DataFrame, str]:
38
+ """
39
+ Main interface function that runs the model on the provided audio_filepath and
40
+ returns the exepected tuple to populate the gradio interface.
41
+
42
+ Args:
43
+ model (YOLO): Loaded ultralytics YOLO model.
44
+ audio_filepath (str): audio to run inference on.
45
+ config_model (dict[str, float | int]): config of the model.
46
+
47
+ Returns:
48
+ pil_image_spectrogram_with_prediction (PIL): spectrogram with overlaid
49
+ predictions
50
+ df (pd.DataFrame): results postprocessed as a pd.DataFrame
51
+ predition_str (str): some raw prediction for the string.
52
+ """
53
+ overlap = 10.0
54
+
55
+ waveform, sample_rate = load_audio(Path(audio_filepath))
56
+ waveforms = chunk(
57
+ waveform=waveform,
58
+ sample_rate=sample_rate,
59
+ duration=config_model["duration"],
60
+ overlap=overlap,
61
+ )
62
+
63
+ yolov8_predictions = inference(
64
+ model=model,
65
+ audio_filepath=Path(audio_filepath),
66
+ duration=config_model["duration"],
67
+ overlap=overlap,
68
+ width=config_model["width"],
69
+ height=config_model["height"],
70
+ freq_max=config_model["freq_max"],
71
+ n_fft=config_model["n_fft"],
72
+ hop_length=config_model["hop_length"],
73
+ batch_size=16,
74
+ output_dir=Path("."),
75
+ save_spectrograms=False,
76
+ save_predictions=False,
77
+ verbose=True,
78
+ )
79
+ df = to_dataframe(
80
+ yolov8_predictions=yolov8_predictions,
81
+ duration=config_model["duration"],
82
+ overlap=overlap,
83
+ freq_min=config_model["freq_min"],
84
+ freq_max=config_model["freq_max"],
85
+ )
86
+
87
+ spectrograms_array_images = [
88
+ waveform_to_np_image(
89
+ waveform=waveform,
90
+ sample_rate=sample_rate,
91
+ n_fft=config_model["n_fft"],
92
+ hop_length=config_model["hop_length"],
93
+ freq_max=config_model["freq_max"],
94
+ width=config_model["width"],
95
+ height=config_model["height"],
96
+ )
97
+ for waveform in waveforms
98
+ ]
99
+
100
+ spectrograms_pil_images = [Image.fromarray(a) for a in spectrograms_array_images]
101
+
102
+ array_image = waveform_to_np_image(
103
+ waveform=waveforms[0],
104
+ sample_rate=sample_rate,
105
+ n_fft=config_model["n_fft"],
106
+ hop_length=config_model["hop_length"],
107
+ freq_max=config_model["freq_max"],
108
+ width=config_model["width"],
109
+ height=config_model["height"],
110
+ )
111
+
112
+ predictions = model.predict(spectrograms_pil_images)
113
+ pil_image_spectrogram_with_prediction = Image.fromarray(
114
+ bgr_to_rgb(predictions[0].plot())
115
+ )
116
+
117
+ for i in range(1, len(predictions)):
118
+ pil_image_spectrogram_with_prediction = get_concat_v(
119
+ pil_image_spectrogram_with_prediction,
120
+ Image.fromarray(bgr_to_rgb(predictions[i].plot())),
121
+ )
122
+
123
+ return (pil_image_spectrogram_with_prediction, df, prediction_to_str(df=df))
124
+
125
+
126
+ def examples(dir_examples: Path) -> list[Path]:
127
+ """
128
+ List the sound filepaths from the dir_examples directory.
129
+
130
+ Returns:
131
+ filepaths (list[Path]): list of image filepaths.
132
+ """
133
+ return list(dir_examples.glob("*.wav"))
134
+
135
+
136
+ def load_model(filepath_weights: Path) -> YOLO:
137
+ """
138
+ Load the YOLO model given the filepath_weights.
139
+ """
140
+ return YOLO(filepath_weights)
141
+
142
+
143
+ MODEL_FILEPATH_WEIGHTS = Path("data/model/weights/best.pt")
144
+ MODEL_FILEPTAH_CONFIG = Path("data/model/config.yaml")
145
+ DIR_EXAMPLES = Path("data/sounds/raw")
146
+ DEFAULT_VALUE_INDEX = 0
147
+
148
+ with gr.Blocks() as demo:
149
+ model = load_model(MODEL_FILEPATH_WEIGHTS)
150
+ sound_filepaths = examples(dir_examples=DIR_EXAMPLES)
151
+ config_model = yaml_read(MODEL_FILEPTAH_CONFIG)
152
+ print(config_model)
153
+ default_value_input = sound_filepaths[DEFAULT_VALUE_INDEX]
154
+ input = gr.Audio(
155
+ value=default_value_input,
156
+ sources=["upload"],
157
+ type="filepath",
158
+ label="input audio",
159
+ )
160
+ output_image = gr.Image(type="pil", label="model prediction")
161
+ output_raw = gr.Text(label="raw prediction")
162
+ output_dataframe = gr.DataFrame(
163
+ headers=["t_start", "t_end", "freq_start", "freq_end", "probability"],
164
+ label="prediction as CSV",
165
+ )
166
+
167
+ fn = lambda audio_filepath: interface_fn(
168
+ model=model,
169
+ audio_filepath=audio_filepath,
170
+ config_model=config_model,
171
+ )
172
+ gr.Interface(
173
+ title="ML model for forest elephant rumble detection 🐘",
174
+ fn=fn,
175
+ inputs=input,
176
+ outputs=[output_image, output_dataframe, output_raw],
177
+ examples=sound_filepaths,
178
+ flagging_mode="never",
179
+ )
180
+
181
+ demo.launch()
data/data/raw/sample_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0179dbd11e36dba96bf1f55a542697ea382330e701953af3a5d2116f41f38da0
3
+ size 4800590
data/data/raw/sample_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8add2dde6bb272816be81bbd5555c84dfbb917cffc26714d74f1d08e7b730f6
3
+ size 4800590
data/data/raw/sample_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaedb8aa45f4b1b95073bfa249bb3dac925f46a87694c618bd4ed0a59cee7a3c
3
+ size 4800590
data/model/config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ duration: 164.0
3
+ freq_min: 0.0
4
+ freq_max: 250.0
5
+ n_fft: 4096
6
+ hop_length: 1024
7
+ width: 640
8
+ height: 256
data/model/weights/best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa9884841054eeef0cb3a0a7c09eb34b51c971aaacf52b592cd024ea212b961
3
+ size 6218137
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.4.*
2
+ torch==2.5.*
3
+ torchaudio==2.5.*
4
+ torchvision==0.20.*
5
+ ultralytics==8.3.*
utils.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Tuple
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import pandas as pd
10
+ import torch
11
+ import torchaudio
12
+ import torchaudio.transforms as T
13
+ import yaml
14
+ from PIL import Image
15
+ from tqdm import tqdm
16
+ from ultralytics import YOLO
17
+
18
+
19
+ def yaml_read(path: Path) -> dict:
20
+ """Returns yaml content as a python dict."""
21
+ with open(path, "r") as f:
22
+ return yaml.safe_load(f)
23
+
24
+
25
+ def clip(
26
+ waveform: torch.Tensor,
27
+ offset: float,
28
+ duration: float,
29
+ sample_rate: int,
30
+ ) -> torch.Tensor:
31
+ """
32
+ Returns a clipped waveform of `duration` seconds at `offset` in seconds.
33
+ """
34
+ offset_frames_start = int(offset * sample_rate)
35
+ offset_frames_end = offset_frames_start + int(duration * sample_rate)
36
+ return waveform[:, offset_frames_start:offset_frames_end]
37
+
38
+
39
+ def chunk(
40
+ waveform: torch.Tensor,
41
+ sample_rate: int,
42
+ duration: float,
43
+ overlap: float,
44
+ ) -> list[torch.Tensor]:
45
+ """
46
+ Returns a list of waveforms as torch.Tensor. Each of these waveforms have the specified
47
+ duration and the specified overlap in seconds.
48
+ """
49
+ total_seconds = waveform.shape[1] / sample_rate
50
+ number_spectrograms = total_seconds / (duration - overlap)
51
+ offsets = [
52
+ idx * (duration - overlap) for idx in range(0, math.floor(number_spectrograms))
53
+ ]
54
+ return [
55
+ clip(
56
+ waveform=waveform,
57
+ offset=offset,
58
+ duration=duration,
59
+ sample_rate=sample_rate,
60
+ )
61
+ for offset in offsets
62
+ ]
63
+
64
+
65
+ def load_audio(audio_filepath: Path) -> Tuple[torch.Tensor, int]:
66
+ """
67
+ Loads an audio_filepath and returns the waveform and sample_rate of the file.
68
+ """
69
+ start_time = time.time()
70
+ waveform, sample_rate = torchaudio.load(audio_filepath)
71
+ end_time = time.time()
72
+ elapsed_time = end_time - start_time
73
+ logging.info(
74
+ f"Elapsed time to load audio file {audio_filepath.name}: {elapsed_time:.2f}s"
75
+ )
76
+ return waveform, sample_rate
77
+
78
+
79
+ def waveform_to_spectrogram(
80
+ waveform: torch.Tensor,
81
+ sample_rate: int,
82
+ n_fft: int,
83
+ hop_length: int,
84
+ freq_max: float,
85
+ ) -> torch.Tensor:
86
+ """
87
+ Returns a spectrogram as a torch.Tensor given the provided arguments.
88
+ See torchaudio.transforms.Spectrogram for more details about the parameters.
89
+
90
+ Args:
91
+ waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
92
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
93
+ n_fft (int): Size of FFT
94
+ hop_length (int): Length of hop between STFT windows.
95
+ freq_max (float): cutoff frequency (Hz)
96
+ """
97
+ filtered_waveform = torchaudio.functional.lowpass_biquad(
98
+ waveform=waveform, sample_rate=sample_rate, cutoff_freq=freq_max
99
+ )
100
+ transform = T.Spectrogram(n_fft=n_fft, hop_length=hop_length, power=2)
101
+ spectrogram = transform(filtered_waveform)
102
+ spectrogram_db = torchaudio.transforms.AmplitudeToDB()(spectrogram)
103
+ frequencies = torch.linspace(0, sample_rate // 2, spectrogram_db.size(1))
104
+ max_freq_bin = torch.searchsorted(frequencies, freq_max).item()
105
+ filtered_spectrogram_db = spectrogram_db[:, :max_freq_bin, :]
106
+ return filtered_spectrogram_db
107
+
108
+
109
+ def normalize(x: np.ndarray, max_value: int = 255) -> np.ndarray:
110
+ """
111
+ Returns the normalized array, value in [0 - max_value]
112
+ Useful for image conversion.
113
+ """
114
+ _min, _max = x.min(), x.max()
115
+ x_normalized = max_value * (x - _min) / (_max - _min)
116
+ return x_normalized.astype(np.uint8)
117
+
118
+
119
+ def spectrogram_tensor_to_np_image(
120
+ spectrogram: torch.Tensor, width: int, height: int
121
+ ) -> np.ndarray:
122
+ """
123
+ Returns a numpy array of shape (height, width) that represents the spectrogram tensor as an image.
124
+ """
125
+ spectrogram_db_np = spectrogram[0].numpy()
126
+ # Normalize to [0, 255] for image conversion
127
+ spectrogram_db_normalized = normalize(spectrogram_db_np, max_value=255)
128
+ resized_spectrogram_array = cv2.resize(
129
+ spectrogram_db_normalized, (width, height), interpolation=cv2.INTER_LINEAR
130
+ )
131
+ # Horizontal flip to make it show the low frequency range at the bottom left of the image instead of the top left
132
+ flipped_resized_spectrogram_array = np.flipud(resized_spectrogram_array)
133
+ return flipped_resized_spectrogram_array
134
+
135
+
136
+ def waveform_to_np_image(
137
+ waveform: torch.Tensor,
138
+ sample_rate: int,
139
+ n_fft: int,
140
+ hop_length: int,
141
+ freq_max: float,
142
+ width: int,
143
+ height: int,
144
+ ) -> np.ndarray:
145
+ """
146
+ Returns a numpy image of shape (height, width) that represents the waveform tensor as an image of its spectrogram.
147
+
148
+ Args:
149
+ waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
150
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
151
+ duration (float): time in seconds of the waveform
152
+ n_fft (int): Size of FFT
153
+ hop_length (int): Length of hop between STFT windows.
154
+ freq_max (float): cutoff frequency (Hz)
155
+ width (int): width of the generated image
156
+ height (int): height of the generated image
157
+ """
158
+ spectrogram = waveform_to_spectrogram(
159
+ waveform=waveform,
160
+ sample_rate=sample_rate,
161
+ n_fft=n_fft,
162
+ hop_length=hop_length,
163
+ freq_max=freq_max,
164
+ )
165
+ return spectrogram_tensor_to_np_image(
166
+ spectrogram=spectrogram,
167
+ width=width,
168
+ height=height,
169
+ )
170
+
171
+
172
+ def batch_sequence(xs: list, batch_size: int):
173
+ """
174
+ Yields successive n-sized batches from xs.
175
+ """
176
+ for i in range(0, len(xs), batch_size):
177
+ yield xs[i : i + batch_size]
178
+
179
+
180
+ def inference(
181
+ model: YOLO,
182
+ audio_filepath: Path,
183
+ duration: float,
184
+ overlap: float,
185
+ width: int,
186
+ height: int,
187
+ freq_max: float,
188
+ n_fft: int,
189
+ hop_length: int,
190
+ batch_size: int,
191
+ output_dir: Path,
192
+ save_spectrograms: bool,
193
+ save_predictions: bool,
194
+ verbose: bool,
195
+ ) -> list:
196
+ """
197
+ Inference entry point for running on an entire audio_filepath sound file.
198
+ """
199
+ logging.info(f"Loading audio filepath {audio_filepath}")
200
+ # waveform, sample_rate = torchaudio.load(audio_filepath)
201
+ waveform, sample_rate = load_audio(audio_filepath)
202
+ waveforms = chunk(
203
+ waveform=waveform,
204
+ sample_rate=sample_rate,
205
+ duration=duration,
206
+ overlap=overlap,
207
+ )
208
+ logging.info(f"Chunking the waveform into {len(waveforms)} overlapping clips")
209
+ logging.info(f"Generating {len(waveforms)} spectrograms")
210
+ images = [
211
+ Image.fromarray(
212
+ waveform_to_np_image(
213
+ waveform=y,
214
+ sample_rate=sample_rate,
215
+ n_fft=n_fft,
216
+ hop_length=hop_length,
217
+ freq_max=freq_max,
218
+ width=width,
219
+ height=height,
220
+ )
221
+ )
222
+ for y in tqdm(waveforms)
223
+ ]
224
+ if save_spectrograms:
225
+ save_dir = output_dir / "spectrograms"
226
+ logging.info(f"Saving spectrograms in {save_dir}")
227
+ save_dir.mkdir(exist_ok=True, parents=True)
228
+ for i, image in tqdm(enumerate(images), total=len(images)):
229
+ image.save(save_dir / f"spectrogram_{i}.png")
230
+
231
+ results = []
232
+
233
+ batches = list(batch_sequence(images, batch_size=batch_size))
234
+ logging.info(f"Running inference on the spectrograms, {len(batches)} batches")
235
+ for batch in tqdm(batches):
236
+ results.extend(model.predict(batch, verbose=verbose))
237
+
238
+ if save_predictions:
239
+ save_dir = output_dir / "predictions"
240
+ save_dir.mkdir(parents=True, exist_ok=True)
241
+ logging.info(f"Saving predictions in {save_dir}")
242
+ for i, yolov8_prediction in tqdm(enumerate(results), total=len(results)):
243
+ yolov8_prediction.save(str(save_dir / f"prediction_{i}.png"))
244
+
245
+ return results
246
+
247
+
248
+ def index_to_relative_offset(idx: int, duration: float, overlap: float) -> float:
249
+ """
250
+ Returns the relative offset in seconds based on the provided spectrogram index, the duration and the overlap.
251
+ """
252
+ return idx * (duration - overlap)
253
+
254
+
255
+ def from_yolov8_prediction(
256
+ yolov8_prediction,
257
+ idx: int,
258
+ duration: float,
259
+ overlap: float,
260
+ freq_min: float,
261
+ freq_max: float,
262
+ ) -> list[dict]:
263
+ results = []
264
+ for k, box_xyxyn in enumerate(yolov8_prediction.boxes.xyxyn):
265
+ conf = yolov8_prediction.boxes.conf[k].item()
266
+ x1, y1, x2, y2 = box_xyxyn.numpy()
267
+ xmin = min(x1, x2)
268
+ xmax = max(x1, x2)
269
+ ymin = min(y1, y2)
270
+ ymax = max(y1, y2)
271
+ freq_start = ymin * (freq_max - freq_min)
272
+ freq_end = ymax * (freq_max - freq_min)
273
+ t_start = xmin * duration + index_to_relative_offset(
274
+ idx=idx, duration=duration, overlap=overlap
275
+ )
276
+ t_end = xmax * duration + index_to_relative_offset(
277
+ idx=idx, duration=duration, overlap=overlap
278
+ )
279
+ data = {
280
+ "probability": conf,
281
+ "freq_start": freq_start,
282
+ "freq_end": freq_end,
283
+ "t_start": t_start,
284
+ "t_end": t_end,
285
+ }
286
+ results.append(data)
287
+ return results
288
+
289
+
290
+ def to_dataframe(
291
+ yolov8_predictions,
292
+ duration: float,
293
+ overlap: float,
294
+ freq_min: float,
295
+ freq_max: float,
296
+ ) -> pd.DataFrame:
297
+ """
298
+ Turns the yolov8 predictions into a pandas dataframe, taking into account the relative offset of each prediction.
299
+ The dataframes contains the following columns
300
+ probability (float): float in 0-1 that represents the probability that this is an actual rumble
301
+ freq_start (float): Hz - where the box starts on the frequency axis
302
+ freq_end (float): Hz - where the box ends on the frequency axis
303
+ t_start (float): Hz - where the box starts on the time axis
304
+ t_end (float): Hz - where the box ends on the time axis
305
+ """
306
+ results = []
307
+ for idx, yolov8_prediction in enumerate(yolov8_predictions):
308
+ results.extend(
309
+ from_yolov8_prediction(
310
+ yolov8_prediction,
311
+ idx=idx,
312
+ duration=duration,
313
+ overlap=overlap,
314
+ freq_min=freq_min,
315
+ freq_max=freq_max,
316
+ )
317
+ )
318
+ return pd.DataFrame(results)
319
+
320
+
321
+ def bgr_to_rgb(a: np.ndarray) -> np.ndarray:
322
+ """
323
+ Turn a BGR numpy array into a RGB numpy array when the array `a` represents
324
+ an image.
325
+ """
326
+ return a[:, :, ::-1]
327
+
328
+ def get_concat_v(im1: Image.Image, im2: Image.Image) -> Image.Image:
329
+ """
330
+ Concatenate vertically two PIL images.
331
+ """
332
+ dst = Image.new('RGB', (im1.width, im1.height + im2.height))
333
+ dst.paste(im1, (0, 0))
334
+ dst.paste(im2, (0, im1.height))
335
+ return dst