Spaces:
Running
Running
"""Utils to load CSV file of audio datasets.""" | |
import os | |
import pandas as pd | |
import shared.utils as su | |
def configure_paths_sound_of_water( | |
data_root="/work/piyush/from_nfs2/datasets/SoundOfWater", | |
): | |
paths = { | |
"data_dir": data_root, | |
"video_clip_dir": os.path.join(data_root, "videos"), | |
"audio_clip_dir": os.path.join(data_root, "videos"), | |
"annot_dir": os.path.join(data_root, "annotations"), | |
"split_dir": os.path.join(data_root, "splits"), | |
} | |
return paths | |
def load_csv_sound_of_water( | |
paths: dict, | |
csv_filters=dict(), | |
csv_name="localisation.csv", | |
ds_name="SoundOfWater", | |
split=None, | |
check_first_frame_annots=True, | |
): | |
"""Loads CSV containing metadata of the dataset.""" | |
su.log.print_update( | |
f" [:::] Loading {ds_name}.", | |
pos="left", | |
fillchar=".", | |
) | |
# Configure paths | |
video_clip_dir = paths["video_clip_dir"] | |
audio_clip_dir = paths["audio_clip_dir"] | |
# Load main CSV | |
path = os.path.join( | |
paths["annot_dir"], csv_name, | |
) | |
assert os.path.exists(path), \ | |
f"CSV file not found at {path}." | |
print(" [:::] CSV path:", path) | |
df = pd.read_csv(path) | |
# Load side information: containers | |
container_path = os.path.join( | |
paths['annot_dir'], "containers.yaml", | |
) | |
assert os.path.exists(container_path) | |
containers = su.io.load_yml(container_path) | |
# Update CSV with container information (optional) | |
update_with_container_info = True | |
if update_with_container_info: | |
rows = [] | |
for row in df.iterrows(): | |
row = row[1].to_dict() | |
row.update(containers[row["container_id"]]) | |
rows.append(row) | |
df = pd.DataFrame(rows) | |
print(" [:::] Shape of CSV: ", df.shape) | |
# 1. Update item_id | |
df["item_id"] = df.apply( | |
lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}", | |
axis=1, | |
) | |
# 2. Update video_clip_path | |
# df["video_path"] = df["video_id"].apply( | |
# lambda d: os.path.join( | |
# video_dir, f"{d}.mp4" | |
# ) | |
# ) | |
df["video_clip_path"] = df["item_id"].apply( | |
lambda d: os.path.join( | |
video_clip_dir, f"{d}.mp4" | |
) | |
) | |
df = df[df["video_clip_path"].apply(os.path.exists)] | |
print(" [:::] Shape of CSV with available video: ", df.shape) | |
# 3. Update audio_clip_path | |
# df["audio_path"] = df["video_id"].apply( | |
# lambda d: os.path.join( | |
# audio_dir, f"{d}.mp4" | |
# ) | |
# ) | |
df["audio_clip_path"] = df["item_id"].apply( | |
lambda d: os.path.join( | |
audio_clip_dir, f"{d}.mp4" | |
) | |
) | |
df = df[df["audio_clip_path"].apply(os.path.exists)] | |
print(" [:::] Shape of CSV with available audio: ", df.shape) | |
# Add first frame annotation paths | |
if check_first_frame_annots: | |
frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes") | |
df["box_path"] = df["video_id"].apply( | |
lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"), | |
) | |
df["mask_path"] = df["video_id"].apply( | |
lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"), | |
) | |
df = df[df["box_path"].apply(os.path.exists)] | |
df = df[df["mask_path"].apply(os.path.exists)] | |
print(" [:::] Shape of CSV with first frame annotations: ", df.shape) | |
# Add split filter | |
if split is not None and ("item_id" not in csv_filters): | |
assert "split_dir" in paths | |
split_path = os.path.join(paths["split_dir"], f"{split}") | |
assert os.path.exists(split_path), \ | |
f"Split file not found at {split_path}." | |
item_ids = su.io.load_txt(split_path) | |
print(" [:::] Number of item_ids in split:", len(item_ids)) | |
csv_filters["item_id"] = item_ids | |
# Apply filter to the CSV | |
if len(csv_filters) > 0: | |
df = su.pd_utils.apply_filters(df, csv_filters) | |
print(" [:::] Shape of CSV after filtering: ", df.shape) | |
return df | |
if __name__ == "__main__": | |
paths = configure_paths_sound_of_water() | |
df = load_csv_sound_of_water(paths) | |
row = df.iloc[0].to_dict() | |
su.log.json_print(row) | |