bpiyush's picture
Upload folder using huggingface_hub
c5f65a4 verified
raw
history blame
4.27 kB
"""Utils to load CSV file of audio datasets."""
import os
import pandas as pd
import shared.utils as su
def configure_paths_sound_of_water(
data_root="/work/piyush/from_nfs2/datasets/SoundOfWater",
):
paths = {
"data_dir": data_root,
"video_clip_dir": os.path.join(data_root, "videos"),
"audio_clip_dir": os.path.join(data_root, "videos"),
"annot_dir": os.path.join(data_root, "annotations"),
"split_dir": os.path.join(data_root, "splits"),
}
return paths
def load_csv_sound_of_water(
paths: dict,
csv_filters=dict(),
csv_name="localisation.csv",
ds_name="SoundOfWater",
split=None,
check_first_frame_annots=True,
):
"""Loads CSV containing metadata of the dataset."""
su.log.print_update(
f" [:::] Loading {ds_name}.",
pos="left",
fillchar=".",
)
# Configure paths
video_clip_dir = paths["video_clip_dir"]
audio_clip_dir = paths["audio_clip_dir"]
# Load main CSV
path = os.path.join(
paths["annot_dir"], csv_name,
)
assert os.path.exists(path), \
f"CSV file not found at {path}."
print(" [:::] CSV path:", path)
df = pd.read_csv(path)
# Load side information: containers
container_path = os.path.join(
paths['annot_dir'], "containers.yaml",
)
assert os.path.exists(container_path)
containers = su.io.load_yml(container_path)
# Update CSV with container information (optional)
update_with_container_info = True
if update_with_container_info:
rows = []
for row in df.iterrows():
row = row[1].to_dict()
row.update(containers[row["container_id"]])
rows.append(row)
df = pd.DataFrame(rows)
print(" [:::] Shape of CSV: ", df.shape)
# 1. Update item_id
df["item_id"] = df.apply(
lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}",
axis=1,
)
# 2. Update video_clip_path
# df["video_path"] = df["video_id"].apply(
# lambda d: os.path.join(
# video_dir, f"{d}.mp4"
# )
# )
df["video_clip_path"] = df["item_id"].apply(
lambda d: os.path.join(
video_clip_dir, f"{d}.mp4"
)
)
df = df[df["video_clip_path"].apply(os.path.exists)]
print(" [:::] Shape of CSV with available video: ", df.shape)
# 3. Update audio_clip_path
# df["audio_path"] = df["video_id"].apply(
# lambda d: os.path.join(
# audio_dir, f"{d}.mp4"
# )
# )
df["audio_clip_path"] = df["item_id"].apply(
lambda d: os.path.join(
audio_clip_dir, f"{d}.mp4"
)
)
df = df[df["audio_clip_path"].apply(os.path.exists)]
print(" [:::] Shape of CSV with available audio: ", df.shape)
# Add first frame annotation paths
if check_first_frame_annots:
frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes")
df["box_path"] = df["video_id"].apply(
lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"),
)
df["mask_path"] = df["video_id"].apply(
lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"),
)
df = df[df["box_path"].apply(os.path.exists)]
df = df[df["mask_path"].apply(os.path.exists)]
print(" [:::] Shape of CSV with first frame annotations: ", df.shape)
# Add split filter
if split is not None and ("item_id" not in csv_filters):
assert "split_dir" in paths
split_path = os.path.join(paths["split_dir"], f"{split}")
assert os.path.exists(split_path), \
f"Split file not found at {split_path}."
item_ids = su.io.load_txt(split_path)
print(" [:::] Number of item_ids in split:", len(item_ids))
csv_filters["item_id"] = item_ids
# Apply filter to the CSV
if len(csv_filters) > 0:
df = su.pd_utils.apply_filters(df, csv_filters)
print(" [:::] Shape of CSV after filtering: ", df.shape)
return df
if __name__ == "__main__":
paths = configure_paths_sound_of_water()
df = load_csv_sound_of_water(paths)
row = df.iloc[0].to_dict()
su.log.json_print(row)