Spaces:

bpiyush
/

SoundOfWater

Running

App Files Files Community

SoundOfWater / sound_of_water /data /csv_loader.py

bpiyush

Upload folder using huggingface_hub

c5f65a4 verified 5 days ago

raw

history blame

4.27 kB

	"""Utils to load CSV file of audio datasets."""
	import os

	import pandas as pd
	import shared.utils as su


	def configure_paths_sound_of_water(
	data_root="/work/piyush/from_nfs2/datasets/SoundOfWater",
	):
	paths = {
	"data_dir": data_root,
	"video_clip_dir": os.path.join(data_root, "videos"),
	"audio_clip_dir": os.path.join(data_root, "videos"),
	"annot_dir": os.path.join(data_root, "annotations"),
	"split_dir": os.path.join(data_root, "splits"),
	}
	return paths


	def load_csv_sound_of_water(
	paths: dict,
	csv_filters=dict(),
	csv_name="localisation.csv",
	ds_name="SoundOfWater",
	split=None,
	check_first_frame_annots=True,
	):
	"""Loads CSV containing metadata of the dataset."""

	su.log.print_update(
	f" [:::] Loading {ds_name}.",
	pos="left",
	fillchar=".",
	)

	# Configure paths
	video_clip_dir = paths["video_clip_dir"]
	audio_clip_dir = paths["audio_clip_dir"]

	# Load main CSV
	path = os.path.join(
	paths["annot_dir"], csv_name,
	)
	assert os.path.exists(path), \
	f"CSV file not found at {path}."
	print(" [:::] CSV path:", path)
	df = pd.read_csv(path)

	# Load side information: containers
	container_path = os.path.join(
	paths['annot_dir'], "containers.yaml",
	)
	assert os.path.exists(container_path)
	containers = su.io.load_yml(container_path)

	# Update CSV with container information (optional)
	update_with_container_info = True
	if update_with_container_info:
	rows = []
	for row in df.iterrows():
	row = row[1].to_dict()
	row.update(containers[row["container_id"]])
	rows.append(row)
	df = pd.DataFrame(rows)
	print(" [:::] Shape of CSV: ", df.shape)

	# 1. Update item_id
	df["item_id"] = df.apply(
	lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}",
	axis=1,
	)

	# 2. Update video_clip_path
	# df["video_path"] = df["video_id"].apply(
	# lambda d: os.path.join(
	# video_dir, f"{d}.mp4"
	# )
	# )
	df["video_clip_path"] = df["item_id"].apply(
	lambda d: os.path.join(
	video_clip_dir, f"{d}.mp4"
	)
	)
	df = df[df["video_clip_path"].apply(os.path.exists)]
	print(" [:::] Shape of CSV with available video: ", df.shape)

	# 3. Update audio_clip_path
	# df["audio_path"] = df["video_id"].apply(
	# lambda d: os.path.join(
	# audio_dir, f"{d}.mp4"
	# )
	# )
	df["audio_clip_path"] = df["item_id"].apply(
	lambda d: os.path.join(
	audio_clip_dir, f"{d}.mp4"
	)
	)
	df = df[df["audio_clip_path"].apply(os.path.exists)]
	print(" [:::] Shape of CSV with available audio: ", df.shape)

	# Add first frame annotation paths
	if check_first_frame_annots:
	frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes")
	df["box_path"] = df["video_id"].apply(
	lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"),
	)
	df["mask_path"] = df["video_id"].apply(
	lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"),
	)
	df = df[df["box_path"].apply(os.path.exists)]
	df = df[df["mask_path"].apply(os.path.exists)]
	print(" [:::] Shape of CSV with first frame annotations: ", df.shape)

	# Add split filter
	if split is not None and ("item_id" not in csv_filters):
	assert "split_dir" in paths
	split_path = os.path.join(paths["split_dir"], f"{split}")
	assert os.path.exists(split_path), \
	f"Split file not found at {split_path}."
	item_ids = su.io.load_txt(split_path)
	print(" [:::] Number of item_ids in split:", len(item_ids))
	csv_filters["item_id"] = item_ids

	# Apply filter to the CSV
	if len(csv_filters) > 0:
	df = su.pd_utils.apply_filters(df, csv_filters)
	print(" [:::] Shape of CSV after filtering: ", df.shape)

	return df


	if __name__ == "__main__":
	paths = configure_paths_sound_of_water()
	df = load_csv_sound_of_water(paths)
	row = df.iloc[0].to_dict()
	su.log.json_print(row)