""" To check the files (images), whether any of them has unusual size / dimension. """ import os from PIL import Image import pandas as pd import argparse from tqdm import tqdm from typing import Union def main(args): dataset_path = args.dataset_path #"/workspace/llava_from_src/LLaVA/playground/data/images" sizes = [] dimensions = [] fpaths = [] # to contain all the filenames (imagenames) size_less_than_100X100 = [] # to contain 1 if size of the image < 100 * 100 else 0 for filename in tqdm(os.listdir(dataset_path)): if filename.endswith(".jpg") or filename.endswith(".png"): image_path = os.path.join(dataset_path, filename) fpaths.append(image_path) with Image.open(image_path) as img: sizes.append(os.path.getsize(image_path)) dim = img.size dimensions.append(img.size) size_less_than_100X100.append((lambda dim: 1 if dim[0]*dim[1] < 10000 else 0)(dim)) if args.create_dataframe: df = pd.DataFrame({ "fpath": fpaths, "img_size": sizes, "dimensions": dimensions, "small_size": size_less_than_100X100 }) df.to_csv(args.create_dataframe, index=False) print(f"Dataframe saved at {args.create_dataframe}.") # Analyze the sizes and dimensions # print("Max size:", max(sizes)) # print("Min size:", min(sizes)) # print("Avg size:", sum(sizes) / len(sizes)) # print("Unique dimensions:", set(dimensions)) print(pd.Series(sizes).describe()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dataset-path", type=str, required=True, help="Path of the dataset of images to be checked.") parser.add_argument("--create-dataframe", type=str, default="report_imgs_size.csv", help="Name of the dataframe if you want to create.") args = parser.parse_args() main(args)