llava-1-5 / utils_finetuning /get_imgs_json_according_to_size.py
saurabh-straive's picture
initial commit
8874c09
"""
This code is to get a json of the images of smaller size.
The record of all the images should be there in a dataframe.
The dataframe either :
1. should have a column named `small_size` which contains the flag
1 if the size is less than 100*100 else 0.
Or
2. should have a column named `dimensions` containing the size tuple.
"""
import json
import os
import pandas as pd
import argparse
def main(args):
# load json
with open(args.json_path, "r") as fp:
data_from_json = json.load(fp)
# load the df
data_from_df = pd.read_csv(args.dataframe_path)
# get the names of those files with size < 100*100
if args.column:
col_to_check = args.column
files_small_size = data_from_df.loc[data_from_df[col_to_check] == 1]["fpath"]
else:
assert "dimensions" in data_from_df.columns, "Either column should be given or a column named 'dimension' should be there in the df."
# Get image names where any dimension is less than 100
files_small_size = data_from_df[data_from_df['dimensions'].apply(lambda x: any(dimension < 100 for dimension in x))]["fpath"]
# creating a list of just the basenames of the files
files_small_size_basename = [os.path.basename(filename) for filename in files_small_size]
# got the names of the files with small sizes
# to create a json for them now
print("Creating json records with the images with small size.")
data_small_imgs = [record for record in data_from_json if os.path.basename(record["image"]) in files_small_size_basename]
print(f"Saving the json at {args.save_json}.")
with open(args.save_json, "w") as fp:
json.dump(data_small_imgs, fp, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--json-path", type=str, help="Path of the json containing images data.")
parser.add_argument("--dataframe-path", type=str, help="Path of the dataframe of all the images containing info about dimension.")
parser.add_argument("--column", type=str,
help="Name of the column if any, to consider instead of `dimensions` column. This column should contain flag 1 or 0 when the dimension is < 100*100 respectively.")
parser.add_argument("--save-json", type=str, help="Path of the json to be saved with all the files with size < 100*100.")
args = parser.parse_args()
main(args)