Make statistics about the datasets
Build the database of labels:
DATASET=vg-densecap-local
python scripts/tools/build_annotation_db.py train_data='['$DATASET']' eval_data='['$DATASET']' training.dataloader_num_workers=10 train_data_overrides='[data.with_image\=False]' eval_data_overrides='[data.with_image\=False]'
# training.output_dir=tmp/data/$DATASET
The table schema:
CREATE TABLE IF NOT EXISTS {table_name} (
region_id INTEGER PRIMARY KEY,
image_id INTEGER,
width INTEGER,
height INTEGER,
file_name TEXT,
coco_url TEXT,
task_type TEXT,
phrases TEXT,
tokenized_phrases TEXT,
x REAL,
y REAL,
region_width REAL,
region_height REAL
)
The distribution of VG text num tokens: scripts/notebooks/dataset_statstics_db.ipynb
Build annotation database (deprecated)
# for DATASET in objects365-local coco-instance v3det-local vg-densecap-region_descriptions refcocog-google; do
for DATASET in vg-densecap-region_descriptions ; do
python scripts/tools/build_annotation_db.py \
train_data='['"$DATASET"']' train_data_overrides='[data.with_image\=False]' \
eval_data='['"$DATASET"']' eval_data_overrides='[data.with_image\=False]' \
training.output_dir='tmp/annotation_db/'"$DATASET"
done
Extract nouns from the annotation database
ANNOTATION_DB_PATH=
python scripts/tools/add_pos_table_annotation_db.py --db $ANNOTATION_DB_PATH
for i in coco-instance objects365-local refcocog-google v3det-local vg-densecap-region_descriptions ; do
python scripts/tools/add_pos_table_annotation_db.py --db tmp/annotation_db/$i/annotations.db
done
Visualize annotations with the Graio app
ANNOTATION_DB_PATH=
python scripts/apps/annotation_db_app.py --db $ANNOTATION_DB_PATH
Get the noun comparison
# Default output path: tmp/annotation_db_noun_stats/compare_nouns_annotation_db.xlsx
for i in coco-instance objects365-local v3det-local refcocog-google ; do
python scripts/tools/compare_nouns_annotation_db.py --db tmp/annotation_db/$i/annotations.db --db tmp/annotation_db/vg-densecap-region_descriptions/annotations.db # -o OUTPUT_PATH
done
To get dataset statistics (deprecated)
The statistics result is saved in */dataset_statistics.log
. Therefore, we need to parse the log file.
Load with images of which we check the sanity (slow)
- Objects365: 1742289 images, ~ 3 hours.
- V3Det: 183348 images, ~ 30 minutes.
for DATASET in objects365-local; do
torchrun --nproc-per-node 12 --standalone scripts/tools/dataset_statistics.py \
train_data='['"$DATASET"']' \
eval_data='['"$DATASET"']' \
training.output_dir='tmp/dataset_statistics-w_image/'"$DATASET"
done
Only do statistics (fast, < 2 min)
for DATASET in objects365-local; do
torchrun --nproc-per-node 12 --standalone scripts/tools/dataset_statistics.py \
train_data='['"$DATASET"']' train_data_overrides='[data.with_image\=False]' \
eval_data='['"$DATASET"']' eval_data_overrides='[data.with_image\=False]' \
training.output_dir='tmp/dataset_statistics-wo_image/'"$DATASET"
done
Do not use multiple eval datasts.
Parse the log to csv
#!/bin/bash
# Input argument: base_dir
base_dir=$1
output_file="${base_dir}/dataset_statistics-full.$(date +%m%d%y).csv"
# Create a CSV file with the header
echo "dataset,split,total samples,total regions,total sents,total tokens,total words" > "$output_file"
# Find all "dataset_statistics.log" files under the base_dir
find "${base_dir}" -type f -name "dataset_statistics.log" | while read log_file; do
# Extract the dataset name from the directory path
dataset=$(basename $(dirname "${log_file}"))
# Parse the log file and extract the required information
grep -E "\[FULL\]: split name" "${log_file}" | while read line; do
# Extract the values for each field
split=$(echo "${line}" | grep -oP "split name: \K\w+")
total_samples=$(echo "${line}" | grep -oP "total samples: \K\d+")
total_regions=$(echo "${line}" | grep -oP "total regions: \K\d+")
total_sents=$(echo "${line}" | grep -oP "total sents: \K\d+")
total_tokens=$(echo "${line}" | grep -oP "total tokens: \K\d+")
total_words=$(echo "${line}" | grep -oP "total words: \K\d+")
# Append the parsed information to the CSV file
echo "${dataset},${split},${total_samples},${total_regions},${total_sents},${total_tokens},${total_words}" >> "$output_file"
done
done
Test dataloading.
python scripts/tools/test_dataset_loading.py \
train_data='[vg-densecap-region_descriptions]' \
eval_data='[vg-densecap-region_descriptions]' \
+data_transforms=lsj-1_0-2_0 \
+model=base_sca_multitask_v2 \
training.do_train=True \
training.do_eval=True \
training.per_device_train_batch_size=1 \
training.num_masks_per_sample=16 \
training.dataloader_num_workers=10 \
Get param size
for i in 2 4 8 12 24; do
python scripts/tools/count_num_params.py \
train_data='[vg-densecap-local]' \
eval_data='[vg-densecap-local]' \
training.do_train=True training.do_eval=True \
+model=base_sca_multitask_v2 model.num_caption_tokens=8 model.additional_num_hidden_layers=$i model.num_task_tokens=6 | grep mask_decoder.additional_transformer >> tmp/mixer_size.txt
done
for i in facebook/sam-vit-huge facebook/sam-vit-large facebook/sam-vit-base ; do
python scripts/tools/count_num_params.py \
train_data='[vg-densecap-local]' \
eval_data='[vg-densecap-local]' \
training.do_train=True training.do_eval=True \
+model=base_sca_multitask_v2 model.sam_model_name_or_path=$i | grep vision_encoder >> tmp/vision_encoder_size.txt
done