alps / unitable /CONFIG.mk
yumikimi381's picture
Upload folder using huggingface_hub
daf0288 verified
##################################################
# Configurations #
##################################################
#
# Datasets
#
# label type
LABEL_IMAGE = ++trainer.label_type="image"
LABEL_HTML = ++trainer.label_type="html" "++trainer.train.loss_weights.html=1"
LABEL_CELL = ++trainer.label_type="cell" "++trainer.train.loss_weights.cell=1"
LABEL_BBOX = ++trainer.label_type="bbox" "++trainer.train.loss_weights.bbox=1"
MEAN = [0.86597056,0.88463002,0.87491087]
STD = [0.20686628,0.18201602,0.18485524]
# augmentation
AUG_VQVAE = dataset/augmentation=vqvae
AUG_BEIT = dataset/augmentation=beit \
++dataset.augmentation.mean=$(MEAN) ++dataset.augmentation.std=$(STD)
AUG_RESIZE_NORM = dataset/augmentation=resize_normalize \
++dataset.augmentation.transforms.2.mean=$(MEAN) ++dataset.augmentation.transforms.2.std=$(STD)
# single dataset
DATA_SINGLE = dataset=single_dataset
PUBTABNET = $(DATA_SINGLE) \
+dataset/pubtabnet@dataset.train_dataset=train_dataset \
+dataset/pubtabnet@dataset.valid_dataset=valid_dataset \
+dataset/pubtabnet@dataset.test_dataset=test_dataset
MINIPUBTABNET = $(DATA_SINGLE) \
+dataset/mini_pubtabnet@dataset.train_dataset=train_dataset \
+dataset/mini_pubtabnet@dataset.valid_dataset=valid_dataset \
+dataset/mini_pubtabnet@dataset.test_dataset=test_dataset
# multiple datasets
DATA_MULTI = dataset=concat_dataset
PUBTABNET_M = +dataset/pubtabnet@dataset.train.d1=train_dataset \
+dataset/pubtabnet@dataset.valid.d1=valid_dataset \
+dataset/pubtabnet@dataset.test.d1=test_dataset
SYN_MARKET_M = +dataset/synthtabnet_marketing@dataset.train.d2=train_dataset \
+dataset/synthtabnet_marketing@dataset.valid.d2=valid_dataset \
+dataset/synthtabnet_marketing@dataset.test.d2=test_dataset
SYN_FIN_M = +dataset/synthtabnet_fintabnet@dataset.train.d3=train_dataset \
+dataset/synthtabnet_fintabnet@dataset.valid.d3=valid_dataset \
+dataset/synthtabnet_fintabnet@dataset.test.d3=test_dataset
SYN_SPARSE_M = +dataset/synthtabnet_sparse@dataset.train.d4=train_dataset \
+dataset/synthtabnet_sparse@dataset.valid.d4=valid_dataset \
+dataset/synthtabnet_sparse@dataset.test.d4=test_dataset
SYN_PUB_M = +dataset/synthtabnet_pubtabnet@dataset.train.d5=train_dataset \
+dataset/synthtabnet_pubtabnet@dataset.valid.d5=valid_dataset \
+dataset/synthtabnet_pubtabnet@dataset.test.d5=test_dataset
PUBTABLES_M = +dataset/pubtables1m@dataset.train.d7=train_dataset \
+dataset/pubtables1m@dataset.valid.d7=valid_dataset \
+dataset/pubtables1m@dataset.test.d7=test_dataset
TABLEBANK_M = +dataset/tablebank@dataset.train.d8=train_dataset \
+dataset/tablebank@dataset.valid.d8=valid_dataset \
+dataset/tablebank@dataset.test.d8=test_dataset
FINTABNET_M = +dataset/fintabnet@dataset.train.d9=train_dataset \
+dataset/fintabnet@dataset.valid.d9=valid_dataset \
+dataset/fintabnet@dataset.test.d9=test_dataset
DATA_VQVAE_1M = $(DATA_MULTI) \
$(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M)
DATA_VQVAE_2M = $(DATA_MULTI) \
$(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) \
$(PUBTABLES_M) $(TABLEBANK_M)
PUBTABLES1M = $(DATA_MULTI) $(PUBTABLES_M)
FINTABNET = $(DATA_MULTI) $(FINTABNET_M)
PUB_SYN = $(DATA_MULTI) \
$(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M)
PUB_SYN_FIN = $(DATA_MULTI) $(PUBTABNET_M) $(FINTABNET_M) \
$(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M)
PUB_SYN_PUB1M = $(DATA_MULTI) $(PUBTABNET_M) $(PUBTABLES_M) \
$(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M)
SYN = $(DATA_MULTI) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M)
SYN_fin = $(DATA_MULTI) $(SYN_FIN_M)
SYN_market = $(DATA_MULTI) $(SYN_MARKET_M)
SYN_pub = $(DATA_MULTI) $(SYN_PUB_M)
SYN_sparse = $(DATA_MULTI) $(SYN_SPARSE_M)
#
# Vocab
#
VOCAB_NONE = vocab=empty
VOCAB_HTML = vocab=html
VOCAB_BBOX = vocab=bbox
VOCAB_CELL = vocab=cell
#
# Trainer
#
# trainer type
TRAINER_VQVAE = trainer=vqvae
TRAINER_BEIT = trainer=beit
TRAINER_TABLE = trainer=table
# input image size
I224 = ++trainer.img_size=[224,224]
I448 = ++trainer.img_size=[448,448]
I112_448 = ++trainer.img_size=[112,448]
# max sequence length
SEQ200 = trainer.max_seq_len=200
SEQ512 = trainer.max_seq_len=512
SEQ1024 = trainer.max_seq_len=1024
# batch size + epoch
BATCH24 = ++trainer.train.dataloader.batch_size=24 ++trainer.valid.dataloader.batch_size=24
BATCH48 = ++trainer.train.dataloader.batch_size=48 ++trainer.valid.dataloader.batch_size=48
BATCH72 = ++trainer.train.dataloader.batch_size=72 ++trainer.valid.dataloader.batch_size=72
BATCH80 = ++trainer.train.dataloader.batch_size=80 ++trainer.valid.dataloader.batch_size=80
BATCH96 = ++trainer.train.dataloader.batch_size=96 ++trainer.valid.dataloader.batch_size=96
BATCH256 = ++trainer.train.dataloader.batch_size=256 ++trainer.valid.dataloader.batch_size=256
BATCH384 = ++trainer.train.dataloader.batch_size=384 ++trainer.valid.dataloader.batch_size=384
EPOCH24 = ++trainer.train.epochs=24
EPOCH30 = ++trainer.train.epochs=30
EPOCH48 = ++trainer.train.epochs=48
# optimizer
OPT_ADAMW = trainer/train/optimizer=adamw
OPT_WD5e2 = ++trainer.train.optimizer.weight_decay=5e-2
# lr + scheduler
LR_5e4 = ++trainer.train.optimizer.lr=5e-4
LR_3e4 = ++trainer.train.optimizer.lr=3e-4
LR_1e4 = ++trainer.train.optimizer.lr=1e-4
LR_8e5 = ++trainer.train.optimizer.lr=8e-5
LR_cosine = trainer/train/lr_scheduler=cosine ++trainer.train.lr_scheduler.lr_lambda.min_ratio=5e-3
LR_cosine93k_warm6k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=93400 ++trainer.train.lr_scheduler.lr_lambda.warmup=5800
LR_cosine77k_warm8k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=76600 ++trainer.train.lr_scheduler.lr_lambda.warmup=7660
LR_cosine30k_warm4k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=30500 ++trainer.train.lr_scheduler.lr_lambda.warmup=4000
LR_cosine8k_warm1k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=7600 ++trainer.train.lr_scheduler.lr_lambda.warmup=800
LR_cosine44k_warm6k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=44100 ++trainer.train.lr_scheduler.lr_lambda.warmup=5500
LR_cosine118k_warm15k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=117800 ++trainer.train.lr_scheduler.lr_lambda.warmup=14700
LR_cosine216k_warm27k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=216000 ++trainer.train.lr_scheduler.lr_lambda.warmup=27000
LR_cosine32k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=32000 ++trainer.train.lr_scheduler.lr_lambda.warmup=0
LR_cosine118k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=118000 ++trainer.train.lr_scheduler.lr_lambda.warmup=0
GRAD_CLIP12 = ++trainer.train.grad_clip=12
# vqvae
VQVAE_TEMP_1M = ++trainer.train.starting_temp=1. \
++trainer.train.temp_min=5e-3 ++trainer.train.temp_anneal_rate=1e-3
VQVAE_TEMP_2M = ++trainer.train.starting_temp=1. \
++trainer.train.temp_min=1e-3 ++trainer.train.temp_anneal_rate=2e-4
# pretraining specific
TRANS448_VQVAE224_GRID28_MASK300 = ++trainer.trans_size=[448,448] ++trainer.vqvae_size=[224,224] ++trainer.grid_size=28 ++trainer.num_mask_patches=300
VQVAE1M_WEIGHTS = $(MODEL_VQVAE) ++trainer.vqvae_weights="../unitable_weights/vqvae_1m.pt"
VQVAE2M_WEIGHTS = $(MODEL_VQVAE_L) ++trainer.vqvae_weights="../unitable_weights/vqvae_2m.pt"
# finetuning specific
WEIGHTS_mtim_1m_base = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_1m_base.pt"
WEIGHTS_mtim_1m_large = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_1m_large.pt"
WEIGHTS_mtim_2m_base = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_2m_base.pt"
WEIGHTS_mtim_2m_large = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_2m_large.pt"
LOCK_MTIM_4 = ++trainer.trainer.freeze_beit_epoch=4
#
# Models
#
# model type
MODEL_VQVAE = model=vqvae
MODEL_VQVAE_L = $(MODEL_VQVAE) ++model.codebook_tokens=16384 ++model.hidden_dim=512
MODEL_BEIT = model=beit
MODEL_ENCODER_DECODER = model=encoderdecoder
# backbone for input preprocessing: resnet, linear projection, and convstem
IMGCNN = model/model/backbone=imgcnn
IMGLINEAR = model/model/backbone=imglinear
IMGCONVSTEM = model/model/backbone=imgconvstem
# number of layers
E4 = ++model.model.encoder.nlayer=4
E12 = ++model.model.encoder.nlayer=12
E24 = ++model.model.encoder.nlayer=24
D4 = ++model.model.decoder.nlayer=4
# transformer layer: attention heads, hidden size, activation, norm
FF4 = ++model.ff_ratio=4
NHEAD8 = ++model.nhead=8
NHEAD12 = ++model.nhead=12
NORM_FIRST = ++model.norm_first=true
NORM_LAST = ++model.norm_first=false
ACT_RELU = ++model.activation="relu"
ACT_GELU = ++model.activation="gelu"
D_MODEL512 = ++model.d_model=512
D_MODEL768 = ++model.d_model=768
# regularization
REG_d00 = ++model.dropout=0.0
REG_d02 = ++model.dropout=0.2
# linear projection patch size
P16 = ++model.backbone_downsampling_factor=16
P28 = ++model.backbone_downsampling_factor=28
P32 = ++model.backbone_downsampling_factor=32
# cnn backbone
R18 = ++model.model.backbone.backbone._target_=torchvision.models.resnet18 \
++model.model.backbone.output_channels=512
MTIM_BASE = $(MODEL_BEIT) $(IMGLINEAR) $(NHEAD8) $(FF4) $(ACT_GELU) \
$(NORM_FIRST) $(D_MODEL512) $(REG_d02) $(P16) $(E4)
MTIM_LARGE = $(MODEL_BEIT) $(IMGLINEAR) $(NHEAD12) $(FF4) $(ACT_GELU) \
$(NORM_FIRST) $(D_MODEL768) $(REG_d02) $(P16) $(E12)
ARCH_BASE = $(MTIM_BASE) $(MODEL_ENCODER_DECODER) $(D4)
ARCH_LARGE = $(MTIM_LARGE) $(MODEL_ENCODER_DECODER) $(D4)
###############################################
# Experiments #
###############################################
TRAIN_vqvae := $(VOCAB_NONE) \
$(LABEL_IMAGE) $(AUG_VQVAE) $(I224) \
$(TRAINER_VQVAE) $(OPT_ADAMW) $(LR_1e4) $(EPOCH24)
TRAIN_mtim := $(VOCAB_NONE) \
$(LABEL_IMAGE) $(AUG_BEIT) \
$(TRAINER_BEIT) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_5e4) \
$(TRANS448_VQVAE224_GRID28_MASK300)
#
# mini_pubtabnet pretraining example (dataset code: mini)
#
# vq-vae
# > make experiments/vqvae_mini/.done_pretrain
EXP_vqvae_mini := $(TRAIN_vqvae) $(MINIPUBTABNET) $(VQVAE_TEMP_2M) $(BATCH80) $(MODEL_VQVAE) $(LR_cosine32k)
# visual encoder pretraining - masked tabular image modeling (MTIM)
# > make experiments/mtim_mini_base/.done_pretrain
EXP_mtim_mini_base := $(TRAIN_mtim) $(MINIPUBTABNET) $(VQVAE2M_WEIGHTS) $(MTIM_BASE) \
$(BATCH384) $(LR_cosine8k_warm1k) $(EPOCH24)
#
# mini_pubtabnet finetuning example
#
# table structure (task code: html)
# > make experiments/ssp_2m_mini_html_base/.done_finetune
TRAIN_mini_html := $(VOCAB_HTML) \
$(MINIPUBTABNET) $(LABEL_HTML) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I448) $(SEQ512) \
$(EPOCH48) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5)
EXP_ssp_2m_mini_html_base := $(TRAIN_mini_html) $(ARCH_BASE) \
$(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH72) $(LR_cosine93k_warm6k)
# table cell bbox (task code: bbox)
# > make experiments/ssp_2m_mini_bbox_base/.done_finetune
TRAIN_mini_bbox := $(VOCAB_BBOX) \
$(MINIPUBTABNET) $(LABEL_BBOX) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I448) $(SEQ1024) \
$(EPOCH30) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_3e4) $(GRAD_CLIP12)
EXP_ssp_2m_mini_bbox_base := $(TRAIN_mini_bbox) $(ARCH_BASE) \
$(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH48) $(LR_cosine77k_warm8k)
# table cell content (task code: cell)
# > make experiments/ssp_2m_mini_cell_base/.done_finetune
TRAIN_mini_cell := $(VOCAB_CELL) \
$(MINIPUBTABNET) $(LABEL_CELL) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I112_448) $(SEQ200) \
$(EPOCH24) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) $(GRAD_CLIP12)
EXP_ssp_2m_mini_cell_base := $(TRAIN_mini_cell) $(ARCH_BASE) \
$(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH24) $(LR_cosine216k_warm27k)
#
# cross-dataset pretraining
#
# vq-vae
EXP_vqvae_1M := $(TRAIN_vqvae) $(DATA_VQVAE_1M) $(VQVAE_TEMP_1M) $(BATCH80) $(MODEL_VQVAE) $(LR_cosine32k)
EXP_vqvae_2M := $(TRAIN_vqvae) $(DATA_VQVAE_2M) $(VQVAE_TEMP_2M) $(BATCH48) $(MODEL_VQVAE_L) $(LR_cosine118k)
# visual encoder pretraining
EXP_mtim_1M_base := $(TRAIN_mtim) $(PUB_SYN) $(VQVAE1M_WEIGHTS) $(MTIM_BASE) \
$(BATCH384) $(LR_cosine8k_warm1k) $(EPOCH24)
EXP_mtim_1M_large := $(TRAIN_mtim) $(PUB_SYN) $(VQVAE1M_WEIGHTS) $(MTIM_LARGE) \
$(BATCH96) $(LR_cosine30k_warm4k) $(EPOCH24)
EXP_mtim_2M_base := $(TRAIN_mtim) $(DATA_VQVAE_2M) $(VQVAE2M_WEIGHTS) $(MTIM_BASE) \
$(BATCH256) $(LR_cosine44k_warm6k) $(EPOCH48)
EXP_mtim_2M_large := $(TRAIN_mtim) $(DATA_VQVAE_2M) $(VQVAE2M_WEIGHTS) $(MTIM_LARGE) \
$(BATCH96) $(LR_cosine118k_warm15k) $(EPOCH48)
#
# cross-dataset finetuning
#
# table structure
# > make experiments/ssp_2m_syn_pub_html_medium/.done_finetune
TRAIN_syn_pub_html := $(VOCAB_HTML) \
$(PUB_SYN) $(LABEL_HTML) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I448) $(SEQ512) \
$(EPOCH48) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5)
EXP_ssp_2m_syn_pub_html_large := $(TRAIN_syn_pub_html) $(ARCH_LARGE) \
$(WEIGHTS_mtim_2m_large) $(LOCK_MTIM_4) $(BATCH72) $(LR_cosine93k_warm6k)
# table cell bbox
# > make experiments/ssp_2m_syn_pub_bbox_medium/.done_finetune
TRAIN_syn_pub_bbox := $(VOCAB_BBOX) \
$(PUB_SYN) $(LABEL_BBOX) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I448) $(SEQ1024) \
$(EPOCH30) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_3e4) $(GRAD_CLIP12)
EXP_ssp_2m_syn_pub_bbox_large := $(TRAIN_syn_pub_bbox) $(ARCH_LARGE) \
$(WEIGHTS_mtim_2m_large) $(LOCK_MTIM_4) $(BATCH48) $(LR_cosine77k_warm8k)
# table cell content
# > make experiments/syn_pub_pub1m_cell_medium/.done_finetune
TRAIN_syn_pub_pub1m_cell := $(VOCAB_CELL) \
$(PUB_SYN_PUB1M) $(LABEL_CELL) $(AUG_RESIZE_NORM) \
$(TRAINER_TABLE) $(I112_448) $(SEQ200) \
$(EPOCH24) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) $(GRAD_CLIP12)
EXP_ssp_2m_syn_pub_pub1m_cell_large := $(TRAIN_syn_pub_pub1m_cell) $(ARCH_LARGE) \
$(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH24) $(LR_cosine216k_warm27k)