|
DEFAULT_TEST_DATASET = dict( |
|
flickr=dict( |
|
filename='./reactiondata/real_test.jsonl', |
|
image_folder='./reaction_image', |
|
template_file='./config/_base_/dataset/template/reaction.json', |
|
type='FlickrDataset'), |
|
reg=dict( |
|
filename='./reactiondata/train_OCR.jsonl', |
|
image_folder='./reaction_image_OCR', |
|
template_file='./config/_base_/dataset/template/OCR.json', |
|
type='REGDataset')) |
|
DEFAULT_TRAIN_DATASET = dict( |
|
flickr=dict( |
|
filename='./reactiondata/reaction_real_structed.jsonl', |
|
image_folder='./reaction_image', |
|
template_file='./config/_base_/dataset/template/reaction.json', |
|
type='FlickrDataset'), |
|
reg=dict( |
|
filename='./reactiondata/train_OCR.jsonl', |
|
image_folder='./reaction_image_OCR', |
|
template_file='./config/_base_/dataset/template/OCR.json', |
|
type='REGDataset')) |
|
data_args = dict( |
|
collator_kwargs=dict(max_length=1024, padding=True), |
|
compute_metric=None, |
|
gen_kwargs=dict(max_new_tokens=1024, num_beams=1), |
|
test=None, |
|
train=dict( |
|
cfgs=[ |
|
dict( |
|
filename='./reactiondata/train_OCR.jsonl', |
|
image_folder='./reaction_image_OCR', |
|
template_file='./config/_base_/dataset/template/OCR.json', |
|
type='REGDataset'), |
|
dict( |
|
filename='./reactiondata/reaction_real_structed.jsonl', |
|
image_folder='./reaction_image', |
|
template_file='./config/_base_/dataset/template/reaction.json', |
|
type='FlickrDataset'), |
|
], |
|
probabilities=[ |
|
0.0, |
|
1, |
|
], |
|
seed=None, |
|
stopping_strategy='first_exhausted', |
|
type='InterleaveDateset'), |
|
validation=dict( |
|
cfgs=[ |
|
dict( |
|
filename='./reactiondata/real_test.jsonl', |
|
image_folder='./reaction_image', |
|
template_file='./config/_base_/dataset/template/reaction.json', |
|
type='FlickrDataset'), |
|
], |
|
type='ConcatDatasetWithShuffle')) |
|
model_args = dict( |
|
cache_dir=None, |
|
conv_args=dict( |
|
conv_template='vicuna_v1.1', |
|
tokenize_kwargs=dict(truncation_size=2048)), |
|
freeze_backbone=False, |
|
freeze_mm_mlp_adapter=False, |
|
gen_kwargs_set_bos_token_id=True, |
|
gen_kwargs_set_eos_token_id=True, |
|
gen_kwargs_set_pad_token_id=True, |
|
image_token_len=300, |
|
mm_use_im_start_end=True, |
|
mm_vision_select_layer=-2, |
|
model_max_length=2048, |
|
model_name_or_path='./exp/reaction_4.2.1', |
|
pretrain_mm_mlp_adapter=None, |
|
process_func_args=dict( |
|
conv=dict(type='ShikraConvProcess'), |
|
image=dict(type='ShikraImageProcessor'), |
|
target=dict(type='BoxFormatProcess'), |
|
text=dict(type='ShikraTextProcess')), |
|
sep_image_conv_front=False, |
|
target_processor=dict(boxes=dict(type='PlainBoxFormatter')), |
|
tune_mm_mlp_adapter=False, |
|
type='shikra', |
|
version='v1', |
|
vision_tower='SenseTime/deformable-detr') |
|
training_args = dict( |
|
bf16=True, |
|
dataloader_num_workers=4, |
|
do_eval=False, |
|
do_predict=False, |
|
do_train=True, |
|
evaluation_strategy='no', |
|
fsdp='full_shard auto_wrap', |
|
fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', |
|
gradient_accumulation_steps=1, |
|
gradient_checkpointing=True, |
|
learning_rate=2e-05, |
|
logging_steps=10, |
|
lr_scheduler_type='cosine', |
|
num_train_epochs=50, |
|
output_dir='./exp/reaction_4.2.2-large', |
|
overwrite_output_dir=False, |
|
per_device_eval_batch_size=4, |
|
per_device_train_batch_size=4, |
|
predict_with_generate=True, |
|
remove_unused_columns=False, |
|
report_to='none', |
|
save_steps=10000, |
|
save_strategy='steps', |
|
save_total_limit=1, |
|
seed=42, |
|
tf32=True, |
|
warmup_ratio=0.03, |
|
weight_decay=0.05) |
|
|