_base_ = [ 'svtr-tiny_20e_st_mj.py', ] model = dict( preprocessor=dict(output_image_size=(48, 160), ), encoder=dict( img_size=[48, 160], max_seq_len=40, out_channels=256, embed_dims=[128, 256, 384], depth=[3, 6, 9], num_heads=[4, 8, 12], mixer_types=['Local'] * 8 + ['Global'] * 10), decoder=dict(in_channels=256)) train_dataloader = dict(batch_size=256, )