OpenGVLab/ViCLIP · requesting an ONNX model for ViClip-InternVid-10M-FLT.pth

It might be long time already. But It seem like we need to split the model into text and vision. We also need to modified the vision part to disable torch.checkpoint while export. Btw, I use base model instead of large model. You might need to change a bit.
In ViCLIP-vision.py

class Transformer(nn.Module):
    def __init__(self, width, layers, heads, drop_path=0., checkpoint_num=0, dropout=0.):
        super().__init__()
        dpr = [x.item() for x in torch.linspace(0, drop_path, layers)]
        self.resblocks = nn.ModuleList()
        for idx in range(layers):
            self.resblocks.append(ResidualAttentionBlock(width, heads, drop_path=dpr[idx], dropout=dropout))
        self.checkpoint_num = checkpoint_num

    def forward(self, x):
        if torch.onnx.is_in_onnx_export():
            # Disable checkpointing for ONNX export
            for blk in self.resblocks:
                x = blk(x)
        else:
            for idx, blk in enumerate(self.resblocks):
                if idx < self.checkpoint_num:
                    x = checkpoint.checkpoint(blk, x)
                else:
                    x = blk(x)
        return x

    # def forward(self, x):
    #     for idx, blk in enumerate(self.resblocks):
    #         if idx < self.checkpoint_num:
    #             x = checkpoint.checkpoint(blk, x)
    #         else:
    #             x = blk(x)
    #     return x

And for export

model,_ =  get_clip(name='viclip')
model.eval()

# raise
tm =  model.text_encoder
dummy_input = torch.randint(low=0, high=10000, size=(1, 32), dtype=torch.int32)  # Random token IDs
# tm
torch.onnx.export(
    tm, 
    dummy_input, 
    "ViCLIP-B-text.onnx", 
    export_params=True,  # Store trained parameters
    opset_version=17,    # ONNX version (adjust as needed)
    do_constant_folding=True,  # Optimize constants
    input_names=["input"], 
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}  # Enable dynamic batching
)
torch._dynamo.config.suppress_errors = False
vm = model.vision_encoder
# scripted_vm = torch.jit.script(vm)
'''
Image INPUT SIZE torch.Size([1, 3, 8, 224, 224])
Image INPUT Dtype: torch.float32
'''

dummy_input = torch.randn(size=(1, 3, 8, 224, 224), dtype=torch.float32)#.cuda()
# tm
# vm.cuda()
torch.onnx.export(
    vm, 
    dummy_input, 
    "ViCLIP-B-vision.onnx", 
    export_params=True,  # Store trained parameters
    opset_version=14,    # ONNX version (adjust as needed)
    do_constant_folding=True,  # Optimize constants
    input_names=["input"], 
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size", 2: "num_frames"}, "output": {0: "batch_size"}},
    # dynamo=True
    
)