requesting an ONNX model for ViClip-InternVid-10M-FLT.pth

#2
by NDugar - opened

Hello, firstly thank you for the repository. I was trying to convert the ViClip-InternVid-10M-FLT.pth model to ONNX for a personal use case and I am having difficulty converting it. I would appreciate it if you all can convert it or reply with steps to convert it correctly.

Thank you.

It might be long time already. But It seem like we need to split the model into text and vision. We also need to modified the vision part to disable torch.checkpoint while export. Btw, I use base model instead of large model. You might need to change a bit.
In ViCLIP-vision.py

class Transformer(nn.Module):
    def __init__(self, width, layers, heads, drop_path=0., checkpoint_num=0, dropout=0.):
        super().__init__()
        dpr = [x.item() for x in torch.linspace(0, drop_path, layers)]
        self.resblocks = nn.ModuleList()
        for idx in range(layers):
            self.resblocks.append(ResidualAttentionBlock(width, heads, drop_path=dpr[idx], dropout=dropout))
        self.checkpoint_num = checkpoint_num

    def forward(self, x):
        if torch.onnx.is_in_onnx_export():
            # Disable checkpointing for ONNX export
            for blk in self.resblocks:
                x = blk(x)
        else:
            for idx, blk in enumerate(self.resblocks):
                if idx < self.checkpoint_num:
                    x = checkpoint.checkpoint(blk, x)
                else:
                    x = blk(x)
        return x

    # def forward(self, x):
    #     for idx, blk in enumerate(self.resblocks):
    #         if idx < self.checkpoint_num:
    #             x = checkpoint.checkpoint(blk, x)
    #         else:
    #             x = blk(x)
    #     return x

And for export

model,_ =  get_clip(name='viclip')
model.eval()

# raise
tm =  model.text_encoder
dummy_input = torch.randint(low=0, high=10000, size=(1, 32), dtype=torch.int32)  # Random token IDs
# tm
torch.onnx.export(
    tm, 
    dummy_input, 
    "ViCLIP-B-text.onnx", 
    export_params=True,  # Store trained parameters
    opset_version=17,    # ONNX version (adjust as needed)
    do_constant_folding=True,  # Optimize constants
    input_names=["input"], 
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}  # Enable dynamic batching
)
torch._dynamo.config.suppress_errors = False
vm = model.vision_encoder
# scripted_vm = torch.jit.script(vm)
'''
Image INPUT SIZE torch.Size([1, 3, 8, 224, 224])
Image INPUT Dtype: torch.float32
'''

dummy_input = torch.randn(size=(1, 3, 8, 224, 224), dtype=torch.float32)#.cuda()
# tm
# vm.cuda()
torch.onnx.export(
    vm, 
    dummy_input, 
    "ViCLIP-B-vision.onnx", 
    export_params=True,  # Store trained parameters
    opset_version=14,    # ONNX version (adjust as needed)
    do_constant_folding=True,  # Optimize constants
    input_names=["input"], 
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size", 2: "num_frames"}, "output": {0: "batch_size"}},
    # dynamo=True
    
)

Sign up or log in to comment