|
import torch |
|
from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel |
|
|
|
|
|
class MCLIPConfig(XLMRobertaConfig): |
|
model_type = "M-CLIP" |
|
|
|
def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs): |
|
self.transformerDimensions = transformerDimSize |
|
self.numDims = imageDimSize |
|
super().__init__(**kwargs) |
|
|
|
|
|
class MultilingualCLIP(PreTrainedModel): |
|
config_class = MCLIPConfig |
|
|
|
def __init__(self, config, *args, **kwargs): |
|
super().__init__(config, *args, **kwargs) |
|
self.transformer = XLMRobertaModel(config) |
|
self.LinearTransformation = torch.nn.Linear( |
|
in_features=config.transformerDimensions, out_features=config.numDims |
|
) |
|
|
|
def forward(self, input_ids, attention_mask): |
|
embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0] |
|
embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None] |
|
return self.LinearTransformation(embs2), embs |
|
|