Spaces:

Zhonathon
/

DRv2

Runtime error

App Files Files Community

Zhonathon commited on Apr 3, 2023

Commit

aa7fb02

1 Parent(s): 21ba5c7

update all file v1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Model/AttDes/__init__.py +4 -0
Model/AttDes/__pycache__/__init__.cpython-38.pyc +0 -0
Model/AttDes/dataset/data_loader.py +170 -0
Model/AttDes/models/AttDes.py +17 -0
Model/AttDes/models/Chinese_tokenizer.pth +3 -0
Model/AttDes/models/__init__.py +1 -0
Model/AttDes/models/__pycache__/__init__.cpython-38.pyc +0 -0
Model/AttDes/models/language_model/bert.py +50 -0
Model/AttDes/models/prefixLM.py +108 -0
Model/AttDes/models/resblock.py +353 -0
Model/AttDes/models/tokenizer.py +92 -0
Model/AttDes/models/transformer.py +291 -0
Model/AttDes/models/visual_model/Chinese_tokenizer.pth +3 -0
Model/AttDes/models/visual_model/backbone.py +121 -0
Model/AttDes/models/visual_model/position_encoding.py +89 -0
Model/AttDes/validate_local.py +399 -0
Model/AttDes/validate_local_gennerate.py +332 -0
Model/CLIP/cn_clip/__init__.py +0 -0
Model/CLIP/cn_clip/__pycache__/__init__.cpython-38.pyc +0 -0
Model/CLIP/cn_clip/clip/__init__.py +5 -0
Model/CLIP/cn_clip/clip/__pycache__/__init__.cpython-38.pyc +0 -0
Model/CLIP/cn_clip/clip/__pycache__/bert_tokenizer.cpython-38.pyc +0 -0
Model/CLIP/cn_clip/clip/__pycache__/utils.cpython-38.pyc +0 -0
Model/CLIP/cn_clip/clip/bert_tokenizer.py +436 -0
Model/CLIP/cn_clip/clip/configuration_bert.py +84 -0
Model/CLIP/cn_clip/clip/model.py +504 -0
Model/CLIP/cn_clip/clip/model_configs/RBT3-chinese.json +13 -0
Model/CLIP/cn_clip/clip/model_configs/RN50.json +7 -0
Model/CLIP/cn_clip/clip/model_configs/RoBERTa-wwm-ext-base-chinese.json +13 -0
Model/CLIP/cn_clip/clip/model_configs/RoBERTa-wwm-ext-large-chinese.json +13 -0
Model/CLIP/cn_clip/clip/model_configs/ViT-B-16.json +7 -0
Model/CLIP/cn_clip/clip/model_configs/ViT-B-32.json +7 -0
Model/CLIP/cn_clip/clip/model_configs/ViT-H-14.json +8 -0
Model/CLIP/cn_clip/clip/model_configs/ViT-L-14-336.json +7 -0
Model/CLIP/cn_clip/clip/model_configs/ViT-L-14.json +7 -0
Model/CLIP/cn_clip/clip/model_configs/for_learn.py +16 -0
Model/CLIP/cn_clip/clip/modeling_bert.py +460 -0
Model/CLIP/cn_clip/clip/utils.py +196 -0
Model/CLIP/cn_clip/clip/vocab.txt +0 -0
Model/CLIP/cn_clip/eval/__init__.py +0 -0
Model/CLIP/cn_clip/eval/data.py +167 -0
Model/CLIP/cn_clip/eval/evaluation.py +157 -0
Model/CLIP/cn_clip/eval/evaluation_tr.py +157 -0
Model/CLIP/cn_clip/eval/extract_features.py +205 -0
Model/CLIP/cn_clip/eval/imagenet_zeroshot_templates.py +194 -0
Model/CLIP/cn_clip/eval/make_topk_predictions.py +88 -0
Model/CLIP/cn_clip/eval/make_topk_predictions_tr.py +88 -0
Model/CLIP/cn_clip/eval/transform_ir_annotation_to_tr.py +36 -0
Model/CLIP/cn_clip/eval/zeroshot_evaluation.py +189 -0
Model/CLIP/cn_clip/preprocess/__init__.py +0 -0

Model/AttDes/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ import Model.AttDes.models
2	+ import Model.AttDes.dataset
3	+
4	+

Model/AttDes/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (208 Bytes). View file

Model/AttDes/dataset/data_loader.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import re
+import time
+import cv2
+import sys
+import json
+import matplotlib.pyplot as plt
+import torch
+from torch import nn
+import numpy as np
+import pandas as pd
+import os.path as osp
+import scipy.io as sio
+import torch.utils.data as data
+from PIL import Image
+import matplotlib.image as mping
+import torchvision.transforms as transforms
+from PIL import Image
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+def get_data_from_csv(path):
+    data_csv = pd.read_csv(path, encoding='utf-8')
+    # print(data_csv)
+    pic_id_list = data_csv['pic_id'].values
+    seg_id_list = data_csv['seg_id'].values
+    object_list = data_csv['object'].values
+    segment_list = data_csv['segment'].values
+    adj_list = data_csv['adj'].values
+    des_list = data_csv['des'].values
+    return pic_id_list, seg_id_list, object_list, segment_list, adj_list, des_list
+class AttDesDataset(data.Dataset):
+    def __init__(self, data_root, dataset_name, img_root, dataset_split='train', transform=None,
+                 bert_model='bert-base-chinese',
+                 des_len=256, obj_len=8, tgt_len=32
+                 ):
+        self.images = []
+        self.data_root = data_root
+        self.dataset_name = dataset_name
+        self.transform = transform
+        self.img_root = img_root
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
+        self.des_len = des_len
+        self.obj_len = obj_len
+        self.tgt_len = tgt_len
+        assert self.transform is not None
+        self.pic_id_list, self.seg_id_list, self.object_list, self.segment_list, self.adj_list, self.des_list = \
+            get_data_from_csv(self.data_root)
+        self.data_csv = pd.read_csv(data_root, encoding='utf-8')
+    def get_data_from_csv_by_id(self, id, dict=None):
+        pic_id_list = self.data_csv['pic_id'].values
+        # {id: des_}
+        des_list = self.data_csv['des'].values
+        start_time = time.time()
+        for i in range(len(pic_id_list)):
+            if str(pic_id_list[i]) == str(id):
+                # print("find: str(pic_id_list[i]) == str(id)", time.time() - start_time)
+                return des_list[i]
+        return ""
+    def get_img_from_id(self, img_id):
+        img_filename = self.img_root
+        img_filename = img_filename + '/' + str(img_id) + '.jpg'
+        img = Image.open(img_filename)
+        if self.transform:
+            img = self.transform(img)
+        return img
+    def encode_text_bert(self, text):
+        tokens = []
+        tokens.append("[CLS]")
+        token_obj = self.tokenizer.tokenize(text)
+        for token in token_obj:
+            tokens.append(token)
+        tokens.append("[SEP]")
+        tokens = self.tokenizer.convert_tokens_to_ids(tokens)
+        return tokens
+    def get_all_from_id(self, img_id, obj_given):
+        img_id = str(img_id)
+        if img_id[0] == '#':
+            des = ""
+        else:
+            des = self.get_data_from_csv_by_id(img_id)
+        img = self.get_img_from_id(img_id)
+        des = self.encode_text_bert(des)
+        obj_given = self.encode_text_bert(obj_given)
+        while(len(des) < self.des_len):
+            des.append(100)
+        while(len(obj_given) < self.obj_len):
+            obj_given.append(0)
+        assert len(des) == self.des_len
+        return img, torch.from_numpy(np.array(des)), torch.from_numpy(np.array(obj_given))
+    def __getitem__(self, idx):
+        img_id = self.pic_id_list[idx]
+        img = self.get_img_from_id(img_id)
+        # des = self.des_list[idx].split('[，,；]')
+        des = re.split('，|；', str(self.des_list[idx]))
+        masked_des = ""                   # chinese
+        for i in range(len(des)):
+            if i != int(self.seg_id_list[idx]):
+                masked_des = masked_des + des[i] + '  '
+        obj = self.object_list[idx]         # chinese
+        segment = self.segment_list[idx]    # chinese
+        masked_des = self.encode_text_bert(masked_des)
+        obj = self.encode_text_bert(obj)
+        segment = self.encode_text_bert(segment)
+        while(len(masked_des) < self.des_len):
+            masked_des.append(100)
+        while(len(obj) < self.obj_len):
+            obj.append(0)
+        while(len(segment) < self.tgt_len):
+            segment.append(0)
+        assert len(masked_des) == self.des_len
+        assert len(obj) == self.obj_len
+        assert len(segment) == self.tgt_len
+        return img, np.array(masked_des), np.array(obj), np.array(segment), img_id
+    def __len__(self):
+        return len(self.pic_id_list)
+if __name__ == '__main__':
+    data_root = r'E:\data\Download\fur\dataset\data_for_test1.csv'
+    split_root = ''
+    dataset_name = 'Furniture'
+    #
+    # get_data_from_csv(data_root)
+    # img_id = 550709
+    # img = get_img_from_id(img_id)
+    # plt.imshow(img)
+    # plt.show()
+    normalize = transforms.Normalize(mean=[0, 0, 0],
+                                     std=[1, 1, 1])
+    dataset = AttDesDataset(data_root, dataset_name, transform=transforms.Compose([
+                                            transforms.Resize((448,448)),
+                                            transforms.RandomHorizontalFlip(),
+                                            transforms.ToTensor(),
+                                            normalize,
+                                        ]))
+    img, masked_des, obj, segment = dataset.__getitem__(100)
+    img_show = np.zeros((len(img[0]), len(img[0][0]), 3))
+    img_show[:, :, 0] = img[0]
+    img_show[:, :, 1] = img[1]
+    img_show[:, :, 2] = img[2]
+    plt.imshow(img_show)
+    plt.show()
+    print(masked_des, len(masked_des))
+    print(obj, len(obj))
+    print(segment, len(segment))
+    print(dataset.__len__())
+    # sentence_for_test = "原木地板的厚实与白色纱幔的轻飘营造朴素和浪漫的氛围，而一张编织餐椅灵动轻巧"
+    # tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+    # tokenizer.tokenize(sentence_for_test)
+    # print(tokenizer.tokenize(sentence_for_test))
+    # print(tokenizer.convert_tokens_to_ids(sentence_for_test))

Model/AttDes/models/AttDes.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorch_pretrained_bert.modeling import BertModel
+class AttDes(nn.Module):
+    def __init__(self, args):
+        super(AttDes, self).__init__()
+        hidden_dim = args.AD_hidden_dim

Model/AttDes/models/Chinese_tokenizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2403030d0e018aedffec4a62d69b124c350a5b1ef03035395dbcb3593deca8dd
+size 142959

Model/AttDes/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

Model/AttDes/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (145 Bytes). View file

Model/AttDes/models/language_model/bert.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import Dict, List
+from pytorch_pretrained_bert.modeling import BertModel
+from utils.misc import NestedTensor
+class BERT(nn.Module):
+    def __init__(self, name: str, train_bert: bool, hidden_dim: int, max_len: int, enc_num):
+        super().__init__()
+        if name == 'bert-base-uncased':
+            self.num_channels = 768
+        else:
+            self.num_channels = 1024
+        self.enc_num = enc_num
+        self.bert = BertModel.from_pretrained(name)
+        if not train_bert:
+            for parameter in self.bert.parameters():
+                parameter.requires_grad_(False)
+    def forward(self, tensor_list: NestedTensor):
+        if self.enc_num > 0:
+            all_encoder_layers, _ = self.bert(tensor_list.tensors, token_type_ids=None, attention_mask=tensor_list.mask)
+            # use the output of the X-th transformer encoder layers
+            xs = all_encoder_layers[self.enc_num - 1]
+        else:
+            xs = self.bert.embeddings.word_embeddings(tensor_list.tensors)
+        mask = tensor_list.mask.to(torch.bool)
+        mask = ~mask
+        out = NestedTensor(xs, mask)
+        return out

Model/AttDes/models/prefixLM.py ADDED Viewed

	@@ -0,0 +1,108 @@

+'''
+   author: yulong-XJTU
+'''
+import torch
+from torch import nn
+import torch.nn.functional as F
+import copy
+from AttDes.models.transformer import Transformer, subsequent_mask, ModelOne, Model005, Model006
+from axial_positional_embedding import AxialPositionalEmbedding
+from AttDes.models.resblock import BottleneckBlock
+from random import randint
+from einops import rearrange
+def clone(module,N):
+    '''copy the given module N times'''
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+class PrefixLM(nn.Module):
+    def __init__(
+            self, des_len, obj_len, tgt_len,
+            d_model=512,
+            input_resolution=224,
+            patch_size=16,
+            num_text_tokens=10000,
+            txt_seq_len=256,
+            prefix_txt_len=25,
+            target_txt_len=52,
+            max_trunc_txt_len=15,
+            heads=8,
+            enc_depth=12,
+            dec_depth=12,
+            d_ff=1024,
+            dropout=0.,
+    ):
+        super(PrefixLM,self).__init__()
+        assert input_resolution % patch_size==0 and max_trunc_txt_len<=prefix_txt_len and max_trunc_txt_len<txt_seq_len
+        self.ResNet = nn.Sequential(*[nn.Conv2d(in_channels=3, out_channels=64, kernel_size=patch_size, stride=patch_size, bias=True),
+                                    BottleneckBlock(in_channels=64,out_channels=256,bottleneck_channels=64,),
+                                    BottleneckBlock(in_channels=256,out_channels=d_model,bottleneck_channels=128)])
+        self.des_len = des_len
+        self.obj_len = obj_len
+        self.tgt_len = tgt_len
+        self.txt_embed = nn.Embedding(num_text_tokens, d_model, padding_idx=0)
+        self.txt_pos_embed = nn.Embedding(self.des_len,d_model)
+        image_fmap_size = input_resolution // patch_size    # 448 // 16
+        self.img_tokens_len=image_fmap_size ** 2
+        # self.img_pos_embed=nn.Embedding(self.img_tokens_len,d_model)
+        self.img_pos_embed = AxialPositionalEmbedding(d_model, axial_shape=(image_fmap_size, image_fmap_size))
+        self.txt_seq_len = txt_seq_len
+        self.target_txt_len = target_txt_len
+        self.prefix_txt_len = prefix_txt_len
+        self.max_trunc_txt_len=max_trunc_txt_len
+        self.num_text_tokens = num_text_tokens
+        self.dim_embed=d_model
+        self.input_resolution=input_resolution
+        self.patch_size=patch_size
+        # self.temperature = nn.Parameter(torch.tensor(1.))       # 论文中没提到
+        self.transformer=Transformer(d_model,heads,enc_depth,dec_depth,d_ff,dropout=dropout)
+        self.ModelOne = Model005(d_model,heads,enc_depth,dec_depth,d_ff,dropout=dropout)
+        self.to_logits = nn.Sequential(
+            nn.LayerNorm(d_model),
+            nn.Linear(d_model, self.num_text_tokens)
+        )
+    def forward(self, img, des, obj, tgt, return_loss=False):
+        device = des.device
+        n = des.shape[0]
+        img_emed = self.ResNet(img)
+        img_emed = rearrange(img_emed,'b c h w -> b (h w) c')
+        img_emed = img_emed + self.img_pos_embed(img_emed)
+        del img
+        #add<CLS>, if you change the tokenizer, don't forget  to change the token ID. another [SEP] token is added at the ending(in the tokenizer.py,please check.)
+        tgt = F.pad(tgt, (1, 0), value=4)
+        labels = tgt[:,1:]
+        tgt = tgt[:,:-1]
+        # print('des:', torch.min(des), torch.max(des))
+        des_embed = self.txt_embed(des)
+        des_embed = des_embed + self.txt_pos_embed(torch.arange(self.des_len, device=device))
+        obj_embed = self.txt_embed(obj)
+        obj_embed = obj_embed + self.txt_pos_embed(torch.arange(self.obj_len, device=device))
+        tgt_embed = self.txt_embed(tgt)
+        tgt_embed = tgt_embed + self.txt_pos_embed(torch.arange(self.tgt_len, device=device))
+        tgt_mask = subsequent_mask(self.tgt_len).to(device)
+        # baseline
+        # prefix = torch.cat((img_emed, des_embed, obj_embed), dim=1)
+        # tgt_mask = subsequent_mask(self.tgt_len).to(device)
+        # out = self.transformer(prefix, tgt_embed, tgt_mask=tgt_mask)
+        # ModelOne
+        out = Model005(q=obj_embed, k=img_emed, v=img_emed,
+                            tgt_embeded=tgt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                            tgt_mask=tgt_mask)
+        logits = self.to_logits(out)
+        return logits, labels
+        # if not return_loss:
+        #     return logits
+        # # temp = self.temperature.exp()
+        # logits = rearrange(logits, 'b n c -> b c n')
+        # # logits=logits*temp #带温度参数
+        # loss=F.cross_entropy(logits,labels,ignore_index=0)
+        # return loss

Model/AttDes/models/resblock.py ADDED Viewed

	@@ -0,0 +1,353 @@

+#代码借鉴于 https://github.com/facebookresearch/detectron2
+from torch import nn
+import torch
+import torch.nn.functional as F
+def c2_msra_fill(module: nn.Module) -> None:
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[nn.Module,
+        #  torch.Tensor]`.
+        nn.init.constant_(module.bias, 0)
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": torch.nn.BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            #"SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            #"naiveSyncBN": NaiveSyncBatchNorm,
+            # expose stats_mode N as an option to caller, required for zero-len inputs
+            #"naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+        }[norm]
+    return norm(out_channels)
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            if x.numel() == 0 and self.training:
+                # https://github.com/pytorch/pytorch/issues/12013
+                assert not isinstance(
+                    self.norm, torch.nn.SyncBatchNorm
+                ), "SyncBatchNorm does not support empty inputs!"
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+    Other pre-trained backbone models may contain all 4 parameters.
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+    _version = 3
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        #*,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                c2_msra_fill(layer)
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out

Model/AttDes/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# take from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+# to give users a quick easy start to training DALL-E without doing BPE
+import torch
+# from transformers import BertTokenizer
+import html
+import os
+from functools import lru_cache
+from pathlib import Path
+import ftfy
+import regex as re
+# OpenAI simple tokenizer
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/bpe_simple_vocab_16e6.txt")
+@lru_cache()
+def bytes_to_unicode():
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+# chinese tokenizer
+class ChineseTokenizer:
+    def __init__(self):
+        tokenizer = torch.load('./models/Chinese_tokenizer.pth')    # BertTokenizer.from_pretrained('bert-base-chinese')
+        self.tokenizer = tokenizer
+        self.vocab_size = tokenizer.vocab_size+2
+    def decode(self, tokens):
+        if torch.is_tensor(tokens):
+            tokens = tokens.tolist()
+        tokens = [token for token in tokens if token not in (0,)]
+        return self.tokenizer.decode(tokens)
+    def encode(self, text,train=False):
+        t=torch.tensor(self.tokenizer.encode(text, add_special_tokens=False))
+        if train:
+            return torch.cat([t,torch.tensor([5])],dim=-1)
+        else:
+            return t
+        #special token: [CLS]==4,[SEP]==5, [PAD]==0,<bos>=7
+    def tokenize(self, texts, context_length = 77, truncate_text = False,train=True):
+        if isinstance(texts, str):
+            texts = [texts]
+        all_tokens = [self.encode(text,train=train) for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate_text:
+                    tokens = tokens[:context_length]
+                else:
+                    raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        return result

Model/AttDes/models/transformer.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import torch
+import copy
+import torch.nn as nn
+import torch.nn .functional as F
+import numpy as np
+import math
+#helpers
+def clone(module,N):
+    '''copy the given module N times'''
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+def subsequent_mask(size):
+    attn_shape=(1,size,size)
+    subsequent_mask=np.triu(np.ones(attn_shape),k=1).astype(bool)
+    return torch.from_numpy(subsequent_mask)==False
+def attention(query, key, value, mask=None, dropout=None):
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    return torch.matmul(p_attn, value), p_attn
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % h == 0
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clone(nn.Linear(d_model, d_model), 4)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, query, key, value, mask=None):
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+        '''print('q:',query)
+        print('k:',key)
+        print('v:',value)'''
+        nbatchs = query.size(0)
+        query, key, value = [l(x).view(nbatchs, -1, self.h, self.d_k).transpose(1, 2) \
+                             for l, x in zip(self.linears, (query, key, value))]
+        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
+        x = x.transpose(1, 2).contiguous().view(nbatchs, -1, self.h * self.d_k)
+        return self.linears[-1](x)
+class Feedforward(nn.Module):
+    def __init__(self,d_model,d_ff,dropout=0.1):
+        super(Feedforward,self).__init__()
+        self.w_1=nn.Linear(d_model,d_ff)
+        self.w_2=nn.Linear(d_ff,d_model)
+        self.dropout=nn.Dropout(dropout)
+    def forward(self,x):
+        return self.w_2(self.dropout(F.relu((self.w_1(x)))))
+class LayerNorm(nn.Module):
+    def __init__(self,features,eps=1e-6):
+        super(LayerNorm,self).__init__()
+        self.a_2=nn.Parameter(torch.ones(features))
+        self.b_2=nn.Parameter(torch.zeros(features))
+        self.eps=eps
+    def forward(self,x):
+        mean=x.mean(-1,keepdim=True)
+        std=x.std(-1,keepdim=True)
+        return self.a_2*(x-mean)/(std+self.eps)+self.b_2
+class Generator(nn.Module):
+    def __init__(self,d_model,vocab):
+        super(Generator,self).__init__()
+        self.proj=nn.Linear(d_model,vocab)
+    def forward(self,x):
+        return F.log_softmax(self.proj(x),dim=-1)
+# encoderLayer clone numbers times of enc_depth.
+# 把encoderLayer重复enc_depth次；
+class Encoder(nn.Module):
+    def __init__(self, layer, N):
+        '''N encoder layers '''
+        super(Encoder,self).__init__()
+        self.layers = clone(layer, N)
+        self.norm = LayerNorm(layer.size)
+    def forward(self,x,mask=None):
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x)
+class SublayerConnection(nn.Module):
+    '''LayerNorm +subLayer+dropout+residual connection'''
+    def __init__(self,size,dropout):
+        super(SublayerConnection,self).__init__()
+        self.norm=LayerNorm(size)
+        self.dropout=nn.Dropout(dropout)
+    def forward(self,x,sublayer):
+        return x+self.dropout(sublayer(self.norm(x)))
+class EncoderLayer(nn.Module):
+    def __init__(self,size,self_attn,feed_forward,dropout):
+        '''size is the embedding dimension'''
+        super(EncoderLayer,self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clone(SublayerConnection(size,dropout),2)
+        self.size = size
+    def forward(self,x,mask=None):
+        x = self.sublayer[0](x, lambda x: self.self_attn(x,x,x,mask))
+        return self.sublayer[1](x, self.feed_forward)
+class Decoder(nn.Module):
+    def __init__(self,layer,N):
+        super(Decoder,self).__init__()
+        self.layers = clone(layer,N)
+        self.norm = LayerNorm(layer.size)
+    def forward(self,x, memory,src_mask=None,tgt_mask=None):
+        for layer in self.layers:
+            x = layer(x, memory, src_mask, tgt_mask)
+        return self.norm(x)
+class DecoderLayer(nn.Module):
+    def __init__(self,size,self_attn,src_attn,feed_forward,dropout):
+        super(DecoderLayer,self).__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clone(SublayerConnection(size,dropout),3)
+    def forward(self,x,memory,src_mask=None,tgt_mask=None):
+        m = memory
+        x = self.sublayer[0](x,lambda x: self.self_attn(x,x,x,tgt_mask))
+        x = self.sublayer[1](x,lambda x: self.src_attn(x,m,m,src_mask))
+        return self.sublayer[2](x,self.feed_forward)
+class CrossAttLayer(nn.Module):
+    def __init__(self,d_model,self_attn,feed_forward,dropout=0.1):
+        super(CrossAttLayer, self).__init__()
+        self.size = d_model
+        self.self_attn = self_attn
+        # self.self_attn_0 = copy.deepcopy(self_attn)
+        self.feed_forward = feed_forward
+        self.dropout = nn.Dropout(dropout)
+        self.sublayer = clone(SublayerConnection(d_model, dropout), 2)
+        # self.sublayer = clone(SublayerConnection(d_model,dropout),3)    # 可以改成三层的，第一层是self_attn
+    def forward(self,q,k,v,src_mask=None):
+        # k = self.sublayer[0](k, lambda k: self.self_attn_0(k,k,k))
+        # q = self.sublayer[0](q, lambda q: self.self_attn_0(q,q,q))
+        # x = self.sublayer[1](q, lambda q: self.self_attn(q,k,k,src_mask))
+        # x = self.sublayer[2](x, self.feed_forward)
+        x = self.sublayer[0](q, lambda q: self.self_attn(q,k,k,src_mask))
+        x = self.sublayer[1](x, self.feed_forward)
+        return x
+class CrossAtt(nn.Module):
+    def __init__(self, crossAttlayer, N=1):
+        super(CrossAtt, self).__init__()
+        self.layers = clone(crossAttlayer,N)
+        self.norm = LayerNorm(crossAttlayer.size)
+    def forward(self, q, k, v, src_mask=None):
+        for crossAttnLayer in self.layers:
+            q = crossAttnLayer(q, k, v, src_mask)
+        return self.norm(q)
+class Transformer(nn.Module):
+    def __init__(self,d_model=512,heads=8,enc_depth=8,dec_depth=8,d_ff=1024,dropout=0.1):
+        super(Transformer,self).__init__()
+        c = copy.deepcopy
+        attn = MultiHeadedAttention(heads,d_model)
+        ff = Feedforward(d_model,d_ff,dropout)
+        self.encoder = Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),enc_depth)
+        self.decoder = Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),dec_depth)
+        #self.register_buffer('src_mask', src_mask, persistent=False)
+        #self.register_buffer('tgt_mask', tgt_mask, persistent=False)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self,src_embeded,tgt_embeded,src_mask=None,tgt_mask=None):
+        return self.decode(self.encode(src_embeded,src_mask),tgt_embeded,src_mask,tgt_mask)
+    def encode(self,src_embeded,src_mask=None):
+        return self.encoder(src_embeded,src_mask)
+    def decode(self,memory,tgt_embeded,src_mask=None,tgt_mask=None):
+        return self.decoder(tgt_embeded,memory,src_mask,tgt_mask)
+class ModelOne(nn.Module):
+    def __init__(self,d_model=512,heads=8,enc_depth=8,dec_depth=8,d_ff=1024,dropout=0.1):
+        super(ModelOne,self).__init__()
+        c = copy.deepcopy
+        attn = MultiHeadedAttention(heads,d_model)
+        ff = Feedforward(d_model,d_ff,dropout)
+        self.CrossAtt = CrossAtt(CrossAttLayer(d_model,c(attn),c(ff),dropout),N=1)
+        self.encoder = Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),enc_depth)
+        self.decoder = Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),dec_depth)
+        #self.register_buffer('src_mask', src_mask, persistent=False)
+        #self.register_buffer('tgt_mask', tgt_mask, persistent=False)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, q, k, v, tgt_embeded, des_embed, obj_embed, img_embed, src_mask=None, tgt_mask=None):
+        # x = self.CrossAtt(q, img_embed, img_embed)
+        # x2 = self.CrossAtt(q, des_embed, des_embed)
+        des_embed_self = self.CrossAtt(des_embed, des_embed, des_embed)
+        x3 = self.CrossAtt(img_embed, des_embed_self, des_embed_self)
+        # src_embeded = torch.cat((x, des_embed, obj_embed), dim=1)
+        src_embeded = torch.cat((x3, obj_embed), dim=1)
+        x = self.encode(src_embeded,src_mask)
+        x = self.decode(x, tgt_embeded,src_mask, tgt_mask)
+        return x
+    def encode(self,src_embeded,src_mask=None):
+        return self.encoder(src_embeded,src_mask)
+    def decode(self,memory,tgt_embeded,src_mask=None,tgt_mask=None):
+        return self.decoder(tgt_embeded,memory,src_mask,tgt_mask)
+class Model005(nn.Module):
+    def __init__(self,d_model=512,heads=8,enc_depth=8,dec_depth=8,d_ff=1024,dropout=0.1):
+        super(Model005,self).__init__()
+        c = copy.deepcopy
+        attn = MultiHeadedAttention(heads,d_model)
+        ff = Feedforward(d_model,d_ff,dropout)
+        self.CrossAtt = CrossAtt(CrossAttLayer(d_model,c(attn),c(ff),dropout),N=1)
+        self.encoder = Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),enc_depth)
+        self.decoder = Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),dec_depth)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, q, k, v, tgt_embeded, des_embed, obj_embed, img_embed, src_mask=None, tgt_mask=None):
+        x = self.CrossAtt(q, img_embed, img_embed)
+        src_embeded = torch.cat((x, des_embed, obj_embed), dim=1)
+        x = self.encode(src_embeded,src_mask)
+        x = self.decode(x, tgt_embeded,src_mask, tgt_mask)
+        return x
+    def encode(self,src_embeded,src_mask=None):
+        return self.encoder(src_embeded,src_mask)
+    def decode(self,memory,tgt_embeded,src_mask=None,tgt_mask=None):
+        return self.decoder(tgt_embeded,memory,src_mask,tgt_mask)
+class Model006(nn.Module):
+    def __init__(self,d_model=512,heads=8,enc_depth=8,dec_depth=8,d_ff=1024,dropout=0.1):
+        super(Model006,self).__init__()
+        c = copy.deepcopy
+        attn = MultiHeadedAttention(heads,d_model)
+        ff = Feedforward(d_model,d_ff,dropout)
+        self.CrossAtt = CrossAtt(CrossAttLayer(d_model,c(attn),c(ff),dropout),N=1)
+        self.encoder = Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),enc_depth)
+        self.decoder = Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),dec_depth)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, q, k, v, tgt_embeded, des_embed, obj_embed, img_embed, src_mask=None, tgt_mask=None):
+        x = self.CrossAtt(img_embed, img_embed, img_embed)
+        x = self.CrossAtt(obj_embed, x, x)
+        src_embeded = torch.cat((x, des_embed, obj_embed), dim=1)
+        x = self.encode(src_embeded,src_mask)
+        x = self.decode(x, tgt_embeded,src_mask, tgt_mask)
+        return x
+    def encode(self,src_embeded,src_mask=None):
+        return self.encoder(src_embeded,src_mask)
+    def decode(self,memory,tgt_embeded,src_mask=None,tgt_mask=None):
+        return self.decoder(tgt_embeded,memory,src_mask,tgt_mask)

Model/AttDes/models/visual_model/Chinese_tokenizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2403030d0e018aedffec4a62d69b124c350a5b1ef03035395dbcb3593deca8dd
+size 142959

Model/AttDes/models/visual_model/backbone.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from utils.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, name:str, backbone: nn.Module, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=False, norm_layer=FrozenBatchNorm2d)
+            # pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+        assert name in ('resnet50', 'resnet101')
+        num_channels = 2048
+        super().__init__(name, backbone, num_channels, return_interm_layers)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    # train_backbone = args.lr_detr > 0
+    return_interm_layers = False
+    backbone = Backbone(args.backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model

Model/AttDes/models/visual_model/position_encoding.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the visual model.
+"""
+import math
+import torch
+from torch import nn
+from utils.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

Model/AttDes/validate_local.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import argparse
+import datetime
+import json
+import random
+import time
+import math
+import os
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import torch
+from nltk.translate import bleu_score
+import sys
+sys.path.append(r"E:\data\streamlit\Model\AttDes")
+sys.path.append(r"E:\data\streamlit\Model\CLIP")
+from AttDes import dataset
+from AttDes.dataset import data_loader
+from torch.utils.data import DataLoader, DistributedSampler
+import torchvision.transforms as transforms
+import AttDes.models as models
+from AttDes.models import prefixLM, tokenizer
+import nltk
+import jieba
+# from engine import train_one_epoch, validate
+#
+# import utils.misc as utils
+# from models import __init__
+# from dataset import build_dataset
+# from engine import train_one_epoch, validate_txt
+from einops import rearrange
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set parser', add_help=False)
+    parser.add_argument('--device', default='cuda')
+    # parser.add_argument('--gpu_id', default='0', type=str)
+    # Dataset parameters
+    parser.add_argument('--data_root', type=str, default='/hy-nas/zhanghe/data/fur/txt/data_for_test2.csv')
+    parser.add_argument('--dataset_name', type=str, default='Furniture')
+    parser.add_argument('--img_root', type=str, default='/hy-nas/zhanghe/data/fur/processed_img')
+    parser.add_argument('--output_dir', default='./outputs/validate', help='path where to save, empty for no saving')
+    parser.add_argument('--seed', default=2022, type=int)
+    parser.add_argument('--resume', default='', help='resume for checkpoint')
+    parser.add_argument('--bert_model', default='bert-base-chinese', type=str)
+    parser.add_argument('--des_len', default=256, type=int)
+    parser.add_argument('--obj_len', default=8, type=int)
+    parser.add_argument('--tgt_len', default=35, type=int)
+    # Train parameters
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--batch_size', default=1, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--optimizer', default='adamw', type=str)
+    parser.add_argument('--lr_scheduler', default='step', type=str)
+    parser.add_argument('--lr_drop', default=5, type=int)
+    parser.add_argument('--start_epoch', default=0, type=int)
+    parser.add_argument('--epochs', default=1, type=int)
+    # Model parameters
+    parser.add_argument('--AD_hidden_dim', default=256, type=int)
+    parser.add_argument('--d_model', default=512, type=int)
+    # visual_model parameters
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    return parser
+def main(args):
+    device = torch.device(args.device)
+    # seed = args.seed
+    # torch.manual_seed(seed)
+    # np.random.seed(seed)
+    # random.seed(seed)
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                        ])
+    dataset_all = AttDes.dataset.data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                      des_len=args.des_len,
+                                                      obj_len=args.obj_len,
+                                                      tgt_len=args.tgt_len,
+                                                      img_root=args.img_root,
+                                                      transform=the_transforms)
+    dataloader_val = DataLoader(dataset_all,
+                                  batch_size=args.batch_size,
+                                  shuffle=False)
+    print("data loaded...")
+    Tokenizer = tokenizer.ChineseTokenizer()
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load('./outputs/005/checkpoint0019.pth'))
+    output_dir = Path(args.output_dir)
+    with (output_dir / "log.txt").open("a") as f:
+        f.write(str(args) + "\n")
+    print("start validate...")
+    start_time = time.time()
+    # optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2000)
+    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+    for epoch in range(args.start_epoch, args.epochs):
+        validate_txt(args, model, dataloader_val, device, batch_size=args.batch_size)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Validate time {}'.format(total_time_str))
+def load_AttDes_Model(model_path, device):
+    parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                        ])
+    dataset_all = data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                      des_len=args.des_len,
+                                                      obj_len=args.obj_len,
+                                                      tgt_len=args.tgt_len,
+                                                      img_root=args.img_root,
+                                                      transform=the_transforms)
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    time_1 = time.time()
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load(model_path))
+    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+    time_2 = time.time()
+    print('Load model takes {}s'.format(time_2 - time_1))
+    return model, dataset_all, tokenizer
+def validate(img1_id, img2_id, obj, model_path):
+    parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    #
+    # seed = args.seed
+    # torch.manual_seed(seed)
+    # np.random.seed(seed)
+    # random.seed(seed)
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                        ])
+    dataset_all = dataset.data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                      des_len=args.des_len,
+                                                      obj_len=args.obj_len,
+                                                      tgt_len=args.tgt_len,
+                                                      img_root=args.img_root,
+                                                      transform=the_transforms)
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    time_1 = time.time()
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load(model_path))
+    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+    time_2 = time.time()
+    print('Load model takes {}s'.format(time_2 - time_1))
+    out_list = []
+    label_txt, output1, output2, output3 = validate_one_img(model, dataset_all, img1_id, obj, device, tokenizer)
+    out_list.append([label_txt, output1, output2, output3])
+    label_txt, output1, output2, output3 = validate_one_img(model, dataset_all, img2_id, obj, device, tokenizer)
+    out_list.append([label_txt, output1, output2, output3])
+    return out_list
+def get_data_from_csv_by_id(path, id):
+    data_csv = pd.read_csv(path, encoding='utf-8')
+    # print(data_csv)
+    pic_id_list = data_csv['pic_id'].values
+    des_list = data_csv['des'].values
+    for i in range(len(pic_id_list)):
+        if str(pic_id_list[i]) == str(id):
+            return des_list[i]
+    return ""
+def validate_one_img(model, dataset_all, img_ids, obj_given, device, tokenizer):
+    batch_size = len(img_ids)
+    start_time = time.time()
+    model.eval()
+    imgs = []
+    dess = []
+    objs = []
+    for i in range(len(img_ids)):
+        img, des, obj = dataset_all.get_all_from_id(img_ids[i], obj_given[i])
+        # print("get img from id time:", time.time() - start_time)  # 3s
+        imgs.append(img)
+        dess.append(des)
+        objs.append(obj)
+    img_data = torch.stack(imgs).to(device)
+    des_data = torch.stack(dess).to(device)
+    obj_data = torch.stack(objs).to(device)
+    # print("get batch time:", time.time() - start_time) # 3s
+    img_emed = model.ResNet(img_data)
+    img_emed = rearrange(img_emed, 'b c h w -> b (h w) c')
+    img_emed += model.img_pos_embed(img_emed)
+    des_embed = model.txt_embed(des_data)
+    des_embed += model.txt_pos_embed(torch.arange(model.des_len, device=device))
+    obj_embed = model.txt_embed(obj_data)
+    obj_embed = obj_embed + model.txt_pos_embed(torch.arange(model.obj_len, device=device))
+    tgt_txt = torch.zeros(batch_size, 1, dtype=torch.long, device=device) + 101
+    tgt_txt_embed = model.txt_embed(tgt_txt)
+    tgt_txt_embed += model.txt_pos_embed(torch.arange(1, device=device) + model.tgt_len)
+    # M_005
+    out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                         tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                         tgt_mask=None)
+    logits = model.to_logits(out)[:, -1]
+    _, index = logits.topk(3, dim=-1)
+    # value: tensor([[7.3227, 7.2289, 6.4169],
+    #               [9.6868, 7.0598, 6.3911]], device='cuda:0', grad_fn= < TopkBackward0 >)
+    # index: tensor([[4677, 2199, 2647],
+    #                [4510, 3763, 2145]], device='cuda:0')
+    sample_1st = index[:,0]
+    sample_2nd = index[:,1]
+    sample_3rd = index[:,2]
+    tgt_txt0 = tgt_txt
+    output_list = []
+    # print("get 1,2,3 sample time:", time.time() - start_time) # 0.01s
+    for sample in [sample_1st, sample_2nd, sample_3rd]:
+        tgt_txt = tgt_txt0
+        cur_len = 1
+        while (cur_len < model.tgt_len and sample.max() != 102):  # 102 is the id of [SEP]
+            tgt_txt = torch.cat((tgt_txt, sample.unsqueeze(1)), dim=-1)
+            tgt_txt_embed = model.txt_embed(tgt_txt)
+            cur_len += 1
+            tgt_txt_embed += model.txt_pos_embed(torch.arange(cur_len, device=device))
+            # out = model.transformer(prefix, tgt_txt_embed)
+            out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                                 tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                                 tgt_mask=None)
+            logits = model.to_logits(out)[:, -1]
+            sample = torch.argmax(logits, dim=-1)
+        # print("one batch sentence token time:", time.time() - start_time) # 0.6s
+        output_1 = []
+        for i in range(batch_size):
+            output_txt = []
+            for token in tgt_txt[i].tolist():
+                if token > 103:
+                    output_txt.append(token)
+            output_txt = tokenizer.convert_ids_to_tokens(output_txt)
+            output_txt = ''.join(output_txt)
+            output_1.append(output_txt[1:])
+        output_list.append(output_1)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Validate time {}'.format(total_time_str))
+    # print(output_list)
+    return output_list
+def generate_texts(img_id, obj, model_path):
+    parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    # seed = args.seed
+    # torch.manual_seed(seed)
+    # np.random.seed(seed)
+    # random.seed(seed)
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                         ])
+    dataset_all = dataset.data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                    des_len=args.des_len,
+                                                    obj_len=args.obj_len,
+                                                    tgt_len=args.tgt_len,
+                                                    img_root=args.img_root,
+                                                    transform=the_transforms)
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    time_1 = time.time()
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load(model_path))
+    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+    time_2 = time.time()
+    print('Load model takes {}s'.format(time_2 - time_1))
+    print("start generate_texts")
+    start_time = time.time()
+    end1_time = time.time()
+    model.eval()
+    img_data, des, obj_data, target, img_id, obj_given = dataset_all.get_all_from_id(img_id, obj)
+    img_data = img_data.unsqueeze(0).to(device)
+    des = des.unsqueeze(0).to(device)
+    obj_given = obj_given.unsqueeze(0).to(device)
+    label = target.unsqueeze(0).to(device)
+    img_emed = model.ResNet(img_data)
+    img_emed = rearrange(img_emed, 'b c h w -> b (h w) c')
+    img_emed += model.img_pos_embed(img_emed)
+    des_embed = model.txt_embed(des)
+    des_embed += model.txt_pos_embed(torch.arange(model.des_len, device=device))
+    obj_embed = model.txt_embed(obj_given)
+    obj_embed = obj_embed + model.txt_pos_embed(torch.arange(model.obj_len, device=device))
+    tgt_txt = torch.zeros(1, 1, dtype=torch.long, device=device) + 101
+    tgt_txt_embed = model.txt_embed(tgt_txt)
+    tgt_txt_embed += model.txt_pos_embed(torch.arange(1, device=device) + model.tgt_len)
+    # M_005
+    out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                         tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                         tgt_mask=None)
+if __name__ == '__main__':
+    # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+    # parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    # args = parser.parse_args()
+    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
+    # if args.output_dir:
+    #     Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    # main(args)
+    model_name = '005'
+    model_path = r'E:\data\Download\models\attribute_desciption\outputs' + '/' + model_name + '/' + 'checkpoint0019.pth'
+    obj = ["空间","客厅","卧室","墙面","餐厅","公寓","住宅","沙发","家具","地毯","厨房","书房","背景墙","吊灯","墙",
+           "卫生间","儿童","床品","装饰","壁纸","地板","窗帘","吊顶","餐椅","别墅","地面","结构","布艺","餐桌","画"]
+    out = generate_texts('550695', obj, model_path)

Model/AttDes/validate_local_gennerate.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import argparse
+import datetime
+import json
+import random
+import time
+import math
+import os
+import numpy as np
+from pathlib import Path
+import torch
+from nltk.translate import bleu_score
+import dataset.data_loader
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader, DistributedSampler
+import torchvision.transforms as transforms
+from models import prefixLM, tokenizer
+import nltk
+import jieba
+# from engine import train_one_epoch, validate
+#
+# import utils.misc as utils
+from models import __init__
+# from dataset import build_dataset
+# from engine import train_one_epoch, validate_txt
+from einops import rearrange
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set parser', add_help=False)
+    parser.add_argument('--device', default='cuda')
+    parser.add_argument('--gpu_id', default='0', type=str)
+    # Dataset parameters
+    parser.add_argument('--data_root', type=str, default=r'E:\data\Download\fur\dataset\data_for_test2.csv')
+    parser.add_argument('--dataset_name', type=str, default='Furniture')
+    parser.add_argument('--img_root', type=str, default=r'E:\data\pictures')
+    parser.add_argument('--output_dir', default='./outputs/validate', help='path where to save, empty for no saving')
+    parser.add_argument('--seed', default=2022, type=int)
+    parser.add_argument('--resume', default='', help='resume for checkpoint')
+    parser.add_argument('--bert_model', default='bert-base-chinese', type=str)
+    parser.add_argument('--des_len', default=256, type=int)
+    parser.add_argument('--obj_len', default=8, type=int)
+    parser.add_argument('--tgt_len', default=35, type=int)
+    # Train parameters
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--batch_size', default=1, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--optimizer', default='adamw', type=str)
+    parser.add_argument('--lr_scheduler', default='step', type=str)
+    parser.add_argument('--lr_drop', default=5, type=int)
+    parser.add_argument('--start_epoch', default=0, type=int)
+    parser.add_argument('--epochs', default=1, type=int)
+    # Model parameters
+    parser.add_argument('--AD_hidden_dim', default=256, type=int)
+    parser.add_argument('--d_model', default=512, type=int)
+    # visual_model parameters
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    return parser
+def main(args):
+    device = torch.device(args.device)
+    seed = args.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                        ])
+    dataset_all = dataset.data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                      des_len=args.des_len,
+                                                      obj_len=args.obj_len,
+                                                      tgt_len=args.tgt_len,
+                                                      img_root=args.img_root,
+                                                      transform=the_transforms)
+    dataloader_val = DataLoader(dataset_all,
+                                  batch_size=args.batch_size,
+                                  shuffle=False)
+    print("data loaded...")
+    Tokenizer = tokenizer.ChineseTokenizer()
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load('./outputs/005/checkpoint0019.pth'))
+    output_dir = Path(args.output_dir)
+    with (output_dir / "log.txt").open("a") as f:
+        f.write(str(args) + "\n")
+    print("start validate...")
+    start_time = time.time()
+    # optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2000)
+    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+    for epoch in range(args.start_epoch, args.epochs):
+        validate_txt(args, model, dataloader_val, device, batch_size=args.batch_size)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Validate time {}'.format(total_time_str))
+def validate(img1_id, img2_id, obj, model_path):
+    parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    seed = args.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    normalize = transforms.Normalize(mean=[0.5024, 0.4993, 0.4992],
+                                     std=[0.1673, 0.1695, 0.1705])
+    the_transforms = transforms.Compose([transforms.Resize((448, 448)),
+                                         transforms.RandomHorizontalFlip(),
+                                         transforms.ToTensor(),
+                                         normalize,
+                                        ])
+    dataset_all = dataset.data_loader.AttDesDataset(args.data_root, args.dataset_name,
+                                                      des_len=args.des_len,
+                                                      obj_len=args.obj_len,
+                                                      tgt_len=args.tgt_len,
+                                                      img_root=args.img_root,
+                                                      transform=the_transforms)
+    PrefixLM_configure = dict(d_model=args.d_model, des_len=args.des_len, obj_len=args.obj_len, tgt_len=args.tgt_len,
+                              input_resolution=448,
+                              patch_size=16,
+                              num_text_tokens=20000,
+                              txt_seq_len=10000,
+                              heads=4,
+                              enc_depth=8,
+                              dec_depth=8,
+                              d_ff=1024,
+                              dropout=0.1)
+    time_1 = time.time()
+    model = prefixLM.PrefixLM(**PrefixLM_configure).to(device)
+    model.load_state_dict(torch.load(model_path))
+    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+    time_2 = time.time()
+    print('Load model takes {}s'.format(time_2 - time_1))
+    out_list = []
+    label_txt, output1, output2, output3 = validate_one_img(model, dataset_all, img1_id, obj, device, tokenizer)
+    out_list.append([label_txt, output1, output2, output3])
+    label_txt, output1, output2, output3 = validate_one_img(model, dataset_all, img2_id, obj, device, tokenizer)
+    out_list.append([label_txt, output1, output2, output3])
+    return out_list
+def validate_one_img(model, dataset_all, img_id, obj, device, tokenizer):
+    # print("start validate...")
+    start_time = time.time()
+    end1_time = time.time()
+    model.eval()
+    print(obj)
+    img_data, des, obj_data, target, img_id, obj_given = dataset_all.get_all_from_id(img_id, obj)
+    print(obj_given)
+    img_data = img_data.unsqueeze(0).to(device)
+    des = des.unsqueeze(0).to(device)
+    obj_given = obj_given.unsqueeze(0).to(device)
+    label = target.unsqueeze(0).to(device)
+    img_emed = model.ResNet(img_data)
+    img_emed = rearrange(img_emed, 'b c h w -> b (h w) c')
+    img_emed += model.img_pos_embed(img_emed)
+    des_embed = model.txt_embed(des)
+    des_embed += model.txt_pos_embed(torch.arange(model.des_len, device=device))
+    obj_embed = model.txt_embed(obj_given)
+    obj_embed = obj_embed + model.txt_pos_embed(torch.arange(model.obj_len, device=device))
+    tgt_txt = torch.zeros(1, 1, dtype=torch.long, device=device) + 101
+    tgt_txt_embed = model.txt_embed(tgt_txt)
+    tgt_txt_embed += model.txt_pos_embed(torch.arange(1, device=device) + model.tgt_len)
+    # M_005
+    out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                         tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                         tgt_mask=None)
+    logits = model.to_logits(out)[:, -1]
+    sample = torch.argmax(logits, dim=-1)
+    value, index = logits.topk(3, dim=-1)
+    sample = index[0][0].unsqueeze(0)
+    sample_2nd = index[0][1].unsqueeze(0)
+    sample_3rd = index[0][2].unsqueeze(0)
+    tgt_txt_2nd = tgt_txt
+    tgt_txt_3rd = tgt_txt
+    cur_len = 1
+    while (cur_len < model.tgt_len and sample != 102):  # 102 is the id of [SEP]
+        tgt_txt = torch.cat((tgt_txt, sample.unsqueeze(1)), dim=-1)
+        tgt_txt_embed = model.txt_embed(tgt_txt)
+        cur_len += 1
+        tgt_txt_embed += model.txt_pos_embed(torch.arange(cur_len, device=device))
+        # out = model.transformer(prefix, tgt_txt_embed)
+        out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                             tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                             tgt_mask=None)
+        logits = model.to_logits(out)[:, -1]
+        sample = torch.argmax(logits, dim=-1)
+    label_txt = []
+    output_txt = []
+    obj_txt = []
+    for token in des[0].tolist():
+        if token > 103:
+            label_txt.append(token)
+    for token in tgt_txt[0].tolist():
+        if token > 103:
+            output_txt.append(token)
+    # for token in obj_data[0].tolist():
+    #     if token > 103:
+    #         obj_txt.append(token)
+    label_txt = tokenizer.convert_ids_to_tokens(label_txt)
+    label_txt = ''.join(label_txt)
+    # obj_txt = tokenizer.convert_ids_to_tokens(obj_txt)
+    output_txt = tokenizer.convert_ids_to_tokens(output_txt)
+    output1 = ''.join(output_txt)
+    # 2nd
+    cur_len = 1
+    while (cur_len < model.tgt_len and sample_2nd != 102):  # 102 is the id of [SEP]
+        tgt_txt_2nd = torch.cat((tgt_txt_2nd, sample_2nd.unsqueeze(1)), dim=-1)
+        tgt_txt_embed = model.txt_embed(tgt_txt_2nd)
+        cur_len += 1
+        tgt_txt_embed += model.txt_pos_embed(torch.arange(cur_len, device=device))
+        # out = model.transformer(prefix, tgt_txt_embed)
+        out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                             tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                             tgt_mask=None)
+        logits = model.to_logits(out)[:, -1]
+        # logits = logits[:, :-26]
+        # print(logits)
+        sample_2nd = torch.argmax(logits, dim=-1)
+    output_txt = []
+    for token in tgt_txt_2nd[0].tolist():
+        if token > 103:
+            output_txt.append(token)
+    output_txt = tokenizer.convert_ids_to_tokens(output_txt)
+    output2 = ''.join(output_txt)
+    # 3rd
+    cur_len = 1
+    while (cur_len < model.tgt_len and sample_3rd != 102):  # 102 is the id of [SEP]
+        tgt_txt_3rd = torch.cat((tgt_txt_3rd, sample_3rd.unsqueeze(1)), dim=-1)
+        tgt_txt_embed = model.txt_embed(tgt_txt_3rd)
+        cur_len += 1
+        tgt_txt_embed += model.txt_pos_embed(torch.arange(cur_len, device=device))
+        # out = model.transformer(prefix, tgt_txt_embed)
+        out = model.ModelOne(q=obj_embed, k=img_emed, v=img_emed,
+                             tgt_embeded=tgt_txt_embed, des_embed=des_embed, obj_embed=obj_embed, img_embed=img_emed,
+                             tgt_mask=None)
+        logits = model.to_logits(out)[:, -1]
+        # logits = logits[:, :-26]
+        sample_3rd = torch.argmax(logits, dim=-1)
+    output_txt = []
+    for token in tgt_txt_3rd[0].tolist():
+        if token > 103:
+            output_txt.append(token)
+    output_txt = tokenizer.convert_ids_to_tokens(output_txt)
+    output3 = ''.join(output_txt)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print(output1)
+    print(output2)
+    print(output3)
+    print('Validate time {}'.format(total_time_str))
+    return label_txt, output1, output2, output3
+if __name__ == '__main__':
+    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+    # parser = argparse.ArgumentParser('AttDes training script', parents=[get_args_parser()])
+    # args = parser.parse_args()
+    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
+    # if args.output_dir:
+    #     Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    # main(args)
+    model_name = '005'
+    model_path = r'E:\data\Download\models\attribute_desciption\outputs' + '/' + model_name + '/' + 'checkpoint0019.pth'
+    objs = ["空间","客厅","卧室","墙面","餐厅","公寓","住宅","沙发","家具","地毯","厨房","书房","背景墙","吊灯","墙",
+           "卫生间","儿童","床品","装饰","壁纸","地板","窗帘","吊顶","餐椅","别墅","地面","结构","布艺","餐桌","画"]
+    for obj in objs:
+        print(obj)
+        out = validate('550695', '550567', obj, model_path)
+        sentences1 = out[0][0].replace('；', '，').split('，')
+    # gt = ""
+    #
+    # for i in sentences1:
+    #     if obj in i:
+    #         gt = i
+    # gt = " ".join(jieba.cut(gt))
+    # print(gt)
+    # for i in out[0]:
+    #     i = " ".join(jieba.cut(i))
+    #     print(i)
+    #     print(gt)
+    #     bleu = nltk.translate.bleu_score.sentence_bleu([i], gt)
+    #     print(bleu)

Model/CLIP/cn_clip/__init__.py ADDED Viewed

File without changes

Model/CLIP/cn_clip/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (144 Bytes). View file

Model/CLIP/cn_clip/clip/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bert_tokenizer import FullTokenizer
+_tokenizer = FullTokenizer()
+from .utils import load_from_name, available_models, tokenize, image_transform, load

Model/CLIP/cn_clip/clip/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (352 Bytes). View file

Model/CLIP/cn_clip/clip/__pycache__/bert_tokenizer.cpython-38.pyc ADDED Viewed

Binary file (11.2 kB). View file

Model/CLIP/cn_clip/clip/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (5.99 kB). View file

Model/CLIP/cn_clip/clip/bert_tokenizer.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import unicodedata
+import six
+from functools import lru_cache
+import os
+@lru_cache()
+def default_vocab():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "vocab.txt")
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+    if not init_checkpoint:
+        return
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+    model_name = m.group(1)
+    lower_models = [
+        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    ]
+    cased_models = [
+        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12"
+    ]
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check." % (actual_flag, init_checkpoint,
+                                              model_name, case_name, opposite_flag))
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding='utf-8') as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file=default_vocab(), do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+    @staticmethod
+    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of tokens (string) in a single string. """
+        def clean_up_tokenization(out_string):
+            """ Clean up a list of simple English tokenization artifacts
+            like spaces before punctuations and abreviated forms.
+            """
+            out_string = (
+                out_string.replace(" .", ".")
+                    .replace(" ?", "?")
+                    .replace(" !", "!")
+                    .replace(" ,", ",")
+                    .replace(" ' ", "'")
+                    .replace(" n't", "n't")
+                    .replace(" 'm", "'m")
+                    .replace(" 's", "'s")
+                    .replace(" 've", "'ve")
+                    .replace(" 're", "'re")
+            )
+            return out_string
+        text = ' '.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            clean_text = clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def vocab_size(self):
+        return len(self.vocab)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False

Model/CLIP/cn_clip/clip/configuration_bert.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+logger = logging.getLogger(__name__)
+class BertConfig(object):
+    r"""
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 output_attentions=False,
+                 output_hidden_states=False
+                 ):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states

Model/CLIP/cn_clip/clip/model.py ADDED Viewed

	@@ -0,0 +1,504 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+from itertools import repeat
+import collections.abc
+import math
+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from cn_clip.clip import _tokenizer
+from cn_clip.clip.configuration_bert import BertConfig
+from cn_clip.clip.modeling_bert import BertModel
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        # FIXME support for non-transformer
+        pass
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.grad_checkpointing = False
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for r in self.resblocks:
+                x = checkpoint(r, x)
+            return x
+        return self.resblocks(x)
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.grid_size = (self.input_resolution // patch_size, self.input_resolution // patch_size)
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.grad_checkpointing = enable
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 vocab_size: int,
+                 text_attention_probs_dropout_prob: float,
+                 text_hidden_act: str,
+                 text_hidden_dropout_prob: float,
+                 text_hidden_size: int,
+                 text_initializer_range: float,
+                 text_intermediate_size: int,
+                 text_max_position_embeddings: int,
+                 text_num_attention_heads: int,
+                 text_num_hidden_layers: int,
+                 text_type_vocab_size: int,
+                 tokenizer = _tokenizer,
+                 # vision head width, added this param for ViT-H
+                 vision_head_width: int = 64,
+                 ):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // vision_head_width
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // vision_head_width
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.bert_config = BertConfig(
+            vocab_size_or_config_json_file=vocab_size,
+            hidden_size=text_hidden_size,
+            num_hidden_layers=text_num_hidden_layers,
+            num_attention_heads=text_num_attention_heads,
+            intermediate_size=text_intermediate_size,
+            hidden_act=text_hidden_act,
+            hidden_dropout_prob=text_hidden_dropout_prob,
+            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
+            max_position_embeddings=text_max_position_embeddings,
+            type_vocab_size=text_type_vocab_size,
+            initializer_range=text_initializer_range,
+            layer_norm_eps=1e-12,
+        )
+        self.bert = BertModel(self.bert_config)
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.tokenizer = tokenizer
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.bert_config.hidden_size ** -0.5)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.bert.set_grad_checkpointing(enable)
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        pad_index = self.tokenizer.vocab['[PAD]']
+        attn_mask = text.ne(pad_index).type(self.dtype)
+        x = self.bert(text, attention_mask=attn_mask)[0].type(self.dtype) # [batch_size, seq_length, hidden_size]
+        return x[:, 0, :] @ self.text_projection
+    def forward(self, image, text):
+        assert image is not None or text is not None, "text and image cannot both be None!"
+        if image is None:
+            return self.encode_text(text)
+        elif text is None:
+            return self.encode_image(image)
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return image_features, text_features, self.logit_scale.exp()
+    def get_similarity(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_models_to_fp32(model):
+    for p in model.parameters():
+        p.data = p.data.float()
+        if p.grad:
+            p.grad.data = p.grad.data.float()
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        if isinstance(l, BertModel):
+            l.to(torch.half)
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def restore_model(model, clip_state_dict: dict, bert_state_dict: dict):
+    merged_state_dict = {}
+    # use clip_state_dict to initialize the image encoder & logit scale
+    if clip_state_dict is not None:
+        for k, v in clip_state_dict.items():
+            if k.startswith("visual") or k == "logit_scale":
+                merged_state_dict[k] = v
+    # use bert_state_dict to initialize the text encoder
+    if bert_state_dict is not None:
+        for k, v in bert_state_dict.items():
+            if k.startswith("bert") and "bert.pooler" not in k:
+                merged_state_dict[k] = v
+    convert_weights(model)
+    resize_pos_embed(merged_state_dict, model)
+    model.load_state_dict(merged_state_dict, strict=False)
+    return model.eval()
+def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1, prefix=""):
+    # Rescale the grid of position embeddings when loading from state_dict
+    old_pos_embed = state_dict.get(prefix + 'visual.positional_embedding', None)
+    model = model.module if hasattr(model, 'module') else model
+    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
+        return
+    grid_size = to_2tuple(model.visual.grid_size)
+    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+    if new_seq_len == old_pos_embed.shape[0]:
+        return
+    if extra_tokens:
+        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+    else:
+        pos_emb_tok, pos_emb_img = None, old_pos_embed
+    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+    pos_emb_img = F.interpolate(
+        pos_emb_img,
+        size=grid_size,
+        mode=interpolation,
+        align_corners=True,
+    )
+    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+    if pos_emb_tok is not None:
+        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+    else:
+        new_pos_embed = pos_emb_img
+    state_dict[prefix + 'visual.positional_embedding'] = new_pos_embed
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = lambda n, x: _ntuple(n)(x)

Model/CLIP/cn_clip/clip/model_configs/RBT3-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 768,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 3072,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 12,
+    "text_num_hidden_layers": 3,
+    "text_type_vocab_size": 2
+}

Model/CLIP/cn_clip/clip/model_configs/RN50.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 1024,
+    "image_resolution": 224,
+    "vision_layers": "[3,4,6,3]",
+    "vision_width": 64,
+    "vision_patch_size": null
+}

Model/CLIP/cn_clip/clip/model_configs/RoBERTa-wwm-ext-base-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 768,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 3072,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 12,
+    "text_num_hidden_layers": 12,
+    "text_type_vocab_size": 2
+}

Model/CLIP/cn_clip/clip/model_configs/RoBERTa-wwm-ext-large-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 1024,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 4096,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 16,
+    "text_num_hidden_layers": 24,
+    "text_type_vocab_size": 2
+}

Model/CLIP/cn_clip/clip/model_configs/ViT-B-16.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 512,
+    "image_resolution": 224,
+    "vision_layers": 12,
+    "vision_width": 768,
+    "vision_patch_size": 16
+}

Model/CLIP/cn_clip/clip/model_configs/ViT-B-32.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 512,
+    "image_resolution": 224,
+    "vision_layers": 12,
+    "vision_width": 768,
+    "vision_patch_size": 32
+}

Model/CLIP/cn_clip/clip/model_configs/ViT-H-14.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "embed_dim": 1024,
+    "image_resolution": 224,
+    "vision_layers": 32,
+    "vision_width": 1280,
+    "vision_head_width": 80,
+    "vision_patch_size": 14
+}

Model/CLIP/cn_clip/clip/model_configs/ViT-L-14-336.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 768,
+    "image_resolution": 336,
+    "vision_layers": 24,
+    "vision_width": 1024,
+    "vision_patch_size": 14
+}

Model/CLIP/cn_clip/clip/model_configs/ViT-L-14.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 768,
+    "image_resolution": 224,
+    "vision_layers": 24,
+    "vision_width": 1024,
+    "vision_patch_size": 14
+}

Model/CLIP/cn_clip/clip/model_configs/for_learn.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+from pathlib import Path
+import os
+vision_model = "ViT-B-16"
+vision_model_config_file = \
+    Path(__file__).parent / f"{vision_model.replace('/', '-')}.json"
+print('Loading vision model config from', vision_model_config_file)
+assert os.path.exists(vision_model_config_file)
+with open(vision_model_config_file, 'r') as fv:
+    model_info = json.load(fv).items()
+    print('Model info:', model_info)
+    if isinstance(model_info['vision_layers'], str):
+        model_info['vision_layers'] = eval(model_info['vision_layers'])

Model/CLIP/cn_clip/clip/modeling_bert.py ADDED Viewed

	@@ -0,0 +1,460 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from .configuration_bert import BertConfig
+logger = logging.getLogger(__name__)
+def gelu(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def swish(x):
+    return x * torch.sigmoid(x)
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
+BertLayerNorm = torch.nn.LayerNorm
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
+        self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        if len(outputs) == 1:
+            return outputs[0]
+        return outputs
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.grad_checkpointing = False
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                layer_outputs = checkpoint(layer_module, hidden_states, attention_mask, head_mask[i])
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            if not isinstance(layer_outputs, tuple):
+                layer_outputs = (layer_outputs, )
+            hidden_states = layer_outputs[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
+                                 bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(nn.Module):
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    def __init__(self, config):
+        super(BertPreTrainedModel, self).__init__()
+        self.config = config
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        # self.pooler = BertPooler(config)
+        self.apply(self._init_weights)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        if enable:
+            assert not self.config.output_attentions, \
+                "Grad checkpointing is currently conflict with output_attentions for BertEncoder, \
+                    please set it to False in BertConfig"
+        self.encoder.grad_checkpointing = enable
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        # pooled_output = self.pooler(sequence_output)
+        pooled_output = None
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

Model/CLIP/cn_clip/clip/utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Code modified from https://github.com/openai/CLIP
+import json
+import os
+from pathlib import Path
+from typing import Union, List
+import urllib
+import torch
+from torchvision.transforms import Compose, ToTensor, Normalize, Resize
+from tqdm import tqdm
+from cn_clip.clip import _tokenizer
+from cn_clip.clip.model import convert_weights, CLIP, restore_model
+__all__ = ["load", "tokenize", "available_models", "image_transform", "load_from_name"]
+_MODELS = {
+    "ViT-B-16": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-b-16.pt",
+    "ViT-L-14": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-l-14.pt",
+    "ViT-L-14-336": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-l-14-336.pt",
+    "ViT-H-14": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-h-14.pt",
+    "RN50": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_rn50.pt",
+}
+_MODEL_INFO = {
+    "ViT-B-16": {
+        "struct": "ViT-B-16@RoBERTa-wwm-ext-base-chinese",
+        "input_resolution": 224
+    },
+    "ViT-L-14": {
+        "struct": "ViT-L-14@RoBERTa-wwm-ext-base-chinese",
+        "input_resolution": 224
+    },
+    "ViT-L-14-336": {
+        "struct": "ViT-L-14-336@RoBERTa-wwm-ext-base-chinese",
+        "input_resolution": 336
+    },
+    "ViT-H-14": {
+        "struct": "ViT-H-14@RoBERTa-wwm-ext-large-chinese",
+        "input_resolution": 224
+    },
+    "RN50": {
+        "struct": "RN50@RBT3-chinese",
+        "input_resolution": 224
+    },
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        return download_target
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True,
+                  unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load_from_name(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+                   download_root: str = None, resume: str = None):
+    if resume is not None:
+        model_path = resume
+    elif name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        # loading saved checkpoint
+        checkpoint = torch.load(opened_file, map_location="cpu")
+    model = create_model(_MODEL_INFO[name]['struct'], checkpoint)
+    if str(device) == "cpu":
+        model.float()
+    else:
+        model.to(device)
+    return model, image_transform(_MODEL_INFO[name]['input_resolution'])
+def load(model, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", clip_path=None,
+         bert_path=None):
+    """Load CLIP and BERT model weights
+    """
+    bert_state_dict = torch.load(bert_path, map_location="cpu") if bert_path else None
+    clip_state_dict = torch.load(clip_path, map_location="cpu") if clip_path else None
+    restore_model(model, clip_state_dict, bert_state_dict).to(device)
+    if str(device) == "cpu":
+        model.float()
+    return model
+def tokenize(texts: Union[str, List[str]], context_length: int = 64) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all baseline models use 24 as the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    all_tokens = []
+    for text in texts:
+        all_tokens.append([_tokenizer.vocab['[CLS]']] + _tokenizer.convert_tokens_to_ids(_tokenizer.tokenize(text))[
+                                                        :context_length - 2] + [_tokenizer.vocab['[SEP]']])
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        assert len(tokens) <= context_length
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def image_transform(image_size=224):
+    transform = Compose([
+        _convert_to_rgb,
+        Resize((image_size, image_size)),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    return transform
+def create_model(model_name, checkpoint=None):
+    vision_model, text_model = model_name.split('@')
+    # Initialize the model.
+    vision_model_config_file = Path(
+        __file__).parent / f"model_configs/{vision_model.replace('/', '-')}.json"
+    # print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(
+        __file__).parent / f"model_configs/{text_model.replace('/', '-')}.json"
+    # print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    if isinstance(model_info['vision_layers'], str):
+        model_info['vision_layers'] = eval(model_info['vision_layers'])
+    # print('Model info', model_info)
+    model = CLIP(**model_info)
+    convert_weights(model)
+    if checkpoint:
+        sd = checkpoint["state_dict"]
+        if next(iter(sd.items()))[0].startswith('module'):
+            sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+        model.load_state_dict(sd)
+    return model

Model/CLIP/cn_clip/clip/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Model/CLIP/cn_clip/eval/__init__.py ADDED Viewed

File without changes

Model/CLIP/cn_clip/eval/data.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import logging
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from PIL import Image
+import base64
+from io import BytesIO
+import lmdb
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize, InterpolationMode
+import torch
+from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import SequentialSampler
+import torchvision.datasets as datasets
+from cn_clip.clip import tokenize
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def _preprocess_text(text):
+    # adapt the text to Chinese BERT vocab
+    text = text.lower().replace("“", "\"").replace("”", "\"")
+    return text
+class EvalTxtDataset(Dataset):
+    def __init__(self, jsonl_filename, max_txt_length=24):
+        assert os.path.exists(jsonl_filename), "The annotation datafile {} not exists!".format(jsonl_filename)
+        logging.debug(f'Loading jsonl data from {jsonl_filename}.')
+        self.texts = []
+        with open(jsonl_filename, "r") as fin:
+            for line in fin:
+                obj = json.loads(line.strip())
+                text_id = obj['text_id']
+                text = obj['text']
+                self.texts.append((text_id, text))
+        logging.debug(f'Finished loading jsonl data from {jsonl_filename}.')
+        self.max_txt_length = max_txt_length
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text_id, text = self.texts[idx]
+        text = tokenize([_preprocess_text(str(text))], context_length=self.max_txt_length)[0]
+        return text_id, text
+class EvalImgDataset(Dataset):
+    def __init__(self, lmdb_imgs, resolution=224):
+        assert os.path.isdir(lmdb_imgs), "The image LMDB directory {} not exists!".format(lmdb_imgs)
+        logging.debug(f'Loading image LMDB from {lmdb_imgs}.')
+        self.env_imgs = lmdb.open(lmdb_imgs, readonly=True, create=False, lock=False, readahead=False, meminit=False)
+        self.txn_imgs = self.env_imgs.begin(buffers=True)
+        self.cursor_imgs = self.txn_imgs.cursor()
+        self.iter_imgs = iter(self.cursor_imgs)
+        self.number_images = int(self.txn_imgs.get(key=b'num_images').tobytes().decode('utf-8'))
+        logging.info("The specified LMDB directory contains {} images.".format(self.number_images))
+        self.transform = self._build_transform(resolution)
+    def _build_transform(self, resolution):
+        normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        return Compose([
+                Resize((resolution, resolution), interpolation=InterpolationMode.BICUBIC),
+                _convert_to_rgb,
+                ToTensor(),
+                normalize,
+            ])
+    def __len__(self):
+        return self.number_images
+    def __getitem__(self, idx):
+        img_id, image_b64 = next(self.iter_imgs)
+        if img_id == b"num_images":
+            img_id, image_b64 = next(self.iter_imgs)
+        img_id = img_id.tobytes()
+        image_b64 = image_b64.tobytes()
+        img_id = int(img_id.decode(encoding="utf8", errors="ignore"))
+        image_b64 = image_b64.decode(encoding="utf8", errors="ignore")
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(image_b64))) # already resized
+        image = self.transform(image)
+        return img_id, image
+@dataclass
+class DataInfo:
+    dataloader: DataLoader
+    sampler: DistributedSampler
+def get_eval_txt_dataset(args, max_txt_length=24):
+    input_filename = args.text_data
+    dataset = EvalTxtDataset(
+        input_filename,
+        max_txt_length=max_txt_length)
+    num_samples = len(dataset)
+    sampler = SequentialSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.text_batch_size,
+        num_workers=0,
+        pin_memory=True,
+        sampler=sampler,
+        drop_last=False,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+    return DataInfo(dataloader, sampler)
+def fetch_resolution(vision_model):
+    # fetch the resolution from the vision model config
+    vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{vision_model.replace('/', '-')}.json"
+    with open(vision_model_config_file, 'r') as fv:
+        model_info = json.load(fv)
+    return model_info["image_resolution"]
+def get_eval_img_dataset(args):
+    lmdb_imgs = args.image_data
+    dataset = EvalImgDataset(
+        lmdb_imgs, resolution=fetch_resolution(args.vision_model))
+    num_samples = len(dataset)
+    sampler = SequentialSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.img_batch_size,
+        num_workers=0,
+        pin_memory=True,
+        sampler=sampler,
+        drop_last=False,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+    return DataInfo(dataloader, sampler)
+def get_imagenet_dataset(args, preprocess_fn, split):
+    assert split in ["val"]
+    data_path = args.imagenet_val
+    assert data_path
+    dataset = datasets.ImageFolder(data_path, transform=preprocess_fn)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.img_batch_size,
+        num_workers=args.num_workers,
+        sampler=None,
+    )
+    return DataInfo(dataloader, None)

Model/CLIP/cn_clip/eval/evaluation.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -*- coding: utf-8 -*-
+'''
+This script computes the recall scores given the ground-truth annotations and predictions.
+'''
+import json
+import sys
+import os
+import string
+import numpy as np
+import time
+NUM_K = 10
+def read_submission(submit_path, reference, k=5):
+    # check whether the path of submitted file exists
+    if not os.path.exists(submit_path):
+        raise Exception("The submission file is not found!")
+    submission_dict = {}
+    ref_qids = set(reference.keys())
+    with open(submit_path) as fin:
+        for line in fin:
+            line = line.strip()
+            try:
+                pred_obj = json.loads(line)
+            except:
+                raise Exception('Cannot parse this line into json object: {}'.format(line))
+            if "text_id" not in pred_obj:
+                raise Exception('There exists one line not containing text_id: {}'.format(line))
+            if not isinstance(pred_obj['text_id'], int):
+                raise Exception('Found an invalid text_id {}, it should be an integer (not string), please check your schema'.format(qid))
+            qid = pred_obj["text_id"]
+            if "image_ids" not in pred_obj:
+                raise Exception('There exists one line not containing the predicted image_ids: {}'.format(line))
+            image_ids = pred_obj["image_ids"]
+            if not isinstance(image_ids, list):
+                raise Exception('The image_ids field of text_id {} is not a list, please check your schema'.format(qid))
+            # check whether there are K products for each text
+            if len(image_ids) != k:
+                raise Exception('Text_id {} has wrong number of predicted image_ids! Require {}, but {} founded.'.format(qid, k, len(image_ids)))
+            # check whether there exist an invalid prediction for any text
+            for rank, image_id in enumerate(image_ids):
+                if not isinstance(image_id, int):
+                    raise Exception('Text_id {} has an invalid predicted image_id {} at rank {}, it should be an integer (not string), please check your schema'.format(qid, image_id, rank + 1))
+            # check whether there are duplicate predicted products for a single text
+            if len(set(image_ids)) != k:
+                raise Exception('Text_id {} has duplicate products in your prediction. Pleace check again!'.format(qid))
+            submission_dict[qid] = image_ids # here we save the list of product ids
+    # check if any text is missing in the submission
+    pred_qids = set(submission_dict.keys())
+    nopred_qids = ref_qids - pred_qids
+    if len(nopred_qids) != 0:
+        raise Exception('The following text_ids have no prediction in your submission, please check again: {}'.format(", ".join([str(idx) for idx in nopred_qids])))
+    return submission_dict
+def dump_2_json(info, path):
+    with open(path, 'w') as output_json_file:
+        json.dump(info, output_json_file)
+def report_error_msg(detail, showMsg, out_p):
+    error_dict=dict()
+    error_dict['errorDetail']=detail
+    error_dict['errorMsg']=showMsg
+    error_dict['score']=0
+    error_dict['scoreJson']={}
+    error_dict['success']=False
+    dump_2_json(error_dict,out_p)
+def report_score(r1, r5, r10, out_p):
+    result = dict()
+    result['success']=True
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result['score'] = mean_recall * 100
+    result['scoreJson'] = {'score': mean_recall * 100, 'mean_recall': mean_recall * 100, 'r1': r1 * 100, 'r5': r5 * 100, 'r10': r10 * 100}
+    dump_2_json(result,out_p)
+def read_reference(path):
+    fin = open(path)
+    reference = dict()
+    for line in fin:
+        line = line.strip()
+        obj = json.loads(line)
+        reference[obj['text_id']] = obj['image_ids']
+    return reference
+def compute_score(golden_file, predict_file):
+    # read ground-truth
+    reference = read_reference(golden_file)
+    # read predictions
+    k = 10
+    predictions = read_submission(predict_file, reference, k)
+    # compute score for each text
+    r1_stat, r5_stat, r10_stat = 0, 0, 0
+    for qid in reference.keys():
+        ground_truth_ids = set(reference[qid])
+        top10_pred_ids = predictions[qid]
+        if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+            r1_stat += 1
+        if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+            r5_stat += 1
+        if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+            r10_stat += 1
+    # the higher score, the better
+    r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result = [mean_recall, r1, r5, r10]
+    result = [score * 100 for score in result]
+    return result
+if __name__=="__main__":
+    # the path of answer json file (eg. test_queries_answers.jsonl)
+    standard_path = sys.argv[1]
+    # the path of prediction file (eg. example_pred.jsonl)
+    submit_path = sys.argv[2]
+    # the score will be dumped into this output json file
+    out_path = sys.argv[3]
+    print("Read standard from %s" % standard_path)
+    print("Read user submit file from %s" % submit_path)
+    try:
+        # read ground-truth
+        reference = read_reference(standard_path)
+        # read predictions
+        k = 10
+        predictions = read_submission(submit_path, reference, k)
+        # compute score for each text
+        r1_stat, r5_stat, r10_stat = 0, 0, 0
+        for qid in reference.keys():
+            ground_truth_ids = set(reference[qid])
+            top10_pred_ids = predictions[qid]
+            if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+                r1_stat += 1
+            if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+                r5_stat += 1
+            if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+                r10_stat += 1
+        # the higher score, the better
+        r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+        report_score(r1, r5, r10, out_path)
+        print("The evaluation finished successfully.")
+    except Exception as e:
+        report_error_msg(e.args[0], e.args[0], out_path)
+        print("The evaluation failed: {}".format(e.args[0]))

Model/CLIP/cn_clip/eval/evaluation_tr.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -*- coding: utf-8 -*-
+'''
+This script computes the recall scores given the ground-truth annotations and predictions.
+'''
+import json
+import sys
+import os
+import string
+import numpy as np
+import time
+NUM_K = 10
+def read_submission(submit_path, reference, k=5):
+    # check whether the path of submitted file exists
+    if not os.path.exists(submit_path):
+        raise Exception("The submission file is not found!")
+    submission_dict = {}
+    ref_image_ids = set(reference.keys())
+    with open(submit_path) as fin:
+        for line in fin:
+            line = line.strip()
+            try:
+                pred_obj = json.loads(line)
+            except:
+                raise Exception('Cannot parse this line into json object: {}'.format(line))
+            if "image_id" not in pred_obj:
+                raise Exception('There exists one line not containing image_id: {}'.format(line))
+            if not isinstance(pred_obj['image_id'], int):
+                raise Exception('Found an invalid image_id {}, it should be an integer (not string), please check your schema'.format(pred_obj['image_id']))
+            image_id = pred_obj['image_id']
+            if "text_ids" not in pred_obj:
+                raise Exception('There exists one line not containing the predicted text_ids: {}'.format(line))
+            text_ids = pred_obj["text_ids"]
+            if not isinstance(text_ids, list):
+                raise Exception('The text_ids field of image_id {} is not a list, please check your schema'.format(image_id))
+            # check whether there are K products for each text
+            if len(text_ids) != k:
+                raise Exception('Image_id {} has wrong number of predicted text_ids! Require {}, but {} founded.'.format(image_id, k, len(text_ids)))
+            # check whether there exist an invalid prediction for any text
+            for rank, text_id in enumerate(text_ids):
+                if not isinstance(text_id, int):
+                    raise Exception('Image_id {} has an invalid predicted text_id {} at rank {}, it should be an integer (not string), please check your schema'.format(image_id, text_id, rank + 1))
+            # check whether there are duplicate predicted products for a single text
+            if len(set(text_ids)) != k:
+                raise Exception('Image_id {} has duplicate products in your prediction. Pleace check again!'.format(image_id))
+            submission_dict[image_id] = text_ids # here we save the list of product ids
+    # check if any text is missing in the submission
+    pred_image_ids = set(submission_dict.keys())
+    nopred_image_ids = ref_image_ids - pred_image_ids
+    if len(nopred_image_ids) != 0:
+        raise Exception('The following image_ids have no prediction in your submission, please check again: {}'.format(", ".join([str(idx) for idx in nopred_image_ids])))
+    return submission_dict
+def dump_2_json(info, path):
+    with open(path, 'w') as output_json_file:
+        json.dump(info, output_json_file)
+def report_error_msg(detail, showMsg, out_p):
+    error_dict=dict()
+    error_dict['errorDetail']=detail
+    error_dict['errorMsg']=showMsg
+    error_dict['score']=0
+    error_dict['scoreJson']={}
+    error_dict['success']=False
+    dump_2_json(error_dict,out_p)
+def report_score(r1, r5, r10, out_p):
+    result = dict()
+    result['success']=True
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result['score'] = mean_recall * 100
+    result['scoreJson'] = {'score': mean_recall * 100, 'mean_recall': mean_recall * 100, 'r1': r1 * 100, 'r5': r5 * 100, 'r10': r10 * 100}
+    dump_2_json(result,out_p)
+def read_reference(path):
+    fin = open(path)
+    reference = dict()
+    for line in fin:
+        line = line.strip()
+        obj = json.loads(line)
+        reference[obj['image_id']] = obj['text_ids']
+    return reference
+def compute_score(golden_file, predict_file):
+    # read ground-truth
+    reference = read_reference(golden_file)
+    # read predictions
+    k = 10
+    predictions = read_submission(predict_file, reference, k)
+    # compute score for each text
+    r1_stat, r5_stat, r10_stat = 0, 0, 0
+    for qid in reference.keys():
+        ground_truth_ids = set(reference[qid])
+        top10_pred_ids = predictions[qid]
+        if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+            r1_stat += 1
+        if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+            r5_stat += 1
+        if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+            r10_stat += 1
+    # the higher score, the better
+    r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result = [mean_recall, r1, r5, r10]
+    result = [score * 100 for score in result]
+    return result
+if __name__=="__main__":
+    # the path of answer json file (eg. test_queries_answers.jsonl)
+    standard_path = sys.argv[1]
+    # the path of prediction file (eg. example_pred.jsonl)
+    submit_path = sys.argv[2]
+    # the score will be dumped into this output json file
+    out_path = sys.argv[3]
+    print("Read standard from %s" % standard_path)
+    print("Read user submit file from %s" % submit_path)
+    try:
+        # read ground-truth
+        reference = read_reference(standard_path)
+        # read predictions
+        k = 10
+        predictions = read_submission(submit_path, reference, k)
+        # compute score for each text
+        r1_stat, r5_stat, r10_stat = 0, 0, 0
+        for qid in reference.keys():
+            ground_truth_ids = set(reference[qid])
+            top10_pred_ids = predictions[qid]
+            if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+                r1_stat += 1
+            if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+                r5_stat += 1
+            if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+                r10_stat += 1
+        # the higher score, the better
+        r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+        report_score(r1, r5, r10, out_path)
+        print("The evaluation finished successfully.")
+    except Exception as e:
+        report_error_msg(e.args[0], e.args[0], out_path)
+        print("The evaluation failed: {}".format(e.args[0]))

Model/CLIP/cn_clip/eval/extract_features.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# -*- coding: utf-8 -*-
+'''
+This script extracts image and text features for evaluation. (with single-GPU)
+'''
+import os
+import argparse
+import logging
+from pathlib import Path
+import json
+import torch
+from tqdm import tqdm
+from cn_clip.clip.model import convert_weights, CLIP
+from cn_clip.training.main import convert_models_to_fp32
+from cn_clip.eval.data import get_eval_img_dataset, get_eval_txt_dataset
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--extract-image-feats',
+        action="store_true",
+        default=False,
+        help="Whether to extract image features."
+    )
+    parser.add_argument(
+        '--extract-text-feats',
+        action="store_true",
+        default=False,
+        help="Whether to extract text features."
+    )
+    parser.add_argument(
+        '--image-data',
+        type=str,
+        default="../Multimodal_Retrieval/lmdb/test/imgs",
+        help="If --extract-image-feats is True, specify the path of the LMDB directory storing input image base64 strings."
+    )
+    parser.add_argument(
+        '--text-data',
+        type=str,
+        default="../Multimodal_Retrieval/test_texts.jsonl",
+        help="If --extract-text-feats is True, specify the path of input text Jsonl file."
+    )
+    parser.add_argument(
+        '--image-feat-output-path',
+        type=str,
+        default=None,
+        help="If --extract-image-feats is True, specify the path of output image features."
+    )
+    parser.add_argument(
+        '--text-feat-output-path',
+        type=str,
+        default=None,
+        help="If --extract-image-feats is True, specify the path of output text features."
+    )
+    parser.add_argument(
+        "--img-batch-size", type=int, default=64, help="Image batch size."
+    )
+    parser.add_argument(
+        "--text-batch-size", type=int, default=64, help="Text batch size."
+    )
+    parser.add_argument(
+        "--context-length", type=int, default=64, help="The maximum length of input text (include [CLS] & [SEP] tokens)."
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--precision",
+        choices=["amp", "fp16", "fp32"],
+        default="amp",
+        help="Floating point precition."
+    )
+    parser.add_argument(
+        "--vision-model",
+        choices=["ViT-B-32", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"],
+        default="ViT-B-16",
+        help="Name of the vision backbone to use.",
+    )
+    parser.add_argument(
+        "--text-model",
+        choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"],
+        default="RoBERTa-wwm-ext-base-chinese",
+        help="Name of the text backbone to use.",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="If true, more information is logged."
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.extract_image_feats or args.extract_text_feats, "--extract-image-feats and --extract-text-feats cannot both be False!"
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    args.gpu = 0
+    torch.cuda.set_device(args.gpu)
+    # Initialize the model.
+    vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json"
+    print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.text_model.replace('/', '-')}.json"
+    print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        if isinstance(model_info['vision_layers'], str):
+            model_info['vision_layers'] = eval(model_info['vision_layers'])
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    model = CLIP(**model_info)
+    convert_weights(model)
+    # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+    if args.precision == "amp" or args.precision == "fp32":
+        convert_models_to_fp32(model)
+    model.cuda(args.gpu)
+    if args.precision == "fp16":
+        convert_weights(model)
+    # Get data.
+    if args.extract_image_feats:
+        print("Preparing image inference dataset.")
+        img_data = get_eval_img_dataset(args)
+    if args.extract_text_feats:
+        print("Preparing text inference dataset.")
+        text_data = get_eval_txt_dataset(args, max_txt_length=args.context_length)
+    # Resume from a checkpoint.
+    print("Begin to load model checkpoint from {}.".format(args.resume))
+    assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume)
+    # Map model to be loaded to specified single gpu.
+    loc = "cuda:{}".format(args.gpu)
+    checkpoint = torch.load(args.resume, map_location='cpu')
+    start_epoch = checkpoint["epoch"]
+    sd = checkpoint["state_dict"]
+    if next(iter(sd.items()))[0].startswith('module'):
+        sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+    model.load_state_dict(sd)
+    print(
+        f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)"
+    )
+    # Make inference for texts
+    if args.extract_text_feats:
+        print('Make inference for texts...')
+        if args.text_feat_output_path is None:
+            args.text_feat_output_path = "{}.txt_feat.jsonl".format(args.text_data[:-6])
+        write_cnt = 0
+        with open(args.text_feat_output_path, "w") as fout:
+            model.eval()
+            dataloader = text_data.dataloader
+            with torch.no_grad():
+                for batch in tqdm(dataloader):
+                    text_ids, texts = batch
+                    texts = texts.cuda(args.gpu, non_blocking=True)
+                    text_features = model(None, texts)
+                    text_features /= text_features.norm(dim=-1, keepdim=True)
+                    for text_id, text_feature in zip(text_ids.tolist(), text_features.tolist()):
+                        fout.write("{}\n".format(json.dumps({"text_id": text_id, "feature": text_feature})))
+                        write_cnt += 1
+        print('{} text features are stored in {}'.format(write_cnt, args.text_feat_output_path))
+    # Make inference for images
+    if args.extract_image_feats:
+        print('Make inference for images...')
+        if args.image_feat_output_path is None:
+            # by default, we store the image features under the same directory with the text features
+            args.image_feat_output_path = "{}.img_feat.jsonl".format(args.text_data.replace("_texts.jsonl", "_imgs"))
+        write_cnt = 0
+        with open(args.image_feat_output_path, "w") as fout:
+            model.eval()
+            dataloader = img_data.dataloader
+            with torch.no_grad():
+                for batch in tqdm(dataloader):
+                    image_ids, images = batch
+                    images = images.cuda(args.gpu, non_blocking=True)
+                    image_features = model(images, None)
+                    image_features /= image_features.norm(dim=-1, keepdim=True)
+                    for image_id, image_feature in zip(image_ids.tolist(), image_features.tolist()):
+                        fout.write("{}\n".format(json.dumps({"image_id": image_id, "feature": image_feature})))
+                        write_cnt += 1
+        print('{} image features are stored in {}'.format(write_cnt, args.image_feat_output_path))
+    print("Done!")

Model/CLIP/cn_clip/eval/imagenet_zeroshot_templates.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# -*- coding: utf-8 -*-
+'''
+This script records the imagenet classnames and templates (both translated in Chinese)
+used for zero-shot evaluation.
+The original classnames and templates in English are derived from open_clip
+(https://github.com/mlfoundations/open_clip/blob/main/src/training/imagenet_zeroshot_data.py)
+The translated classnames and templates in Chinese are derived from wukong
+(https://gitee.com/mindspore/models/tree/master/research/mm/wukong)
+'''
+imagenet_classnames = [	"丁鲷", "金鱼", "大白鲨", "虎鲨", "锤头鲨", "电鳐", "黄貂鱼", "公鸡", "母鸡", "鸵鸟",
+	"燕雀", "金翅雀", "家朱雀", "灯芯草雀", "靛蓝雀", "蓝鹀", "夜莺", "松鸦", "喜鹊", "山雀",
+	"河鸟", "鸢（猛禽）", "秃头鹰", "秃鹫", "大灰猫头鹰", "欧洲火蝾螈", "普通蝾螈", "水蜥", "斑点蝾螈", "蝾螈",
+	"牛蛙", "树蛙", "尾蛙", "红海龟", "皮革龟", "泥龟", "淡水龟", "箱龟", "带状壁虎", "普通鬣蜥",
+	"美国变色龙", "鞭尾蜥蜴", "飞龙科蜥蜴", "褶边蜥蜴", "鳄鱼蜥蜴", "毒蜥", "绿蜥蜴", "非洲变色龙", "科莫多蜥蜴", "非洲鳄",
+	"美国鳄鱼", "三角龙", "雷蛇", "环蛇", "希腊蛇", "绿蛇", "国王蛇", "袜带蛇", "水蛇", "藤蛇",
+	"夜蛇", "大蟒蛇", "岩石蟒蛇", "印度眼镜蛇", "绿曼巴", "海蛇", "角腹蛇", "菱纹响尾蛇", "角响尾蛇", "三叶虫",
+	"盲蜘蛛", "蝎子", "黑金花园蜘蛛", "谷仓蜘蛛", "花园蜘蛛", "黑寡妇蜘蛛", "狼蛛", "狼蜘蛛", "壁虱", "蜈蚣",
+	"黑松鸡", "松鸡", "披肩鸡", "草原鸡", "孔雀", "鹌鹑", "鹧鸪", "非洲灰鹦鹉", "金刚鹦鹉", "硫冠鹦鹉",
+	"短尾鹦鹉", "褐翅鸦鹃", "食蜂鸟；蜂虎", "犀鸟", "蜂鸟", "鹟䴕", "巨嘴鸟；大嘴鸟", "野鸭", "红胸秋沙鸭", "鹅",
+	"黑天鹅", "大象", "针鼹鼠", "鸭嘴兽", "沙袋鼠", "考拉", "袋熊", "水母", "海葵", "脑珊瑚",
+	"扁形虫扁虫", "线虫", "海螺", "蜗牛", "鼻涕虫", "海蛞蝓；海参", "石鳖", "鹦鹉螺", "珍宝蟹", "石蟹",
+	"招潮蟹", "帝王蟹", "美国龙虾", "大螯虾", "小龙虾", "寄居蟹", "等足目动物（明虾和螃蟹近亲）", "白鹳", "黑鹳", "鹭",
+	"火烈鸟", "小蓝鹭", "美国鹭", "麻鸦", "鹤", "秧鹤", "欧洲水鸡", "沼泽泥母鸡", "鸨", "红翻石鹬",
+	"红背鹬", "红脚鹬", "半蹼鹬", "蛎鹬", "鹈鹕", "国王企鹅", "信天翁", "灰鲸", "杀人鲸", "海牛",
+	"海狮", "吉娃娃", "日本狆犬", "马尔济斯犬", "狮子狗", "西施犬", "布莱尼姆猎犬", "巴比狗", "玩具犬", "罗得西亚长背猎狗",
+	"阿富汗猎犬", "巴吉度猎犬", "比格犬", "侦探犬", "蓝色快狗", "黑褐猎浣熊犬", "沃克猎犬", "英国猎狐犬", "美洲赤狗", "俄罗斯猎狼犬",
+	"爱尔兰猎狼犬", "意大利灰狗", "惠比特犬", "依比沙猎犬", "挪威猎犬", "奥达猎犬", "沙克犬", "苏格兰猎鹿犬", "威玛猎犬", "斯塔福德郡斗牛犬",
+	"美国斯塔福德郡梗", "贝德灵顿梗", "边境梗", "凯丽蓝梗", "爱尔兰梗", "诺福克梗", "诺维奇梗", "约克犬；约克夏梗犬", "刚毛猎狐梗", "莱克兰梗",
+	"锡利哈姆梗", "艾尔谷犬", "凯恩梗", "澳大利亚梗", "丹迪丁蒙梗", "波士顿梗", "迷你雪纳瑞犬", "巨型雪纳瑞犬", "标准雪纳瑞犬", "苏格兰梗犬",
+	"西藏梗", "丝毛梗", "爱尔兰软毛梗犬", "西高地白梗", "拉萨阿普索犬", "平毛寻回犬", "卷毛寻回犬", "金毛猎犬", "拉布拉多猎犬", "乞沙比克猎犬",
+	"德国短毛指示犬", "维兹拉犬", "英国塞特犬", "爱尔兰雪达犬", "戈登雪达犬", "布列塔尼犬猎犬", "黄毛", "英国史宾格犬", "威尔士史宾格犬", "可卡犬",
+	"萨塞克斯猎犬", "爱尔兰水猎犬", "哥威斯犬", "舒柏奇犬", "比利时牧羊犬", "马里努阿犬", "伯瑞犬", "凯尔皮犬", "匈牙利牧羊犬", "老英国牧羊犬",
+	"喜乐蒂牧羊犬", "牧羊犬", "边境牧羊犬", "法兰德斯牧牛狗", "罗特韦尔犬", "德国牧羊犬", "多伯曼犬", "鹿犬；迷你杜宾犬", "大瑞士山地犬", "伯恩山犬",
+	"阿策尔山犬", "恩特尔布赫山犬", "拳师狗", "斗牛獒", "藏獒", "法国斗牛犬", "大丹犬", "圣伯纳德狗", "爱斯基摩犬", "阿拉斯加雪橇犬",
+	"哈士奇", "达尔马提亚", "狮毛狗", "巴辛吉狗", "八哥犬", "莱昂贝格狗", "纽芬兰犬", "大白熊犬", "萨摩耶犬", "博美犬",
+	"松狮", "凯斯犬", "布鲁塞尔格林芬犬", "彭布洛克威尔士科基犬", "威尔士柯基犬", "玩具贵宾犬", "迷你贵宾犬", "标准贵宾犬", "墨西哥无毛犬", "灰狼",
+	"白狼", "红太狼", "狼", "澳洲野狗", "豺", "非洲猎犬", "鬣狗", "红狐狸", "沙狐", "北极狐狸",
+	"灰狐狸", "虎斑猫", "山猫", "波斯猫", "暹罗猫", "埃及猫", "美洲狮", "猞猁", "豹子", "雪豹",
+	"美洲虎", "狮子", "老虎", "猎豹", "棕熊", "美洲黑熊", "冰熊", "懒熊", "獴", "猫鼬",
+	"虎甲虫", "瓢虫", "土鳖虫", "天牛", "龟甲虫", "粪甲虫", "犀牛甲虫", "象甲", "苍蝇", "蜜蜂",
+	"蚂蚁", "蚱蜢", "蟋蟀", "竹节虫", "蟑螂", "螳螂", "蝉", "叶蝉", "草蜻蛉", "蜻蜓",
+	"豆娘", "优红蛱蝶", "小环蝴蝶", "君主蝴蝶", "菜粉蝶", "白蝴蝶", "灰蝶", "海星", "海胆", "海黄瓜；海参",
+	"野兔", "兔", "安哥拉兔", "仓鼠", "刺猬", "黑松鼠", "土拨鼠", "海狸", "豚鼠", "栗色马",
+	"斑马", "猪", "野猪", "疣猪", "河马", "牛", "水牛", "野牛", "公羊", "大角羊",
+	"山羊", "狷羚", "黑斑羚", "瞪羚", "阿拉伯单峰骆驼", "骆驼", "黄鼠狼", "水貂", "臭猫", "黑足鼬",
+	"水獭", "臭鼬", "獾", "犰狳", "树懒", "猩猩", "大猩猩", "黑猩猩", "长臂猿", "合趾猿长臂猿",
+	"长尾猴", "赤猴", "狒狒", "恒河猴", "白头叶猴", "疣猴", "长鼻猴", "狨（美洲产小型长尾猴）", "卷尾猴", "吼猴",
+	"伶猴", "蜘蛛猴", "松鼠猴", "马达加斯加环尾狐猴", "大狐猴", "印度大象", "非洲象", "小熊猫", "大熊猫", "杖鱼",
+	"鳗鱼", "银鲑", "三色刺蝶鱼", "海葵鱼", "鲟鱼", "雀鳝", "狮子鱼", "河豚", "算盘", "长袍",
+	"学位袍", "手风琴", "原声吉他", "航空母舰", "客机", "飞艇", "祭坛", "救护车", "水陆两用车", "模拟时钟",
+	"蜂房", "围裙", "垃圾桶", "攻击步枪", "背包", "面包店", "平衡木", "热气球", "圆珠笔", "创可贴",
+	"班卓琴", "栏杆", "杠铃", "理发师的椅子", "理发店", "牲口棚", "晴雨表", "圆筒", "园地小车", "棒球",
+	"篮球", "婴儿床", "巴松管", "游泳帽", "沐浴毛巾", "浴缸", "沙滩车", "灯塔", "烧杯", "熊皮高帽",
+	"啤酒瓶", "啤酒杯", "钟塔", "（小儿用的）围嘴", "串联自行车", "比基尼", "装订册", "双筒望远镜", "鸟舍", "船库",
+	"双人雪橇", "饰扣式领带", "阔边女帽", "书橱", "书店", "瓶盖", "弓箭", "蝴蝶结领结", "铜制牌位", "奶罩",
+	"防波堤", "铠甲", "扫帚", "桶", "扣环", "防弹背心", "动车", "肉铺", "出租车", "大锅",
+	"蜡烛", "大炮", "独木舟", "开瓶器", "开衫", "车镜", "旋转木马", "木匠的工具包", "纸箱", "车轮",
+	"取款机", "盒式录音带", "卡带播放器", "城堡", "双体船", "CD播放器", "大提琴", "移动电话", "铁链", "围栏",
+	"链甲", "电锯", "箱子", "梳妆台", "编钟", "中国橱柜", "圣诞袜", "教堂", "电影院", "切肉刀",
+	"悬崖屋", "斗篷", "木屐", "鸡尾酒调酒器", "咖啡杯", "咖啡壶", "螺旋结构（楼梯）", "组合锁", "电脑键盘", "糖果",
+	"集装箱船", "敞篷车", "瓶塞钻", "短号", "牛仔靴", "牛仔帽", "摇篮", "起重机", "头盔", "板条箱",
+	"小儿床", "砂锅", "槌球", "拐杖", "胸甲", "大坝", "书桌", "台式电脑", "有线电话", "尿布湿",
+	"数字时钟", "数字手表", "餐桌板", "抹布", "洗碗机", "盘式制动器", "码头", "狗拉雪橇", "圆顶", "门垫",
+	"钻井平台", "鼓", "鼓槌", "哑铃", "荷兰烤箱", "电风扇", "电吉他", "电力机车", "组合电视柜", "信封",
+	"浓缩咖啡机", "扑面粉", "女用长围巾", "文件", "消防船", "消防车", "火炉栏", "旗杆", "长笛", "折叠椅",
+	"橄榄球头盔", "叉车", "喷泉", "钢笔", "有四根帷柱的床", "运货车厢", "圆号", "煎锅", "裘皮大衣", "垃圾车",
+	"防毒面具", "汽油泵", "高脚杯", "卡丁车", "高尔夫球", "高尔夫球车", "狭长小船", "锣", "礼服", "钢琴",
+	"温室", "散热器格栅", "杂货店", "断头台", "小发夹", "头发喷雾", "半履带装甲车", "锤子", "大篮子", "手摇鼓风机",
+	"手提电脑", "手帕", "硬盘", "口琴", "竖琴", "收割机", "斧头", "手枪皮套", "家庭影院", "蜂窝",
+	"钩爪", "衬裙", "单杠", "马车", "沙漏", "iPod", "熨斗", "南瓜灯笼", "牛仔裤", "吉普车",
+	"T恤衫", "拼图", "人力车", "操纵杆", "和服", "护膝", "蝴蝶结", "大褂", "长柄勺", "灯罩",
+	"笔记本电脑", "割草机", "镜头盖", "开信刀；拆信刀", "图书馆", "救生艇", "点火器", "豪华轿车", "远洋班轮", "唇膏",
+	"平底便鞋", "洗剂", "扬声器", "放大镜", "锯木厂", "磁罗盘", "邮袋", "信箱", "女游泳衣", "有肩带浴衣",
+	"窨井盖", "沙球（一种打击乐器）", "马林巴木琴", "面膜", "火柴", "花柱", "迷宫", "量杯", "药箱", "巨石",
+	"麦克风", "微波炉", "军装", "奶桶", "迷你巴士", "迷你裙", "面包车；小型货车", "导弹", "连指手套", "搅拌钵",
+	"活动房屋（由汽车拖拉的）", "福特T型车", "调制解调器；光猫", "修道院", "显示器", "电瓶车", "砂浆", "学士", "清真寺", "蚊帐",
+	"摩托车", "山地自行车", "登山帐", "鼠标", "捕鼠器", "搬家货车", "动物的口套", "金属钉子", "颈托", "项链",
+	"乳头（瓶）", "平板电脑", "方尖碑", "双簧管", "小鹅笛；球形笛(管身椭圆形)", "里程表", "滤油器", "风琴", "示波器", "罩裙",
+	"牛车", "氧气面罩", "包装", "船桨", "明轮", "挂锁", "画笔", "睡衣", "宫殿", "排箫",
+	"纸巾", "降落伞", "双杠", "公园长椅", "停车收费表", "客车", "露台", "付费电话", "基座", "铅笔盒",
+	"卷笔刀", "香水（瓶）", "培养皿", "复印机", "拨弦片", "尖顶头盔", "用尖板条连成的尖桩篱栅", "皮卡", "桥墩", "存钱罐",
+	"药瓶", "枕头", "乒乓球", "风车", "海盗船", "水罐", "木工刨", "天文馆", "塑料袋", "板架",
+	"犁型铲雪机", "手压皮碗泵", "宝丽来相机", "电线杆", "警车", "雨披", "台球桌", "充气饮料瓶", "花盆", "陶工旋盘",
+	"电钻", "祈祷垫", "打印机", "监狱", "炮弹", "投影仪", "冰球", "沙包", "小钱袋；手袋", "羽管笔",
+	"被子", "赛车", "球拍", "散热器", "收音机", "射电望远镜", "雨桶", "休闲车", "卷轴", "反射式照相机",
+	"冰箱", "遥控器", "餐厅", "左轮手枪", "步枪", "摇椅", "电转烤肉架", "橡皮", "橄榄球", "直尺",
+	"跑步鞋", "保险柜", "安全别针", "盐瓶（调味用）", "凉鞋", "纱笼", "萨克斯管", "剑鞘", "秤", "校车",
+	"帆船", "记分牌", "屏幕", "螺丝", "螺丝刀", "安全带", "缝纫机", "盾牌", "皮鞋店", "障子",
+	"购物篮", "购物车", "铁锹", "浴帽", "浴帘", "滑雪板", "滑雪面罩", "睡袋", "滑尺", "滑动门",
+	"角子老虎机", "潜水通气管", "摩托雪橇；雪地机动车", "扫雪机", "皂液器", "足球", "袜子", "碟式太阳能", "宽边帽", "汤碗",
+	"空格键", "空间加热器", "航天飞机", "锅铲；做饭的铲子", "快艇", "蜘蛛网", "纺锤；手纺用的绕线杆", "跑车", "聚光灯", "舞台",
+	"蒸汽机车", "钢拱桥", "钢滚筒", "听诊器", "女用披肩", "石头墙", "秒表", "火炉", "过滤器", "有轨电车",
+	"担架", "沙发床", "佛塔", "潜艇", "套装", "日晷", "太阳镜", "太阳镜", "防晒霜", "悬索桥",
+	"拖把", "运动衫", "游泳裤", "秋千", "开关", "注射器；吸管", "台灯", "坦克", "录音机", "茶壶",
+	"泰迪", "电视", "网球；打网球的球", "茅草", "幕布", "顶针", "打谷机；脱粒机", "宝座", "瓦屋顶", "烤面包机",
+	"烟草店", "马桶", "火炬", "图腾柱", "拖车；牵引车", "玩具店", "拖拉机", "半挂汽车", "托盘", "风衣",
+	"三轮车", "三体船", "三脚架", "凯旋门", "无轨电车", "长号", "浴盆", "旋转式栅门", "打字机键盘", "伞",
+	"独轮车", "直立式钢琴", "吸尘器", "花瓶；装饰瓶", "拱顶", "天鹅绒", "自动售货机", "法衣；祭衣；祭服", "高架桥", "小提琴",
+	"排球", "松饼机", "挂钟", "钱包；钱夹", "衣柜衣橱", "军用飞机", "洗脸盆", "洗衣机", "水瓶", "水壶",
+	"水塔", "威士忌壶", "哨子", "假发", "纱窗", "百叶窗", "温莎领带", "葡萄酒瓶", "飞机翅膀", "炒菜锅",
+	"木勺子；木头勺子", "毛织品", "原木栅栏", "沉船", "双桅船", "蒙古包", "网站；网页", "漫画", "纵横字谜", "路标",
+	"交通信号灯", "防尘罩", "菜单", "盘子", "墨西哥鳄梨酱；墨西哥牛油果酱", "清炖肉汤", "火锅", "乳脂蛋糕；英国甜点", "冰淇淋", "冰棍；雪糕",
+	"法式面包", "百吉饼", "椒盐脆饼", "芝士汉堡", "热狗", "土豆泥", "结球甘蓝", "西兰花；绿菜花", "菜花；花椰菜", "西葫芦",
+	"金丝瓜；意面南瓜；面条瓜", "绿色小南瓜；青南瓜", "南瓜", "黄瓜", "洋蓟；球蓟", "甜椒", "刺棘蓟", "蘑菇", "绿苹果", "草莓",
+	"橘子", "柠檬", "无花果", "菠萝", "香蕉", "菠萝蜜", "番荔枝", "石榴", "干草", "培根蛋酱意大利面",
+	"巧克力酱", "生面；面团", "瑞士肉包", "披萨", "馅饼", "卷饼", "红葡萄酒", "意式浓缩咖啡", "杯子", "蛋酒",
+	"高山", "泡泡", "悬崖", "珊瑚礁", "间歇泉；间断喷发的温泉", "湖边", "岬角；深入海中的狭长高地", "沙洲", "沙滩", "峡谷",
+	"火山", "棒球运动员", "新郎", "潜水员", "油菜", "雏菊", "黄色杓兰", "玉米", "橡子", "玫瑰果",
+    "七叶树果实", "珊瑚菌", "木耳", "鹿花菌", "臭角菇", "地星", "多叶奇果菌", "牛肝菌", "玉米棒子", "卫生纸"]
+openai_imagenet_template = [
+    lambda c: f'{c}的照片。',
+    lambda c: f'质量差的{c}的照片。',
+    lambda c: f'许多{c}的照片。',
+    lambda c: f'{c}的雕塑。',
+    lambda c: f'难以看到{c}的照片。',
+    lambda c: f'{c}的低分辨率照片。',
+    lambda c: f'{c}的渲染。',
+    lambda c: f'涂鸦{c}。',
+    lambda c: f'{c}的糟糕照片。',
+    lambda c: f'{c}的裁剪照片。',
+    lambda c: f'{c}的纹身。',
+    lambda c: f'{c}的刺绣照片。',
+    lambda c: f'很难看到{c}的照片。',
+    lambda c: f'{c}的明亮照片。',
+    lambda c: f'一张干净的{c}的照片。',
+    lambda c: f'一张包含{c}的照片。',
+    lambda c: f'{c}的深色照片。',
+    lambda c: f'{c}的手绘画。',
+    lambda c: f'���的{c}的照片。',
+    lambda c: f'不自然的{c}的照片。',
+    lambda c: f'一张酷的{c}的照片。',
+    lambda c: f'{c}的特写照片。',
+    lambda c: f'{c}的黑白照片。',
+    lambda c: f'一幅{c}的画。',
+    lambda c: f'一幅{c}的绘画。',
+    lambda c: f'一张{c}的像素照片。',
+    lambda c: f'{c}的雕像。',
+    lambda c: f'一张{c}的明亮照片。',
+    lambda c: f'{c}的裁剪照片。',
+    lambda c: f'人造的{c}的照片。',
+    lambda c: f'一张关于{c}的照片。',
+    lambda c: f'损坏的{c}的jpeg照片。',
+    lambda c: f'{c}的模糊照片。',
+    lambda c: f'{c}的相片。',
+    lambda c: f'一张{c}的好照片。',
+    lambda c: f'{c}的渲染照。',
+    lambda c: f'视频游戏中的{c}。',
+    lambda c: f'一张{c}的照片。',
+    lambda c: f'{c}的涂鸦。',
+    lambda c: f'{c}的近距离照片。',
+    lambda c: f'{c}的折纸。',
+    lambda c: f'{c}在视频游戏中。',
+    lambda c: f'{c}的草图。',
+    lambda c: f'{c}的涂鸦照。',
+    lambda c: f'{c}的折纸形状。',
+    lambda c: f'低分辨率的{c}的照片。',
+    lambda c: f'玩具{c}。',
+    lambda c: f'{c}的副本。',
+    lambda c: f'{c}的干净的照片。',
+    lambda c: f'一张大{c}的照片。',
+    lambda c: f'{c}的重现。',
+    lambda c: f'一张漂亮的{c}的照片。',
+    lambda c: f'一张奇怪的{c}的照片。',
+    lambda c: f'模糊的{c}的照片。',
+    lambda c: f'卡通{c}。',
+    lambda c: f'{c}的艺术作品。',
+    lambda c: f'{c}的素描。',
+    lambda c: f'刺绣{c}。',
+    lambda c: f'{c}的像素照。',
+    lambda c: f'{c}的拍照。',
+    lambda c: f'{c}的损坏的照片。',
+    lambda c: f'高质量的{c}的照片。',
+    lambda c: f'毛绒玩具{c}。',
+    lambda c: f'漂亮的{c}的照片。',
+    lambda c: f'小{c}的照片。',
+    lambda c: f'照片是奇怪的{c}。',
+    lambda c: f'漫画{c}。',
+    lambda c: f'{c}的艺术照。',
+    lambda c: f'{c}的图形。',
+    lambda c: f'大{c}的照片。',
+    lambda c: f'黑白的{c}的照片。',
+    lambda c: f'{c}毛绒玩具。',
+    lambda c: f'一张{c}的深色照片。',
+    lambda c: f'{c}的摄影图。',
+    lambda c: f'{c}的涂鸦照。',
+    lambda c: f'玩具形状的{c}。',
+    lambda c: f'拍了{c}的照片。',
+    lambda c: f'酷酷的{c}的照片。',
+    lambda c: f'照片里的小{c}。',
+    lambda c: f'{c}的刺青。',
+]

Model/CLIP/cn_clip/eval/make_topk_predictions.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+'''
+This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs text-to-image prediction file for evaluation.
+'''
+import argparse
+import numpy
+from tqdm import tqdm
+import json
+import numpy as np
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image-feats',
+        type=str,
+        required=True,
+        help="Specify the path of image features."
+    )
+    parser.add_argument(
+        '--text-feats',
+        type=str,
+        required=True,
+        help="Specify the path of text features."
+    )
+    parser.add_argument(
+        '--top-k',
+        type=int,
+        default=10,
+        help="Specify the k value of top-k predictions."
+    )
+    parser.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=32768,
+        help="Specify the image-side batch size when computing the inner products, default to 8192"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help="Specify the output jsonl prediction filepath."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    print("Begin to load image features...")
+    image_ids = []
+    image_feats = []
+    with open(args.image_feats, "r") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            image_ids.append(obj['image_id'])
+            image_feats.append(obj['feature'])
+    image_feats_array = np.array(image_feats, dtype=np.float32)
+    print("Finished loading image features.")
+    print("Begin to compute top-{} predictions for texts...".format(args.top_k))
+    with open(args.output, "w") as fout:
+        with open(args.text_feats, "r") as fin:
+            for line in tqdm(fin):
+                obj = json.loads(line.strip())
+                text_id = obj['text_id']
+                text_feat = obj['feature']
+                score_tuples = []
+                text_feat_tensor = torch.tensor([text_feat], dtype=torch.float).cuda() # [1, feature_dim]
+                idx = 0
+                while idx < len(image_ids):
+                    img_feats_tensor = torch.from_numpy(image_feats_array[idx : min(idx + args.eval_batch_size, len(image_ids))]).cuda() # [batch_size, feature_dim]
+                    batch_scores = text_feat_tensor @ img_feats_tensor.t() # [1, batch_size]
+                    for image_id, score in zip(image_ids[idx : min(idx + args.eval_batch_size, len(image_ids))], batch_scores.squeeze(0).tolist()):
+                        score_tuples.append((image_id, score))
+                    idx += args.eval_batch_size
+                top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k]
+                fout.write("{}\n".format(json.dumps({"text_id": text_id, "image_ids": [entry[0] for entry in top_k_predictions]})))
+    print("Top-{} predictions are saved in {}".format(args.top_k, args.output))
+    print("Done!")

Model/CLIP/cn_clip/eval/make_topk_predictions_tr.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+'''
+This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs image-to-text retrieval prediction file for evaluation.
+'''
+import argparse
+import numpy
+from tqdm import tqdm
+import json
+import numpy as np
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image-feats',
+        type=str,
+        required=True,
+        help="Specify the path of image features."
+    )
+    parser.add_argument(
+        '--text-feats',
+        type=str,
+        required=True,
+        help="Specify the path of text features."
+    )
+    parser.add_argument(
+        '--top-k',
+        type=int,
+        default=10,
+        help="Specify the k value of top-k predictions."
+    )
+    parser.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=32768,
+        help="Specify the image-side batch size when computing the inner products, default to 8192"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help="Specify the output jsonl prediction filepath."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    print("Begin to load text features...")
+    text_ids = []
+    text_feats = []
+    with open(args.text_feats, "r") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            text_ids.append(obj['text_id'])
+            text_feats.append(obj['feature'])
+    text_feats_array = np.array(text_feats, dtype=np.float32)
+    print("Finished loading text features.")
+    print("Begin to compute top-{} predictions for images...".format(args.top_k))
+    with open(args.output, "w") as fout:
+        with open(args.image_feats, "r") as fin:
+            for line in tqdm(fin):
+                obj = json.loads(line.strip())
+                image_id = obj['image_id']
+                image_feat = obj['feature']
+                score_tuples = []
+                image_feat_tensor = torch.tensor([image_feat], dtype=torch.float).cuda() # [1, feature_dim]
+                idx = 0
+                while idx < len(text_ids):
+                    text_feats_tensor = torch.from_numpy(text_feats_array[idx : min(idx + args.eval_batch_size, len(text_ids))]).cuda() # [batch_size, feature_dim]
+                    batch_scores = image_feat_tensor @ text_feats_tensor.t() # [1, batch_size]
+                    for text_id, score in zip(text_ids[idx : min(idx + args.eval_batch_size, len(text_ids))], batch_scores.squeeze(0).tolist()):
+                        score_tuples.append((text_id, score))
+                    idx += args.eval_batch_size
+                top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k]
+                fout.write("{}\n".format(json.dumps({"image_id": image_id, "text_ids": [entry[0] for entry in top_k_predictions]})))
+    print("Top-{} predictions are saved in {}".format(args.top_k, args.output))
+    print("Done!")

Model/CLIP/cn_clip/eval/transform_ir_annotation_to_tr.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# -*- coding: utf-8 -*-
+from tqdm import tqdm
+import argparse
+import json
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input',
+        type=str,
+        required=True,
+        help="Input path of text-to-image Jsonl annotation file."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    t2i_record = dict()
+    with open(args.input, "r") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            text_id = obj['text_id']
+            image_ids = obj['image_ids']
+            for image_id in image_ids:
+                if image_id not in t2i_record:
+                    t2i_record[image_id] = []
+                t2i_record[image_id].append(text_id)
+    with open(args.input.replace(".jsonl", "") + ".tr.jsonl", "w") as fout:
+        for image_id, text_ids in t2i_record.items():
+            out_obj = {"image_id": image_id, "text_ids": text_ids}
+            fout.write("{}\n".format(json.dumps(out_obj)))
+    print("Done!")

Model/CLIP/cn_clip/eval/zeroshot_evaluation.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# -*- coding: utf-8 -*-
+'''
+This script performs zero-shot evaluation on ImageNet-1K. (with single-GPU)
+'''
+import os
+import argparse
+from pathlib import Path
+import json
+from tqdm import tqdm
+import torch
+from cn_clip.clip.model import convert_weights, CLIP
+from cn_clip.clip import tokenize
+from cn_clip.training.main import convert_models_to_fp32
+from cn_clip.clip.utils import image_transform
+from cn_clip.eval.data import get_imagenet_dataset, _preprocess_text
+from cn_clip.eval.imagenet_zeroshot_templates import imagenet_classnames, openai_imagenet_template
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--vision-model",
+        choices=["ViT-B-32", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"],
+        default="ViT-B-16",
+        help="Name of the vision backbone to use.",
+    )
+    parser.add_argument(
+        "--text-model",
+        choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"],
+        default="RoBERTa-wwm-ext-base-chinese",
+        help="Name of the text backbone to use.",
+    )
+    parser.add_argument(
+        "--precision",
+        choices=["amp", "fp16", "fp32"],
+        default="amp",
+        help="Floating point precition."
+    )
+    parser.add_argument(
+        "--imagenet-val",
+        type=str,
+        required=True,
+        help="Path to imagenet val set for conducting zero shot evaluation.",
+    )
+    parser.add_argument(
+        "--img-batch-size", type=int, default=64, help="Image batch size."
+    )
+    parser.add_argument(
+        "--context-length",
+        type=int,
+        default=32,
+        help="The maximum length of input text (include [CLS] & [SEP] tokens)."
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--num-workers", type=int, default=4, help="Number of workers for ImageNet dataset."
+    )
+    args = parser.parse_args()
+    return args
+def zero_shot_classifier(model, classnames, templates, args):
+    with torch.no_grad():
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [_preprocess_text(template(classname)) for template in templates] #format with class
+            texts = tokenize(texts, context_length=args.context_length).to(args.gpu) #tokenize
+            class_embeddings = model(None, texts)
+            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+            class_embedding = class_embeddings.mean(dim=0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.gpu)
+    return zeroshot_weights
+def accuracy(output, target, topk=(1,)):
+    pred = output.topk(max(topk), 1, True, True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
+def run(model, classifier, dataloader, args):
+    with torch.no_grad():
+        top1, top5, n = 0., 0., 0.
+        for images, target in tqdm(dataloader):
+            images = images.to(args.gpu)
+            target = target.to(args.gpu)
+            # predict
+            image_features = model(images, None)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+            logits = 100. * image_features @ classifier
+            # measure accuracy
+            acc1, acc5 = accuracy(logits, target, topk=(1, 5))
+            top1 += acc1
+            top5 += acc5
+            n += images.size(0)
+    top1 = (top1 / n)
+    top5 = (top5 / n)
+    return top1, top5
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    args.gpu = 0
+    torch.cuda.set_device(args.gpu)
+    # Initialize the model.
+    vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json"
+    print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.text_model.replace('/', '-')}.json"
+    print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        if isinstance(model_info['vision_layers'], str):
+            model_info['vision_layers'] = eval(model_info['vision_layers'])
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    model = CLIP(**model_info)
+    convert_weights(model)
+    # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+    if args.precision == "amp" or args.precision == "fp32":
+        convert_models_to_fp32(model)
+    model.cuda(args.gpu)
+    if args.precision == "fp16":
+        convert_weights(model)
+    # Get imagenet eval data.
+    print("Preparing imagenet val dataset.")
+    data = {}
+    data["imagenet-val"] = get_imagenet_dataset(args, image_transform(model_info['image_resolution']), "val")
+    # Resume from a checkpoint.
+    print("Begin to load model checkpoint from {}.".format(args.resume))
+    assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume)
+    # Map model to be loaded to specified single gpu.
+    loc = "cuda:{}".format(args.gpu)
+    checkpoint = torch.load(args.resume, map_location='cpu')
+    start_epoch = checkpoint["epoch"]
+    sd = checkpoint["state_dict"]
+    if next(iter(sd.items()))[0].startswith('module'):
+        sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+    model.load_state_dict(sd)
+    print(
+        f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)"
+    )
+    # Compute ensembled class embeddings
+    print('Building zero-shot classifier')
+    model.eval()
+    classifier = zero_shot_classifier(model, imagenet_classnames, openai_imagenet_template, args)
+    # Make inference and evaluation
+    print('Using classifier')
+    results = {}
+    top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args)
+    results['imagenet-zeroshot-val-top1'] = top1
+    results['imagenet-zeroshot-val-top5'] = top5
+    print('Result:')
+    print(", ".join(["{}: {}".format(k, v) for k, v in results.items()]))
+    print('Finished.')

Model/CLIP/cn_clip/preprocess/__init__.py ADDED Viewed

File without changes