Spaces:

navervision
/

LinCIR

Running

App Files Files Community

Geonmo commited on Dec 12, 2023

Commit

cacafc1

•

1 Parent(s): 3f0fd05

initial commit

Browse files

Files changed (12) hide show

README.md +1 -1
app.py +220 -0
data_utils.py +67 -0
encode_with_pseudo_tokens.py +54 -0
eval_templates.py +70 -0
generate_test_submission.py +363 -0
loader.py +632 -0
models.py +192 -0
requirements.txt +8 -0
train_phi.py +317 -0
utils.py +182 -0
validate.py +650 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: LinCIR
-emoji: 🐨
 colorFrom: purple
 colorTo: yellow
 sdk: gradio

 ---
 title: LinCIR
+emoji: 📚
 colorFrom: purple
 colorTo: yellow
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+import os
+import time
+from argparse import ArgumentParser
+import numpy as np
+import torch
+import gradio as gr
+from clip_retrieval.clip_client import ClipClient
+from encode_with_pseudo_tokens import encode_with_pseudo_tokens_HF
+from models import build_text_encoder, Phi, PIC2WORD
+import transformers
+from huggingface_hub import hf_hub_url, cached_download
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--lincir_ckpt_path", default=None, type=str,
+                        help="The output directory where the model predictions and checkpoints will be written")
+    parser.add_argument("--pic2word_ckpt_path", default=None, type=str)
+    parser.add_argument("--cache_dir", default="./hf_models", type=str,
+                        help="Path to model cache folder")
+    parser.add_argument("--clip_model_name", default="large", type=str,
+                        help="CLIP model to use, e.g 'large', 'huge', 'giga'")
+    parser.add_argument("--mixed_precision", default="fp16", type=str)
+    parser.add_argument("--test_fps", action="store_true")
+    args = parser.parse_args()
+    return args
+def load_models(args):
+    if torch.cuda.is_available():
+        device = 'cuda:0'
+        dtype = torch.float16
+    else:
+        device = 'cpu'
+        dtype = torch.float32
+    clip_vision_model, clip_preprocess, clip_text_model, tokenizer = build_text_encoder(args)
+    tokenizer.add_special_tokens({'additional_special_tokens':["[$]"]}) # 49408
+    # ours
+    phi = Phi(input_dim=clip_text_model.config.projection_dim,
+              hidden_dim=clip_text_model.config.projection_dim * 4,
+              output_dim=clip_text_model.config.hidden_size, dropout=0.0)
+    phi.eval()
+    # searle
+    phi_searle, _ = torch.hub.load(repo_or_dir='miccunifi/SEARLE', model='searle', source='github',
+                                   backbone='ViT-L/14')
+    phi_searle.eval()
+    # pic2word
+    phi_pic2word = PIC2WORD(embed_dim=clip_text_model.config.projection_dim,
+                            output_dim=clip_text_model.config.hidden_size)
+    phi_pic2word.eval()
+    clip_vision_model.to(device, dtype=dtype)
+    clip_text_model.to(device, dtype=dtype)
+    if not args.test_fps:
+        # download and load sd
+        if not os.path.exists('./pretrained_models/lincir_large.pt'):
+            model_file_url = hf_hub_url(repo_id='navervision/zeroshot-cir-models', filename='lincir_large.pt')
+            cached_download(model_file_url, cache_dir='./pretrained_models', force_filename='lincir_large.pt')
+        state_dict = torch.load('./pretrained_models/lincir_large.pt', map_location=device)
+        phi.load_state_dict(state_dict['Phi'])
+        if not os.path.exists('./pretrained_models/pic2word_large.pt'):
+            model_file_url = hf_hub_url(repo_id='navervision/zeroshot-cir-models', filename='pic2word_large.pt')
+            cached_download(model_file_url, cache_dir='./pretrained_models', force_filename='pic2word_large.pt')
+        sd = torch.load('./pretrained_models/pic2word_large.pt', map_location=device)['state_dict_img2text']
+        sd = {k[len('module.'):]: v for k, v in sd.items()}
+        phi_pic2word.load_state_dict(sd)
+    phi.to(device, dtype=dtype)
+    phi_searle.to(device, dtype=dtype)
+    phi_pic2word.to(device, dtype=dtype)
+    decoder = None
+    return {'clip_vision_model': clip_vision_model,
+            'clip_preprocess': clip_preprocess,
+            'clip_text_model': clip_text_model,
+            'tokenizer': tokenizer,
+            'phi': phi,
+            'phi_searle': phi_searle,
+            'phi_pic2word': phi_pic2word,
+            'decoder': decoder,
+            'device': device,
+            'dtype': dtype,
+            'clip_model_name': args.clip_model_name,
+            }
+def predict(images, input_text, model_name):
+    start_time = time.time()
+    input_images = model_dict['clip_preprocess'](images, return_tensors='pt')['pixel_values'].to(model_dict['device'])
+    input_text = input_text.replace('$', '[$]')
+    input_tokens = model_dict['tokenizer'](text=input_text, return_tensors='pt', padding='max_length', truncation=True)['input_ids'].to(model_dict['device'])
+    input_tokens = torch.where(input_tokens == 49408,
+                               torch.ones_like(input_tokens) * 259,
+                               input_tokens)
+    image_features = model_dict['clip_vision_model'](pixel_values=input_images.to(model_dict['dtype'])).image_embeds
+    clip_image_time = time.time() - start_time
+    start_time = time.time()
+    if model_name == 'lincir':
+        estimated_token_embeddings = model_dict['phi'](image_features)
+    elif model_name == 'searle':
+        estimated_token_embeddings = model_dict['phi_searle'](image_features)
+    else: # model_name == 'pic2word'
+        estimated_token_embeddings = model_dict['phi_pic2word'](image_features)
+    phi_time = time.time() - start_time
+    start_time = time.time()
+    text_embeddings, text_last_hidden_states = encode_with_pseudo_tokens_HF(model_dict['clip_text_model'], input_tokens, estimated_token_embeddings, return_last_states=True)
+    clip_text_time = time.time() - start_time
+    start_time = time.time()
+    results = client.query(embedding_input=text_embeddings[0].tolist())
+    retrieval_time = time.time() - start_time
+    output = ''
+    for idx, result in enumerate(results):
+        image_url = result['url']
+        output += f'![image]({image_url})\n'
+    time_output = {'CLIP visual extractor': clip_image_time,
+                   'CLIP textual extractor': clip_text_time,
+                   'Phi projection': phi_time,
+                   'CLIP retrieval': retrieval_time,
+                   }
+    setup_output = {'device': model_dict['device'],
+                    'dtype': model_dict['dtype'],
+                    'Phi': model_name,
+                    'CLIP': model_dict['clip_model_name'],
+                    }
+    return {'time': time_output, 'setup': setup_output}, output
+def test_fps(batch_size=1):
+    dummy_images = torch.rand([batch_size, 3, 224, 224])
+    todo_list = ['phi', 'phi_pic2word']
+    input_tokens = model_dict['tokenizer'](text=['a photo of $1 with flowers'] * batch_size, return_tensors='pt', padding='max_length', truncation=True)['input_ids'].to(model_dict['device'])
+    input_tokens = torch.where(input_tokens == 49409,
+                               torch.ones_like(input_tokens) * 259,
+                               input_tokens)
+    for model_name in todo_list:
+        time_array = []
+        n_repeat = 100
+        for _ in range(n_repeat):
+            start_time = time.time()
+            image_features = model_dict['clip_vision_model'](pixel_values=dummy_images.to(model_dict['clip_vision_model'].device, dtype=model_dict['clip_vision_model'].dtype)).image_embeds
+            token_embeddings = model_dict[model_name](image_features)
+            text_embeddings = encode_with_pseudo_tokens_HF(model_dict['clip_text_model'], input_tokens, token_embeddings)
+            end_time = time.time()
+            if _ > 5:
+                time_array.append(end_time - start_time)
+        print(f"{model_name}: {np.mean(time_array):.4f}")
+if __name__ == '__main__':
+    args = parse_args()
+    global model_dict, client
+    model_dict = load_models(args)
+    if args.test_fps:
+        # check FPS of all models.
+        test_fps(1)
+        exit()
+    client = ClipClient(url="https://knn.laion.ai/knn-service",
+                        indice_name="laion5B-H-14" if args.clip_model_name == "huge" else "laion5B-L-14",
+                        )
+    title = 'Zeroshot CIR demo'
+    md_title = f'''# {title}
+    [LinCIR](https://arxiv.org/abs/2312.01998): Language-only Training of Zero-shot Composed Image Retrieval
+    [SEARLE](https://arxiv.org/abs/2303.15247): Zero-shot Composed Image Retrieval with Textual Inversion
+    [Pic2Word](https://arxiv.org/abs/2302.03084): Mapping Pictures to Words for Zero-shot Composed Image Retrieval
+    K-NN index for the retrieval results are entirely trained using the entire Laion-5B imageset. This is made possible thanks to the great work of [rom1504](https://github.com/rom1504/clip-retrieval).
+    '''
+    with gr.Blocks(title=title) as demo:
+        gr.Markdown(md_title)
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    image_source = gr.Image(type='pil', label='image1')
+                model_name = gr.Radio(['lincir', 'searle', 'pic2word'], label='Phi model', value='lincir')
+                text_input = gr.Textbox(value='', label='Input text guidance. Special token is $')
+                submit_button = gr.Button('Submit')
+                gr.Examples([["example1.jpg", "$, pencil sketch", 'lincir']], inputs=[image_source, text_input, model_name])
+            with gr.Column():
+                json_output = gr.JSON(label='Processing time')
+                md_output = gr.Markdown(label='Output')
+        submit_button.click(predict, inputs=[image_source, text_input, model_name], outputs=[json_output, md_output])
+    demo.queue()
+    demo.launch()

data_utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from pathlib import Path
+import PIL
+import torch
+import torchvision.transforms.functional as FT
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose, CenterCrop, ToTensor, Normalize, Resize
+from torchvision.transforms import InterpolationMode
+PROJECT_ROOT = Path(__file__).absolute().parents[1].absolute()
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def collate_fn(batch):
+    '''
+    function which discard None images in a batch when using torch DataLoader
+    :param batch: input_batch
+    :return: output_batch = input_batch - None_values
+    '''
+    batch = list(filter(lambda x: x is not None, batch))
+    return torch.utils.data.dataloader.default_collate(batch)
+class TargetPad:
+    """
+    If an image aspect ratio is above a target ratio, pad the image to match such target ratio.
+    For more details see Baldrati et al. 'Effective conditioned and composed image retrieval combining clip-based features.' Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (2022).
+    """
+    def __init__(self, target_ratio: float, size: int):
+        """
+        :param target_ratio: target ratio
+        :param size: preprocessing output dimension
+        """
+        self.size = size
+        self.target_ratio = target_ratio
+    def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        w, h = image.size
+        actual_ratio = max(w, h) / min(w, h)
+        if actual_ratio < self.target_ratio:  # check if the ratio is above or below the target ratio
+            return image
+        scaled_max_wh = max(w, h) / self.target_ratio  # rescale the pad to match the target ratio
+        hp = max(int((scaled_max_wh - w) / 2), 0)
+        vp = max(int((scaled_max_wh - h) / 2), 0)
+        padding = [hp, vp, hp, vp]
+        return FT.pad(image, padding, 0, 'constant')
+def targetpad_transform(target_ratio: float, dim: int) -> torch.Tensor:
+    """
+    CLIP-like preprocessing transform computed after using TargetPad pad
+    :param target_ratio: target ratio for TargetPad
+    :param dim: image output dimension
+    :return: CLIP-like torchvision Compose transform
+    """
+    return Compose([
+        TargetPad(target_ratio, dim),
+        Resize(dim, interpolation=InterpolationMode.BICUBIC),
+        CenterCrop(dim),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])

encode_with_pseudo_tokens.py ADDED Viewed

	@@ -0,0 +1,54 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+import torch
+from clip.model import CLIP
+from transformers import CLIPTextModelWithProjection
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    Copy-paste from https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/models/clip/modeling_clip.py#L679-L693
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+def encode_with_pseudo_tokens_HF(clip_model: CLIPTextModelWithProjection, text: torch.Tensor, pseudo_tokens: torch.Tensor,
+                              num_tokens=1, return_last_states=False) -> torch.Tensor:
+    x = clip_model.text_model.embeddings.token_embedding(text).type(clip_model.dtype)  # [batch_size, n_ctx, d_model]
+    x = torch.where(text.unsqueeze(-1) == 259,
+                    pseudo_tokens.unsqueeze(1).type(clip_model.dtype),
+                    x)
+    x = x + clip_model.text_model.embeddings.position_embedding(clip_model.text_model.embeddings.position_ids)
+    _causal_attention_mask = _make_causal_mask(text.shape, x.dtype, device=x.device)
+    x = clip_model.text_model.encoder(inputs_embeds=x,
+                                      attention_mask=None,
+                                      causal_attention_mask=_causal_attention_mask,
+                                      output_attentions=False,
+                                      output_hidden_states=False,
+                                      return_dict=False)
+    x = x[0]
+    x_last = clip_model.text_model.final_layer_norm(x)
+    x = x_last[torch.arange(x_last.shape[0], device=x_last.device),
+          text.to(dtype=torch.int, device=x_last.device).argmax(dim=-1),
+          ]
+    if hasattr(clip_model, 'text_projection'):
+        x = clip_model.text_projection(x)
+    if return_last_states:
+        return x, x_last
+    else:
+        return x

eval_templates.py ADDED Viewed

	@@ -0,0 +1,70 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+templates = [
+    lambda caption: f"a photo of $ that {caption}",
+    lambda caption: f"$ that {caption}",
+    lambda caption: f"$ with {caption}",
+    lambda caption: f"$ , {caption}",
+    lambda caption: f"$ adapted to {caption}",
+    lambda caption: f"$ modified by {caption}",
+    lambda caption: f"$ in response to {caption}",
+    lambda caption: f"$ transformed by {caption}",
+    lambda caption: f"$ influenced by {caption}",
+    lambda caption: f"Retrieval of $ using feedback {caption}",
+    lambda caption: f"$ guided by {caption}",
+    lambda caption: f"$ adjusted to {caption}",
+    lambda caption: f"$ in alignment with {caption}",
+    lambda caption: f"$ in correspondence to {caption}",
+    lambda caption: f"$ refined with {caption}",
+    lambda caption: f"$ as directed by {caption}",
+    lambda caption: f"$ evolved from {caption}",
+    lambda caption: f"$ inspired by {caption}",
+    lambda caption: f"$ with adjustments from {caption}",
+    lambda caption: f"$ in consideration of {caption}",
+    lambda caption: f"$ , taking into account {caption}",
+    lambda caption: f"$ as influenced by the query {caption}",
+    lambda caption: f"$ reshaped by {caption}",
+    lambda caption: f"$ curated based on {caption}",
+    lambda caption: f"$ showcasing {caption}",
+    lambda caption: f"An instance of $ where {caption}",
+    lambda caption: f"$ highlighting {caption}",
+    lambda caption: f"A depiction of $ exhibiting {caption}",
+    lambda caption: f"$ as exemplified by {caption}",
+    lambda caption: f"$ demonstrating {caption}",
+    lambda caption: f"An illustration of $ portraying {caption}",
+    lambda caption: f"$ in the context of {caption}",
+    lambda caption: f"$ as influenced by {caption}",
+    lambda caption: f"$ characterized by {caption}",
+    lambda caption: f"$ : An exploration of {caption}",
+    lambda caption: f"A presentation of $ underlined by {caption}",
+    lambda caption: f"A manifestation of $ reflecting {caption}",
+    lambda caption: f"$ in light of {caption}",
+    lambda caption: f"$ as a testament to {caption}",
+    lambda caption: f"$ intertwined with {caption}",
+    lambda caption: f"$ complemented by {caption}",
+    lambda caption: f"$ juxtaposed with {caption}",
+    lambda caption: f"A representation of $ in relation to {caption}",
+    lambda caption: f"$ that {caption}",
+    lambda caption: f"$ which {caption}",
+    lambda caption: f"$ where it {caption}",
+    lambda caption: f"Discover $ that {caption}",
+    lambda caption: f"Retrieve $ that {caption}",
+    lambda caption: f"Search for $ that {caption}",
+    lambda caption: f"Identify $ which {caption}",
+    lambda caption: f"Highlight $ that {caption}",
+    lambda caption: f"Present $ where it {caption}",
+    lambda caption: f"Showcase $ that {caption}",
+    lambda caption: f"Explore $ which {caption}",
+    lambda caption: f"Find $ that {caption}",
+    lambda caption: f"Source $ which {caption}",
+    lambda caption: f"View $ where it {caption}",
+    lambda caption: f"Examine $ that {caption}",
+    lambda caption: f"Analyze $ which {caption}",
+    lambda caption: f"Observe $ that {caption}",
+    lambda caption: f"Report $ which {caption}",
+    lambda caption: f"See $ where it {caption}",
+    lambda caption: f"Document $ that {caption}"
+]

generate_test_submission.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import os
+import json
+import pickle
+from argparse import ArgumentParser
+from typing import List, Tuple, Dict
+import clip
+import numpy as np
+import torch
+import torch.nn.functional as F
+from clip.model import CLIP
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from data_utils import PROJECT_ROOT, targetpad_transform
+from loader import CIRRDataset, CIRCODataset
+from encode_with_pseudo_tokens import encode_with_pseudo_tokens, encode_with_pseudo_tokens_HF
+from models import build_text_encoder, Phi, PIC2WORD
+from utils import extract_image_features, device, collate_fn, extract_pseudo_tokens_with_phi
+@torch.no_grad()
+def cirr_generate_test_submission_file(dataset_path: str, image_encoder, text_encoder, ref_names_list: List[str],
+                                       pseudo_tokens: torch.Tensor, preprocess: callable, submission_name: str) -> None:
+    """
+    Generate the test submission file for the CIRR dataset given the pseudo tokens
+    """
+    # Load the CLIP model
+    #clip_model, _ = clip.load(clip_model_name, device=device, jit=False)
+    #clip_model = clip_model.float().eval()
+    # Compute the index features
+    classic_test_dataset = CIRRDataset(dataset_path, 'test1', 'classic', preprocess)
+    index_features, index_names = extract_image_features(classic_test_dataset, image_encoder)
+    relative_test_dataset = CIRRDataset(dataset_path, 'test1', 'relative', preprocess)
+    # Get the predictions dicts
+    pairid_to_retrieved_images, pairid_to_group_retrieved_images = \
+        cirr_generate_test_dicts(relative_test_dataset, text_encoder, index_features, index_names,
+                                 ref_names_list, pseudo_tokens)
+    submission = {
+        'version': 'rc2',
+        'metric': 'recall'
+    }
+    group_submission = {
+        'version': 'rc2',
+        'metric': 'recall_subset'
+    }
+    submission.update(pairid_to_retrieved_images)
+    group_submission.update(pairid_to_group_retrieved_images)
+    submissions_folder_path = os.path.join('./submission', 'cirr')
+    os.makedirs(submissions_folder_path, exist_ok=True)
+    with open(os.path.join(submissions_folder_path, f"{submission_name}.json"), 'w+') as file:
+        json.dump(submission, file, sort_keys=True)
+    with open(os.path.join(submissions_folder_path, f"subset_{submission_name}.json"), 'w+') as file:
+        json.dump(group_submission, file, sort_keys=True)
+def cirr_generate_test_dicts(relative_test_dataset: CIRRDataset, clip_model, index_features: torch.Tensor,
+                             index_names: List[str], ref_names_list: List[str], pseudo_tokens: List[str]) \
+        -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
+    """
+    Generate the test submission dicts for the CIRR dataset given the pseudo tokens
+    """
+    # Get the predicted features
+    predicted_features, reference_names, pairs_id, group_members = \
+        cirr_generate_test_predictions(clip_model, relative_test_dataset, ref_names_list, pseudo_tokens)
+    print(f"Compute CIRR prediction dicts")
+    # Normalize the index features
+    index_features = index_features.to(device)
+    index_features = F.normalize(index_features, dim=-1).float()
+    # Compute the distances and sort the results
+    distances = 1 - predicted_features @ index_features.T
+    sorted_indices = torch.argsort(distances, dim=-1).cpu()
+    sorted_index_names = np.array(index_names)[sorted_indices]
+    # Delete the reference image from the results
+    reference_mask = torch.tensor(
+        sorted_index_names != np.repeat(np.array(reference_names), len(index_names)).reshape(len(sorted_index_names),
+                                                                                             -1))
+    sorted_index_names = sorted_index_names[reference_mask].reshape(sorted_index_names.shape[0],
+                                                                    sorted_index_names.shape[1] - 1)
+    # Compute the subset predictions
+    group_members = np.array(group_members)
+    group_mask = (sorted_index_names[..., None] == group_members[:, None, :]).sum(-1).astype(bool)
+    sorted_group_names = sorted_index_names[group_mask].reshape(sorted_index_names.shape[0], -1)
+    # Generate prediction dicts
+    pairid_to_retrieved_images = {str(int(pair_id)): prediction[:50].tolist() for (pair_id, prediction) in
+                                  zip(pairs_id, sorted_index_names)}
+    pairid_to_group_retrieved_images = {str(int(pair_id)): prediction[:3].tolist() for (pair_id, prediction) in
+                                        zip(pairs_id, sorted_group_names)}
+    return pairid_to_retrieved_images, pairid_to_group_retrieved_images
+def cirr_generate_test_predictions(clip_model, relative_test_dataset: CIRRDataset, ref_names_list: List[str],
+                                   pseudo_tokens: torch.Tensor) -> \
+        Tuple[torch.Tensor, List[str], List[str], List[List[str]]]:
+    """
+    Generate the test prediction features for the CIRR dataset given the pseudo tokens
+    """
+    # Create the test dataloader
+    relative_test_loader = DataLoader(dataset=relative_test_dataset, batch_size=32, num_workers=10,
+                                      pin_memory=False)
+    predicted_features_list = []
+    reference_names_list = []
+    pair_id_list = []
+    group_members_list = []
+    # Compute the predictions
+    for batch in tqdm(relative_test_loader):
+        reference_names = batch['reference_name']
+        pairs_id = batch['pair_id']
+        relative_captions = batch['relative_caption']
+        group_members = batch['group_members']
+        group_members = np.array(group_members).T.tolist()
+        input_captions = [
+            f"a photo of $ that {rel_caption}" for rel_caption in relative_captions]
+        batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        predicted_features = F.normalize(text_features)
+        predicted_features_list.append(predicted_features)
+        reference_names_list.extend(reference_names)
+        pair_id_list.extend(pairs_id)
+        group_members_list.extend(group_members)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, reference_names_list, pair_id_list, group_members_list
+@torch.no_grad()
+def circo_generate_test_submission_file(dataset_path: str, image_encoder, text_encoder, ref_names_list: List[str],
+                                        pseudo_tokens: torch.Tensor, preprocess: callable,
+                                        submission_name: str) -> None:
+    """
+    Generate the test submission file for the CIRCO dataset given the pseudo tokens
+    """
+    # Load the CLIP model
+    #clip_model, _ = clip.load(clip_model_name, device=device, jit=False)
+    #clip_model = clip_model.float().eval().requires_grad_(False)
+    # Compute the index features
+    classic_test_dataset = CIRCODataset(dataset_path, 'test', 'classic', preprocess)
+    index_features, index_names = extract_image_features(classic_test_dataset, image_encoder)
+    relative_test_dataset = CIRCODataset(dataset_path, 'test', 'relative', preprocess)
+    # Get the predictions dict
+    queryid_to_retrieved_images = circo_generate_test_dict(relative_test_dataset, text_encoder, index_features,
+                                                           index_names, ref_names_list, pseudo_tokens)
+    submissions_folder_path = os.path.join('./submission', 'circo')
+    os.makedirs(submissions_folder_path, exist_ok=True)
+    with open(os.path.join(submissions_folder_path, f"{submission_name}.json"), 'w+') as file:
+        json.dump(queryid_to_retrieved_images, file, sort_keys=True)
+def circo_generate_test_predictions(clip_model, relative_test_dataset: CIRCODataset, ref_names_list: List[str],
+                                    pseudo_tokens: torch.Tensor) -> [torch.Tensor, List[List[str]]]:
+    """
+    Generate the test prediction features for the CIRCO dataset given the pseudo tokens
+    """
+    # Create the test dataloader
+    relative_test_loader = DataLoader(dataset=relative_test_dataset, batch_size=32, num_workers=10,
+                                      pin_memory=False, collate_fn=collate_fn, shuffle=False)
+    predicted_features_list = []
+    query_ids_list = []
+    # Compute the predictions
+    for batch in tqdm(relative_test_loader):
+        reference_names = batch['reference_name']
+        relative_captions = batch['relative_caption']
+        query_ids = batch['query_id']
+        input_captions = [f"a photo of $ that {caption}" for caption in relative_captions]
+        batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        predicted_features = F.normalize(text_features)
+        predicted_features_list.append(predicted_features)
+        query_ids_list.extend(query_ids)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, query_ids_list
+def circo_generate_test_dict(relative_test_dataset: CIRCODataset, clip_model, index_features: torch.Tensor,
+                             index_names: List[str], ref_names_list: List[str], pseudo_tokens: torch.Tensor) \
+        -> Dict[str, List[str]]:
+    """
+    Generate the test submission dicts for the CIRCO dataset given the pseudo tokens
+    """
+    # Get the predicted features
+    predicted_features, query_ids = circo_generate_test_predictions(clip_model, relative_test_dataset,
+                                                                    ref_names_list, pseudo_tokens)
+    # Normalize the features
+    index_features = index_features.float().to(device)
+    index_features = F.normalize(index_features, dim=-1)
+    # Compute the similarity
+    similarity = predicted_features @ index_features.T
+    sorted_indices = torch.topk(similarity, dim=-1, k=50).indices.cpu()
+    sorted_index_names = np.array(index_names)[sorted_indices]
+    # Generate prediction dicts
+    queryid_to_retrieved_images = {query_id: query_sorted_names[:50].tolist() for
+                                   (query_id, query_sorted_names) in zip(query_ids, sorted_index_names)}
+    return queryid_to_retrieved_images
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--submission-name", type=str, required=True, help="Filename of the generated submission file")
+    parser.add_argument("--exp-name", type=str, help="Experiment to evaluate")
+    parser.add_argument("--dataset", type=str, required=True, choices=['cirr', 'circo'], help="Dataset to use")
+    parser.add_argument("--dataset-path", type=str, help="Path to the dataset", required=True)
+    parser.add_argument("--eval-type", type=str, choices=['oti', 'phi', 'searle', 'searle-xl', 'pic2word'], required=True,
+                        help="If 'oti' evaluate directly using the inverted oti pseudo tokens, "
+                             "if 'phi' predicts the pseudo tokens using the phi network, "
+                             "if 'searle' uses the pre-trained SEARLE model to predict the pseudo tokens, "
+                             "if 'searle-xl' uses the pre-trained SEARLE-XL model to predict the pseudo tokens")
+    parser.add_argument("--preprocess-type", default="clip", type=str, choices=['clip', 'targetpad'],
+                        help="Preprocess pipeline to use")
+    parser.add_argument("--phi-checkpoint-name", type=str,
+                        help="Phi checkpoint to use, needed when using phi, e.g. 'phi_20.pt'")
+    parser.add_argument("--clip_model_name", default="giga", type=str)
+    parser.add_argument("--cache_dir", default="./hf_models", type=str)
+    parser.add_argument("--l2_normalize", action="store_true", help="Whether or not to use l2 normalization")
+    args = parser.parse_args()
+    if args.eval_type == 'oti':
+        experiment_path = PROJECT_ROOT / 'data' / "oti_pseudo_tokens" / args.dataset.lower() / 'test' / args.exp_name
+        with open(experiment_path / 'hyperparameters.json') as f:
+            hyperparameters = json.load(f)
+        pseudo_tokens = torch.load(experiment_path / 'ema_oti_pseudo_tokens.pt', map_location=device)
+        with open(experiment_path / 'image_names.pkl', 'rb') as f:
+            ref_names_list = pickle.load(f)
+        clip_model_name = hyperparameters['clip_model_name']
+        clip_model, clip_preprocess = clip.load(clip_model_name, device='cpu', jit=False)
+        if args.preprocess_type == 'targetpad':
+            print('Target pad preprocess pipeline is used')
+            preprocess = targetpad_transform(1.25, clip_model.visual.input_resolution)
+        elif args.preprocess_type == 'clip':
+            print('CLIP preprocess pipeline is used')
+            preprocess = clip_preprocess
+        else:
+            raise ValueError("Preprocess type not supported")
+    elif args.eval_type in ['phi', 'searle', 'searle-xl', 'pic2word']:
+        if args.eval_type == 'phi':
+            args.mixed_precision = 'fp16'
+            image_encoder, clip_preprocess, text_encoder, tokenizer = build_text_encoder(args)
+            phi = Phi(input_dim=text_encoder.config.projection_dim,
+                      hidden_dim=text_encoder.config.projection_dim * 4,
+                      output_dim=text_encoder.config.hidden_size, dropout=0.5).to(
+                device)
+            phi.load_state_dict(
+                    torch.load(args.phi_checkpoint_name, map_location=device)[
+                    phi.__class__.__name__])
+            phi = phi.eval()
+        elif args.eval_type == 'pic2word':
+            args.mixed_precision = 'fp16'
+            image_encoder, clip_preprocess, text_encoder, tokenizer = build_text_encoder(args)
+            phi = PIC2WORD(embed_dim=text_encoder.config.projection_dim,
+                           output_dim=text_encoder.config.hidden_size,
+                           ).to(device)
+            sd = torch.load(args.phi_checkpoint_name, map_location=device)['state_dict_img2text']
+            sd = {k[len('module.'):]: v for k, v in sd.items()}
+            phi.load_state_dict(sd)
+            phi = phi.eval()
+        else:  # searle or searle-xl
+            if args.eval_type == 'searle':
+                clip_model_name = 'ViT-B/32'
+            else:  # args.eval_type == 'searle-xl':
+                clip_model_name = 'ViT-L/14'
+            phi, _ = torch.hub.load(repo_or_dir='miccunifi/SEARLE', model='searle', source='github',
+                                    backbone=clip_model_name)
+            phi = phi.to(device).eval()
+            clip_model, clip_preprocess = clip.load(clip_model_name, device=device, jit=False)
+        if args.preprocess_type == 'targetpad':
+            print('Target pad preprocess pipeline is used')
+            preprocess = targetpad_transform(1.25, clip_model.visual.input_resolution)
+        elif args.preprocess_type == 'clip':
+            print('CLIP preprocess pipeline is used')
+            preprocess = clip_preprocess
+        else:
+            raise ValueError("Preprocess type not supported")
+        if args.dataset.lower() == 'cirr':
+            relative_test_dataset = CIRRDataset(args.dataset_path, 'test', 'relative', preprocess, no_duplicates=True)
+        elif args.dataset.lower() == 'circo':
+            relative_test_dataset = CIRCODataset(args.dataset_path, 'test', 'relative', preprocess)
+        else:
+            raise ValueError("Dataset not supported")
+        #clip_model = clip_model.float().to(device)
+        image_encoder = image_encoder.float().to(device)
+        text_encoder = text_encoder.float().to(device)
+        pseudo_tokens, ref_names_list = extract_pseudo_tokens_with_phi(image_encoder, phi, relative_test_dataset, args)
+        pseudo_tokens = pseudo_tokens.to(device)
+    else:
+        raise ValueError("Eval type not supported")
+    print(f"Eval type = {args.eval_type} \t exp name = {args.exp_name} \t")
+    if args.dataset == 'cirr':
+        cirr_generate_test_submission_file(args.dataset_path, image_encoder, text_encoder, ref_names_list, pseudo_tokens,
+                                           preprocess, args.submission_name)
+    elif args.dataset == 'circo':
+        circo_generate_test_submission_file(args.dataset_path, image_encoder, text_encoder, ref_names_list, pseudo_tokens,
+                                            preprocess, args.submission_name)
+    else:
+        raise ValueError("Dataset not supported")
+if __name__ == '__main__':
+    main()

loader.py ADDED Viewed

	@@ -0,0 +1,632 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+import os
+import functools
+import glob
+import random
+import json
+from pathlib import Path
+from typing import List, Optional, Union, Dict, Literal
+import PIL
+import PIL.Image
+import torch
+from torch.utils.data import Dataset
+import webdataset as wds
+import spacy
+import numpy as np
+import sng_parser
+import datasets
+def extract_keywords(spacy_nlp, caption):
+    candidates = []
+    nlp_caption = caption
+    doc = spacy_nlp(nlp_caption)
+    tmp = ''
+    for word in doc:
+        if word.pos_ == 'ADJ':
+            if tmp == '':
+                tmp += word.text
+            else:
+                tmp += ' ' + word.text
+        elif word.pos_ == 'NOUN' or word.pos_ == 'PROPN':
+            if tmp == '':
+                tmp += word.text
+            else:
+                tmp += ' ' + word.text
+        else:
+            if tmp != '':
+                candidates.append(tmp)
+            tmp = ''
+    if tmp != '':
+        candidates.append(tmp)
+    candidates = list(set(candidates))
+    return candidates
+def extract_keywords_spacy(spacy_nlp, caption):
+    sequences = []
+    current_sequence = []
+    doc = spacy_nlp(caption)
+    for token in doc:
+        # Check if the token is a noun, proper noun, or adjective
+        if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'DET']:
+            current_sequence.append(token.text)
+        else:
+            # If we encounter a token that's not one of the desired POS and current_sequence is not empty
+            if current_sequence:
+                sequences.append(" ".join(current_sequence))
+                current_sequence = []
+    # Adding any remaining sequence after the loop
+    if current_sequence:
+        sequences.append(" ".join(current_sequence))
+    return sequences
+def extract_sng(caption):
+    graph = sng_parser.parse(caption)
+    entities = [x['head'] for i, x in enumerate(graph['entities'])]
+    relations = [{'subject': entities[x['subject']], 'object': entities[x['object']], 'relation': x['relation']} for x in graph['relations']]
+    return entities, relations
+def clean_caption(caption, tokenizer):
+    if caption is None:
+        caption = ''
+    if '<PERSON>' in caption: # to handle with GCC12M
+        caption = caption.replace('<PERSON>', 'person')
+    caption = caption.lower().replace('$', '').strip()
+    tokens = tokenizer.encode(caption, padding='longest', return_tensors='pt')
+    if tokens.shape[1] > 77:
+        caption = tokenizer.batch_decode(tokens[:,1:76])[0]
+    return caption
+def preprocess_precomputed_base(sample, spacy_nlp, keywords_list, tokenizer):
+    '''
+    'image_feature.npy','json'
+    '''
+    image_feature, image_feature_giga, meta = sample
+    caption = clean_caption(meta['source_caption'], tokenizer)
+    keywords = ['']
+    try:
+        keywords = extract_keywords_spacy(spacy_nlp, caption)
+    except Exception as e:
+        #print(e)
+        pass
+    # for keywords
+    indicator = 1
+    replaced_caption = caption
+    for keyword in keywords:
+        if keyword != '' and keyword in caption:
+            replaced_caption = replaced_caption.replace(keyword, '[$]')
+        else:
+            tmp_keywords = caption.split(' ')
+            if len(tmp_keywords) > 0:
+                selected_keywords = random.sample(tmp_keywords, k=min(int(len(tmp_keywords) * 1.0), 1))
+                for selected_keyword in selected_keywords:
+                    replaced_caption = replaced_caption.replace(selected_keyword, '[$]')
+            else:
+                replaced_caption = f'a photo of [$] that {caption}'
+                indicator = 0
+            break
+    token_dict = tokenizer(text=caption, return_tensors='pt', padding='max_length', truncation=True)
+    tokens, attention_mask = token_dict['input_ids'][0], token_dict['attention_mask'][0]
+    replaced_token_dict = tokenizer(text=replaced_caption, return_tensors='pt', padding='max_length', truncation=True)
+    replaced_tokens, replaced_attention_mask = replaced_token_dict['input_ids'][0], replaced_token_dict['attention_mask'][0]
+    replaced_tokens = torch.where(replaced_tokens == 49408,
+                                  torch.ones_like(replaced_tokens) * 259,
+                                  replaced_tokens)
+    if 259 not in replaced_tokens:
+        replaced_caption = 'a photo of [$]'
+        replaced_token_dict = tokenizer(text=replaced_caption, return_tensors='pt', padding='max_length', truncation=True)
+        replaced_tokens, replaced_attention_mask = replaced_token_dict['input_ids'][0], replaced_token_dict['attention_mask'][0]
+        replaced_tokens = torch.where(replaced_tokens == 49408,
+                                      torch.ones_like(replaced_tokens) * 259,
+                                      replaced_tokens)
+        indicator = 0
+    new_sample = [tokens, replaced_tokens, indicator]
+    return tuple(new_sample)
+class CaptionDataset(Dataset):
+    def __init__(self, captions, tokenizer, spacy_nlp):
+        self.captions = captions
+        self.tokenizer = tokenizer
+        self.spacy_nlp = spacy_nlp
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        caption = self.captions[idx]
+        caption = clean_caption(caption, self.tokenizer)
+        keywords = [""]
+        try:
+            keywords = extract_keywords_spacy(self.spacy_nlp, caption)
+        except Exception as e:
+            #print(e)
+            pass
+        # for keywords
+        indicator = 1
+        replaced_caption = caption
+        if len(keywords) == 0:
+            keywords = [""]
+        for keyword in keywords:
+            if keyword != '' and keyword in caption:
+                replaced_caption = replaced_caption.replace(keyword, '[$]')
+            else:
+                tmp_keywords = caption.split(' ')
+                if len(tmp_keywords) > 0:
+                    selected_keywords = random.sample(tmp_keywords, k=min(int(len(tmp_keywords) * 1.0), 1))
+                    for selected_keyword in selected_keywords:
+                        replaced_caption = replaced_caption.replace(selected_keyword, '[$]')
+                else:
+                    replaced_caption = f'a photo of [$] that {caption}'
+                    indicator = 0
+                break
+        token_dict = self.tokenizer(text=caption, return_tensors='pt', padding='max_length', truncation=True)
+        tokens, attention_mask = token_dict['input_ids'][0], token_dict['attention_mask'][0]
+        replaced_token_dict = self.tokenizer(text=replaced_caption, return_tensors='pt', padding='max_length', truncation=True)
+        replaced_tokens, replaced_attention_mask = replaced_token_dict['input_ids'][0], replaced_token_dict['attention_mask'][0]
+        replaced_tokens = torch.where(replaced_tokens == 49408,
+                                      torch.ones_like(replaced_tokens) * 259,
+                                      replaced_tokens)
+        if 259 not in replaced_tokens:
+            replaced_caption = 'a photo of [$]'
+            replaced_token_dict = self.tokenizer(text=replaced_caption, return_tensors='pt', padding='max_length', truncation=True)
+            replaced_tokens, replaced_attention_mask = replaced_token_dict['input_ids'][0], replaced_token_dict['attention_mask'][0]
+            replaced_tokens = torch.where(replaced_tokens == 49408,
+                                          torch.ones_like(replaced_tokens) * 259,
+                                          replaced_tokens)
+            indicator = 0
+        return tokens, replaced_tokens, indicator
+def build_loader(args, tokenizer, accelerator):
+    data_names = {'dataset1': 'dangne/gcc_caption_only',
+                  'dataset2': 'FredZhang7/stable-diffusion-prompts-2.47M',
+                  'dataset3': 'Geonmo/midjourney-prompts-only',
+                  }
+    for k, v in data_names.items():
+        if not os.path.exists(os.path.join('./datasets', k)):
+            if accelerator.is_main_process:
+                print('Downloading captions is required')
+                db = datasets.load_dataset(v, cache_dir=os.path.join('./datasets', k))
+    captions = []
+    for k, v in data_names.items():
+        db = datasets.load_dataset(v, cache_dir=os.path.join('./datasets', k))
+        captions += db['train']['text']
+    dataset = CaptionDataset(captions, tokenizer, spacy.load('en_core_web_sm'))
+    data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, drop_last=True, shuffle=True)
+    return data_loader
+class FashionIQDataset(Dataset):
+    """
+    Copy-paste from https://github.com/miccunifi/SEARLE/blob/main/src/datasets.py
+    FashionIQ dataset class for PyTorch.
+    The dataset can be used in 'relative' or 'classic' mode:
+        - In 'classic' mode the dataset yield :a dict with keys ['image', 'image_name']
+        - In 'relative' mode the dataset yield dict with keys:
+            - ['reference_image', 'reference_name', 'target_image', 'target_name', 'relative_captions'] when
+             split in ['train', 'val']
+            - ['reference_image', 'reference_name', 'relative_captions'] when split == test
+    """
+    def __init__(self, dataset_path: Union[Path, str], split: Literal['train', 'val', 'test'], dress_types: List[str],
+                 mode: Literal['relative', 'classic'], preprocess: callable, no_duplicates: Optional[bool] = False):
+        """
+        :param dataset_path: path to the FashionIQ dataset
+        :param split: dataset split, should be in ['train, 'val', 'test']
+        :param dress_types: list of fashionIQ categories, each category should be in ['dress', 'shirt', 'toptee']
+        :param mode: dataset mode, should be in ['relative', 'classic']:
+            - In 'classic' mode the dataset yield a dict with keys ['image', 'image_name']
+            - In 'relative' mode the dataset yield dict with keys:
+                - ['reference_image', 'reference_name', 'target_image', 'target_name', 'relative_captions']
+                 when split in ['train', 'val']
+                - ['reference_image', 'reference_name', 'relative_captions'] when split == test
+        :param preprocess: function which preprocesses the image
+        :param no_duplicates: if True, the dataset will not yield duplicate images in relative mode, does not affect classic mode
+        """
+        dataset_path = Path(dataset_path)
+        self.dataset_path = dataset_path
+        self.mode = mode
+        self.dress_types = dress_types
+        self.split = split
+        self.no_duplicates = no_duplicates
+        # Validate the inputs
+        if mode not in ['relative', 'classic']:
+            raise ValueError("mode should be in ['relative', 'classic']")
+        if split not in ['test', 'train', 'val']:
+            raise ValueError("split should be in ['test', 'train', 'val']")
+        for dress_type in dress_types:
+            if dress_type not in ['dress', 'shirt', 'toptee']:
+                raise ValueError("dress_type should be in ['dress', 'shirt', 'toptee']")
+        self.preprocess = preprocess
+        # get triplets made by (reference_image, target_image, a pair of relative captions)
+        self.triplets: List[dict] = []
+        for dress_type in dress_types:
+            with open(dataset_path / 'captions' / f'cap.{dress_type}.{split}.json') as f:
+                self.triplets.extend(json.load(f))
+        # Remove duplicats from
+        if self.no_duplicates:
+            seen = set()
+            new_triplets = []
+            for triplet in self.triplets:
+                if triplet['candidate'] not in seen:
+                    seen.add(triplet['candidate'])
+                    new_triplets.append(triplet)
+            self.triplets = new_triplets
+        # get the image names
+        self.image_names: list = []
+        for dress_type in dress_types:
+            with open(dataset_path / 'image_splits' / f'split.{dress_type}.{split}.json') as f:
+                self.image_names.extend(json.load(f))
+        print(f"FashionIQ {split} - {dress_types} dataset in {mode} mode initialized")
+    def __getitem__(self, index) -> dict:
+        try:
+            if self.mode == 'relative':
+                relative_captions = self.triplets[index]['captions']
+                reference_name = self.triplets[index]['candidate']
+                if self.split in ['train', 'val']:
+                    reference_image_path = self.dataset_path / 'images' / f"{reference_name}.jpg"
+                    reference_image = self.preprocess(PIL.Image.open(reference_image_path), return_tensors='pt')['pixel_values'][0]
+                    target_name = self.triplets[index]['target']
+                    target_image_path = self.dataset_path / 'images' / f"{target_name}.jpg"
+                    target_image = self.preprocess(PIL.Image.open(target_image_path), return_tensors='pt')['pixel_values'][0]
+                    return {
+                        'reference_image': reference_image,
+                        'reference_name': reference_name,
+                        'target_image': target_image,
+                        'target_name': target_name,
+                        'relative_captions': relative_captions
+                    }
+                elif self.split == 'test':
+                    reference_image_path = self.dataset_path / 'images' / f"{reference_name}.jpg"
+                    reference_image = self.preprocess(PIL.Image.open(reference_image_path), return_tensors='pt')['pixel_values'][0]
+                    return {
+                        'reference_image': reference_image,
+                        'reference_name': reference_name,
+                        'relative_captions': relative_captions
+                    }
+            elif self.mode == 'classic':
+                image_name = self.image_names[index]
+                image_path = self.dataset_path / 'images' / f"{image_name}.jpg"
+                image = self.preprocess(PIL.Image.open(image_path), return_tensors='pt')['pixel_values'][0]
+                return {
+                    'image': image,
+                    'image_name': image_name
+                }
+            else:
+                raise ValueError("mode should be in ['relative', 'classic']")
+        except Exception as e:
+            print(f"Exception: {e}")
+    def __len__(self):
+        if self.mode == 'relative':
+            return len(self.triplets)
+        elif self.mode == 'classic':
+            return len(self.image_names)
+        else:
+            raise ValueError("mode should be in ['relative', 'classic']")
+class CIRRDataset(Dataset):
+    """
+   Copy-paste from https://github.com/miccunifi/SEARLE/blob/main/src/datasets.py
+   CIRR dataset class for PyTorch dataloader.
+   The dataset can be used in 'relative' or 'classic' mode:
+        - In 'classic' mode the dataset yield a dict with keys ['image', 'image_name']
+        - In 'relative' mode the dataset yield dict with keys:
+            - ['reference_image', 'reference_name', 'target_image', 'target_name', 'relative_caption', 'group_members']
+             when split in ['train', 'val']
+            - ['reference_image', 'reference_name' 'relative_caption', 'group_members', 'pair_id'] when split == test
+    """
+    def __init__(self, dataset_path: Union[Path, str], split: Literal['train', 'val', 'test'],
+                 mode: Literal['relative', 'classic'], preprocess: callable, no_duplicates: Optional[bool] = False):
+        """
+        :param dataset_path: path to the CIRR dataset
+        :param split: dataset split, should be in ['train', 'val', 'test']
+        :param mode: dataset mode, should be in ['relative', 'classic']:
+                - In 'classic' mode the dataset yield a dict with keys ['image', 'image_name']
+                - In 'relative' mode the dataset yield dict with keys:
+                    - ['reference_image', 'reference_name', 'target_image', 'target_name', 'relative_caption',
+                    'group_members'] when split in ['train', 'val']
+                    - ['reference_image', 'reference_name' 'relative_caption', 'group_members', 'pair_id'] when split == test
+        :param preprocess: function which preprocesses the image
+        :param no_duplicates: if True, the dataset will not yield duplicate images in relative mode, does not affect classic mode
+        """
+        dataset_path = Path(dataset_path)
+        self.dataset_path = dataset_path
+        self.preprocess = preprocess
+        self.mode = mode
+        self.split = split
+        self.no_duplicates = no_duplicates
+        if split == "test":
+            split = "test1"
+            self.split = "test1"
+        # Validate inputs
+        if split not in ['test1', 'train', 'val']:
+            raise ValueError("split should be in ['test1', 'train', 'val']")
+        if mode not in ['relative', 'classic']:
+            raise ValueError("mode should be in ['relative', 'classic']")
+        # get triplets made by (reference_image, target_image, relative caption)
+        with open(dataset_path / 'cirr' / 'captions' / f'cap.rc2.{split}.json') as f:
+            self.triplets = json.load(f)
+        # Remove duplicates from triplets
+        if self.no_duplicates:
+            seen = set()
+            new_triplets = []
+            for triplet in self.triplets:
+                if triplet['reference'] not in seen:
+                    seen.add(triplet['reference'])
+                    new_triplets.append(triplet)
+            self.triplets = new_triplets
+        # get a mapping from image name to relative path
+        with open(dataset_path / 'cirr' / 'image_splits' / f'split.rc2.{split}.json') as f:
+            self.name_to_relpath = json.load(f)
+        print(f"CIRR {split} dataset in {mode} mode initialized")
+    def __getitem__(self, index) -> dict:
+        try:
+            if self.mode == 'relative':
+                group_members = self.triplets[index]['img_set']['members']
+                reference_name = self.triplets[index]['reference']
+                relative_caption = self.triplets[index]['caption']
+                if self.split in ['train', 'val']:
+                    reference_image_path = self.dataset_path / self.name_to_relpath[reference_name]
+                    reference_image = self.preprocess(PIL.Image.open(reference_image_path), return_tensors='pt')['pixel_values'][0]
+                    target_hard_name = self.triplets[index]['target_hard']
+                    target_image_path = self.dataset_path / self.name_to_relpath[target_hard_name]
+                    target_image = self.preprocess(PIL.Image.open(target_image_path), return_tensors='pt')['pixel_values'][0]
+                    return {
+                        'reference_image': reference_image,
+                        'reference_name': reference_name,
+                        'target_image': target_image,
+                        'target_name': target_hard_name,
+                        'relative_caption': relative_caption,
+                        'group_members': group_members
+                    }
+                elif self.split == 'test1':
+                    pair_id = self.triplets[index]['pairid']
+                    reference_image_path = self.dataset_path / self.name_to_relpath[reference_name]
+                    reference_image = self.preprocess(PIL.Image.open(reference_image_path), return_tensors='pt')['pixel_values'][0]
+                    return {
+                        'reference_image': reference_image,
+                        'reference_name': reference_name,
+                        'relative_caption': relative_caption,
+                        'group_members': group_members,
+                        'pair_id': pair_id
+                    }
+            elif self.mode == 'classic':
+                image_name = list(self.name_to_relpath.keys())[index]
+                image_path = self.dataset_path / self.name_to_relpath[image_name]
+                im = PIL.Image.open(image_path)
+                image = self.preprocess(im, return_tensors='pt')['pixel_values'][0]
+                return {
+                    'image': image,
+                    'image_name': image_name
+                }
+            else:
+                raise ValueError("mode should be in ['relative', 'classic']")
+        except Exception as e:
+            print(f"Exception: {e}")
+    def __len__(self):
+        if self.mode == 'relative':
+            return len(self.triplets)
+        elif self.mode == 'classic':
+            return len(self.name_to_relpath)
+        else:
+            raise ValueError("mode should be in ['relative', 'classic']")
+class CIRCODataset(Dataset):
+    """
+    Copy-paste from https://github.com/miccunifi/SEARLE/blob/main/src/datasets.py
+    CIRCO dataset class for PyTorch.
+    The dataset can be used in 'relative' or 'classic' mode:
+        - In 'classic' mode the dataset yield a dict with keys ['image', 'image_name']
+        - In 'relative' mode the dataset yield dict with keys:
+            - ['reference_image', 'reference_name', 'target_image', 'target_name', 'relative_captions', 'shared_concept',
+             'gt_img_ids', 'query_id'] when split == 'val'
+            - ['reference_image', 'reference_name', 'relative_captions', 'shared_concept', 'query_id'] when split == test
+    """
+    def __init__(self, dataset_path: Union[str, Path], split: Literal['val', 'test'],
+                 mode: Literal['relative', 'classic'], preprocess: callable):
+        """
+        Args:
+            dataset_path (Union[str, Path]): path to CIRCO dataset
+            split (str): dataset split, should be in ['test', 'val']
+            mode (str): dataset mode, should be in ['relative', 'classic']
+            preprocess (callable): function which preprocesses the image
+        """
+        # Set dataset paths and configurations
+        dataset_path = Path(dataset_path)
+        self.mode = mode
+        self.split = split
+        self.preprocess = preprocess
+        self.data_path = dataset_path
+        # Ensure input arguments are valid
+        if mode not in ['relative', 'classic']:
+            raise ValueError("mode should be in ['relative', 'classic']")
+        if split not in ['test', 'val']:
+            raise ValueError("split should be in ['test', 'val']")
+        # Load COCO images information
+        with open(dataset_path / 'COCO2017_unlabeled' / "annotations" / "image_info_unlabeled2017.json", "r") as f:
+            imgs_info = json.load(f)
+        self.img_paths = [dataset_path / 'COCO2017_unlabeled' / "unlabeled2017" / img_info["file_name"] for img_info in
+                          imgs_info["images"]]
+        self.img_ids = [img_info["id"] for img_info in imgs_info["images"]]
+        self.img_ids_indexes_map = {str(img_id): i for i, img_id in enumerate(self.img_ids)}
+        # get CIRCO annotations
+        with open(dataset_path / 'annotations' / f'{split}.json', "r") as f:
+            self.annotations: List[dict] = json.load(f)
+        # Get maximum number of ground truth images (for padding when loading the images)
+        self.max_num_gts = 23  # Maximum number of ground truth images
+        print(f"CIRCODataset {split} dataset in {mode} mode initialized")
+    def get_target_img_ids(self, index) -> Dict[str, int]:
+        """
+        Returns the id of the target image and ground truth images for a given query
+        Args:
+            index (int): id of the query
+        Returns:
+             Dict[str, int]: dictionary containing target image id and a list of ground truth image ids
+        """
+        return {
+            'target_img_id': self.annotations[index]['target_img_id'],
+            'gt_img_ids': self.annotations[index]['gt_img_ids']
+        }
+    def __getitem__(self, index) -> dict:
+        """
+        Returns a specific item from the dataset based on the index.
+        In 'classic' mode, the dataset yields a dictionary with the following keys: [img, img_id]
+        In 'relative' mode, the dataset yields dictionaries with the following keys:
+            - [reference_img, reference_img_id, target_img, target_img_id, relative_caption, shared_concept, gt_img_ids,
+            query_id]
+            if split == val
+            - [reference_img, reference_img_id, relative_caption, shared_concept, query_id]  if split == test
+        """
+        if self.mode == 'relative':
+            # Get the query id
+            query_id = str(self.annotations[index]['id'])
+            # Get relative caption and shared concept
+            relative_caption = self.annotations[index]['relative_caption']
+            shared_concept = self.annotations[index]['shared_concept']
+            # Get the reference image
+            reference_img_id = str(self.annotations[index]['reference_img_id'])
+            reference_img_path = self.img_paths[self.img_ids_indexes_map[reference_img_id]]
+            reference_img = self.preprocess(PIL.Image.open(reference_img_path), return_tensors='pt')['pixel_values'][0]
+            if self.split == 'val':
+                # Get the target image and ground truth images
+                target_img_id = str(self.annotations[index]['target_img_id'])
+                gt_img_ids = [str(x) for x in self.annotations[index]['gt_img_ids']]
+                target_img_path = self.img_paths[self.img_ids_indexes_map[target_img_id]]
+                target_img = self.preprocess(PIL.Image.open(target_img_path), return_tensors='pt')['pixel_values'][0]
+                # Pad ground truth image IDs with zeros for collate_fn
+                gt_img_ids += [''] * (self.max_num_gts - len(gt_img_ids))
+                return {
+                    'reference_image': reference_img,
+                    'reference_name': reference_img_id,
+                    'target_image': target_img,
+                    'target_name': target_img_id,
+                    'relative_caption': relative_caption,
+                    'shared_concept': shared_concept,
+                    'gt_img_ids': gt_img_ids,
+                    'query_id': query_id,
+                }
+            elif self.split == 'test':
+                return {
+                    'reference_image': reference_img,
+                    'reference_name': reference_img_id,
+                    'relative_caption': relative_caption,
+                    'shared_concept': shared_concept,
+                    'query_id': query_id,
+                }
+        elif self.mode == 'classic':
+            # Get image ID and image path
+            img_id = str(self.img_ids[index])
+            img_path = self.img_paths[index]
+            # Preprocess image and return
+            img = self.preprocess(PIL.Image.open(img_path), return_tensors='pt')['pixel_values'][0]
+            return {
+                'image': img,
+                'image_name': img_id
+            }
+    def __len__(self):
+        """
+        Returns the length of the dataset.
+        """
+        if self.mode == 'relative':
+            return len(self.annotations)
+        elif self.mode == 'classic':
+            return len(self.img_ids)
+        else:
+            raise ValueError("mode should be in ['relative', 'classic']")

models.py ADDED Viewed

	@@ -0,0 +1,192 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer
+def build_text_encoder(args):
+    clip_model_dict = {'base32': 'openai/clip-vit-base-patch32',
+                       'base': 'openai/clip-vit-base-patch16',
+                       'large': 'openai/clip-vit-large-patch14',
+                       'huge': 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
+                       'giga': 'Geonmo/CLIP-Giga-config-fixed',
+                       'meta-large': 'facebook/metaclip-l14-fullcc2.5b',
+                       'meta-huge': 'facebook/metaclip-h14-fullcc2.5b',
+                       }
+    clip_preprocess = CLIPImageProcessor(crop_size={'height': 224, 'width': 224},
+                                         do_center_crop=True,
+                                         do_convert_rgb=True,
+                                         do_normalize=True,
+                                         do_rescale=True,
+                                         do_resize=True,
+                                         image_mean=[0.48145466, 0.4578275, 0.40821073],
+                                         image_std=[0.26862954, 0.26130258, 0.27577711],
+                                         resample=3,
+                                         size={'shortest_edge': 224},
+                                         )
+    clip_vision_model = CLIPVisionModelWithProjection.from_pretrained(clip_model_dict[args.clip_model_name], torch_dtype=torch.float16 if args.mixed_precision == 'fp16' else torch.float32, cache_dir=args.cache_dir)
+    clip_text_model = CLIPTextModelWithProjection.from_pretrained(clip_model_dict[args.clip_model_name], torch_dtype=torch.float16 if args.mixed_precision == 'fp16' else torch.float32, cache_dir=args.cache_dir)
+    tokenizer = CLIPTokenizer.from_pretrained('stabilityai/stable-diffusion-xl-base-1.0', subfolder='tokenizer_2', cache_dir=args.cache_dir)
+    tokenizer.add_special_tokens({'additional_special_tokens':["[$]"]}) # NOTE: 49408
+    return clip_vision_model, clip_preprocess, clip_text_model, tokenizer
+class Phi(nn.Module):
+    """
+    Textual Inversion Phi network.
+    Takes as input the visual features of an image and outputs the pseudo-work embedding.
+    Copy-paste from https://github.com/miccunifi/SEARLE/blob/main/src/phi.py
+    """
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, dropout: int):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(p=dropout),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(p=dropout),
+            nn.Linear(hidden_dim, output_dim),
+        )
+    def forward(self, x):
+        #x = F.normalize(x, dim=-1)
+        return self.layers(x)
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+    def __init__(self, parameters, decay=0.9999):
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+        self.collected_params = None
+        self.decay = decay
+        self.optimization_step = 0
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+        self.optimization_step += 1
+        # Compute the decay factor for the exponential moving average.
+        value = (1 + self.optimization_step) / (10 + self.optimization_step)
+        one_minus_decay = 1 - min(self.decay, value)
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                s_param.sub_(one_minus_decay * (s_param - param))
+            else:
+                s_param.copy_(param)
+        torch.cuda.empty_cache()
+    def copy_to(self, parameters) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "optimization_step": self.optimization_step,
+            "shadow_params": self.shadow_params,
+            "collected_params": self.collected_params,
+        }
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Loads the ExponentialMovingAverage state.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        Args:
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+        self.decay = state_dict["decay"]
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+        self.optimization_step = state_dict["optimization_step"]
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+        self.shadow_params = state_dict["shadow_params"]
+        if not isinstance(self.shadow_params, list):
+            raise ValueError("shadow_params must be a list")
+        if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+            raise ValueError("shadow_params must all be Tensors")
+        self.collected_params = state_dict["collected_params"]
+        if self.collected_params is not None:
+            if not isinstance(self.collected_params, list):
+                raise ValueError("collected_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.collected_params):
+                raise ValueError("collected_params must all be Tensors")
+            if len(self.collected_params) != len(self.shadow_params):
+                raise ValueError("collected_params and shadow_params must have the same length")
+class PIC2WORD(nn.Module):
+    def __init__(self, embed_dim=512, middle_dim=512, output_dim=512, n_layer=2, dropout=0.1):
+        super().__init__()
+        self.fc_out = nn.Linear(middle_dim, output_dim)
+        layers = []
+        dim = embed_dim
+        for _ in range(n_layer):
+            block = []
+            block.append(nn.Linear(dim, middle_dim))
+            block.append(nn.Dropout(dropout))
+            block.append(nn.ReLU())
+            dim = middle_dim
+            layers.append(nn.Sequential(*block))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor):
+        for layer in self.layers:
+            x = layer(x)
+        return self.fc_out(x)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy
+torch
+transformers
+diffusers
+accelerate
+datasets
+spacy
+clip-retrieval

train_phi.py ADDED Viewed

	@@ -0,0 +1,317 @@

+'''
+LinCIR
+Copyright (c) 2023-present NAVER Corp.
+CC BY-NC-4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
+'''
+import json
+import os
+import pickle
+import random
+import math
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Literal, Tuple, Dict, List, Set
+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from loader import build_loader, CIRRDataset
+from encode_with_pseudo_tokens import encode_with_pseudo_tokens_HF
+from models import build_text_encoder, Phi, EMAModel
+from utils import extract_image_features, extract_pseudo_tokens_with_phi
+from validate import cirr_compute_val_metrics
+import transformers
+from transformers import get_scheduler
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from accelerate.state import AcceleratorState
+from accelerate.logging import get_logger
+logger = get_logger(__name__)
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--output_dir", default="trained_models", type=str,
+                        help="The output directory where the model predictions and checkpoints will be written")
+    parser.add_argument("--logging_dir", default="logs", type=str, help="tensorboard logs will saved here")
+    parser.add_argument("--cache_dir", default="./hf_models", type=str,
+                        help="Path to model cache folder")
+    parser.add_argument("--report_to", default="tensorboard", type=str, help="")
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--clip_model_name", default="giga", type=str,
+                        help="CLIP model to use, e.g 'large', 'giga'")
+    parser.add_argument("--cirr_dataset_path", type=str, help="Path to CIRR dataset", required=True)
+    parser.add_argument("--keywords_path", type=str, help="Path to keywords json file")
+    parser.add_argument("--resume", default=None, type=str, help="Path to pretrained ckpt")
+    parser.add_argument("--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes.")
+    parser.add_argument("--lr_scheduler", type=str, default="constant",
+                        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+                        help="")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--max_train_steps", type=int, default=50000, help="Total number of training steps to perform")
+    parser.add_argument("--phi_dropout", default=0.5, type=float, help="Dropout probability for the phi network")
+    parser.add_argument("--l2_normalize", action="store_true", help="Whether or not to use l2 normalization")
+    parser.add_argument("--batch_size", default=256, type=int, help="Phi training batch size")
+    parser.add_argument("--num_workers", default=10, type=int, help="Number of workers")
+    parser.add_argument("--learning_rate", default=1e-4, type=float, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay")
+    parser.add_argument("--gradient_accumulation_steps", default=1, type=int, help="Number of updates steps to accumulate before performing a backward/update pass")
+    parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.")
+    parser.add_argument("--mixed_precision", default=None, type=str, choices=["no", "fp16", "bf16"], help="mixed precision")
+    parser.add_argument("--validation_steps", default=1, type=int, help="Validation frequency expressed in epochs")
+    parser.add_argument("--checkpointing_steps", default=None, type=int, help="Save a checkpoint of the training state every X updates")
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--seed", type=int, default=None, help="seed for reproducibility")
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    return args
+def save_phi(name: str, cur_epoch: int, model_to_save: Phi, training_path: Path) -> None:
+    """
+    Save the weights of Phi during training
+    """
+    models_path = os.path.join(training_path, "checkpoints")
+    os.makedirs(models_path, exist_ok=True)
+    model_name = model_to_save.__class__.__name__
+    torch.save({
+        'epoch': cur_epoch,
+        model_name: model_to_save.state_dict(),
+    }, os.path.join(models_path, f'{name}.pt'))
+def train_phi(args):
+    # We are going to use the pre-extracted clip image features. so we do not need image_encoder anymore.
+    ### init accelerator here
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            mixed_precision=args.mixed_precision,
+            log_with=args.report_to,
+            project_dir=logging_dir,
+    )
+    os.makedirs(args.output_dir, exist_ok=True)
+    logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+            datefmt="%m/%d/%Y %H:%M:%S",
+            level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+    if args.seed is not None:
+        set_seed(args.seed)
+    ### Define the text encoder from clip
+    image_encoder, clip_preprocess, text_encoder, tokenizer = build_text_encoder(args)
+    ### Define the phi model
+    phi = Phi(input_dim=text_encoder.config.projection_dim,
+                    hidden_dim=text_encoder.config.projection_dim * 4,
+                    output_dim=text_encoder.config.hidden_size, dropout=args.phi_dropout)
+    if args.resume:
+        phi.load_state_dict(
+                torch.load(args.resume, map_location=accelerator.device)[
+                phi.__class__.__name__])
+    ### GPU handling
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    if args.use_ema:
+        import copy
+        ema_phi = copy.deepcopy(phi)
+        ema_phi = EMAModel(ema_phi.parameters())
+        ema_phi.to(accelerator.device, dtype=weight_dtype)
+    ### Define the train datasets
+    print('pytorch loader')
+    train_dataset = build_loader(args, tokenizer, accelerator)
+    ## evaluator
+    if accelerator.is_main_process:
+        ## Define CIRR validation set
+        cirr_relative_val_dataset = CIRRDataset(args.cirr_dataset_path, 'val', 'relative', clip_preprocess)
+        cirr_classic_val_dataset = CIRRDataset(args.cirr_dataset_path, 'val', 'classic', clip_preprocess)
+        # Extract the features for the CIRR validation set
+        cirr_val_index_features, cirr_val_index_names = extract_image_features(cirr_classic_val_dataset, image_encoder)
+    # Define the optimizer, the loss and the grad scaler
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                    "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(phi.parameters(),
+                              lr=args.learning_rate,
+                              weight_decay=args.weight_decay)
+    lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * args.gradient_accumulation_steps * accelerator.num_processes,
+    )
+    phi, optimizer, lr_scheduler, train_dataset = accelerator.prepare(
+            phi, optimizer, lr_scheduler, train_dataset
+    )
+    if accelerator.is_main_process:
+        accelerator.init_trackers("zeroshot-cir", config=vars(args))
+    # Start with the training loop
+    total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Instantaneous batch size per device = {args.batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total steps = {args.max_train_steps}")
+    phi.train()
+    train_loss = 0.0
+    global_step = 0
+    best_recall = -1
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    while True:
+        for idx, (original_tokens, replaced_tokens, indicators) in enumerate(train_dataset):
+            original_tokens = original_tokens.to(accelerator.device)
+            replaced_tokens = replaced_tokens.to(accelerator.device)
+            org = text_encoder(input_ids=original_tokens)
+            original_text_embeddings, original_last_hidden_states = org.text_embeds, org.last_hidden_state
+            input_features = original_text_embeddings.clone()
+            input_features += 1.0 * torch.rand(input_features.shape[0], device=input_features.device).unsqueeze(-1) * torch.randn(input_features.shape, device=input_features.device)
+            # normalize test
+            if args.l2_normalize:
+                input_features = F.normalize(input_features, dim=-1)
+            #################
+            estimated_token_embeddings = phi(input_features)
+            replaced_text_embeddings, replaced_last_hidden_states = encode_with_pseudo_tokens_HF(text_encoder, replaced_tokens, estimated_token_embeddings, return_last_states=True)
+            loss = F.mse_loss(replaced_text_embeddings.float(), original_text_embeddings.float(), reduction="mean")
+            avg_loss = accelerator.gather(loss.repeat(args.batch_size)).mean()
+            train_loss += avg_loss.item() / args.gradient_accumulation_steps
+            # Backpropagation
+            accelerator.backward(loss)
+            if accelerator.sync_gradients and args.max_grad_norm is not None:
+                accelerator.clip_grad_norm_(phi.parameters(), arg.max_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_phi.step(phi.module.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train/train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                accelerator.log({'train/lr': lr_scheduler.get_last_lr()[0]}, step=global_step)
+                accelerator.log({'train/preproc_rate': torch.sum(indicators).item() / len(indicators)}, step=global_step)
+                if args.checkpointing_steps and global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        logger.info(f"model saving... step: {global_step}")
+                        save_phi(f"phi_{global_step:09}", global_step, accelerator.unwrap_model(phi), args.output_dir)
+                        save_phi(f"phi_latest", global_step, accelerator.unwrap_model(phi), args.output_dir)
+                    if args.use_ema:
+                        phi_for_saving = copy.deepcopy(accelerator.unwrap_model(phi))
+                        ema_phi.copy_to(phi_for_saving.parameters())
+                        save_phi(f"ema_phi_{global_step:09}", global_step, phi_for_saving, args.output_dir)
+                        save_phi(f"ema_phi_latest", global_step, phi_for_saving, args.output_dir)
+                if global_step % args.validation_steps == 0 or global_step == 50:
+                    if accelerator.is_main_process:
+                        logger.info(f"evaluate model... step: {global_step}")
+                        if args.use_ema:
+                            phi_for_eval = copy.deepcopy(accelerator.unwrap_model(phi))
+                            ema_phi.copy_to(phi_for_eval.parameters())
+                        else:
+                            phi_for_eval = phi
+                        phi_for_eval.eval()
+                        # Extract the pseudo tokens for the CIRR validation set using Phi
+                        cirr_val_pseudo_tokens, cirr_val_ref_names_list = extract_pseudo_tokens_with_phi(image_encoder, phi_for_eval,
+                                                                                                         cirr_relative_val_dataset, args)
+                        cirr_val_pseudo_tokens = cirr_val_pseudo_tokens.to(accelerator.device)
+                        # Compute the CIRR validation metrics
+                        cirr_results_dict = cirr_compute_val_metrics(cirr_relative_val_dataset, text_encoder,
+                                                                     cirr_val_index_features, cirr_val_index_names,
+                                                                     cirr_val_ref_names_list, cirr_val_pseudo_tokens)
+                        check_list = ['cirr_recall_at1', 'cirr_recall_at5', 'cirr_recall_at10', 'cirr_recall_at50']
+                        for check_key in check_list:
+                            accelerator.log({f"validate/{check_key}": cirr_results_dict[check_key]}, step=global_step)
+                        print(json.dumps(cirr_results_dict, indent=4))
+                        # Save the best model.
+                        if args.checkpointing_steps:
+                            if cirr_results_dict['cirr_recall_at1'] > best_recall:
+                                best_recall = cirr_results_dict['cirr_recall_at1']
+                                logger.info(f"best model saving... step: {global_step}")
+                                save_phi("phi_best", global_step, accelerator.unwrap_model(phi), args.output_dir)
+                        phi.train()
+            if global_step >= args.max_train_steps:
+                break
+if __name__ == '__main__':
+    args = parse_args()
+    train_phi(args)

utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Optional, Tuple, List
+import torch
+import torch.nn.functional as F
+from clip.model import CLIP
+from transformers import CLIPVisionModelWithProjection
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from data_utils import collate_fn
+from models import Phi
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    dtype = torch.float16
+else:
+    device = torch.device("cpu")
+    dtype = torch.float32
+@torch.no_grad()
+def extract_image_features(dataset: Dataset, clip_model: CLIPVisionModelWithProjection, batch_size: Optional[int] = 32,
+                           num_workers: Optional[int] = 10) -> Tuple[torch.Tensor, List[str]]:
+    """
+    Extracts image features from a dataset using a CLIP model.
+    """
+    # Create data loader
+    loader = DataLoader(dataset=dataset, batch_size=batch_size,
+                        num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
+    index_features = []
+    index_names = []
+    try:
+        print(f"extracting image features {dataset.__class__.__name__} - {dataset.split}")
+    except Exception as e:
+        pass
+    # Extract features
+    for batch in tqdm(loader):
+        images = batch.get('image')
+        names = batch.get('image_name')
+        if images is None:
+            images = batch.get('reference_image')
+        if names is None:
+            names = batch.get('reference_name')
+        images = images.to(clip_model.device)
+        with torch.no_grad():
+            batch_features = clip_model(pixel_values=images.to(clip_model.dtype)).image_embeds #.encode_image(images)
+            index_features.append(batch_features.cpu())
+            index_names.extend(names)
+    index_features = torch.vstack(index_features)
+    return index_features, index_names
+def contrastive_loss(v1: torch.Tensor, v2: torch.Tensor, temperature: float) -> torch.Tensor:
+    # Based on https://github.com/NVlabs/PALAVRA/blob/main/utils/nv.py
+    v1 = F.normalize(v1, dim=1)
+    v2 = F.normalize(v2, dim=1)
+    numerator = torch.exp(torch.diag(torch.inner(v1, v2)) / temperature)
+    numerator = torch.cat((numerator, numerator), 0)
+    joint_vector = torch.cat((v1, v2), 0)
+    pairs_product = torch.exp(torch.mm(joint_vector, joint_vector.t()) / temperature)
+    denominator = torch.sum(pairs_product - pairs_product * torch.eye(joint_vector.shape[0]).to(device), 0)
+    loss = -torch.mean(torch.log(numerator / denominator))
+    return loss
+@torch.no_grad()
+def extract_pseudo_tokens_with_phi(clip_model: CLIPVisionModelWithProjection, phi: Phi, dataset: Dataset, args) -> Tuple[torch.Tensor, List[str]]:
+    """
+    Extracts pseudo tokens from a dataset using a CLIP model and a phi model
+    """
+    data_loader = DataLoader(dataset=dataset, batch_size=32, num_workers=10, pin_memory=False,
+                             collate_fn=collate_fn)
+    predicted_tokens = []
+    names_list = []
+    print(f"Extracting tokens using phi model")
+    for batch in tqdm(data_loader):
+        images = batch.get('image')
+        names = batch.get('image_name')
+        if images is None:
+            images = batch.get('reference_image')
+        if names is None:
+            names = batch.get('reference_name')
+        images = images.to(device)
+        image_features = clip_model(pixel_values=images.half()).image_embeds
+        if args.l2_normalize:
+            image_features = F.normalize(image_features, dim=-1)
+        batch_predicted_tokens = phi(image_features)
+        predicted_tokens.append(batch_predicted_tokens.cpu())
+        names_list.extend(names)
+    predicted_tokens = torch.vstack(predicted_tokens)
+    return predicted_tokens, names_list
+@torch.no_grad()
+def extract_image_features_with_names(clip_model: CLIPVisionModelWithProjection, dataset: Dataset) -> Tuple[torch.Tensor, List[str]]:
+    """
+    Extracts image features from a dataset using a CLIP model
+    """
+    data_loader = DataLoader(dataset=dataset, batch_size=32, num_workers=10, pin_memory=False,
+                             collate_fn=collate_fn)
+    predicted_tokens = []
+    names_list = []
+    print(f"Extracting tokens using phi model")
+    for batch in tqdm(data_loader):
+        images = batch.get('image')
+        names = batch.get('image_name')
+        if images is None:
+            images = batch.get('reference_image')
+        if names is None:
+            names = batch.get('reference_name')
+        images = images.to(device)
+        image_features = clip_model(pixel_values=images.to(clip_model.dtype)).image_embeds
+        #batch_predicted_tokens = phi(image_features)
+        batch_predicted_tokens = image_features
+        predicted_tokens.append(batch_predicted_tokens.cpu())
+        names_list.extend(names)
+    predicted_tokens = torch.vstack(predicted_tokens)
+    return predicted_tokens, names_list
+class CustomTensorDataset(Dataset):
+    """
+    Custom Tensor Dataset which yields image_features and image_names
+    """
+    def __init__(self, images: torch.Tensor, names: torch.Tensor):
+        self.images = images
+        self.names = names
+    def __getitem__(self, index) -> dict:
+        return {'image': self.images[index],
+                'image_name': self.names[index]
+                }
+    def __len__(self):
+        return len(self.images)
+def get_templates():
+    """
+    Return a list of templates
+    Same templates as in PALAVRA: https://arxiv.org/abs/2204.01694
+    """
+    return [
+        "This is a photo of a {}",
+        "This photo contains a {}",
+        "A photo of a {}",
+        "This is an illustration of a {}",
+        "This illustration contains a {}",
+        "An illustrations of a {}",
+        "This is a sketch of a {}",
+        "This sketch contains a {}",
+        "A sketch of a {}",
+        "This is a diagram of a {}",
+        "This diagram contains a {}",
+        "A diagram of a {}",
+        "A {}",
+        "We see a {}",
+        "{}",
+        "We see a {} in this photo",
+        "We see a {} in this image",
+        "We see a {} in this illustration",
+        "We see a {} photo",
+        "We see a {} image",
+        "We see a {} illustration",
+        "{} photo",
+        "{} image",
+        "{} illustration",
+    ]

validate.py ADDED Viewed

	@@ -0,0 +1,650 @@

+import json
+import pickle
+from argparse import ArgumentParser
+from typing import List, Dict, Tuple
+import clip
+import numpy as np
+import torch
+import torch.nn.functional as F
+from clip.model import CLIP
+from transformers import CLIPTextModelWithProjection
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from data_utils import collate_fn, PROJECT_ROOT, targetpad_transform
+from loader import FashionIQDataset, CIRRDataset, CIRCODataset
+from encode_with_pseudo_tokens import encode_with_pseudo_tokens_HF
+from models import build_text_encoder, Phi, PIC2WORD
+from utils import extract_image_features, device, extract_pseudo_tokens_with_phi
+torch.multiprocessing.set_sharing_strategy('file_system')
+@torch.no_grad()
+def fiq_generate_val_predictions(clip_model, relative_val_dataset: Dataset, ref_names_list: List[str],
+                                 pseudo_tokens: torch.Tensor) -> Tuple[torch.Tensor, List[str]]:
+    """
+    Generates features predictions for the validation set of Fashion IQ.
+    """
+    # Create data loader
+    relative_val_loader = DataLoader(dataset=relative_val_dataset, batch_size=32, num_workers=10,
+                                     pin_memory=False, collate_fn=collate_fn, shuffle=False)
+    predicted_features_list = []
+    target_names_list = []
+    # Compute features
+    for batch in tqdm(relative_val_loader):
+        reference_names = batch['reference_name']
+        target_names = batch['target_name']
+        relative_captions = batch['relative_captions']
+        flattened_captions: list = np.array(relative_captions).T.flatten().tolist()
+        input_captions = [
+            f"{flattened_captions[i].strip('.?, ')} and {flattened_captions[i + 1].strip('.?, ')}" for
+            i in range(0, len(flattened_captions), 2)]
+        input_captions_reversed = [
+            f"{flattened_captions[i + 1].strip('.?, ')} and {flattened_captions[i].strip('.?, ')}" for
+            i in range(0, len(flattened_captions), 2)]
+        input_captions = [
+            f"a photo of $ that {in_cap}" for in_cap in input_captions]
+        batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        input_captions_reversed = [
+            f"a photo of $ that {in_cap}" for in_cap in input_captions_reversed]
+        tokenized_input_captions_reversed = clip.tokenize(input_captions_reversed, context_length=77).to(device)
+        text_features_reversed = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions_reversed,
+                                                           batch_tokens)
+        predicted_features = F.normalize((F.normalize(text_features) + F.normalize(text_features_reversed)) / 2)
+        # predicted_features = F.normalize((text_features + text_features_reversed) / 2)
+        predicted_features_list.append(predicted_features)
+        target_names_list.extend(target_names)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, target_names_list
+@torch.no_grad()
+def fiq_compute_val_metrics(relative_val_dataset: Dataset, clip_model, index_features: torch.Tensor,
+                            index_names: List[str], ref_names_list: List[str], pseudo_tokens: torch.Tensor) \
+        -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the FashionIQ validation set given the dataset, pseudo tokens and the reference names
+    """
+    # Generate the predicted features
+    predicted_features, target_names = fiq_generate_val_predictions(clip_model, relative_val_dataset, ref_names_list,
+                                                                    pseudo_tokens)
+    # Move the features to the device
+    index_features = index_features.to(device)
+    predicted_features = predicted_features.to(device)
+    # Normalize the features
+    index_features = F.normalize(index_features.float())
+    # Compute the distances
+    distances = 1 - predicted_features @ index_features.T
+    sorted_indices = torch.argsort(distances, dim=-1).cpu()
+    sorted_index_names = np.array(index_names)[sorted_indices]
+    # Check if the target names are in the top 10 and top 50
+    labels = torch.tensor(
+        sorted_index_names == np.repeat(np.array(target_names), len(index_names)).reshape(len(target_names), -1))
+    assert torch.equal(torch.sum(labels, dim=-1).int(), torch.ones(len(target_names)).int())
+    # Compute the metrics
+    recall_at10 = (torch.sum(labels[:, :10]) / len(labels)).item() * 100
+    recall_at50 = (torch.sum(labels[:, :50]) / len(labels)).item() * 100
+    return {'fiq_recall_at10': recall_at10,
+            'fiq_recall_at50': recall_at50}
+@torch.no_grad()
+def fiq_val_retrieval(dataset_path: str, dress_type: str, image_encoder, text_encoder, ref_names_list: List[str],
+                      pseudo_tokens: torch.Tensor, preprocess: callable) -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the FashionIQ validation set given the pseudo tokens and the reference names
+    """
+    # Load the model
+    #clip_model, _ = clip.load(clip_model_name, device=device, jit=False)
+    #clip_model = clip_model.float().eval().requires_grad_(False)
+    # Extract the index features
+    classic_val_dataset = FashionIQDataset(dataset_path, 'val', [dress_type], 'classic', preprocess)
+    index_features, index_names = extract_image_features(classic_val_dataset, image_encoder)
+    # Define the relative dataset
+    relative_val_dataset = FashionIQDataset(dataset_path, 'val', [dress_type], 'relative', preprocess)
+    return fiq_compute_val_metrics(relative_val_dataset, text_encoder, index_features, index_names, ref_names_list,
+                                   pseudo_tokens)
+@torch.no_grad()
+def cirr_generate_val_predictions(clip_model: CLIPTextModelWithProjection, relative_val_dataset: Dataset, ref_names_list: List[str],
+                                  pseudo_tokens: torch.Tensor) -> \
+        Tuple[torch.Tensor, List[str], List[str], List[List[str]]]:
+    """
+    Generates features predictions for the validation set of CIRR
+    """
+    # Define the dataloader
+    relative_val_loader = DataLoader(dataset=relative_val_dataset, batch_size=32, num_workers=10,
+                                     pin_memory=False, collate_fn=collate_fn)
+    predicted_features_list = []
+    target_names_list = []
+    group_members_list = []
+    reference_names_list = []
+    for batch in tqdm(relative_val_loader):
+        reference_names = batch['reference_name']
+        target_names = batch['target_name']
+        relative_captions = batch['relative_caption']
+        group_members = batch['group_members']
+        group_members = np.array(group_members).T.tolist()
+        input_captions = [
+            f"a photo of $ that {rel_caption}" for rel_caption in relative_captions]
+        batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        predicted_features = F.normalize(text_features)
+        predicted_features_list.append(predicted_features)
+        target_names_list.extend(target_names)
+        group_members_list.extend(group_members)
+        reference_names_list.extend(reference_names)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, reference_names_list, target_names_list, group_members_list
+@torch.no_grad()
+def cirr_generate_val_predictions_with_phi(clip_model: CLIPTextModelWithProjection, phi, relative_val_dataset: Dataset, ref_names_list: List[str],
+                                           image_features: torch.Tensor) -> \
+        Tuple[torch.Tensor, List[str], List[str], List[List[str]]]:
+    """
+    Generates features predictions for the validation set of CIRR
+    """
+    # Define the dataloader
+    relative_val_loader = DataLoader(dataset=relative_val_dataset, batch_size=32, num_workers=10,
+                                     pin_memory=False, collate_fn=collate_fn)
+    predicted_features_list = []
+    target_names_list = []
+    group_members_list = []
+    reference_names_list = []
+    for batch in tqdm(relative_val_loader):
+        reference_names = batch['reference_name']
+        target_names = batch['target_name']
+        relative_captions = batch['relative_caption']
+        group_members = batch['group_members']
+        group_members = np.array(group_members).T.tolist()
+        input_captions = [
+            f"a photo of $ that {rel_caption}" for rel_caption in relative_captions]
+        # we need to make batch_tokens with selected_image_features
+        selected_image_features = torch.vstack([image_features[ref_names_list.index(ref)] for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        context = clip_model.text_model.embeddings.token_embedding(tokenized_input_captions) + clip_model.text_model.embeddings.position_embedding(clip_model.text_model.embeddings.position_ids)
+        batch_tokens = phi(selected_image_features, context)
+        #batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        predicted_features = F.normalize(text_features)
+        predicted_features_list.append(predicted_features)
+        target_names_list.extend(target_names)
+        group_members_list.extend(group_members)
+        reference_names_list.extend(reference_names)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, reference_names_list, target_names_list, group_members_list
+@torch.no_grad()
+def cirr_compute_val_metrics(relative_val_dataset: Dataset, clip_model, index_features: torch.Tensor,
+                             index_names: List[str], ref_names_list: List[str], pseudo_tokens: torch.Tensor) \
+        -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the CIRR validation set given the dataset, pseudo tokens and the reference names
+    """
+    # Generate the predicted features
+    predicted_features, reference_names, target_names, group_members = \
+        cirr_generate_val_predictions(clip_model, relative_val_dataset, ref_names_list, pseudo_tokens)
+    index_features = index_features.to(device)
+    predicted_features = predicted_features.to(device)
+    # Normalize the index features
+    index_features = F.normalize(index_features, dim=-1).float()
+    predicted_features = predicted_features.float()
+    # Compute the distances and sort the results
+    distances = 1 - predicted_features @ index_features.T
+    sorted_indices = torch.argsort(distances, dim=-1).cpu()
+    sorted_index_names = np.array(index_names)[sorted_indices]
+    # Delete the reference image from the results
+    reference_mask = torch.tensor(
+        sorted_index_names != np.repeat(np.array(reference_names), len(index_names)).reshape(len(target_names), -1))
+    sorted_index_names = sorted_index_names[reference_mask].reshape(sorted_index_names.shape[0],
+                                                                    sorted_index_names.shape[1] - 1)
+    # Compute the ground-truth labels wrt the predictions
+    labels = torch.tensor(
+        sorted_index_names == np.repeat(np.array(target_names), len(index_names) - 1).reshape(len(target_names), -1))
+    # Compute the subset predictions and ground-truth labels
+    group_members = np.array(group_members)
+    group_mask = (sorted_index_names[..., None] == group_members[:, None, :]).sum(-1).astype(bool)
+    group_labels = labels[group_mask].reshape(labels.shape[0], -1)
+    assert torch.equal(torch.sum(labels, dim=-1).int(), torch.ones(len(target_names)).int())
+    assert torch.equal(torch.sum(group_labels, dim=-1).int(), torch.ones(len(target_names)).int())
+    # Compute the metrics
+    recall_at1 = (torch.sum(labels[:, :1]) / len(labels)).item() * 100
+    recall_at5 = (torch.sum(labels[:, :5]) / len(labels)).item() * 100
+    recall_at10 = (torch.sum(labels[:, :10]) / len(labels)).item() * 100
+    recall_at50 = (torch.sum(labels[:, :50]) / len(labels)).item() * 100
+    group_recall_at1 = (torch.sum(group_labels[:, :1]) / len(group_labels)).item() * 100
+    group_recall_at2 = (torch.sum(group_labels[:, :2]) / len(group_labels)).item() * 100
+    group_recall_at3 = (torch.sum(group_labels[:, :3]) / len(group_labels)).item() * 100
+    return {
+        'cirr_recall_at1': recall_at1,
+        'cirr_recall_at5': recall_at5,
+        'cirr_recall_at10': recall_at10,
+        'cirr_recall_at50': recall_at50,
+        'cirr_group_recall_at1': group_recall_at1,
+        'cirr_group_recall_at2': group_recall_at2,
+        'cirr_group_recall_at3': group_recall_at3,
+    }
+@torch.no_grad()
+def cirr_compute_val_metrics_with_phi(relative_val_dataset: Dataset, clip_model: CLIPTextModelWithProjection, phi,  index_features: torch.Tensor,
+                             index_names: List[str], ref_names_list: List[str], image_features: torch.Tensor) \
+        -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the CIRR validation set given the dataset, pseudo tokens and the reference names
+    """
+    # Generate the predicted features
+    predicted_features, reference_names, target_names, group_members = \
+        cirr_generate_val_predictions_with_phi(clip_model, phi, relative_val_dataset, ref_names_list, image_features)
+    index_features = index_features.to(device)
+    predicted_features = predicted_features.to(device)
+    # Normalize the index features
+    index_features = F.normalize(index_features, dim=-1).float()
+    predicted_features = predicted_features.float()
+    # Compute the distances and sort the results
+    distances = 1 - predicted_features @ index_features.T
+    sorted_indices = torch.argsort(distances, dim=-1).cpu()
+    sorted_index_names = np.array(index_names)[sorted_indices]
+    # Delete the reference image from the results
+    reference_mask = torch.tensor(
+        sorted_index_names != np.repeat(np.array(reference_names), len(index_names)).reshape(len(target_names), -1))
+    sorted_index_names = sorted_index_names[reference_mask].reshape(sorted_index_names.shape[0],
+                                                                    sorted_index_names.shape[1] - 1)
+    # Compute the ground-truth labels wrt the predictions
+    labels = torch.tensor(
+        sorted_index_names == np.repeat(np.array(target_names), len(index_names) - 1).reshape(len(target_names), -1))
+    # Compute the subset predictions and ground-truth labels
+    group_members = np.array(group_members)
+    group_mask = (sorted_index_names[..., None] == group_members[:, None, :]).sum(-1).astype(bool)
+    group_labels = labels[group_mask].reshape(labels.shape[0], -1)
+    assert torch.equal(torch.sum(labels, dim=-1).int(), torch.ones(len(target_names)).int())
+    assert torch.equal(torch.sum(group_labels, dim=-1).int(), torch.ones(len(target_names)).int())
+    # Compute the metrics
+    recall_at1 = (torch.sum(labels[:, :1]) / len(labels)).item() * 100
+    recall_at5 = (torch.sum(labels[:, :5]) / len(labels)).item() * 100
+    recall_at10 = (torch.sum(labels[:, :10]) / len(labels)).item() * 100
+    recall_at50 = (torch.sum(labels[:, :50]) / len(labels)).item() * 100
+    group_recall_at1 = (torch.sum(group_labels[:, :1]) / len(group_labels)).item() * 100
+    group_recall_at2 = (torch.sum(group_labels[:, :2]) / len(group_labels)).item() * 100
+    group_recall_at3 = (torch.sum(group_labels[:, :3]) / len(group_labels)).item() * 100
+    return {
+        'cirr_recall_at1': recall_at1,
+        'cirr_recall_at5': recall_at5,
+        'cirr_recall_at10': recall_at10,
+        'cirr_recall_at50': recall_at50,
+        'cirr_group_recall_at1': group_recall_at1,
+        'cirr_group_recall_at2': group_recall_at2,
+        'cirr_group_recall_at3': group_recall_at3,
+    }
+@torch.no_grad()
+def cirr_val_retrieval(dataset_path: str, image_encoder, text_encoder, ref_names_list: list, pseudo_tokens: torch.Tensor,
+                       preprocess: callable) -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the CIRR validation set given the pseudo tokens and the reference names
+    """
+    # Load the model
+    #clip_model, _ = clip.load(clip_model_name, device=device, jit=False)
+    #clip_model = clip_model.float().eval().requires_grad_(False)
+    # Extract the index features
+    classic_val_dataset = CIRRDataset(dataset_path, 'val', 'classic', preprocess)
+    index_features, index_names = extract_image_features(classic_val_dataset, image_encoder)
+    # Define the relative validation dataset
+    relative_val_dataset = CIRRDataset(dataset_path, 'val', 'relative', preprocess)
+    return cirr_compute_val_metrics(relative_val_dataset, text_encoder, index_features, index_names,
+                                    ref_names_list, pseudo_tokens)
+@torch.no_grad()
+def circo_generate_val_predictions(clip_model, relative_val_dataset: Dataset, ref_names_list: List[str],
+                                   pseudo_tokens: torch.Tensor) -> Tuple[
+    torch.Tensor, List[str], list]:
+    """
+    Generates features predictions for the validation set of CIRCO
+    """
+    # Create the data loader
+    relative_val_loader = DataLoader(dataset=relative_val_dataset, batch_size=32, num_workers=10,
+                                     pin_memory=False, collate_fn=collate_fn, shuffle=False)
+    predicted_features_list = []
+    target_names_list = []
+    gts_img_ids_list = []
+    # Compute the features
+    for batch in tqdm(relative_val_loader):
+        reference_names = batch['reference_name']
+        target_names = batch['target_name']
+        relative_captions = batch['relative_caption']
+        gt_img_ids = batch['gt_img_ids']
+        gt_img_ids = np.array(gt_img_ids).T.tolist()
+        input_captions = [f"a photo of $ that {caption}" for caption in relative_captions]
+        batch_tokens = torch.vstack([pseudo_tokens[ref_names_list.index(ref)].unsqueeze(0) for ref in reference_names])
+        tokenized_input_captions = clip.tokenize(input_captions, context_length=77).to(device)
+        text_features = encode_with_pseudo_tokens_HF(clip_model, tokenized_input_captions, batch_tokens)
+        predicted_features = F.normalize(text_features)
+        predicted_features_list.append(predicted_features)
+        target_names_list.extend(target_names)
+        gts_img_ids_list.extend(gt_img_ids)
+    predicted_features = torch.vstack(predicted_features_list)
+    return predicted_features, target_names_list, gts_img_ids_list
+@torch.no_grad()
+def circo_compute_val_metrics(relative_val_dataset: Dataset, clip_model, index_features: torch.Tensor,
+                              index_names: List[str], ref_names_list: List[str], pseudo_tokens: torch.Tensor) \
+        -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the CIRCO validation set given the dataset, pseudo tokens and the reference names
+    """
+    # Generate the predicted features
+    predicted_features, target_names, gts_img_ids = circo_generate_val_predictions(clip_model, relative_val_dataset,
+                                                                                   ref_names_list, pseudo_tokens)
+    ap_at5 = []
+    ap_at10 = []
+    ap_at25 = []
+    ap_at50 = []
+    recall_at5 = []
+    recall_at10 = []
+    recall_at25 = []
+    recall_at50 = []
+    # Move the features to the device
+    index_features = index_features.to(device)
+    predicted_features = predicted_features.to(device)
+    # Normalize the features
+    index_features = F.normalize(index_features.float())
+    for predicted_feature, target_name, gt_img_ids in tqdm(zip(predicted_features, target_names, gts_img_ids)):
+        gt_img_ids = np.array(gt_img_ids)[
+            np.array(gt_img_ids) != '']  # remove trailing empty strings added for collate_fn
+        similarity = predicted_feature @ index_features.T
+        sorted_indices = torch.topk(similarity, dim=-1, k=50).indices.cpu()
+        sorted_index_names = np.array(index_names)[sorted_indices]
+        map_labels = torch.tensor(np.isin(sorted_index_names, gt_img_ids), dtype=torch.uint8)
+        precisions = torch.cumsum(map_labels, dim=0) * map_labels  # Consider only positions corresponding to GTs
+        precisions = precisions / torch.arange(1, map_labels.shape[0] + 1)  # Compute precision for each position
+        ap_at5.append(float(torch.sum(precisions[:5]) / min(len(gt_img_ids), 5)))
+        ap_at10.append(float(torch.sum(precisions[:10]) / min(len(gt_img_ids), 10)))
+        ap_at25.append(float(torch.sum(precisions[:25]) / min(len(gt_img_ids), 25)))
+        ap_at50.append(float(torch.sum(precisions[:50]) / min(len(gt_img_ids), 50)))
+        assert target_name == gt_img_ids[0], f"Target name not in GTs {target_name} {gt_img_ids}"
+        single_gt_labels = torch.tensor(sorted_index_names == target_name)
+        recall_at5.append(float(torch.sum(single_gt_labels[:5])))
+        recall_at10.append(float(torch.sum(single_gt_labels[:10])))
+        recall_at25.append(float(torch.sum(single_gt_labels[:25])))
+        recall_at50.append(float(torch.sum(single_gt_labels[:50])))
+    map_at5 = np.mean(ap_at5) * 100
+    map_at10 = np.mean(ap_at10) * 100
+    map_at25 = np.mean(ap_at25) * 100
+    map_at50 = np.mean(ap_at50) * 100
+    recall_at5 = np.mean(recall_at5) * 100
+    recall_at10 = np.mean(recall_at10) * 100
+    recall_at25 = np.mean(recall_at25) * 100
+    recall_at50 = np.mean(recall_at50) * 100
+    return {
+        'circo_map_at5': map_at5,
+        'circo_map_at10': map_at10,
+        'circo_map_at25': map_at25,
+        'circo_map_at50': map_at50,
+        'circo_recall_at5': recall_at5,
+        'circo_recall_at10': recall_at10,
+        'circo_recall_at25': recall_at25,
+        'circo_recall_at50': recall_at50,
+    }
+@torch.no_grad()
+def circo_val_retrieval(dataset_path: str, image_encoder, text_encoder, ref_names_list: List[str], pseudo_tokens: torch.Tensor,
+                        preprocess: callable) -> Dict[str, float]:
+    """
+    Compute the retrieval metrics on the CIRCO validation set given the pseudo tokens and the reference names
+    """
+    # Load the model
+    #clip_model, _ = clip.load(clip_model_name, device=device, jit=False)
+    #clip_model = clip_model.float().eval().requires_grad_(False)
+    # Extract the index features
+    classic_val_dataset = CIRCODataset(dataset_path, 'val', 'classic', preprocess)
+    index_features, index_names = extract_image_features(classic_val_dataset, image_encoder)
+    # Define the relative validation dataset
+    relative_val_dataset = CIRCODataset(dataset_path, 'val', 'relative', preprocess)
+    return circo_compute_val_metrics(relative_val_dataset, text_encoder, index_features, index_names, ref_names_list,
+                                     pseudo_tokens)
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--exp-name", type=str, help="Experiment to evaluate")
+    parser.add_argument("--eval-type", type=str, choices=['oti', 'phi', 'searle', 'searle-xl', 'pic2word'], required=True,
+                        help="If 'oti' evaluate directly using the inverted oti pseudo tokens, "
+                             "if 'phi' predicts the pseudo tokens using the phi network, "
+                             "if 'searle' uses the pre-trained SEARLE model to predict the pseudo tokens, "
+                             "if 'searle-xl' uses the pre-trained SEARLE-XL model to predict the pseudo tokens"
+                        )
+    parser.add_argument("--dataset", type=str, required=True, choices=['cirr', 'fashioniq', 'circo'],
+                        help="Dataset to use")
+    parser.add_argument("--dataset-path", type=str, help="Path to the dataset", required=True)
+    parser.add_argument("--preprocess-type", default="clip", type=str, choices=['clip', 'targetpad'],
+                        help="Preprocess pipeline to use")
+    parser.add_argument("--phi-checkpoint-name", type=str,
+                        help="Phi checkpoint to use, needed when using phi, e.g. 'phi_20.pt'")
+    parser.add_argument("--clip_model_name", default="giga", type=str)
+    parser.add_argument("--cache_dir", default="./hf_models", type=str)
+    parser.add_argument("--l2_normalize", action="store_true", help="Whether or not to use l2 normalization")
+    args = parser.parse_args()
+    #if args.eval_type in ['phi', 'oti'] and args.exp_name is None:
+    #    raise ValueError("Experiment name is required when using phi or oti evaluation type")
+    if args.eval_type == 'phi' and args.phi_checkpoint_name is None:
+        raise ValueError("Phi checkpoint name is required when using phi evaluation type")
+    if args.eval_type == 'oti':
+        experiment_path = PROJECT_ROOT / 'data' / "oti_pseudo_tokens" / args.dataset.lower() / 'val' / args.exp_name
+        if not experiment_path.exists():
+            raise ValueError(f"Experiment {args.exp_name} not found")
+        with open(experiment_path / 'hyperparameters.json') as f:
+            hyperparameters = json.load(f)
+        pseudo_tokens = torch.load(experiment_path / 'ema_oti_pseudo_tokens.pt', map_location=device)
+        with open(experiment_path / 'image_names.pkl', 'rb') as f:
+            ref_names_list = pickle.load(f)
+        clip_model_name = hyperparameters['clip_model_name']
+        clip_model, clip_preprocess = clip.load(clip_model_name, device='cpu', jit=False)
+        if args.preprocess_type == 'targetpad':
+            print('Target pad preprocess pipeline is used')
+            preprocess = targetpad_transform(1.25, clip_model.visual.input_resolution)
+        elif args.preprocess_type == 'clip':
+            print('CLIP preprocess pipeline is used')
+            preprocess = clip_preprocess
+        else:
+            raise ValueError("Preprocess type not supported")
+    elif args.eval_type in ['phi', 'searle', 'searle-xl', 'pic2word']:
+        if args.eval_type == 'phi':
+            args.mixed_precision = 'fp16'
+            image_encoder, clip_preprocess, text_encoder, tokenizer = build_text_encoder(args)
+            phi = Phi(input_dim=text_encoder.config.projection_dim,
+                      hidden_dim=text_encoder.config.projection_dim * 4,
+                      output_dim=text_encoder.config.hidden_size, dropout=0.5).to(
+                device)
+            phi.load_state_dict(
+                    torch.load(args.phi_checkpoint_name, map_location=device)[
+                    phi.__class__.__name__])
+            phi = phi.eval()
+        elif args.eval_type == 'pic2word':
+            args.mixed_precision = 'fp16'
+            image_encoder, clip_preprocess, text_encoder, tokenizer = build_text_encoder(args)
+            phi = PIC2WORD(embed_dim=text_encoder.config.projection_dim,
+                           output_dim=text_encoder.config.hidden_size,
+                           ).to(device)
+            sd = torch.load(args.phi_checkpoint_name, map_location=device)['state_dict_img2text']
+            sd = {k[len('module.'):]: v for k, v in sd.items()}
+            phi.load_state_dict(sd)
+            phi = phi.eval()
+        else:  # searle or searle-xl
+            if args.eval_type == 'searle':
+                clip_model_name = 'ViT-B/32'
+            else:  # args.eval_type == 'searle-xl':
+                clip_model_name = 'ViT-L/14'
+            phi, _ = torch.hub.load(repo_or_dir='miccunifi/SEARLE', model='searle', source='github',
+                                    backbone=clip_model_name)
+            phi = phi.to(device).eval()
+            clip_model, clip_preprocess = clip.load(clip_model_name, device=device, jit=False)
+        if args.preprocess_type == 'targetpad':
+            print('Target pad preprocess pipeline is used')
+            preprocess = targetpad_transform(1.25, clip_model.visual.input_resolution)
+        elif args.preprocess_type == 'clip':
+            print('CLIP preprocess pipeline is used')
+            preprocess = clip_preprocess
+        else:
+            raise ValueError("Preprocess type not supported")
+        if args.dataset.lower() == 'fashioniq':
+            relative_val_dataset = FashionIQDataset(args.dataset_path, 'val', ['dress', 'toptee', 'shirt'],
+                                                    'relative', preprocess, no_duplicates=True)
+        elif args.dataset.lower() == 'cirr':
+            relative_val_dataset = CIRRDataset(args.dataset_path, 'val', 'relative', preprocess,
+                                               no_duplicates=True)
+        elif args.dataset.lower() == 'circo':
+            relative_val_dataset = CIRCODataset(args.dataset_path, 'val', 'relative', preprocess)
+        else:
+            raise ValueError("Dataset not supported")
+        #clip_model = clip_model.float().to(device)
+        image_encoder = image_encoder.float().to(device)
+        text_encoder = text_encoder.float().to(device)
+        pseudo_tokens, ref_names_list = extract_pseudo_tokens_with_phi(image_encoder, phi, relative_val_dataset, args)
+        pseudo_tokens = pseudo_tokens.to(device)
+    else:
+        raise ValueError("Eval type not supported")
+    print(f"Eval type = {args.eval_type} \t exp name = {args.exp_name} \t")
+    if args.dataset.lower() == 'fashioniq':
+        recalls_at10 = []
+        recalls_at50 = []
+        for dress_type in ['shirt', 'dress', 'toptee']:
+            fiq_metrics = fiq_val_retrieval(args.dataset_path, dress_type, image_encoder, text_encoder, ref_names_list,
+                                            pseudo_tokens, preprocess)
+            recalls_at10.append(fiq_metrics['fiq_recall_at10'])
+            recalls_at50.append(fiq_metrics['fiq_recall_at50'])
+            for k, v in fiq_metrics.items():
+                print(f"{dress_type}_{k} = {v:.2f}")
+            print("\n")
+        print(f"average_fiq_recall_at10 = {np.mean(recalls_at10):.2f}")
+        print(f"average_fiq_recall_at50 = {np.mean(recalls_at50):.2f}")
+    elif args.dataset.lower() == 'cirr':
+        cirr_metrics = cirr_val_retrieval(args.dataset_path, image_encoder, text_encoder, ref_names_list, pseudo_tokens,
+                                          preprocess)
+        for k, v in cirr_metrics.items():
+            print(f"{k} = {v:.2f}")
+    elif args.dataset.lower() == 'circo':
+        circo_metrics = circo_val_retrieval(args.dataset_path, clip_model_name, ref_names_list, pseudo_tokens,
+                                            preprocess)
+        for k, v in circo_metrics.items():
+            print(f"{k} = {v:.2f}")
+if __name__ == '__main__':
+    main()