Spaces:

jiaqingj
/

ConZIC

Runtime error

App Files Files Community

jiaqingj commited on Apr 1, 2023

Commit

85a5010

1 Parent(s): 8ec26be

ConZIC

Browse files

Files changed (15) hide show

POS_classifier.py +69 -0
app.py +285 -0
clip/build_text_index.py +105 -0
clip/clip.py +146 -0
clip/clipretrieval.py +135 -0
control_gen_utils.py +223 -0
examples/Gosh.jpeg +0 -0
examples/cat.png +0 -0
examples/girl.jpg +0 -0
examples/horse.png +0 -0
gen_utils.py +324 -0
requirements.txt +3 -0
sentiments_classifer.py +51 -0
stop_words.txt +2835 -0
utils.py +74 -0

POS_classifier.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from nltk.tokenize import word_tokenize
+from nltk import pos_tag
+import torch
+import json
+def batch_texts_POS_analysis(batch_texts, pos_templete, device="cuda"):
+    batch_size = len(batch_texts)
+    pos_tags = []
+    pos_scores = torch.zeros(batch_size)
+    for b_id in range(batch_size):
+        text = batch_texts[b_id]
+        words = word_tokenize(text)
+        word_tag = pos_tag(words, tagset="universal")
+        res_tag = [tag[1] for tag in word_tag]
+        total_num = len(pos_templete)
+        correct = 0
+        if len(res_tag) <= total_num:
+            cur_tag = res_tag + [""] * (len(pos_templete)-len(res_tag))
+        else:
+            cur_tag = res_tag[:total_num]
+        for word_id in range(len(cur_tag)):
+            if pos_templete[word_id]=="":
+                correct += 1
+            elif cur_tag[word_id] in pos_templete[word_id]:
+                correct +=1
+        acc = correct/total_num
+        pos_tags.append(res_tag)
+        pos_scores[b_id] = acc
+    return pos_tags, pos_scores
+def text_POS_analysis(text):
+    words = word_tokenize(text)
+    word_tag = pos_tag(words, tagset="universal")
+    res_tag = [tag[1] for tag in word_tag]
+    return res_tag
+if __name__=="__main__":
+    batch_texts = ["A cat sitting in the bed.",
+                   "Two men in a nice hotel room one playing a video game with a remote control.",
+                   "The man sitting in the chair feels like an invisible,dead man."]
+    pos_templete = ['DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', '.', 'NOUN', 'CONJ', 'NOUN', 'ADP', 'PRON', '.']
+    batch_texts_POS_analysis(batch_texts, pos_templete, device="cuda")
+    cur_path = "iter_15.json"
+    all_caption = []
+    with open(cur_path, "r") as cur_json_file:
+        all_res = list(json.load(cur_json_file).values())
+        for res in all_res:
+            if isinstance(res, list):
+                all_caption += res
+            else:
+                all_caption.append(res)
+        pos_tags, pos_scores = batch_texts_POS_analysis(all_caption, pos_templete, device="cuda")
+        word_id = 12
+        pos_dict = {"ADJ": 0, "ADP": 0, "ADV": 0,
+                    "CONJ": 0, "DET": 0, "NOUN": 0,"X":0,
+                    "NUM": 0, "PRT": 0, "PRON": 0, "VERB": 0, ".": 0}
+        for pos_tag in pos_tags:
+            if word_id < len(pos_tag):
+                pos_dict[pos_tag[word_id]] += 1
+        print(1)

app.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from utils import create_logger, set_seed, format_output
+import os
+import time
+import argparse
+import json
+from PIL import Image
+import torch
+import gradio as gr
+import nltk
+from clip.clip import CLIP
+from gen_utils import generate_caption
+from control_gen_utils import control_generate_caption
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--batch_size", type=int, default=1, help = "Only supports batch_size=1 currently.")
+    parser.add_argument("--device", type=str,
+                        default='cpu',choices=['cuda','cpu'])
+    ## Generation and Controllable Type
+    parser.add_argument('--run_type',
+                        default='caption',
+                        nargs='?',
+                        choices=['caption', 'controllable'])
+    parser.add_argument('--prompt',
+                        default='Image of a',type=str)
+    parser.add_argument('--order',
+                        default='shuffle',
+                        nargs='?',
+                        choices=['sequential', 'shuffle', 'span', 'random','parallel'],
+                        help="Generation order of text")
+    parser.add_argument('--control_type',
+                        default='sentiment',
+                        nargs='?',
+                        choices=["sentiment","pos"],
+                        help="which controllable task to conduct")
+    parser.add_argument('--pos_type', type=list,
+                        default=[['DET'], ['ADJ','NOUN'], ['NOUN'],
+                                 ['VERB'], ['VERB'],['ADV'], ['ADP'],
+                                 ['DET','NOUN'], ['NOUN'], ['NOUN','.'],
+                                 ['.','NOUN'],['.','NOUN']],
+                        help="predefined part-of-speech templete")
+    parser.add_argument('--sentiment_type',
+                        default="positive",
+                        nargs='?',
+                        choices=["positive", "negative"])
+    parser.add_argument('--samples_num',
+                        default=2,type=int)
+    ## Hyperparameters
+    parser.add_argument("--sentence_len", type=int, default=10)
+    parser.add_argument("--candidate_k", type=int, default=200)
+    parser.add_argument("--alpha", type=float, default=0.02, help="weight for fluency")
+    parser.add_argument("--beta", type=float, default=2.0, help="weight for image-matching degree")
+    parser.add_argument("--gamma", type=float, default=5.0, help="weight for controllable degree")
+    parser.add_argument("--lm_temperature", type=float, default=0.1)
+    parser.add_argument("--num_iterations", type=int, default=1, help="predefined iterations for Gibbs Sampling")
+    ## Models and Paths
+    parser.add_argument("--lm_model", type=str, default='bert-base-uncased',
+                        help="Path to language model") # bert,roberta
+    parser.add_argument("--match_model", type=str, default='clip-vit-base-patch32',
+                        help="Path to Image-Text model")  # clip,align
+    parser.add_argument("--caption_img_path", type=str, default='./examples/girl.jpg',
+                        help="file path of the image for captioning")
+    parser.add_argument("--stop_words_path", type=str, default='stop_words.txt',
+                        help="Path to stop_words.txt")
+    parser.add_argument("--add_extra_stopwords", type=list, default=[],
+                        help="you can add some extra stop words")
+    args = parser.parse_args()
+    return args
+def run_caption(args, image, lm_model, lm_tokenizer, clip, token_mask, logger):
+    FinalCaptionList = []
+    BestCaptionList = []
+    # logger.info(f"Processing: {image_path}")
+    image_instance = image.convert("RGB")
+    for sample_id in range(args.samples_num):
+        logger.info(f"Sample {sample_id}: ")
+        gen_texts, clip_scores = generate_caption(lm_model, clip, lm_tokenizer, image_instance, token_mask, logger,
+                                  prompt=args.prompt, batch_size=args.batch_size, max_len=args.sentence_len,
+                                  top_k=args.candidate_k, temperature=args.lm_temperature,
+                                  max_iter=args.num_iterations,alpha=args.alpha,beta=args.beta,
+                                  generate_order = args.order)
+        FinalCaptionStr = "Sample {}: ".format(sample_id + 1) + gen_texts[-2]
+        BestCaptionStr = "Sample {}: ".format(sample_id + 1) + gen_texts[-1]
+        FinalCaptionList.append(FinalCaptionStr)
+        BestCaptionList.append(BestCaptionStr)
+    return FinalCaptionList, BestCaptionList
+def run_control(run_type, args, image, lm_model, lm_tokenizer, clip, token_mask, logger):
+    FinalCaptionList = []
+    BestCaptionList = []
+    # logger.info(f"Processing: {image_path}")
+    image_instance = image.convert("RGB")
+    for sample_id in range(args.samples_num):
+        logger.info(f"Sample {sample_id}: ")
+        gen_texts, clip_scores = control_generate_caption(lm_model, clip, lm_tokenizer, image_instance, token_mask, logger,
+                                  prompt=args.prompt, batch_size=args.batch_size, max_len=args.sentence_len,
+                                  top_k=args.candidate_k, temperature=args.lm_temperature,
+                                  max_iter=args.num_iterations, alpha=args.alpha,
+                                  beta=args.beta, gamma=args.gamma,
+                                  ctl_type = args.control_type, style_type=args.sentiment_type,pos_type=args.pos_type, generate_order=args.order)
+        FinalCaptionStr = "Sample {}: ".format(sample_id + 1) + gen_texts[-2]
+        BestCaptionStr = "Sample {}: ".format(sample_id + 1) + gen_texts[-1]
+        FinalCaptionList.append(FinalCaptionStr)
+        BestCaptionList.append(BestCaptionStr)
+    return FinalCaptionList, BestCaptionList
+def Demo(RunType, ControlType, SentimentType, Order, Length, NumIterations, SamplesNum, Alpha, Beta, Gamma, Img):
+    args = get_args()
+    set_seed(args.seed)
+    args.num_iterations = NumIterations
+    args.sentence_len = Length
+    args.run_type = RunType
+    args.control_type = ControlType
+    args.sentiment_type = SentimentType
+    args.alpha = Alpha
+    args.beta = Beta
+    args.gamma = Gamma
+    args.samples_num = SamplesNum
+    args.order = Order
+    img = Img
+    run_type = "caption" if args.run_type=="caption" else args.control_type
+    if run_type=="sentiment":
+        run_type = args.sentiment_type
+    if os.path.exists("logger")== False:
+        os.mkdir("logger")
+    logger = create_logger(
+        "logger",'demo_{}_{}_len{}_topk{}_alpha{}_beta{}_gamma{}_lmtemp{}_{}.log'.format(
+        run_type, args.order,args.sentence_len,
+        args.candidate_k, args.alpha,args.beta,args.gamma,args.lm_temperature,
+        time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())))
+    logger.info(f"Generating order:{args.order}")
+    logger.info(f"Run type:{run_type}")
+    logger.info(args)
+    # Load pre-trained model (weights)
+    lm_model = AutoModelForMaskedLM.from_pretrained(args.lm_model)
+    lm_tokenizer = AutoTokenizer.from_pretrained(args.lm_model)
+    lm_model.eval()
+    clip = CLIP(args.match_model)
+    clip.eval()
+    lm_model = lm_model.to(args.device)
+    clip = clip.to(args.device)
+    ## Remove stop words, token mask
+    with open(args.stop_words_path,'r',encoding='utf-8') as stop_words_file:
+        stop_words = stop_words_file.readlines()
+        stop_words_ = [stop_word.rstrip('\n') for stop_word in stop_words]
+        stop_words_ += args.add_extra_stopwords
+        stop_ids = lm_tokenizer.convert_tokens_to_ids(stop_words_)
+        token_mask = torch.ones((1,lm_tokenizer.vocab_size))
+        for stop_id in stop_ids:
+            token_mask[0,stop_id]=0
+        token_mask = token_mask.to(args.device)
+    if args.run_type == 'caption':
+        FinalCaption, BestCaption = run_caption(args, img, lm_model, lm_tokenizer, clip, token_mask, logger)
+    elif args.run_type == 'controllable':
+        FinalCaption, BestCaption = run_control(run_type, args, img, lm_model, lm_tokenizer, clip, token_mask, logger)
+    else:
+        raise Exception('run_type must be caption or controllable!')
+    logger.handlers = []
+    FinalCaptionFormat, BestCaptionFormat = format_output(SamplesNum, FinalCaption, BestCaption)
+    return FinalCaptionFormat, BestCaptionFormat
+def RunTypeChange(choice):
+    if choice == "caption":
+        return gr.update(visible=False)
+    elif choice == "controllable":
+        return gr.update(visible=True)
+def ControlTypeChange(choice):
+    if choice == "pos":
+        return gr.update(visible=False)
+    elif choice == "sentiment":
+        return gr.update(visible=True)
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    # ConZIC
+    ### Controllable Zero-shot Image Captioning by Sampling-Based Polishing
+    """)
+    with gr.Row():
+        with gr.Column():
+            RunType = gr.Radio(
+                ["caption", "controllable"], value="caption", label="Run Type", info="Select the Run Type"
+            )
+            ControlType = gr.Radio(
+                ["sentiment", "pos"], value="sentiment", label="Control Type", info="Select the Control Type",
+                visible=False, interactive=True
+            )
+            SentimentType = gr.Radio(
+                ["positive", "negative"], value="positive", label="Sentiment Type", info="Select the Sentiment Type",
+                visible=False, interactive=True
+            )
+            Order = gr.Radio(
+                ["sequential", "shuffle", "random"], value="shuffle", label="Order", info="Generation order of text"
+            )
+            RunType.change(fn = RunTypeChange, inputs = RunType, outputs = SentimentType)
+            RunType.change(fn = RunTypeChange, inputs = RunType, outputs = ControlType)
+            ControlType.change(fn = ControlTypeChange, inputs = ControlType, outputs = SentimentType)
+            with gr.Row():
+                Length = gr.Slider(
+                    5, 15, value=10, label="Sentence Length", info="Choose betwen 5 and 15", step=1
+                )
+                NumIterations = gr.Slider(
+                    1, 15, value=10, label="Num Iterations", info="predefined iterations for Gibbs Sampling", step=1
+                )
+            with gr.Row():
+                SamplesNum = gr.Slider(
+                    1, 5, value=2, label="Samples Num", step=1
+                )
+                Alpha = gr.Slider(
+                    0, 1, value=0.02, label="Alpha", info="Weight for fluency", step=0.01
+                )
+            with gr.Row():
+                Beta = gr.Slider(
+                    1, 5, value=2, label="Beta", info="Weight for image-matching degree", step=0.5
+                )
+                Gamma = gr.Slider(
+                    1, 10, value=5, label="Gamma", info="weight for controllable degree", step=0.5
+                )
+        with gr.Column():
+            Img = gr.Image(label="Upload Picture", type = "pil")
+            FinalCaption = gr.Textbox(label="Final Caption", lines=5, placeholder="Final Caption")
+            BestCaption = gr.Textbox(label="Best Caption", lines=5, placeholder="Best Caption")
+            with gr.Row():
+                gen_button = gr.Button("Submit")
+                clear_button = gr.Button("Reset")
+    gen_button.click(
+        fn = Demo,
+        inputs = [
+            RunType, ControlType, SentimentType, Order, Length, NumIterations, SamplesNum, Alpha, Beta, Gamma, Img
+        ],
+        outputs = [
+            FinalCaption, BestCaption
+        ]
+    )
+    clear_button.click(
+        fn = lambda : [gr.Radio.update(value = 'caption'), gr.Radio.update(value = 'pos'), gr.Radio.update(value = 'positive'),
+            gr.Radio.update(value = 'shuffle'), gr.Slider.update(value = 10), gr.Slider.update(value = 10),
+            gr.Slider.update(value = 2), gr.Slider.update(value = 0.02), gr.Slider.update(value = 2),
+            gr.Slider.update(value = 5)
+        ],
+        inputs = [
+        ],
+        outputs = [
+           RunType, ControlType, SentimentType, Order, Length, NumIterations, SamplesNum, Alpha, Beta, Gamma
+        ]
+    )
+if __name__ == "__main__":
+    nltk.download('wordnet')
+    nltk.download('punkt')
+    nltk.download('averaged_perceptron_tagger')
+    nltk.download('sentiwordnet')
+    demo.launch()

clip/build_text_index.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import sys
+import torch
+import numpy as np
+import progressbar
+import os
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--clip_name", type=str, default="openai/clip-vit-base-patch32")
+    parser.add_argument("--text_file_path", type=str)
+    # save configuration
+    parser.add_argument("--save_index_prefix", type=str, help='where to save the mips index')
+    parser.add_argument("--save_index_name", type=str)
+    parser.add_argument("--save_mapping_dict_name", type=str,
+        help="a json file that stores a dictory. The dictory contains mapping between mips index and caption text")
+    # inference configuration
+    parser.add_argument("--batch_size", type=int, help="the batch size used to conduct inference with CLIP")
+    return parser.parse_args()
+def load_batch_text(text_file_path, batch_size):
+    import json
+    with open(text_file_path) as f:
+        item_list = json.load(f)
+    text_list = []
+    for item in item_list:
+        captions = item["captions"]
+        for cap in captions:
+            text_list.append(cap)
+    print ('Number of text instances is {}'.format(len(text_list)))
+    data_num = len(text_list)
+    batch_num = data_num // batch_size
+    batch_text_list = []
+    s_idx, e_idx = 0, batch_size
+    for p_idx in range(batch_num):
+        one_batch_text_list = []
+        for idx in range(s_idx, e_idx):
+            one_batch_text_list.append(text_list[idx])
+        batch_text_list.append(one_batch_text_list)
+    return batch_text_list
+import argparse
+if __name__ == '__main__':
+    if torch.cuda.is_available():
+        print ('Cuda is available.')
+    cuda_available = torch.cuda.is_available()
+    args = parse_config()
+    device = torch.device('cuda')
+    import os
+    if os.path.exists(args.save_index_prefix):
+        pass
+    else: # recursively construct directory
+        os.makedirs(args.save_index_prefix, exist_ok=True)
+    print ('Loading CLIP...')
+    from clip import CLIP
+    model = CLIP(args.clip_name)
+    if cuda_available:
+        model = model.cuda(device)
+    model.eval()
+    print ('CLIP loaded!')
+    print ('Loading text data...')
+    batch_text_list = load_batch_text(args.text_file_path, args.batch_size)
+    print ('Text data loaded.')
+    res_text_vec_list, res_text_list = [], []
+    batch_num = len(batch_text_list)
+    print ('Number of batches is {}'.format(batch_num))
+    print ('Start inference...')
+    p = progressbar.ProgressBar(batch_num)
+    p.start()
+    with torch.no_grad():
+        for p_idx in range(batch_num):
+            p.update(p_idx)
+            one_text_batch = batch_text_list[p_idx]
+            one_batch_vec = model.compute_batch_index_text_representation(one_text_batch).detach().cpu()
+            one_batch_vec_list = one_batch_vec.unbind(dim=0)
+            bsz = len(one_batch_vec_list)
+            for k in range(bsz):
+                res_text_vec_list.append(one_batch_vec_list[k].numpy())
+                res_text_list.append(one_text_batch[k])
+    p.finish()
+    assert len(res_text_vec_list) == len(res_text_list)
+    print ('Inference completed!')
+    index_text_mapping_dict = {}
+    for k in range(len(res_text_list)):
+        index_text_mapping_dict[k] = res_text_list[k]
+    mapping_list_save_path = args.save_index_prefix + '/' + args.save_mapping_dict_name
+    import json
+    with open(mapping_list_save_path, 'w') as outfile:
+        json.dump(index_text_mapping_dict, outfile, indent=4)
+    print ('Mapping dictionary saved!')
+    print ('Start buiding index...')
+    index_save_path = args.save_index_prefix + '/' + args.save_index_name
+    with open(index_save_path, 'w', encoding = 'utf8') as o:
+        for vec in res_text_vec_list:
+            one_text = ' '.join([str(num) for num in vec]).strip()
+            o.writelines(one_text + '\n')
+    print ('Index completed!')

clip/clip.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import requests
+from torch import nn
+from PIL import Image
+class CLIP(nn.Module):
+    def __init__(self, model_name):
+        super(CLIP, self).__init__()
+        # model name: e.g. openai/clip-vit-base-patch32
+        print ('Initializing CLIP model...')
+        from transformers import CLIPProcessor, CLIPModel
+        self.model = CLIPModel.from_pretrained(model_name)
+        self.model.eval()
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        from transformers import CLIPTokenizer
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_name)
+        self.cuda_has_been_checked = False
+        print ('CLIP model initialized.')
+    def check_cuda(self):
+        self.cuda_available = next(self.model.parameters()).is_cuda
+        self.device = next(self.model.parameters()).get_device()
+        if self.cuda_available:
+            print ('Cuda is available.')
+            print ('Device is {}'.format(self.device))
+        else:
+            print ('Cuda is not available.')
+            print ('Device is {}'.format(self.device))
+    @torch.no_grad()
+    def compute_image_representation_from_image_path(self, image_path):
+        if not self.cuda_has_been_checked:
+            self.check_cuda()
+            self.cuda_has_been_checked = True
+        else:
+            pass
+        # image_path: the path of the image
+        image = Image.open(image_path)
+        inputs = self.processor(images=image, return_tensors="pt")
+        pixel_values = inputs['pixel_values']
+        if self.cuda_available:
+            pixel_values = pixel_values.cuda(self.device)
+        visual_outputs = self.model.vision_model(pixel_values=pixel_values)
+        image_embeds = visual_outputs[1]
+        image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
+        return image_embeds
+    def compute_image_representation_from_image_instance(self, image):
+        if not self.cuda_has_been_checked:
+            self.check_cuda()
+            self.cuda_has_been_checked = True
+        else:
+            pass
+        # image_path: the path of the image
+        inputs = self.processor(images=image, return_tensors="pt")
+        pixel_values = inputs['pixel_values']
+        if self.cuda_available:
+            pixel_values = pixel_values.cuda(self.device)
+        visual_outputs = self.model.vision_model(pixel_values=pixel_values)
+        image_embeds = visual_outputs[1]
+        image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
+        return image_embeds
+    def compute_text_representation(self, text_list):
+        if not self.cuda_has_been_checked:
+            self.check_cuda()
+            self.cuda_has_been_checked = True
+        else:
+            pass
+        # text_list: a list of text
+        text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt",
+            max_length=self.tokenizer.max_len_single_sentence + 2, truncation=True)
+        # self.tokenizer.max_len_single_sentence + 2 = 77
+        input_ids, attention_mask = text_inputs['input_ids'], text_inputs['attention_mask']
+        if self.cuda_available:
+            input_ids = input_ids.cuda(self.device)
+            attention_mask = attention_mask.cuda(self.device)
+        text_outputs = self.model.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        text_embeds = text_outputs[1]
+        text_embeds = self.model.text_projection(text_embeds)
+        return text_embeds
+    def compute_image_text_similarity_via_embeddings(self, image_embeds, text_embeds):
+        '''
+            image_embeds: 1 x embed_dim
+            text_embeds: len(text_list) x embed_dim
+        '''
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+        logit_scale = self.model.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.T
+        return logits_per_image.softmax(dim=1), logits_per_image/logit_scale # 1 x len(text_list)
+    def compute_image_text_similarity_via_raw_text(self, image_embeds, text_list):
+        text_embeds = self.compute_text_representation(text_list)
+        return self.compute_image_text_similarity_via_embeddings(image_embeds, text_embeds)
+    ### -------------------- functions for building index ---------------------- ###
+    def compute_batch_index_image_features(self, image_list):
+        '''
+            # list of image instances
+        '''
+        if not self.cuda_has_been_checked:
+            self.check_cuda()
+            self.cuda_has_been_checked = True
+        else:
+            pass
+        # image_path: the path of the image
+        inputs = self.processor(images=image_list, return_tensors="pt")
+        pixel_values = inputs['pixel_values']
+        if self.cuda_available:
+            pixel_values = pixel_values.cuda(self.device)
+        visual_outputs = self.model.vision_model(pixel_values=pixel_values)
+        image_embeds = visual_outputs[1]
+        image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
+        return image_embeds # len(image_list) x embed_dim
+    def compute_batch_index_text_representation(self, text_list):
+        if not self.cuda_has_been_checked:
+            self.check_cuda()
+            self.cuda_has_been_checked = True
+        else:
+            pass
+        # text_list: a list of text
+        #text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt")
+        text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt",
+            max_length=self.tokenizer.max_len_single_sentence + 2, truncation=True)
+        input_ids, attention_mask = text_inputs['input_ids'], text_inputs['attention_mask']
+        if self.cuda_available:
+            input_ids = input_ids.cuda(self.device)
+            attention_mask = attention_mask.cuda(self.device)
+        text_outputs = self.model.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        text_embeds = text_outputs[1]
+        text_embeds = self.model.text_projection(text_embeds)
+        return text_embeds
+        #logit_scale = self.model.logit_scale.exp()
+        #text_embeds = text_embeds * logit_scale
+        #return text_embeds

clip/clipretrieval.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import json
+import copy
+import torch
+import progressbar
+import numpy as np
+from PIL import Image
+class CLIPIndex:
+    def __init__(self, index_matrix_path, mapping_dict_path, clip):
+        '''
+            index_path: the pre-trained index
+            mapping_dict_path: the pre-indexed mapping dictionary
+            clip: the pre-trained clip model
+        '''
+        print ('Loading index...')
+        self.index_matrix = self.normalization(self.load_matrix(index_matrix_path))
+        print ('Index loaded.')
+        print (self.index_matrix.shape)
+        with open(mapping_dict_path) as f:
+            self.mapping_dict = json.load(f)
+        self.clip = clip
+    def load_matrix(self, in_f):
+        matrix_list = []
+        with open(in_f, 'r', encoding = 'utf8') as i:
+            lines = i.readlines()
+            for l in lines:
+                one_vec = [float(num) for num in l.strip('\n').split()]
+                matrix_list.append(one_vec)
+        return np.array(matrix_list)
+    def normalization(self, matrix):
+        '''
+            matrix: num_instance x num_feature
+        '''
+        return matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
+    def get_image_representation(self, image_path):
+        image_instance = Image.open(image_path)
+        image_vec = self.clip.compute_batch_index_image_features([image_instance]).detach().cpu().numpy()
+        image_vec = self.normalization(image_vec)
+        return image_vec
+    def search_text(self, image_path):
+        image_vec = self.get_image_representation(image_path)
+        sort_idx_list = np.matmul(image_vec, self.index_matrix.transpose())[0].argsort()[::-1]
+        top_idx = sort_idx_list[0]
+        return self.mapping_dict[str(top_idx)]
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--clip_name", type=str)
+    parser.add_argument("--test_image_prefix_path", type=str, help="the folder that stores all test images")
+    parser.add_argument("--test_path", type=str)
+    # index configuration
+    parser.add_argument("--index_matrix_path", type=str)
+    parser.add_argument("--mapping_dict_path", type=str)
+    # save configuration
+    parser.add_argument("--save_path_prefix", type=str, help="save the result in which directory")
+    parser.add_argument("--save_name", type=str, help="the name of the saved file")
+    return parser.parse_args()
+import argparse
+if __name__ == '__main__':
+    if torch.cuda.is_available():
+        print ('Cuda is available.')
+    cuda_available = torch.cuda.is_available()
+    args = parse_config()
+    device = torch.device('cuda')
+    save_path_prefix = args.save_path_prefix
+    import os
+    if os.path.exists(save_path_prefix):
+        pass
+    else: # recursively construct directory
+        os.makedirs(save_path_prefix, exist_ok=True)
+    # parse save name
+    save_name = args.save_name
+    full_save_path = save_path_prefix + '/' + save_name
+    print ('full save path is {}'.format(full_save_path))
+    print ('Loading CLIP...')
+    from clip import CLIP
+    clip = CLIP(args.clip_name)
+    if cuda_available:
+        clip = clip.cuda(device)
+    clip.eval()
+    print ('CLIP loaded!')
+    clipindex = CLIPIndex(args.index_matrix_path, args.mapping_dict_path, clip)
+    print ('Loading data...')
+    import json
+    with open(args.test_path) as f:
+        item_list = json.load(f)
+    print ('Data loaded.')
+    print ('Number of test instances is {}'.format(len(item_list)))
+    result_list = []
+    invalid_num = 0
+    print ('----------------------------------------------------------------')
+    with torch.no_grad():
+        test_num = len(item_list)
+        #test_num = 10
+        print ('Number of inference instances is {}'.format(test_num))
+        p = progressbar.ProgressBar(test_num)
+        p.start()
+        for p_idx in range(test_num):
+            p.update(p_idx)
+            one_test_dict = item_list[p_idx]
+            one_res_dict = {
+                'split':one_test_dict['split'],
+                'image_name':one_test_dict['image_name'],
+                #'file_path':one_test_dict['file_path'],
+                'captions':one_test_dict['captions']
+            }
+            image_full_path = args.test_image_prefix_path + '/' + one_test_dict['image_name']
+            try:
+                output_text = clipindex.search_text(image_full_path)
+                one_res_dict['prediction'] = output_text
+                result_list.append(one_res_dict)
+            except:
+                invalid_num += 1
+                print ('invalid number is {}'.format(invalid_num))
+                continue
+        p.finish()
+    print ('Inference completed!')
+    import json
+    with open(full_save_path, 'w') as outfile:
+        json.dump(result_list, outfile, indent=4)

control_gen_utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import random
+from utils import get_init_text, update_token_mask
+from sentiments_classifer import batch_texts_POS_Sentiments_analysis
+from POS_classifier import batch_texts_POS_analysis
+import time
+def generate_caption_step(out, gen_idx, mask, temperature=None, top_k=0):
+    """ Generate a word from out[gen_idx]
+    args:
+        - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size
+        - gen_idx (int): location for which to generate for
+        - top_k (int): if >0, only sample from the top k most probable words
+    """
+    logits = out[:, gen_idx]
+    if temperature is not None:
+        logits = logits / temperature
+    probs = F.softmax(logits, dim=-1)
+    probs *= (mask)
+    top_k_probs, top_k_ids = probs.topk(top_k, dim=-1)
+    # top_k_probs = torch.gather(probs, dim=1, index=top_k_ids)
+    return top_k_probs, top_k_ids
+def sentiment_sequential_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=0,temperature=None, alpha=0.7,beta=1,
+                          max_iters=20,batch_size=1,
+                         verbose=True,gamma=5, ctl_signal="positive"):
+    """ Generate one word at a time, in L->R order """
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    best_clip_score = 0
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for ii in range(max_len):
+            token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+            for jj in range(batch_size):
+                inp[jj][seed_len + ii] = tokenizer.mask_token_id
+            inp_ = inp.clone().detach()
+            out = model(inp).logits
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                idxs_ = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                topk_inp[:, ii + seed_len] = idxs_
+                repeats = ((idxs_[:, None] == topk_inp).float().sum(1) - 1)  # *pos_mask
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                sentiment_probs, sentiment_scores, pos_tags, wordnet_pos_tags = batch_texts_POS_Sentiments_analysis(
+                    batch_text_list, 1, topk_inp.device, sentiment_ctl=ctl_signal)
+                clip_score, clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score + gamma * sentiment_probs[None,:] + 0.1 * (1-torch.exp(repeats))[None,:]
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + ii] = idxs_[best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+                current_senti_score = sentiment_scores[best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score:.3f}, ctl score {current_senti_score:.3f}:"+ for_print)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def sentiment_shuffle_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=0,temperature=None, alpha=0.7,beta=1,
+                          max_iters=20,batch_size=1,
+                           verbose=True,gamma=5, ctl_signal="positive"):
+    """ Generate one word at a time, in random generation order """
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    inp = torch.tensor(batch).to(image_embeds.device)
+    clip_score_sequence = []
+    best_clip_score = 0
+    random_lst = list(range(max_len))
+    random.shuffle(random_lst)
+    logger.info(f"Order_list:{random_lst}")
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for ii in random_lst:
+            token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+            for jj in range(batch_size):
+                inp[jj][seed_len + ii] = tokenizer.mask_token_id
+            inp_ = inp.clone().detach()
+            out = model(inp).logits
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                idxs_ = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                topk_inp[:, ii + seed_len] = idxs_
+                repeats = ((idxs_[:, None] == topk_inp).float().sum(1) - 1)  # *pos_mask
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                sentiment_probs, sentiment_scores, pos_tags, wordnet_pos_tags = batch_texts_POS_Sentiments_analysis(
+                    batch_text_list, 1, topk_inp.device, sentiment_ctl=ctl_signal)
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                clip_score,clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score + gamma * sentiment_probs[None,:] + 0.01 * (1-torch.exp(repeats))[None,:]
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + ii] = idxs_[best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+                current_senti_score = sentiment_scores[best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score:.3f}, ctl score {current_senti_score:.3f}:"+ for_print)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def POS_sequential_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=0,temperature=None, alpha=0.7,beta=1,gamma=0.1,
+                          max_iters=20,batch_size=1,ctl_signal=["DET"],
+                          verbose=True):
+    """ Generate one word at a time, in L->R order """
+    seed_len = len(prompt.split())+1
+    templete = False
+    logger.info(ctl_signal)
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    best_clip_score = 0
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for ii in range(max_len):
+            token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+            for jj in range(batch_size):
+                inp[jj][seed_len + ii] = tokenizer.mask_token_id
+            inp_ = inp.clone().detach()
+            out = model(inp).logits
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                idxs_ = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                topk_inp[:, ii + seed_len] = idxs_
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                pos_tags, pos_scores = batch_texts_POS_analysis(batch_text_list, ctl_signal, device=idxs_.device)
+                pos_probs = torch.softmax(pos_scores/0.1, dim=-1).to(idxs_.device)
+                clip_score, clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score + gamma * pos_probs[None, :]
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + ii] = idxs_[best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+                current_ctl_score = pos_scores[best_clip_id]
+                current_pos_tag = pos_tags[best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_ctl_score = current_ctl_score
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score.cpu().item():.3f}, ctl score {current_ctl_score.cpu().item():.3f}: "+ for_print)
+            logger.info(current_pos_tag)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def control_generate_caption(model, clip, tokenizer,image_instance,token_mask,logger,
+                     prompt="", batch_size=10, max_len=25,
+                    top_k=100, temperature=1.0, max_iter=500,alpha=0.7,beta=1,gamma=5,
+                    ctl_type="sentiment", style_type="positive",pos_type=None,generate_order="sequential"):
+    # controllable funcitions to call
+    start_time = time.time()
+    if ctl_type=="sentiment": #sentiment control
+        if generate_order=="sequential":
+            generate_texts, clip_scores = sentiment_sequential_generation(model, clip, tokenizer, image_instance, token_mask, prompt, logger,
+                                     batch_size=batch_size, max_len=max_len, top_k=top_k,
+                                     alpha=alpha,beta=beta,gamma=gamma,temperature=temperature,
+                                    max_iters=max_iter, ctl_signal=style_type)
+        else:
+            generate_texts, clip_scores = sentiment_shuffle_generation(model, clip, tokenizer, image_instance,
+                                                                       token_mask, prompt, logger,
+                                                                       batch_size=batch_size, max_len=max_len,
+                                                                       top_k=top_k,
+                                                                       alpha=alpha, beta=beta, gamma=gamma,
+                                                                       temperature=temperature,
+                                                                       max_iters=max_iter,
+                                                                       ctl_signal=style_type)
+    else: ##POS control
+        generate_texts, clip_scores = POS_sequential_generation(model, clip, tokenizer, image_instance, token_mask, prompt, logger,
+                                 batch_size=batch_size, max_len=max_len, top_k=top_k,
+                                 alpha=alpha,beta=beta,gamma=gamma,temperature=temperature, ctl_signal=pos_type,
+                                  max_iters=max_iter)
+    logger.info("Finished in %.3fs" % (time.time() - start_time))
+    logger.info(f"final caption: {generate_texts[-2]}")
+    logger.info(f"best caption: {generate_texts[-1]}")
+    return generate_texts, clip_scores

examples/Gosh.jpeg ADDED Viewed

examples/cat.png ADDED Viewed

examples/girl.jpg ADDED Viewed

examples/horse.png ADDED Viewed

gen_utils.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import random
+from utils import get_init_text, update_token_mask
+import time
+def generate_step(out, gen_idx,  temperature=None, top_k=0, sample=False, return_list=True):
+    """ Generate a word from out[gen_idx]
+    args:
+        - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size
+        - gen_idx (int): location for which to generate for
+        - top_k (int): if >0, only sample from the top k most probable words
+        - sample (Bool): if True, sample from full distribution. Overridden by top_k
+    """
+    logits = out[:, gen_idx]
+    if temperature is not None:
+        logits = logits / temperature
+    if top_k > 0:
+        kth_vals, kth_idx = logits.topk(top_k, dim=-1)
+        dist = torch.distributions.categorical.Categorical(logits=kth_vals)
+        idx = kth_idx.gather(dim=1, index=dist.sample().unsqueeze(-1)).squeeze(-1)
+    elif sample:
+        dist = torch.distributions.categorical.Categorical(logits=logits)
+        idx = dist.sample().squeeze(-1)
+    else:
+        idx = torch.argmax(logits, dim=-1)
+    return idx.tolist() if return_list else idx
+def generate_caption_step(out, gen_idx, mask, temperature=None, top_k=100):
+    """ Generate a word from out[gen_idx]
+    args:
+        - out (torch.Tensor): tensor of logits of size (batch_size, seq_len, vocab_size)
+        - gen_idx (int): location for which to generate for
+        - mask (torch.Tensor): (1, vocab_size)
+        - top_k (int): candidate k
+    """
+    logits = out[:, gen_idx]
+    if temperature is not None:
+        logits = logits / temperature
+    probs = F.softmax(logits, dim=-1)
+    probs *= (mask)
+    top_k_probs, top_k_ids = probs.topk(top_k, dim=-1)
+    return top_k_probs, top_k_ids
+def sequential_generation(model, clip, tokenizer, image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=100,temperature=None, alpha=0.7,beta=1,
+                          max_iters=20,batch_size=1, verbose=True):
+    """ Generate one word at a time, in L->R order """
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer, prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    best_clip_score = 0
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for ii in range(max_len):
+            token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+            for jj in range(batch_size):
+                inp[jj][seed_len + ii] = tokenizer.mask_token_id
+            inp_ = inp.clone().detach()
+            out = model(inp).logits
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                idxs_ = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                topk_inp[:, ii + seed_len] = idxs_
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                clip_score, clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + ii] = idxs_[best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score:.3f}: "+ for_print)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def shuffle_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=0,temperature=None, alpha=0.7,beta=1,
+                          max_iters=20,batch_size=1,
+                          verbose=True):
+    """ Generate one word at a time, in random generation order """
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    inp = torch.tensor(batch).to(image_embeds.device)
+    clip_score_sequence = []
+    best_clip_score = 0
+    random_lst = list(range(max_len))
+    random.shuffle(random_lst)
+    logger.info(f"Order_list:{random_lst}")
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for ii in random_lst:
+            token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+            for jj in range(batch_size):
+                inp[jj][seed_len + ii] = tokenizer.mask_token_id
+            inp_ = inp.clone().detach()
+            out = model(inp).logits
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                topk_inp[:, ii + seed_len] = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                clip_score,clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + ii] = idxs[jj][best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            gen_texts.append(cur_text)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score:.3f}: "+for_print)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def span_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                          max_len=15, top_k=0,temperature=None, alpha=0.7,beta=1,
+                          max_iters=20,batch_size=1,verbose=True):
+    """ Generate multiple words at a time (span generation), in L->R order """
+    seed_len = len(prompt.split())+1
+    span_len = 2
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    best_clip_score = 0
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    for iter_num in range(max_iters):
+        for span_start in range(0,max_len,span_len):
+            span_end = min(span_start+span_len,max_len)
+            for jj in range(batch_size):
+                inp[jj][seed_len + span_start: seed_len + span_end] = tokenizer.mask_token_id
+            out = model(inp).logits
+            for ii in range(span_start,span_end):
+                token_mask = update_token_mask(tokenizer, token_mask, max_len, ii)
+                inp_ = inp.clone().detach()
+                probs, idxs = generate_caption_step(out, gen_idx=seed_len + ii, mask=token_mask, top_k=top_k,
+                                                    temperature=temperature)
+                for jj in range(batch_size):
+                    topk_inp = inp_.repeat(top_k, 1)
+                    idxs_ = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                    topk_inp[:, ii + seed_len] = idxs_
+                    batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                    clip_score, clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                    final_score = alpha * probs + beta * clip_score
+                    best_clip_id = final_score.argmax()
+                    inp[jj][seed_len + ii] = idxs_[best_clip_id]
+                    current_clip_score = clip_ref[jj][best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(iter_num + 1, 1) == 0:
+            for_print = tokenizer.decode(inp[0])
+            cur_text = tokenizer.decode(inp[0],skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+            logger.info(f"iter {iter_num + 1}, clip score {current_clip_score:.3f}: "+ for_print)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def random_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                                   max_len=15, top_k=0, temperature=None,alpha=0.7,beta=2,
+                                   max_iters=300,print_every=10,batch_size=1,
+                                    verbose=True):
+    """ Generate for one random position at a timestep"""
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer, prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    best_clip_score = 0
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    for ii in range(max_iters):
+        kk = np.random.randint(0, max_len)
+        token_mask = update_token_mask(tokenizer, token_mask, max_len, kk)
+        for jj in range(batch_size):
+            inp[jj][seed_len + kk] = tokenizer.mask_token_id
+        inp_ = inp.clone().detach()
+        out = model(inp).logits
+        probs, idxs = generate_caption_step(out,gen_idx=seed_len + kk,mask=token_mask, top_k=top_k, temperature=temperature)
+        for jj in range(batch_size):
+            topk_inp = inp_.repeat(top_k, 1)
+            topk_inp[:, kk + seed_len] = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+            batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+            clip_score, clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+            final_score = alpha * probs + beta * clip_score
+            best_clip_id = final_score.argmax()
+            inp[jj][seed_len + kk] = idxs[jj][best_clip_id]
+            current_clip_score = clip_ref[jj][best_clip_id]
+        clip_score_sequence.append(current_clip_score.cpu().item())
+        if best_clip_score < current_clip_score.cpu().item():
+            best_clip_score = current_clip_score.cpu().item()
+            best_caption = tokenizer.decode(inp[0], skip_special_tokens=True)
+        if verbose and np.mod(ii + 1, print_every) == 0:
+            for_print = tokenizer.decode(inp[0])
+            logger.info(f"iter {ii + 1}, clip score {current_clip_score:.3f}: "+for_print)
+            cur_text = tokenizer.decode(inp[0], skip_special_tokens=True)
+            gen_texts.append(cur_text)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def parallel_generation(model, clip, tokenizer,image_instance,token_mask, prompt, logger,
+                        max_len=15, top_k=0, temperature=None,  alpha=0.1, beta=1,
+                        max_iters=300,batch_size=1,print_every=1, verbose=True):
+    """ Generate for all positions at a time step """
+    seed_len = len(prompt.split())+1
+    batch = get_init_text(tokenizer,prompt, max_len, batch_size)
+    image_embeds = clip.compute_image_representation_from_image_instance(image_instance)
+    clip_score_sequence = []
+    inp = torch.tensor(batch).to(image_embeds.device)
+    gen_texts = []
+    best_clip_score = 0
+    for ii in range(max_iters):
+        inp_ = inp.clone().detach()
+        out = model(inp).logits
+        for kk in range(max_len):
+            probs, idxs = generate_caption_step(out, gen_idx=seed_len + kk,mask=token_mask, top_k=top_k, temperature=temperature)
+            for jj in range(batch_size):
+                topk_inp = inp_.repeat(top_k, 1)
+                topk_inp[:, ii + seed_len] = (idxs[jj] * token_mask[0][idxs[jj]]).long()
+                batch_text_list = tokenizer.batch_decode(topk_inp, skip_special_tokens=True)
+                clip_score,clip_ref = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)
+                final_score = alpha * probs + beta * clip_score
+                best_clip_id = final_score.argmax()
+                inp[jj][seed_len + kk] = idxs[jj][best_clip_id]
+                current_clip_score = clip_ref[jj][best_clip_id]
+                clip_score_sequence.append(current_clip_score.cpu().item())
+        if verbose and np.mod(ii, 1) == 0:
+            logger.info(f"iter{ii + 1}, clip score {current_clip_score:.3f}: " + tokenizer.decode(inp[0]))
+            cur_text = tokenizer.decode(inp[0], skip_special_tokens=True)
+            if best_clip_score < current_clip_score.cpu().item():
+                best_clip_score = current_clip_score.cpu().item()
+                best_caption = cur_text
+            gen_texts.append(cur_text)
+    gen_texts.append(best_caption)
+    clip_score_sequence.append(best_clip_score)
+    return gen_texts, clip_score_sequence
+def generate_caption(model, clip, tokenizer,image_instance,token_mask,logger,
+                     prompt="", batch_size=1, max_len=15,
+                     top_k=100, temperature=1.0, max_iter=500,alpha=0.7,beta=1,
+                     generate_order="sequential"):
+    # main generation functions to call
+    start_time = time.time()
+    if generate_order=="sequential":
+        generate_texts, clip_scores = sequential_generation(model, clip, tokenizer, image_instance, token_mask, prompt, logger,
+                                 batch_size=batch_size, max_len=max_len, top_k=top_k,
+                                 alpha=alpha,beta=beta,temperature=temperature,
+                                 max_iters=max_iter)
+    elif generate_order=="shuffle":
+        # max_iter = 15
+        generate_texts, clip_scores = shuffle_generation(model, clip, tokenizer,image_instance,token_mask,prompt, logger,
+                                 batch_size=batch_size, max_len=max_len, top_k=top_k,
+                                 alpha=alpha,beta=beta,temperature=temperature,max_iters=max_iter)
+    elif generate_order=="random":
+        max_iter *= max_len
+        print_every = max_len
+        generate_texts, clip_scores = random_generation(model, clip, tokenizer,image_instance,token_mask,prompt,logger,
+                              max_len=max_len, top_k=top_k,alpha=alpha,beta=beta,print_every=print_every,
+                               temperature=temperature,  max_iters=max_iter,verbose=True)
+    elif generate_order=="span":
+        max_iter = max_iter
+        generate_texts, clip_scores = span_generation(model, clip, tokenizer, image_instance, token_mask, prompt, logger,
+                                 batch_size=batch_size, max_len=max_len, top_k=top_k,
+                                 alpha=alpha,beta=beta,temperature=temperature, max_iters=max_iter)
+    elif generate_order=="parallel":
+        generate_texts, clip_scores = parallel_generation(model, clip, tokenizer,image_instance,token_mask,prompt,  logger,
+                               max_len=max_len, temperature=temperature,top_k=top_k,alpha=alpha,beta=beta,
+                                max_iters=max_iter,verbose=True)
+    logger.info("Finished in %.3fs" % (time.time() - start_time))
+    logger.info(f"final caption: {generate_texts[-2]}")
+    logger.info(f"best caption: {generate_texts[-1]}")
+    return generate_texts, clip_scores

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+colorlog
+nltk
+transformers

sentiments_classifer.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from nltk.tokenize import word_tokenize
+from nltk import pos_tag
+from nltk.corpus import sentiwordnet
+import torch
+import torch.nn.functional as F
+def text_POS_Sentiments_analysis(text,sentiment_ctl=None):
+    """
+    id: 0,1,2,3,4
+    pos:none,n,v,a,r
+    """
+    words = word_tokenize(text)
+    word_tag = pos_tag(words)
+    res_tag = [tag[1] for tag in word_tag]
+    tag_map = {'NN': 'n', 'NNP': 'n', 'NNPS': 'n', 'NNS': 'n', 'UH': 'n', \
+               'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', \
+               'JJ': 'a', 'JJR': 'a', 'JJS': 'a', \
+               'RB': 'r', 'RBR': 'r', 'RBS': 'r', 'RP': 'r', 'WRB': 'r'}
+    word_tag = [(t[0], tag_map[t[1]]) if t[1] in tag_map else (t[0], '') for t in word_tag]
+    wordnet_tag = [tag[1] for tag in word_tag]
+    sentiment_synsets = [list(sentiwordnet.senti_synsets(t[0], t[1])) for t in word_tag]
+    if sentiment_ctl is None:
+        return 0, res_tag, wordnet_tag
+    score = sum(sum([x.pos_score() - x.neg_score() for x in s]) / len(s) for s in sentiment_synsets if len(s) != 0)
+    if sentiment_ctl=="negative":
+        score = -score
+    return score, res_tag, wordnet_tag
+def batch_texts_POS_Sentiments_analysis(batch_texts, temperature,device,sentiment_ctl=None):
+    batch_size = len(batch_texts)
+    senti_scores = torch.zeros(batch_size)
+    pos_tags = []
+    wordnet_pos_tags = []
+    for b_id in range(batch_size):
+        text = batch_texts[b_id]
+        score, cur_tag, cur_word_tag = text_POS_Sentiments_analysis(text,sentiment_ctl=sentiment_ctl)
+        senti_scores[b_id] = score
+        pos_tags.append(cur_tag)
+        wordnet_pos_tags.append(cur_word_tag)
+    final_prob_score = F.softmax(senti_scores / temperature,dim=0).to(device)
+    return final_prob_score, senti_scores, pos_tags, wordnet_pos_tags

stop_words.txt ADDED Viewed

	@@ -0,0 +1,2835 @@

+...
+[unused0]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[unused100]
+[unused101]
+[unused102]
+[unused103]
+[unused104]
+[unused105]
+[unused106]
+[unused107]
+[unused108]
+[unused109]
+[unused110]
+[unused111]
+[unused112]
+[unused113]
+[unused114]
+[unused115]
+[unused116]
+[unused117]
+[unused118]
+[unused119]
+[unused120]
+[unused121]
+[unused122]
+[unused123]
+[unused124]
+[unused125]
+[unused126]
+[unused127]
+[unused128]
+[unused129]
+[unused130]
+[unused131]
+[unused132]
+[unused133]
+[unused134]
+[unused135]
+[unused136]
+[unused137]
+[unused138]
+[unused139]
+[unused140]
+[unused141]
+[unused142]
+[unused143]
+[unused144]
+[unused145]
+[unused146]
+[unused147]
+[unused148]
+[unused149]
+[unused150]
+[unused151]
+[unused152]
+[unused153]
+[unused154]
+[unused155]
+[unused156]
+[unused157]
+[unused158]
+[unused159]
+[unused160]
+[unused161]
+[unused162]
+[unused163]
+[unused164]
+[unused165]
+[unused166]
+[unused167]
+[unused168]
+[unused169]
+[unused170]
+[unused171]
+[unused172]
+[unused173]
+[unused174]
+[unused175]
+[unused176]
+[unused177]
+[unused178]
+[unused179]
+[unused180]
+[unused181]
+[unused182]
+[unused183]
+[unused184]
+[unused185]
+[unused186]
+[unused187]
+[unused188]
+[unused189]
+[unused190]
+[unused191]
+[unused192]
+[unused193]
+[unused194]
+[unused195]
+[unused196]
+[unused197]
+[unused198]
+[unused199]
+[unused200]
+[unused201]
+[unused202]
+[unused203]
+[unused204]
+[unused205]
+[unused206]
+[unused207]
+[unused208]
+[unused209]
+[unused210]
+[unused211]
+[unused212]
+[unused213]
+[unused214]
+[unused215]
+[unused216]
+[unused217]
+[unused218]
+[unused219]
+[unused220]
+[unused221]
+[unused222]
+[unused223]
+[unused224]
+[unused225]
+[unused226]
+[unused227]
+[unused228]
+[unused229]
+[unused230]
+[unused231]
+[unused232]
+[unused233]
+[unused234]
+[unused235]
+[unused236]
+[unused237]
+[unused238]
+[unused239]
+[unused240]
+[unused241]
+[unused242]
+[unused243]
+[unused244]
+[unused245]
+[unused246]
+[unused247]
+[unused248]
+[unused249]
+[unused250]
+[unused251]
+[unused252]
+[unused253]
+[unused254]
+[unused255]
+[unused256]
+[unused257]
+[unused258]
+[unused259]
+[unused260]
+[unused261]
+[unused262]
+[unused263]
+[unused264]
+[unused265]
+[unused266]
+[unused267]
+[unused268]
+[unused269]
+[unused270]
+[unused271]
+[unused272]
+[unused273]
+[unused274]
+[unused275]
+[unused276]
+[unused277]
+[unused278]
+[unused279]
+[unused280]
+[unused281]
+[unused282]
+[unused283]
+[unused284]
+[unused285]
+[unused286]
+[unused287]
+[unused288]
+[unused289]
+[unused290]
+[unused291]
+[unused292]
+[unused293]
+[unused294]
+[unused295]
+[unused296]
+[unused297]
+[unused298]
+[unused299]
+[unused300]
+[unused301]
+[unused302]
+[unused303]
+[unused304]
+[unused305]
+[unused306]
+[unused307]
+[unused308]
+[unused309]
+[unused310]
+[unused311]
+[unused312]
+[unused313]
+[unused314]
+[unused315]
+[unused316]
+[unused317]
+[unused318]
+[unused319]
+[unused320]
+[unused321]
+[unused322]
+[unused323]
+[unused324]
+[unused325]
+[unused326]
+[unused327]
+[unused328]
+[unused329]
+[unused330]
+[unused331]
+[unused332]
+[unused333]
+[unused334]
+[unused335]
+[unused336]
+[unused337]
+[unused338]
+[unused339]
+[unused340]
+[unused341]
+[unused342]
+[unused343]
+[unused344]
+[unused345]
+[unused346]
+[unused347]
+[unused348]
+[unused349]
+[unused350]
+[unused351]
+[unused352]
+[unused353]
+[unused354]
+[unused355]
+[unused356]
+[unused357]
+[unused358]
+[unused359]
+[unused360]
+[unused361]
+[unused362]
+[unused363]
+[unused364]
+[unused365]
+[unused366]
+[unused367]
+[unused368]
+[unused369]
+[unused370]
+[unused371]
+[unused372]
+[unused373]
+[unused374]
+[unused375]
+[unused376]
+[unused377]
+[unused378]
+[unused379]
+[unused380]
+[unused381]
+[unused382]
+[unused383]
+[unused384]
+[unused385]
+[unused386]
+[unused387]
+[unused388]
+[unused389]
+[unused390]
+[unused391]
+[unused392]
+[unused393]
+[unused394]
+[unused395]
+[unused396]
+[unused397]
+[unused398]
+[unused399]
+[unused400]
+[unused401]
+[unused402]
+[unused403]
+[unused404]
+[unused405]
+[unused406]
+[unused407]
+[unused408]
+[unused409]
+[unused410]
+[unused411]
+[unused412]
+[unused413]
+[unused414]
+[unused415]
+[unused416]
+[unused417]
+[unused418]
+[unused419]
+[unused420]
+[unused421]
+[unused422]
+[unused423]
+[unused424]
+[unused425]
+[unused426]
+[unused427]
+[unused428]
+[unused429]
+[unused430]
+[unused431]
+[unused432]
+[unused433]
+[unused434]
+[unused435]
+[unused436]
+[unused437]
+[unused438]
+[unused439]
+[unused440]
+[unused441]
+[unused442]
+[unused443]
+[unused444]
+[unused445]
+[unused446]
+[unused447]
+[unused448]
+[unused449]
+[unused450]
+[unused451]
+[unused452]
+[unused453]
+[unused454]
+[unused455]
+[unused456]
+[unused457]
+[unused458]
+[unused459]
+[unused460]
+[unused461]
+[unused462]
+[unused463]
+[unused464]
+[unused465]
+[unused466]
+[unused467]
+[unused468]
+[unused469]
+[unused470]
+[unused471]
+[unused472]
+[unused473]
+[unused474]
+[unused475]
+[unused476]
+[unused477]
+[unused478]
+[unused479]
+[unused480]
+[unused481]
+[unused482]
+[unused483]
+[unused484]
+[unused485]
+[unused486]
+[unused487]
+[unused488]
+[unused489]
+[unused490]
+[unused491]
+[unused492]
+[unused493]
+[unused494]
+[unused495]
+[unused496]
+[unused497]
+[unused498]
+[unused499]
+[unused500]
+[unused501]
+[unused502]
+[unused503]
+[unused504]
+[unused505]
+[unused506]
+[unused507]
+[unused508]
+[unused509]
+[unused510]
+[unused511]
+[unused512]
+[unused513]
+[unused514]
+[unused515]
+[unused516]
+[unused517]
+[unused518]
+[unused519]
+[unused520]
+[unused521]
+[unused522]
+[unused523]
+[unused524]
+[unused525]
+[unused526]
+[unused527]
+[unused528]
+[unused529]
+[unused530]
+[unused531]
+[unused532]
+[unused533]
+[unused534]
+[unused535]
+[unused536]
+[unused537]
+[unused538]
+[unused539]
+[unused540]
+[unused541]
+[unused542]
+[unused543]
+[unused544]
+[unused545]
+[unused546]
+[unused547]
+[unused548]
+[unused549]
+[unused550]
+[unused551]
+[unused552]
+[unused553]
+[unused554]
+[unused555]
+[unused556]
+[unused557]
+[unused558]
+[unused559]
+[unused560]
+[unused561]
+[unused562]
+[unused563]
+[unused564]
+[unused565]
+[unused566]
+[unused567]
+[unused568]
+[unused569]
+[unused570]
+[unused571]
+[unused572]
+[unused573]
+[unused574]
+[unused575]
+[unused576]
+[unused577]
+[unused578]
+[unused579]
+[unused580]
+[unused581]
+[unused582]
+[unused583]
+[unused584]
+[unused585]
+[unused586]
+[unused587]
+[unused588]
+[unused589]
+[unused590]
+[unused591]
+[unused592]
+[unused593]
+[unused594]
+[unused595]
+[unused596]
+[unused597]
+[unused598]
+[unused599]
+[unused600]
+[unused601]
+[unused602]
+[unused603]
+[unused604]
+[unused605]
+[unused606]
+[unused607]
+[unused608]
+[unused609]
+[unused610]
+[unused611]
+[unused612]
+[unused613]
+[unused614]
+[unused615]
+[unused616]
+[unused617]
+[unused618]
+[unused619]
+[unused620]
+[unused621]
+[unused622]
+[unused623]
+[unused624]
+[unused625]
+[unused626]
+[unused627]
+[unused628]
+[unused629]
+[unused630]
+[unused631]
+[unused632]
+[unused633]
+[unused634]
+[unused635]
+[unused636]
+[unused637]
+[unused638]
+[unused639]
+[unused640]
+[unused641]
+[unused642]
+[unused643]
+[unused644]
+[unused645]
+[unused646]
+[unused647]
+[unused648]
+[unused649]
+[unused650]
+[unused651]
+[unused652]
+[unused653]
+[unused654]
+[unused655]
+[unused656]
+[unused657]
+[unused658]
+[unused659]
+[unused660]
+[unused661]
+[unused662]
+[unused663]
+[unused664]
+[unused665]
+[unused666]
+[unused667]
+[unused668]
+[unused669]
+[unused670]
+[unused671]
+[unused672]
+[unused673]
+[unused674]
+[unused675]
+[unused676]
+[unused677]
+[unused678]
+[unused679]
+[unused680]
+[unused681]
+[unused682]
+[unused683]
+[unused684]
+[unused685]
+[unused686]
+[unused687]
+[unused688]
+[unused689]
+[unused690]
+[unused691]
+[unused692]
+[unused693]
+[unused694]
+[unused695]
+[unused696]
+[unused697]
+[unused698]
+[unused699]
+[unused700]
+[unused701]
+[unused702]
+[unused703]
+[unused704]
+[unused705]
+[unused706]
+[unused707]
+[unused708]
+[unused709]
+[unused710]
+[unused711]
+[unused712]
+[unused713]
+[unused714]
+[unused715]
+[unused716]
+[unused717]
+[unused718]
+[unused719]
+[unused720]
+[unused721]
+[unused722]
+[unused723]
+[unused724]
+[unused725]
+[unused726]
+[unused727]
+[unused728]
+[unused729]
+[unused730]
+[unused731]
+[unused732]
+[unused733]
+[unused734]
+[unused735]
+[unused736]
+[unused737]
+[unused738]
+[unused739]
+[unused740]
+[unused741]
+[unused742]
+[unused743]
+[unused744]
+[unused745]
+[unused746]
+[unused747]
+[unused748]
+[unused749]
+[unused750]
+[unused751]
+[unused752]
+[unused753]
+[unused754]
+[unused755]
+[unused756]
+[unused757]
+[unused758]
+[unused759]
+[unused760]
+[unused761]
+[unused762]
+[unused763]
+[unused764]
+[unused765]
+[unused766]
+[unused767]
+[unused768]
+[unused769]
+[unused770]
+[unused771]
+[unused772]
+[unused773]
+[unused774]
+[unused775]
+[unused776]
+[unused777]
+[unused778]
+[unused779]
+[unused780]
+[unused781]
+[unused782]
+[unused783]
+[unused784]
+[unused785]
+[unused786]
+[unused787]
+[unused788]
+[unused789]
+[unused790]
+[unused791]
+[unused792]
+[unused793]
+[unused794]
+[unused795]
+[unused796]
+[unused797]
+[unused798]
+[unused799]
+[unused800]
+[unused801]
+[unused802]
+[unused803]
+[unused804]
+[unused805]
+[unused806]
+[unused807]
+[unused808]
+[unused809]
+[unused810]
+[unused811]
+[unused812]
+[unused813]
+[unused814]
+[unused815]
+[unused816]
+[unused817]
+[unused818]
+[unused819]
+[unused820]
+[unused821]
+[unused822]
+[unused823]
+[unused824]
+[unused825]
+[unused826]
+[unused827]
+[unused828]
+[unused829]
+[unused830]
+[unused831]
+[unused832]
+[unused833]
+[unused834]
+[unused835]
+[unused836]
+[unused837]
+[unused838]
+[unused839]
+[unused840]
+[unused841]
+[unused842]
+[unused843]
+[unused844]
+[unused845]
+[unused846]
+[unused847]
+[unused848]
+[unused849]
+[unused850]
+[unused851]
+[unused852]
+[unused853]
+[unused854]
+[unused855]
+[unused856]
+[unused857]
+[unused858]
+[unused859]
+[unused860]
+[unused861]
+[unused862]
+[unused863]
+[unused864]
+[unused865]
+[unused866]
+[unused867]
+[unused868]
+[unused869]
+[unused870]
+[unused871]
+[unused872]
+[unused873]
+[unused874]
+[unused875]
+[unused876]
+[unused877]
+[unused878]
+[unused879]
+[unused880]
+[unused881]
+[unused882]
+[unused883]
+[unused884]
+[unused885]
+[unused886]
+[unused887]
+[unused888]
+[unused889]
+[unused890]
+[unused891]
+[unused892]
+[unused893]
+[unused894]
+[unused895]
+[unused896]
+[unused897]
+[unused898]
+[unused899]
+[unused900]
+[unused901]
+[unused902]
+[unused903]
+[unused904]
+[unused905]
+[unused906]
+[unused907]
+[unused908]
+[unused909]
+[unused910]
+[unused911]
+[unused912]
+[unused913]
+[unused914]
+[unused915]
+[unused916]
+[unused917]
+[unused918]
+[unused919]
+[unused920]
+[unused921]
+[unused922]
+[unused923]
+[unused924]
+[unused925]
+[unused926]
+[unused927]
+[unused928]
+[unused929]
+[unused930]
+[unused931]
+[unused932]
+[unused933]
+[unused934]
+[unused935]
+[unused936]
+[unused937]
+[unused938]
+[unused939]
+[unused940]
+[unused941]
+[unused942]
+[unused943]
+[unused944]
+[unused945]
+[unused946]
+[unused947]
+[unused948]
+[unused949]
+[unused950]
+[unused951]
+[unused952]
+[unused953]
+[unused954]
+[unused955]
+[unused956]
+[unused957]
+[unused958]
+[unused959]
+[unused960]
+[unused961]
+[unused962]
+[unused963]
+[unused964]
+[unused965]
+[unused966]
+[unused967]
+[unused968]
+[unused969]
+[unused970]
+[unused971]
+[unused972]
+[unused973]
+[unused974]
+[unused975]
+[unused976]
+[unused977]
+[unused978]
+[unused979]
+[unused980]
+[unused981]
+[unused982]
+[unused983]
+[unused984]
+[unused985]
+[unused986]
+[unused987]
+[unused988]
+[unused989]
+[unused990]
+[unused991]
+[unused992]
+[unused993]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+/
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+{
+|
+}
+~
+¡
+¢
+£
+¤
+¥
+¦
+§
+¨
+©
+ª
+«
+¬
+®
+°
+±
+²
+³
+´
+µ
+¶
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+×
+ß
+æ
+ð
+÷
+ø
+þ
+đ
+ħ
+ı
+ł
+ŋ
+œ
+ƒ
+ɐ
+ɑ
+ɒ
+ɔ
+ɕ
+ə
+ɛ
+ɡ
+ɣ
+ɨ
+ɪ
+ɫ
+ɬ
+ɯ
+ɲ
+ɴ
+ɹ
+ɾ
+ʀ
+ʁ
+ʂ
+ʃ
+ʉ
+ʊ
+ʋ
+ʌ
+ʎ
+ʐ
+ʑ
+ʒ
+ʔ
+ʰ
+ʲ
+ʳ
+ʷ
+ʸ
+ʻ
+ʼ
+ʾ
+ʿ
+ˈ
+ː
+ˡ
+ˢ
+ˣ
+ˤ
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ђ
+є
+і
+ј
+љ
+њ
+ћ
+ӏ
+ա
+բ
+գ
+դ
+ե
+թ
+ի
+լ
+կ
+հ
+մ
+յ
+ն
+ո
+պ
+ս
+վ
+տ
+ր
+ւ
+ք
+־
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+ך
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+ף
+פ
+ץ
+צ
+ק
+ר
+ש
+ת
+،
+ء
+ا
+ب
+ة
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ـ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+ٹ
+پ
+چ
+ک
+گ
+ں
+ھ
+ہ
+ی
+ے
+अ
+आ
+उ
+ए
+क
+ख
+ग
+च
+ज
+ट
+ड
+ण
+त
+थ
+द
+ध
+न
+प
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ा
+ि
+ी
+ो
+।
+॥
+ং
+অ
+আ
+ই
+উ
+এ
+ও
+ক
+খ
+গ
+চ
+ছ
+জ
+ট
+ড
+ণ
+ত
+থ
+দ
+ধ
+ন
+প
+ব
+ভ
+ম
+য
+র
+ল
+শ
+ষ
+স
+হ
+া
+ি
+ী
+ে
+க
+ச
+ட
+த
+ந
+ன
+ப
+ம
+ய
+ர
+ல
+ள
+வ
+ா
+ி
+ு
+ே
+ை
+ನ
+ರ
+ಾ
+ක
+ය
+ර
+ල
+ව
+ා
+ก
+ง
+ต
+ท
+น
+พ
+ม
+ย
+ร
+ล
+ว
+ส
+อ
+า
+เ
+་
+།
+ག
+ང
+ད
+ན
+པ
+བ
+མ
+འ
+ར
+ལ
+ས
+မ
+ა
+ბ
+გ
+დ
+ე
+ვ
+თ
+ი
+კ
+ლ
+მ
+ნ
+ო
+რ
+ს
+ტ
+უ
+ᄀ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄉ
+ᄊ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅥ
+ᅦ
+ᅧ
+ᅩ
+ᅪ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆼ
+ᴬ
+ᴮ
+ᴰ
+ᴵ
+ᴺ
+ᵀ
+ᵃ
+ᵇ
+ᵈ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵖ
+ᵗ
+ᵘ
+ᵢ
+ᵣ
+ᵤ
+ᵥ
+ᶜ
+ᶠ
+‐
+‑
+‒
+–
+—
+―
+‖
+‘
+’
+‚
+“
+”
+„
+†
+‡
+•
+…
+‰
+′
+″
+›
+‿
+⁄
+⁰
+ⁱ
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+⁺
+⁻
+ⁿ
+₀
+₁
+₂
+₃
+₄
+₅
+₆
+₇
+₈
+₉
+₊
+₍
+₎
+ₐ
+ₑ
+ₒ
+ₓ
+ₕ
+ₖ
+ₗ
+ₘ
+ₙ
+ₚ
+ₛ
+ₜ
+₤
+₩
+€
+₱
+₹
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+←
+↑
+→
+↓
+↔
+↦
+⇄
+⇌
+⇒
+∂
+∅
+∆
+∇
+∈
+−
+∗
+∘
+√
+∞
+∧
+∨
+∩
+∪
+≈
+≡
+≤
+≥
+⊂
+⊆
+⊕
+⊗
+⋅
+─
+│
+■
+▪
+●
+★
+☆
+☉
+♠
+♣
+♥
+♦
+♭
+♯
+⟨
+⟩
+ⱼ
+⺩
+⺼
+⽥
+、
+。
+〈
+〉
+《
+》
+「
+」
+『
+』
+〜
+あ
+い
+う
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+や
+ゆ
+よ
+ら
+り
+る
+れ
+ろ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ュ
+ョ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+一
+三
+上
+下
+不
+世
+中
+主
+久
+之
+也
+事
+二
+五
+井
+京
+人
+亻
+仁
+介
+代
+仮
+伊
+会
+佐
+侍
+保
+信
+健
+元
+光
+八
+公
+内
+出
+分
+前
+劉
+力
+加
+勝
+北
+区
+十
+千
+南
+博
+原
+口
+古
+史
+司
+合
+吉
+同
+名
+和
+囗
+四
+国
+國
+土
+地
+坂
+城
+堂
+場
+士
+夏
+外
+大
+天
+太
+夫
+奈
+女
+子
+学
+宀
+宇
+安
+宗
+定
+宣
+宮
+家
+宿
+寺
+將
+小
+尚
+山
+岡
+島
+崎
+川
+州
+巿
+帝
+平
+年
+幸
+广
+弘
+張
+彳
+後
+御
+德
+心
+忄
+志
+忠
+愛
+成
+我
+戦
+戸
+手
+扌
+政
+文
+新
+方
+日
+明
+星
+春
+昭
+智
+曲
+書
+月
+有
+朝
+木
+本
+李
+村
+東
+松
+林
+森
+楊
+樹
+橋
+歌
+止
+正
+武
+比
+氏
+民
+水
+氵
+氷
+永
+江
+沢
+河
+治
+法
+海
+清
+漢
+瀬
+火
+版
+犬
+王
+生
+田
+男
+疒
+発
+白
+的
+皇
+目
+相
+省
+真
+石
+示
+社
+神
+福
+禾
+秀
+秋
+空
+立
+章
+竹
+糹
+美
+義
+耳
+良
+艹
+花
+英
+華
+葉
+藤
+行
+街
+西
+見
+訁
+語
+谷
+貝
+貴
+車
+軍
+辶
+道
+郎
+郡
+部
+都
+里
+野
+金
+鈴
+镇
+長
+門
+間
+阝
+阿
+陳
+陽
+雄
+青
+面
+風
+食
+香
+馬
+高
+龍
+龸
+ﬁ
+ﬂ
+！
+（
+）
+－
+．
+／
+：
+？
+～
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+²
+³
+¹
+⁰
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+₀
+₁
+₂
+₃
+₄
+₅
+₆
+₇
+₈
+₉
+10
+000
+2010
+2011
+12
+2012
+2008
+2009
+2013
+2007
+2006
+2014
+15
+20
+18
+2015
+11
+2016
+30
+2005
+16
+14
+13
+2017
+25
+2004
+2000
+17
+24
+2003
+2002
+100
+21
+19
+2001
+22
+23
+1999
+28
+26
+27
+1998
+1997
+1996
+50
+29
+2018
+1995
+1994
+1992
+1993
+31
+40
+1991
+1990
+1989
+1988
+1987
+1986
+1985
+1984
+1980
+500
+1983
+1982
+1979
+1981
+200
+1972
+1976
+1978
+1974
+1975
+1977
+1970
+1968
+1973
+1945
+1971
+45
+60
+1969
+1967
+35
+65
+1964
+1966
+1965
+32
+1960
+1944
+1963
+1962
+1942
+80
+1961
+1943
+1956
+1958
+1959
+1941
+1940
+1948
+1957
+1939
+1946
+1950
+90
+33
+70
+1955
+300
+1952
+00
+1947
+44
+36
+1954
+1953
+1949
+34
+1951
+64
+38
+1938
+37
+1936
+1918
+400
+75
+1937
+42
+1935
+1920
+39
+48
+1930
+1919
+1933
+1914
+1934
+55
+1917
+41
+1929
+1928
+1932
+47
+52
+43
+1931
+49
+1927
+1922
+46
+1924
+1925
+51
+1912
+1926
+1921
+978
+1923
+1915
+1916
+1910
+150
+1913
+54
+1900
+600
+56
+1911
+53
+1908
+95
+59
+800
+58
+57
+1905
+08
+1906
+1907
+250
+1909
+99
+85
+09
+1904
+05
+07
+06
+66
+1902
+1901
+1903
+62
+98
+72
+04
+01
+96
+97
+03
+120
+1898
+88
+61
+93
+76
+67
+1899
+02
+63
+1890
+91
+92
+77
+68
+78
+81
+1895
+1896
+1897
+700
+69
+74
+94
+71
+84
+73
+82
+1889
+89
+1893
+1892
+79
+1894
+86
+1885
+87
+1891
+83
+1888
+1000
+1864
+1865
+1880
+1887
+1861
+1862
+1863
+1886
+1870
+1884
+1881
+1882
+1883
+1878
+110
+1860
+1876
+1871
+1879
+1875
+1867
+1877
+130
+1872
+1868
+1874
+1873
+1866
+900
+1869
+101
+1850
+1848
+160
+1859
+1857
+180
+1854
+1855
+1858
+140
+350
+1856
+125
+105
+1852
+1851
+1840
+1853
+1849
+1847
+1846
+102
+360
+1830
+1845
+104
+750
+1837
+1844
+103
+1800
+1841
+1812
+1838
+1842
+1839
+1843
+1836
+106
+1835
+1832
+450
+1500
+2019
+220
+107
+115
+1815
+1834
+108
+170
+1831
+1814
+1833
+1820
+111
+112
+240
+1825
+135
+1828
+109
+1829
+1824
+1821
+1810
+230
+190
+128
+3000
+1826
+1818
+113
+1813
+1822
+1827
+1816
+1793
+1801
+114
+1806
+1823
+1817
+1819
+117
+121
+2020
+1803
+1809
+175
+210
+116
+118
+127
+1798
+1808
+1811
+122
+1805
+123
+1804
+1794
+1807
+550
+119
+1790
+1795
+124
+1792
+280
+5000
+1802
+260
+320
+1789
+145
+270
+650
+1799
+1796
+165
+1776
+126
+132
+1797
+155
+330
+1775
+1791
+129
+133
+131
+144
+1200
+1600
+137
+225
+152
+138
+1780
+134
+1783
+185
+136
+141
+1788
+850
+340
+1787
+143
+142
+1777
+501
+205
+1778
+146
+201
+370
+148
+147
+1784
+151
+1700
+139
+154
+153
+156
+167
+1781
+202
+1758
+1782
+168
+380
+310
+290
+1785
+460
+256
+480
+195
+149
+161
+157
+215
+440
+1786
+420
+1772
+275
+1774
+192
+1779
+182
+158
+1770
+235
+162
+163
+164
+1660
+375
+177
+212
+1750
+171
+172
+1763
+208
+203
+176
+169
+181
+166
+183
+206
+159
+222
+1760
+188
+301
+410
+211
+178
+365
+209
+173
+187
+174
+1300
+430
+221
+186
+520
+204
+325
+184
+224
+640
+1768
+610
+207
+191
+213
+1773
+214
+194
+197
+193
+303
+911
+198
+390
+196
+4000
+540
+216
+231
+179
+950
+217
+305
+189
+265
+219
+255
+1400
+1769
+232
+1771
+199
+218
+1765
+223
+1762
+660
+245
+226
+312
+470
+333
+560
+1761
+1766
+1755
+1764
+227
+1767
+1640
+264
+1759
+295
+1740
+285
+1745
+1650
+262
+234
+238
+302
+737
+1100
+233
+254
+228
+490
+241
+1756
+246
+242
+1648
+251
+1754
+1715
+1757
+401
+1689
+229
+625
+720
+243
+252
+315
+281
+313
+287
+253
+1730
+425
+237
+247
+510
+1644
+530
+311
+1720
+236
+630
+620
+249
+239
+580
+322
+345
+1753
+1710
+304
+802
+680
+316
+405
+321
+1661
+1642
+1688
+435
+244
+272
+308
+1620
+257
+258
+512
+335
+385
+1751
+261
+1748
+1746
+1747
+307
+248
+1680
+306
+760
+395
+415
+1749
+278
+1752
+1690
+404
+288
+570
+286
+1630
+1707
+309
+1685
+271
+2500
+276
+268
+266
+590
+259
+980
+1714
+263
+328
+1741
+1727
+273
+747
+323
+267
+283
+1643
+670
+277
+274
+001
+1743
+525
+1603
+1725
+2021
+1641
+1742
+269
+279
+292
+1610
+1739
+740
+1744
+412
+999
+1662
+299
+6000
+1701
+1735
+1645
+357
+1550
+1670
+314
+1625
+282
+355
+1724
+319
+1649
+1723
+317
+960
+820
+1722
+1737
+1702
+1728
+880
+284
+293
+521
+1718
+318
+1713
+1621
+289
+291
+1675
+296
+1733
+324
+298
+1672
+1708
+1734
+1666
+1683
+1635
+406
+1654
+1638
+297
+356
+411
+417
+1717
+331
+1540
+1732
+1667
+875
+710
+1665
+1721
+910
+1704
+343
+354
+1629
+338
+1679
+336
+730
+1738
+441
+402
+1609
+690
+840
+1622
+294
+451
+1719
+326
+1736
+1086
+1605
+403
+1716
+1632
+475
+1580
+1659
+1726
+341
+1703
+1656
+1655
+1731
+1729
+1711
+1712
+327
+351
+1664
+337
+1634
+1624
+780
+1692
+1628
+1697
+1016
+050
+1699
+1604
+1611
+1646
+1626
+1652
+870
+1570
+352
+407
+1658
+505
+1709
+339
+1663
+1618
+1623
+770
+1651
+1695
+1560
+1612
+422
+495
+1653
+1705
+332
+381
+930
+344
+421
+1682
+555
+334
+329

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import numpy as np
+import os
+import colorlog
+import random
+import torch
+def create_logger(folder, filename):
+    log_colors = {
+        'DEBUG': 'blue',
+        'INFO': 'white',
+        'WARNING': 'green',
+        'ERROR': 'red',
+        'CRITICAL': 'yellow',
+    }
+    import logging
+    logger = logging.getLogger('ConZIC')
+    # %(filename)s$RESET:%(lineno)d
+    # LOGFORMAT = "%(log_color)s%(asctime)s [%(log_color)s%(filename)s:%(lineno)d] | %(log_color)s%(message)s%(reset)s |"
+    LOGFORMAT = ""
+    LOG_LEVEL = logging.DEBUG
+    logging.root.setLevel(LOG_LEVEL)
+    stream = logging.StreamHandler()
+    stream.setLevel(LOG_LEVEL)
+    stream.setFormatter(colorlog.ColoredFormatter(LOGFORMAT, datefmt='%d %H:%M', log_colors=log_colors))
+    # print to log file
+    hdlr = logging.FileHandler(os.path.join(folder, filename))
+    hdlr.setLevel(LOG_LEVEL)
+    # hdlr.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
+    hdlr.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(hdlr)
+    logger.addHandler(stream)
+    return logger
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def get_init_text(tokenizer, seed_text, max_len, batch_size=1):
+    """ Get initial sentence by padding seed_text with [mask] words to max_len """
+    text = seed_text + tokenizer.mask_token * max_len
+    ids = tokenizer.encode(text)
+    batch = [ids] * batch_size
+    return batch
+def update_token_mask(tokenizer, token_mask, max_len, index):
+    """ '.'(full stop) is only allowed in the last token position """
+    if index == max_len - 1:
+        token_mask[:, tokenizer.vocab['.']] = 1
+    else:
+        token_mask[:, tokenizer.vocab['.']] = 0
+    return token_mask
+def format_output(sample_num, FinalCaption, BestCaption):
+    if sample_num == 1:
+        return f"{FinalCaption[0]}", f"{BestCaption[0]}"
+    elif sample_num ==2:
+        return f"{FinalCaption[0]}\n{FinalCaption[1]}", f"{BestCaption[0]}\n{BestCaption[1]}"
+    elif sample_num ==3:
+        return f"{FinalCaption[0]}\n{FinalCaption[1]}\n{FinalCaption[2]}",\
+            f"{BestCaption[0]}\n{BestCaption[1]}\n{BestCaption[2]}"
+    elif sample_num ==4:
+        return f"{FinalCaption[0]}\n{FinalCaption[1]}\n{FinalCaption[2]}\n{FinalCaption[3]}",\
+            f"{BestCaption[0]}\n{BestCaption[1]}\n{BestCaption[2]}\n{BestCaption[3]}"
+    else:
+        return f"{FinalCaption[0]}\n{FinalCaption[1]}\n{FinalCaption[2]}\n{FinalCaption[3]}\n{FinalCaption[4]}",\
+            f"{BestCaption[0]}\n{BestCaption[1]}\n{BestCaption[2]}\n{BestCaption[3]}\n{BestCaption[4]}"