diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..338964404899f86fb20b81c252a8c23cbd5dc26e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+MiniGPT_4.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000000000000000000000000000000000..41016a6bcf7375d0a3fb3115b303791930e93c2d
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,2 @@
+# Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing.
+#ECCN:Open Source
\ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ba97919e5b9568c8b9c42ea85251f01049a220e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,14 @@
+BSD 3-Clause License
+
+Copyright (c) 2022 Salesforce, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..481e101713e7fef33d177aa5f107480c9919a474
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,7 @@
+recursive-include minigpt4/configs *.yaml *.json
+recursive-include minigpt4/projects *.yaml *.json
+
+recursive-exclude minigpt4/datasets/download_scripts *
+recursive-exclude minigpt4/output *
+
+include requirements.txt
diff --git a/MiniGPT_4.pdf b/MiniGPT_4.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..3ec99b0bf46a595b643e132daff041d77437a8ea
--- /dev/null
+++ b/MiniGPT_4.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef8de6eeefee0dcf33dea53e8de2a884939dc20617362052232e7a223941260
+size 6614913
diff --git a/README.md b/README.md
index 37c851295b02e70829e1089ad0a3c26458f0ce0c..75e26209461a0c2ff662712dca8fc6c19335ae12 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,139 @@
----
-title: Minigpt4
-emoji: 🌖
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 3.24.1
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
+[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. *Equal Contribution
+
+**King Abdullah University of Science and Technology**
+
+[[Project Website]](https://minigpt-4.github.io/) [[Paper]](MiniGPT_4.pdf) [Online Demo]
+
+
+## Online Demo
+
+Chat with MiniGPT-4 around your images
+
+
+## Examples
+ | | |
+:-------------------------:|:-------------------------:
+![find wild](examples/wop_2.png) | ![write story](examples/ad_2.png)
+![solve problem](examples/fix_1.png) | ![write Poem](examples/rhyme_1.png)
+
+
+
+
+
+## Abstract
+The recent GPT-4 has demonstrated extraordinary multi-modal abilities, such as directly generating websites from handwritten text and identifying humorous elements within images. These features are rarely observed in previous vision-language models. We believe the primary reason for GPT-4's advanced multi-modal generation capabilities lies in the utilization of a more advanced large language model (LLM). To examine this phenomenon, we present MiniGPT-4, which aligns a frozen visual encoder with a frozen LLM, Vicuna, using just one projection layer.
+Our findings reveal that MiniGPT-4 processes many capabilities similar to those exhibited by GPT-4 like detailed image description generation and website creation from hand-written drafts. Furthermore, we also observe other emerging capabilities in MiniGPT-4, including writing stories and poems inspired by given images, providing solutions to problems shown in images, teaching users how to cook based on food photos, etc.
+These advanced capabilities can be attributed to the use of a more advanced large language model.
+Furthermore, our method is computationally efficient, as we only train a projection layer using roughly 5 million aligned image-text pairs and an additional 3,500 carefully curated high-quality pairs.
+
+
+
+
+
+
+
+
+## Getting Started
+### Installation
+
+1. Prepare the code and the environment
+
+Git clone our repository, creating a python environment and ativate it via the following command
+
+```bash
+git clone https://github.com/Vision-CAIR/MiniGPT-4.git
+cd MiniGPT-4
+conda env create -f environment.yml
+conda activate minigpt4
+```
+
+
+2. Prepare the pretrained Vicuna weights
+
+The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
+Please refer to their instructions [here](https://huggingface.co/lmsys/vicuna-13b-delta-v0) to obtaining the weights.
+The final weights would be in a single folder with the following structure:
+
+```
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...
+```
+
+Then, set the path to the vicuna weight in the model config file
+[here](minigpt4/configs/models/minigpt4.yaml#L21) at Line 21.
+
+3. Prepare the pretrained MiniGPT-4 checkpoint
+
+To play with our pretrained model, download the pretrained checkpoint
+[here](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link).
+Then, set the path to the pretrained checkpoint in the evaluation config file
+in [eval_configs/minigpt4.yaml](eval_configs/minigpt4.yaml#L15) at Line 15.
+
+
+
+
+
+### Launching Demo Locally
+
+Try out our demo [demo.py](app.py) with your images for on your local machine by running
+
+```
+python demo.py --cfg-path eval_configs/minigpt4.yaml
+```
+
+
+
+
+
+### Training
+The training of MiniGPT-4 contains two-stage alignments.
+In the first stage, the model is trained using image-text pairs from Laion and CC datasets
+to align the vision and language model. To download and prepare the datasets, please check
+[here](dataset/readme.md).
+After the first stage, the visual features are mapped and can be understood by the language
+model.
+To launch the first stage training, run
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_config/minigpt4_stage1_laion.yaml
+```
+
+In the second stage, we use a small high quality image-text pair dataset created by ourselves
+and convert it to a conversation format to further align MiniGPT-4.
+Our second stage dataset can be download from
+[here](https://drive.google.com/file/d/1RnS0mQJj8YU0E--sfH08scu5-ALxzLNj/view?usp=share_link).
+After the second stage alignment, MiniGPT-4 is able to talk about the image in
+a smooth way.
+To launch the second stage alignment, run
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_config/minigpt4_stage2_align.yaml
+```
+
+
+
+
+
+## Acknowledgement
+
++ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2)
++ [Vicuna](https://github.com/lm-sys/FastChat)
+
+
+If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
+```bibtex
+@misc{zhu2022minigpt4,
+ title={MiniGPT-4: Enhancing the Vision-language Understanding with Advanced Large Language Models},
+ author={Deyao Zhu and Jun Chen and Xiaoqian Shen and xiang Li and Mohamed Elhoseiny},
+ year={2023},
+}
+```
+
+## License
+This repository is built on [Lavis](https://github.com/salesforce/LAVIS) with BSD 3-Clause License
+[BSD 3-Clause License](LICENSE.txt)
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e73082911f187477fee7e45cf9ec86ae919cd61
--- /dev/null
+++ b/app.py
@@ -0,0 +1,146 @@
+import argparse
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import gradio as gr
+
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat, CONV_VISION
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Demo")
+ parser.add_argument("--cfg-path", type=str, default='eval_configs/minigpt4.yaml', help="path to configuration file.")
+ parser.add_argument(
+ "--options",
+ nargs="+",
+ help="override some settings in the used config, the key-value pair "
+ "in xxx=yyy format will be merged into config file (deprecate), "
+ "change to --cfg-options instead.",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def setup_seeds(config):
+ seed = config.run_cfg.seed + get_rank()
+
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+ cudnn.benchmark = False
+ cudnn.deterministic = True
+
+
+# ========================================
+# Model Initialization
+# ========================================
+
+print('Initializing Chat')
+cfg = Config(parse_args())
+
+model_config = cfg.model_cfg
+model_cls = registry.get_model_class(model_config.arch)
+model = model_cls.from_config(model_config).to('cuda:0')
+
+vis_processor_cfg = cfg.datasets_cfg.cc_align.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+chat = Chat(model, vis_processor)
+print('Initialization Finished')
+
+# ========================================
+# Gradio Setting
+# ========================================
+
+def gradio_reset(chat_state, img_list):
+ chat_state.messages = []
+ img_list = []
+ return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False), gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+
+def upload_img(gr_img, text_input, chat_state):
+ if gr_img is None:
+ return None, None, gr.update(interactive=True)
+ chat_state = CONV_VISION.copy()
+ img_list = []
+ llm_message = chat.upload_img(gr_img, chat_state, img_list)
+ return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
+
+def gradio_ask(user_message, chatbot, chat_state):
+ if len(user_message) == 0:
+ return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+ chat.ask(user_message, chat_state)
+ chatbot = chatbot + [[user_message, None]]
+ return '', chatbot, chat_state
+
+
+def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
+ llm_message = chat.answer(conv=chat_state, img_list=img_list, max_new_tokens=1000, num_beams=num_beams, temperature=temperature)[0]
+ chatbot[-1][1] = llm_message
+ return chatbot, chat_state, img_list
+
+title = """
Demo of MiniGPT-4
"""
+description = """This is the demo of MiniGPT-4. Upload your images and start chatting!
"""
+article = """Paper: Here
+Code: Here
+Project Page: Here
+"""
+
+#TODO show examples below
+
+with gr.Blocks() as demo:
+ gr.Markdown(title)
+ gr.Markdown(description)
+ gr.Markdown(article)
+
+ with gr.Row():
+ with gr.Column(scale=0.5):
+ image = gr.Image(type="pil")
+ upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+ clear = gr.Button("Restart")
+
+ num_beams = gr.Slider(
+ minimum=1,
+ maximum=16,
+ value=5,
+ step=1,
+ interactive=True,
+ label="beam search numbers)",
+ )
+
+ temperature = gr.Slider(
+ minimum=0.1,
+ maximum=2.0,
+ value=1.0,
+ step=0.1,
+ interactive=True,
+ label="Temperature",
+ )
+
+
+ with gr.Column():
+ chat_state = gr.State()
+ img_list = gr.State()
+ chatbot = gr.Chatbot(label='MiniGPT-4')
+ text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
+
+ upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list])
+
+ text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+ gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
+ )
+ clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False)
+
+demo.launch(share=True, enable_queue=True)
\ No newline at end of file
diff --git a/create_align_dataset.py b/create_align_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a5de1eecb979937692731dd0f13bc85f99d3fd
--- /dev/null
+++ b/create_align_dataset.py
@@ -0,0 +1,134 @@
+import argparse
+import os
+import json
+from tqdm import tqdm
+import random
+import numpy as np
+from PIL import Image
+import webdataset as wds
+import torch
+from torchvision.datasets import ImageFolder
+import torchvision.transforms as transforms
+
+import openai
+from tenacity import (
+ retry,
+ stop_after_attempt,
+ wait_random_exponential,
+) # for exponential backoff
+
+from minigpt4.common.config import Config
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat
+
+openai.api_key = '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'
+
+
+def prepare_chatgpt_message(task_prompt, paragraph):
+ messages = [{"role": "system", "content": task_prompt},
+ {"role": "user", "content": paragraph}]
+ return messages
+
+
+@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+def call_chatgpt(chatgpt_messages, max_tokens=200, model="gpt-3.5-turbo"):
+ response = openai.ChatCompletion.create(model=model, messages=chatgpt_messages, temperature=0.7, max_tokens=max_tokens)
+ reply = response['choices'][0]['message']['content']
+ total_tokens = response['usage']['total_tokens']
+ return reply, total_tokens
+
+
+def main(args):
+
+ print('Initializing Chat')
+ cfg = Config(args)
+
+ model_config = cfg.model_cfg
+ model_cls = registry.get_model_class(model_config.arch)
+ model = model_cls.from_config(model_config).to('cuda:{}'.format(args.device))
+
+ ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_pretrain_stage2_cc/20230405233_3GPU40kSTEP_MAIN/checkpoint_3.pth'
+ ckpt = torch.load(ckpt_path)
+ msg = model.load_state_dict(ckpt['model'], strict=False)
+
+
+ vis_processor_cfg = cfg.datasets_cfg.cc_combine.vis_processor.train
+ vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+
+ text_processor_cfg = cfg.datasets_cfg.laion.text_processor.train
+ text_processor = registry.get_processor_class(text_processor_cfg.name).from_config(text_processor_cfg)
+
+ chat = Chat(model, vis_processor, args.device)
+ print('Initialization Finished')
+
+
+
+ texts = {}
+ negative_list = []
+
+ for i in tqdm(range(args.begin_id, args.end_id)):
+ image = Image.open(os.path.join(args.save_dir, 'image/{}.jpg'.format(i))).convert('RGB')
+
+ fix_prompt = \
+ "Fix the error in the given paragraph. " \
+ "Remove any repeating sentences, meanless characters, not English sentences, and so on." \
+ "Remove unnecessary repetition." \
+ "Rewrite any incomplete sentences." \
+ "Return directly the results WITHOUT explanation." \
+ "Return directly the input paragraph if it is already correct WITHOUT explanation."
+
+ answers = []
+ answer_tokens = 0
+ chat.reset()
+ chat.upload_img(image)
+ chat.ask("Describe this image in detail. Give as many details as possible. Say everything you see.")
+ answer, tokens = chat.answer()
+ answers.append(answer)
+ answer_tokens += tokens
+ if len(answer_tokens) < 80:
+ chat.ask("Continue")
+ answer, answer_token = chat.answer()
+ answers.append(answer)
+ answer_tokens += tokens
+ answer = ' '.join(answers)
+
+ chatgpt_message = prepare_chatgpt_message(fix_prompt, answer)
+ improved_answer, num_token = call_chatgpt(chatgpt_message)
+
+ if 'already correct' in improved_answer:
+ if 'repetition' in improved_answer:
+ continue
+ improved_answer = answer
+ if 'incomplete' in improved_answer or len(improved_answer) < 50:
+ negative_list.append(improved_answer)
+ else:
+ texts[i] = improved_answer
+
+ with open(os.path.join(args.save_dir, "cap_{}_{}.json".format(args.begin_id, args.end_id)), "w") as outfile:
+ # write the dictionary to the file in JSON format
+ json.dump(texts, outfile)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Create Alignment")
+
+ parser.add_argument("--cfg-path", default='train_config/minigpt4_stage2_align.yaml')
+ parser.add_argument("--save-dir", default="/ibex/project/c2133/blip_dataset/image_alignment")
+ parser.add_argument("--begin-id", type=int)
+ parser.add_argument("--end-id", type=int)
+ parser.add_argument("--device", type=int)
+ parser.add_argument(
+ "--options",
+ nargs="+",
+ help="override some settings in the used config, the key-value pair "
+ "in xxx=yyy format will be merged into config file (deprecate), "
+ "change to --cfg-options instead.",
+ )
+
+ args = parser.parse_args()
+
+ print("begin_id: ", args.begin_id)
+ print("end_id: ", args.end_id)
+ print("device:", args.device)
+
+ main(args)
diff --git a/dataset/convert_cc_sbu.py b/dataset/convert_cc_sbu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c325ed3afa3ddb81c5535b5a6febc23d3d5ceee
--- /dev/null
+++ b/dataset/convert_cc_sbu.py
@@ -0,0 +1,20 @@
+import json
+import csv
+
+# specify input and output file paths
+input_file = 'ccs_synthetic_filtered_large.json'
+output_file = 'ccs_synthetic_filtered_large.tsv'
+
+# load JSON data from input file
+with open(input_file, 'r') as f:
+ data = json.load(f)
+
+# extract header and data from JSON
+header = data[0].keys()
+rows = [x.values() for x in data]
+
+# write data to TSV file
+with open(output_file, 'w') as f:
+ writer = csv.writer(f, delimiter='\t')
+ writer.writerow(header)
+ writer.writerows(rows)
diff --git a/dataset/convert_laion.py b/dataset/convert_laion.py
new file mode 100644
index 0000000000000000000000000000000000000000..b793579ce276b72a4313bba4f237b8cb0becb294
--- /dev/null
+++ b/dataset/convert_laion.py
@@ -0,0 +1,20 @@
+import json
+import csv
+
+# specify input and output file paths
+input_file = 'laion_synthetic_filtered_large.json'
+output_file = 'laion_synthetic_filtered_large.tsv'
+
+# load JSON data from input file
+with open(input_file, 'r') as f:
+ data = json.load(f)
+
+# extract header and data from JSON
+header = data[0].keys()
+rows = [x.values() for x in data]
+
+# write data to TSV file
+with open(output_file, 'w') as f:
+ writer = csv.writer(f, delimiter='\t')
+ writer.writerow(header)
+ writer.writerows(rows)
diff --git a/dataset/download_cc_sbu.sh b/dataset/download_cc_sbu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ec102bde61c5a65aafb56545dc3f62d6d6cb4494
--- /dev/null
+++ b/dataset/download_cc_sbu.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+img2dataset --url_list ccs_synthetic_filtered_large.tsv --input_format "tsv"\
+ --url_col "url" --caption_col "caption" --output_format webdataset\
+ --output_folder cc_sbu_dataset --processes_count 16 --thread_count 128 --image_size 256 \
+ --enable_wandb True
diff --git a/dataset/download_laion.sh b/dataset/download_laion.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5de38e2cbbb55083a819374192f2943ec8ead9cd
--- /dev/null
+++ b/dataset/download_laion.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+img2dataset --url_list laion_synthetic_filtered_large.tsv --input_format "tsv"\
+ --url_col "url" --caption_col "caption" --output_format webdataset\
+ --output_folder laion_dataset --processes_count 16 --thread_count 128 --image_size 256 \
+ --enable_wandb True
diff --git a/dataset/readme.md b/dataset/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b94182c482b993e0a91d1f0802d2dc901badedb
--- /dev/null
+++ b/dataset/readme.md
@@ -0,0 +1,92 @@
+## Download the filtered Conceptual Captions, SBU, LAION datasets
+
+### Pre-training datasets download:
+We use the filtered synthetic captions prepared by BLIP. For more details about the dataset, please refer to [BLIP](https://github.com/salesforce/BLIP).
+
+It requires ~2.3T to store LAION and CC3M+CC12M+SBU datasets
+
+Image source | Filtered synthetic caption by ViT-L
+--- | :---:
+CC3M+CC12M+SBU | Download
+LAION115M | Download
+
+This will download two json files
+```
+ccs_synthetic_filtered_large.json
+laion_synthetic_filtered_large.json
+```
+
+## prepare the data step-by-step
+
+
+### setup the dataset folder and move the annotation file to the data storage folder
+```
+export MINIGPT4_DATASET=/YOUR/PATH/FOR/LARGE/DATASET/
+mkdir ${MINIGPT4_DATASET}/cc_sbu
+mkdir ${MINIGPT4_DATASET}/laion
+mv ccs_synthetic_filtered_large.json ${MINIGPT4_DATASET}/cc_sbu
+mv laion_synthetic_filtered_large.json ${MINIGPT4_DATASET}/laion
+```
+
+### Convert the scripts to data storate folder
+```
+cp convert_cc_sbu.py ${MINIGPT4_DATASET}/cc_sbu
+cp download_cc_sbu.sh ${MINIGPT4_DATASET}/cc_sbu
+cp convert_laion.py ${MINIGPT4_DATASET}/laion
+cp download_laion.sh ${MINIGPT4_DATASET}/laion
+```
+
+
+### Convert the laion and cc_sbu annotation file format to be img2dataset format
+```
+cd ${MINIGPT4_DATASET}/cc_sbu
+python convert_cc_sbu.py
+
+cd ${MINIGPT4_DATASET}/laion
+python convert_laion.py
+```
+
+### Download the datasets with img2dataset
+```
+cd ${MINIGPT4_DATASET}/cc_sbu
+sh download_cc_sbu.sh
+cd ${MINIGPT4_DATASET}/laion
+sh download_laion.sh
+```
+
+
+The final dataset structure
+
+```
+.
+├── ${MINIGPT4_DATASET}
+│ ├── cc_sbu
+│ ├── convert_cc_sbu.py
+│ ├── download_cc_sbu.sh
+│ ├── ccs_synthetic_filtered_large.json
+│ ├── ccs_synthetic_filtered_large.tsv
+│ └── cc_sbu_dataset
+│ ├── 00000.tar
+│ ├── 00000.parquet
+│ ...
+│ ├── laion
+│ ├── convert_laion.py
+│ ├── download_laion.sh
+│ ├── laion_synthetic_filtered_large.json
+│ ├── laion_synthetic_filtered_large.tsv
+│ └── laion_dataset
+│ ├── 00000.tar
+│ ├── 00000.parquet
+│ ...
+...
+```
+
+
+## Set up the dataset configuration files
+
+Then, set up the LAION dataset loading path in [here](../minigpt4/configs/datasets/laion/defaults.yaml#L13) at Line 13 as ${MINIGPT4_DATASET}/laion/laion_dataset/{00000..10488}.tar
+
+Then, set up the Conceptual Captoin and SBU datasets loading path in [here](../minigpt4/configs/datasets/cc_sbu/defaults.yaml#L13) at Line 13 as ${MINIGPT4_DATASET}/cc_sbu/cc_sbu_dataset/{00000..01255}.tar
+
+
+
diff --git a/demo_dev.ipynb b/demo_dev.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..5a0138228afa89fae0f7e9180a81b59871b630fe
--- /dev/null
+++ b/demo_dev.ipynb
@@ -0,0 +1,3491 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "1d3a1f40",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import argparse\n",
+ "import os\n",
+ "import random\n",
+ "import requests\n",
+ "from io import BytesIO\n",
+ "\n",
+ "import numpy as np\n",
+ "from PIL import Image\n",
+ "import torch\n",
+ "import torch.backends.cudnn as cudnn\n",
+ "import gradio as gr\n",
+ "\n",
+ "import minigpt4.tasks as tasks\n",
+ "from minigpt4.common.config import Config\n",
+ "from minigpt4.common.dist_utils import get_rank, init_distributed_mode\n",
+ "from minigpt4.common.logger import setup_logger\n",
+ "from minigpt4.common.optims import (\n",
+ " LinearWarmupCosineLRScheduler,\n",
+ " LinearWarmupStepLRScheduler,\n",
+ ")\n",
+ "from minigpt4.common.registry import registry\n",
+ "from minigpt4.common.utils import now\n",
+ "from minigpt4.conversation.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub\n",
+ "\n",
+ "# imports modules for registration\n",
+ "from minigpt4.datasets.builders import *\n",
+ "from minigpt4.models import *\n",
+ "from minigpt4.processors import *\n",
+ "from minigpt4.runners import *\n",
+ "from minigpt4.tasks import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "af62dac2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "_StoreAction(option_strings=['--options'], dest='options', nargs='+', const=None, default=None, type=None, choices=None, required=False, help='override some settings in the used config, the key-value pair in xxx=yyy format will be merged into config file (deprecate), change to --cfg-options instead.', metavar=None)"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "parser = argparse.ArgumentParser(description=\"Demo\")\n",
+ "parser.add_argument(\"--cfg-path\", required=True, help=\"path to configuration file.\")\n",
+ "parser.add_argument(\n",
+ " \"--options\",\n",
+ " nargs=\"+\",\n",
+ " help=\"override some settings in the used config, the key-value pair \"\n",
+ " \"in xxx=yyy format will be merged into config file (deprecate), \"\n",
+ " \"change to --cfg-options instead.\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "1d50fdae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Initializing Chat\n",
+ "Loading VIT\n",
+ "Loading VIT Done\n",
+ "Loading Q-Former\n",
+ "Loading Q-Former Done\n",
+ "Loading LLAMA\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ac6a4d59a5bc49b3ba420537c3a7dbc0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading LLAMA Done\n",
+ "Load 4 training prompts\n",
+ "Prompt Example \n",
+ "###Human: Could you describe the contents of this image for me? ###Assistant: \n",
+ "Load BLIP2-LLM Checkpoint: /ibex/project/c2133/vicuna_ckpt_test/Vicuna_stage3_align/20230412191_laion_ckpt3/checkpoint_1.pth\n",
+ "Initialization Finished\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Initializing Chat')\n",
+ "cfg = Config(parser.parse_args(['--cfg-path', 'eval_configs/minigpt4.yaml']))\n",
+ "\n",
+ "model_config = cfg.model_cfg\n",
+ "model_cls = registry.get_model_class(model_config.arch)\n",
+ "model = model_cls.from_config(model_config).to('cuda:0')\n",
+ "\n",
+ "vis_processor_cfg = cfg.datasets_cfg.cc_align.vis_processor.train\n",
+ "vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)\n",
+ "\n",
+ "# chat = Chat(model, vis_processor)\n",
+ "print('Initialization Finished')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 462,
+ "id": "ea32d3b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_prompt_stage2_quick_laion/20230411114/checkpoint_0.pth'\n",
+ "# ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_prompt_stage2_laion/20230410145/checkpoint_0.pth'\n",
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_prompt_stage2_cc/20230408151_3GPU50kStep_Multi/checkpoint_19.pth'\n",
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_pretrain_stage2_cc/20230405233_3GPU40kSTEP_MAIN/checkpoint_3.pth'\n",
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_stage3_align/20230412172/checkpoint_0.pth'\n",
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_stage3_align/20230412191_laion_ckpt3/checkpoint_1.pth'\n",
+ "# ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_stage3_align/20230412203_laion_prompt_ckpt12/checkpoint_1.pth'\n",
+ "# ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/minigpt4_stage2_align/20230415090/checkpoint_2.pth'\n",
+ "\n",
+ "\n",
+ "ckpt = torch.load(ckpt_path)\n",
+ "msg = model.load_state_dict(ckpt['model'], strict=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "41e3a310",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# from transformers.generation.configuration_utils import GenerationConfig\n",
+ "# new_generation_config = GenerationConfig.from_model_config(chat.model.llama_model.config)\n",
+ "# chat.model.llama_model.generation_config\n",
+ "# new_generation_config"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a1a25602",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONV_VISION = Conversation(\n",
+ " system=\"Give the following image: ImageContent. \"\n",
+ " \"You will be able to see the image once I provide it to you. Please answer my questions.\",\n",
+ " roles=(\"Human\", \"Assistant\"),\n",
+ " messages=[],\n",
+ " offset=2,\n",
+ " sep_style=SeparatorStyle.SINGLE,\n",
+ " sep=\"###\",\n",
+ ")\n",
+ "\n",
+ "\n",
+ "class Chat:\n",
+ " def __init__(self, model, vis_processor, device='cuda:0'):\n",
+ " self.device = device\n",
+ " self.model = model\n",
+ " self.vis_processor = vis_processor\n",
+ "\n",
+ " self.conv = CONV_VISION.copy()\n",
+ " self.img_list = []\n",
+ " self.raw_answers = []\n",
+ "\n",
+ " stop_words_ids = [torch.tensor([835]).to(self.device),\n",
+ " torch.tensor([2277, 29937]).to(self.device)] # '###' can be encoded in two different ways.\n",
+ " self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])\n",
+ "\n",
+ " def reset(self):\n",
+ " self.conv.messages = []\n",
+ " self.img_list = []\n",
+ " # self.img_list = [img for img in self.conv.system_img]\n",
+ " self.raw_answers = []\n",
+ "\n",
+ " def ask(self, text):\n",
+ " if len(self.conv.messages) > 0 and self.conv.messages[-1][0] == self.conv.roles[0] \\\n",
+ " and self.conv.messages[-1][1][-6:] == '': # last message is image.\n",
+ " self.conv.messages[-1][1] = ' '.join([self.conv.messages[-1][1], text])\n",
+ " else:\n",
+ " self.conv.append_message(self.conv.roles[0], text)\n",
+ "\n",
+ " def answer(self, max_new_tokens=200, num_beams=5, min_length=1, top_p=0.9,\n",
+ " repetition_penalty=1.0, length_penalty=1, temperature=1):\n",
+ " self.conv.append_message(self.conv.roles[1], None)\n",
+ " embs = self.get_context_emb()\n",
+ " outputs = self.model.llama_model.generate(\n",
+ " inputs_embeds=embs,\n",
+ " max_new_tokens=max_new_tokens,\n",
+ " stopping_criteria=self.stopping_criteria,\n",
+ " num_beams=num_beams,\n",
+ " min_length=min_length,\n",
+ " top_p=top_p,\n",
+ " repetition_penalty=repetition_penalty,\n",
+ " length_penalty=length_penalty,\n",
+ " temperature=temperature,\n",
+ " )\n",
+ " output_token = outputs[0]\n",
+ " if output_token[0] == 0:\n",
+ " output_token = output_token[1:]\n",
+ " output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)\n",
+ " self.raw_answers.append(output_text)\n",
+ " output_text = output_text.split('###')[0] # remove the stop sign '###'\n",
+ " output_text = output_text.split('Assistant:')[-1].strip()\n",
+ " self.conv.messages[-1][1] = output_text\n",
+ " return output_text, output_token.cpu().numpy()\n",
+ "\n",
+ " def upload_img(self, image):\n",
+ " if isinstance(image, str): # is a image path\n",
+ " raw_image = Image.open(image).convert('RGB')\n",
+ " image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)\n",
+ " elif isinstance(image, Image.Image):\n",
+ " raw_image = image\n",
+ " image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)\n",
+ " elif isinstance(image, torch.Tensor):\n",
+ " if len(image.shape) == 3:\n",
+ " image = image.unsqueeze(0)\n",
+ " image = image.to(self.device)\n",
+ " \n",
+ " image_emb, _ = self.model.encode_img(image)\n",
+ " self.img_list.append(image_emb)\n",
+ " self.conv.append_message(self.conv.roles[0], \"\")\n",
+ " msg = \"Received.\"\n",
+ " # self.conv.append_message(self.conv.roles[1], msg)\n",
+ " return msg\n",
+ "\n",
+ " def get_context_emb(self):\n",
+ " prompt = self.conv.get_prompt()\n",
+ " prompt_segs = prompt.split('')\n",
+ " assert len(prompt_segs) == len(self.img_list) + 1, \"Unmatched numbers of image placeholders and images.\"\n",
+ " seg_tokens = [\n",
+ " self.model.llama_tokenizer(\n",
+ " seg, return_tensors=\"pt\", add_special_tokens=i==0).to(self.device).input_ids # only add bos to the first seg\n",
+ " for i, seg in enumerate(prompt_segs)\n",
+ " ]\n",
+ " seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]\n",
+ " mixed_embs = [emb for pair in zip(seg_embs[:-1], self.img_list) for emb in pair] + [seg_embs[-1]]\n",
+ "\n",
+ " mixed_embs = torch.cat(mixed_embs, dim=1)\n",
+ " return mixed_embs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "9f4378a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "from io import BytesIO"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 362,
+ "id": "99b48b03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chat = Chat(model, vis_processor)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 489,
+ "id": "b157b73b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "img_url = 'https://newsfeed.time.com/wp-content/uploads/sites/9/2010/11/grandma-dj.jpg?w=720&h=480&crop=1'\n",
+ "img_url = 'https://newsfeed.time.com/wp-content/uploads/sites/9/2010/11/grandma-dj.jpg?w=720&h=480&crop=1'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 497,
+ "id": "aa998179",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = requests.get(img_url)\n",
+ "image_buffer = BytesIO(response.content)\n",
+ "image = Image.open(image_buffer).convert(\"RGB\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 504,
+ "id": "4822c36d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chat.reset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 505,
+ "id": "c0cff044",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Received.'"
+ ]
+ },
+ "execution_count": 505,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat.upload_img(image)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 506,
+ "id": "2afe06ec",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sure, here's a rap song based on the image you provided:\n",
+ "\n",
+ "Verse 1:\n",
+ "I'm an old lady, but I'm still flyin'\n",
+ "I've got my headphones on and I'm feelin' high\n",
+ "I'm in the studio, makin' some beats\n",
+ "I'm the queen of hip hop, can't nobody touch my feet\n",
+ "\n",
+ "Chorus:\n",
+ "I'm the grandma of rap, I'm still spittin' fire\n",
+ "I've got my green jacket on, I'm lookin' fly\n",
+ "I've got my sunglasses on, I'm shining bright\n",
+ "I'm the queen of the game, I'm makin' it right\n",
+ "\n",
+ "Verse 2:\n",
+ "I've been in the game for years, I've paid my dues\n",
+ "I've got a microphone in my hand and I'm singin' the blues\n",
+ "I've got the crowd in the palm of my hand\n",
+ "I'm the queen of rap, I'm takin' over the land\n",
+ "\n",
+ "Chorus:\n",
+ "I'm the grandma of rap, I'm still spittin' fire\n",
+ "I've got my green jacket on, I'm lookin' fly\n",
+ "I've got my sunglasses on, I'm shining bright\n",
+ "I'm the queen of the game, I'm makin' it right\n",
+ "\n",
+ "Verse 3:\n",
+ "I've got my beats in my headphones, I'm feelin' the vibe\n",
+ "I'm in the zone, I'm in my element, I'm in my ride\n",
+ "I'm the queen of rap, I'm in control\n",
+ "I'm makin' hits, I'm takin' over the world\n",
+ "\n",
+ "Chorus:\n",
+ "I'm the grandma of rap, I'm still spittin' fire\n",
+ "I've got my green jacket on, I'm lookin' fly\n",
+ "I've got my sunglasses on, I'm shining bright\n",
+ "I'm the queen of the game, I'm makin' it right\n",
+ "\n",
+ "Outro:\n",
+ "I'm the grandma of rap, I'm still spittin' fire\n",
+ "I've got my green jacket on, I'm lookin' fly\n",
+ "I've got my sunglasses on, I'm shining bright\n",
+ "I'm the queen of the game, I'm makin' it right.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# chat.ask(\"How old is this DJ in this image?\")\n",
+ "chat.ask(\"Can you write me a master rap song that rhymes very well based on this image?\")\n",
+ "a, a_token = chat.answer(max_new_tokens=800, temperature=2)\n",
+ "print(a)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "id": "4805b5a8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "962bd3c0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 508,
+ "id": "b10e1974",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "conv = []\n",
+ "for role, message in chat.conv.messages:\n",
+ " message = message.split('')[-1].strip()\n",
+ " conv.append(message)\n",
+ " \n",
+ "total_data.append({'img': img_url, 'conv': conv})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b04b8155",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 509,
+ "id": "5144bf9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'img': 'https://brandforma.com/wp-content/uploads/2019/04/flamingo_logo_for_sale.png',\n",
+ " 'conv': ['What do you think of this logo design.',\n",
+ " \"The logo design is simple and minimalistic, with a pink line drawing of a flamingo standing on one leg in the water. The design is clean and easy to recognize, making it suitable for use in various contexts such as a logo for a beach resort or a flamingo-themed event. The use of a flamingo as a symbol adds a touch of whimsy and fun to the design, making it memorable and eye-catching. Overall, it's a well-designed logo that effectively communicates the brand's message.\"]},\n",
+ " {'img': 'https://i.imgflip.com/c37k1.jpg',\n",
+ " 'conv': ['Explain why this meme is funny.',\n",
+ " 'This meme is funny because it shows a dog laying on the floor with its head resting on its paws, as if it is tired or sleepy. The caption, \"monday just monday,\" adds to the humor by suggesting that the dog is feeling the same way as many people do on Mondays, which are often considered to be the most dreaded day of the week. The meme is relatable and humorous, making it a popular one among internet users.']},\n",
+ " {'img': 'https://contentcenter-drcn.dbankcdn.cn/img/pub_1/Browser_contentImg_1019_8/3f/v3/10510195ebff150e9eb42d6ae2da8bac4979d23/5e1515931d554ff1bcdf94a82e5ef223_5_1/hd.webp',\n",
+ " 'conv': ['Describe this image as detailed as possible.',\n",
+ " 'The image shows a busy city street with a clock tower in the background. The street is lined with shops and restaurants on both sides. There are several motorcycles parked on the side of the road, and people are walking down the street. The clock tower has a large clock face with Roman numerals and a small spire on top. The buildings on either side of the street have ornate facades and balconies. The street is paved with cobblestones and there are streetlights on either side of the road. The sky is clear and blue, and there are a few clouds in the distance.']},\n",
+ " {'img': 'https://cdn.ttv.com.tw/manasystem/FileData/News/c232e38d-0239-4681-bc28-0f55a3331e3d.jpg',\n",
+ " 'conv': ['Describe this image as detailed as possible.',\n",
+ " \"The image shows a group of musicians performing on stage in front of a large audience. The musicians are playing electric guitars and are wearing black and white clothing. The audience is made up of people of different ages and ethnicities who are watching the performance and taking photos with their phones. The stage is lit up with bright lights and there is a large screen displaying the band's name and lyrics in Chinese characters. The atmosphere is lively and energetic, with the audience cheering and singing along to the music.\"]},\n",
+ " {'img': 'https://newsfeed.time.com/wp-content/uploads/sites/9/2010/11/grandma-dj.jpg?w=720&h=480&crop=1',\n",
+ " 'conv': ['Can you write me a master rap song that rhymes very well based on this image?',\n",
+ " \"Sure, here's a rap song based on the image you provided:\\n\\nVerse 1:\\nI'm an old lady, but I'm still flyin'\\nI've got my headphones on and I'm feelin' high\\nI'm in the studio, makin' some beats\\nI'm the queen of hip hop, can't nobody touch my feet\\n\\nChorus:\\nI'm the grandma of rap, I'm still spittin' fire\\nI've got my green jacket on, I'm lookin' fly\\nI've got my sunglasses on, I'm shining bright\\nI'm the queen of the game, I'm makin' it right\\n\\nVerse 2:\\nI've been in the game for years, I've paid my dues\\nI've got a microphone in my hand and I'm singin' the blues\\nI've got the crowd in the palm of my hand\\nI'm the queen of rap, I'm takin' over the land\\n\\nChorus:\\nI'm the grandma of rap, I'm still spittin' fire\\nI've got my green jacket on, I'm lookin' fly\\nI've got my sunglasses on, I'm shining bright\\nI'm the queen of the game, I'm makin' it right\\n\\nVerse 3:\\nI've got my beats in my headphones, I'm feelin' the vibe\\nI'm in the zone, I'm in my element, I'm in my ride\\nI'm the queen of rap, I'm in control\\nI'm makin' hits, I'm takin' over the world\\n\\nChorus:\\nI'm the grandma of rap, I'm still spittin' fire\\nI've got my green jacket on, I'm lookin' fly\\nI've got my sunglasses on, I'm shining bright\\nI'm the queen of the game, I'm makin' it right\\n\\nOutro:\\nI'm the grandma of rap, I'm still spittin' fire\\nI've got my green jacket on, I'm lookin' fly\\nI've got my sunglasses on, I'm shining bright\\nI'm the queen of the game, I'm makin' it right.\"]}]"
+ ]
+ },
+ "execution_count": 509,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "total_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2d6dbcf",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "1d4af846",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['Human', ' What do you think of this logo design.'],\n",
+ " ['Assistant',\n",
+ " \"The logo design is simple and minimalistic, with a pink line drawing of a flamingo standing on one leg in the water. The design is clean and easy to recognize, making it suitable for use in various contexts such as a logo for a beach resort or a flamingo-themed event. The use of a flamingo as a symbol adds a touch of whimsy and fun to the design, making it memorable and eye-catching. Overall, it's a well-designed logo that effectively communicates the brand's message.\"]]"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat.conv.messages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90a8aa05",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1019cfaf",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 434,
+ "id": "b50f755c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'img': 'https://iso.500px.com/wp-content/uploads/2016/11/stock-photo-159533631-1500x1000.jpg',\n",
+ " 'conv': ['Can you write me a beautiful poem about this image?',\n",
+ " \"Sure, here is a poem about the image you provided:\\n\\nA man sits on a hill, with his dog by his side\\nThe sun sets behind them, casting a golden light\\nThe city below is bustling, but they are at peace\\nEnjoying the moment, in each other's embrace\\n\\nThe man's arms are wrapped around his furry friend\\nAs they watch the world go by, with a gentle bend\\nThe dog's head rests on the man's shoulder\\nA symbol of their unbreakable bond, forever\\n\\nThe sky is ablaze with color, a beautiful sight\\nBut it's the love between the man and his dog that shines bright\\nIt's a moment of pure joy, a memory to keep\\nForever etched in their hearts, as they sit and weep\\n\\nSo let us cherish these moments, with our four-legged friends\\nFor they bring us so much happiness, until the very end\\nAnd even when they're gone, their love will live on\\nIn the memories we hold, forever and strong.\"]},\n",
+ " {'img': 'https://f.i.uol.com.br/fotografia/2023/03/25/1679774982641f55063b5cd_1679774982_2x3_md.jpg',\n",
+ " 'conv': ['Who is him?',\n",
+ " 'The person in the image is Pope Francis, the current Pope of the Catholic Church.',\n",
+ " 'Based on this photo, write me a short rap song that rhymes well.',\n",
+ " \"(Verse 1)\\n\\nI'm Pope Francis, the man of the hour\\nWalking down the street in my puffer coat\\nPeople stare, they can't believe their eyes\\nSeeing the Pope in all this disguise\\n\\n(Chorus)\\n\\nPope Francis, the man of the hour\\nWalking down the street in my puffer coat\\nPeople stare, they can't believe their eyes\\nSeeing the Pope in all this disguise\\n\\n(Verse 2)\\n\\nI'm the leader of the Catholic Church\\nBut today I'm dressed like a bum\\nI'm spreading the word of God in my own way\\nWalking down the street in my puffer coat today\\n\\n(Chorus)\\n\\nPope Francis, the man of the hour\\nWalking down the street in my puffer coat\\nPeople stare, they can't believe their eyes\\nSeeing the Pope in all this disguise\\n\\n(Bridge)\\n\\nI may be the Pope, but I'm just like you\\nI walk the streets, I do my thing too\\nI may be different, but we're all the same\\nWalking down the street in my puffer coat in this game\\n\\n(Chorus)\\n\\nPope Francis, the man of the hour\\nWalking down the street in my puffer coat\\nPeople stare, they can't believe their eyes\\nSeeing the Pope in all this disguise\"]},\n",
+ " {'img': 'https://i.pinimg.com/236x/d5/fa/3d/d5fa3d292466eeb048048c42d774b03f--hot-dog-buns-hot-dogs.jpg',\n",
+ " 'conv': ['Why does this image look funny?',\n",
+ " \"The image looks funny because it appears to be a small dog sitting on a plate of bread, which is an unusual and humorous sight. The dog's expression and the way it is sitting on the bread also adds to the humor of the image.\\n\\nIt's possible that the image was intended to be humorous or playful, or it could simply be a candid shot of a dog enjoying a meal. Regardless of the intention behind the image, it is likely to elicit a smile or chuckle from anyone who sees it.\"]},\n",
+ " {'img': 'https://i.etsystatic.com/5220905/r/il/6e5166/2603519433/il_fullxfull.2603519433_7i6k.jpg',\n",
+ " 'conv': ['Can you write a story around this image for my child?',\n",
+ " 'Sure, here\\'s a story based on the image:\\n\\nOnce upon a time, there was a brown bear, a white rabbit, and a black cat who lived in a forest. They were good friends and loved to spend time together. One day, they decided to have a picnic under a tree. The bear brought some honey, the rabbit brought some carrots, and the cat brought some milk.\\n\\nAs they sat around the table, the bear said, \"I\\'m so hungry, I could eat a whole honey pot!\" The rabbit giggled and said, \"I\\'m so hungry, I could eat a whole bag of carrots!\" The cat meowed and said, \"I\\'m so hungry, I could drink a whole bowl of milk!\"\\n\\nThey all laughed and dug in to their food. The bear loved the honey, the rabbit loved the carrots, and the cat loved the milk. They were all very happy and content.\\n\\nAfter they finished their meal, they decided to go for a walk in the forest. They came across a stream and the bear said, \"Let\\'s go for a swim!\" The rabbit and cat were a bit hesitant, but the bear convinced them to join in.\\n\\nThey all had a great time swimming and playing in the water. The bear loved to splash around, the rabbit loved to dive, and the cat loved to chase the splashes. They were all very happy and had a lot of fun.\\n\\nAfter their swim, they decided to go back to their picnic spot and have some more food. They sat around the table and the bear said, \"I\\'m so hungry, I could eat a whole honey pot again!\" The rabbit giggled and said, \"I\\'m so hungry, I could eat a whole bag of carrots again!\" The cat meowed and said, \"I\\'m so hungry, I could drink a whole bowl of milk again!\"\\n\\nThey all laughed and dug in to their food. They were all very happy and content. And they lived happily ever after.']},\n",
+ " {'img': 'https://media.discordapp.net/attachments/1008571152479826060/1096134597626036274/Sam1_A_Monk_has_a_huge_brain_Full_body_shot_pray_sharp_colors_r_dc6d519b-edc6-48b2-9401-b58998f311b6.png?width=1280&height=1280',\n",
+ " 'conv': ['I designed a character for my game as shown in the image. Can you write me a background story for him?',\n",
+ " \"Sure, here's a possible background story for the character in the image:\\n\\nThe monk in the image is a wise and enlightened being who has spent many years studying the workings of the mind. He has spent countless hours meditating and contemplating the nature of consciousness, and as a result, he has gained a deep understanding of the inner workings of the mind.\\n\\nThe brain in his hand is a symbol of his deep understanding of the mind. He uses it to help others understand the nature of their own thoughts and emotions, and to guide them on the path to enlightenment.\\n\\nDespite his great wisdom, the monk is a humble and compassionate being. He uses his knowledge to help others, and is always willing to share his insights with those who seek his guidance. He is a revered figure in his community, and many come to him seeking his help and guidance.\\n\\nThe monk's robes are a symbol of his commitment to his spiritual path. He wears them as a reminder of his vows of poverty, chastity, and obedience, and as a symbol of his commitment to living a life of service to others.\\n\\nOverall, the monk in the image is a wise and compassionate being who has dedicated his life to understanding the nature of the mind and helping others on the path to enlightenment.\"]},\n",
+ " {'img': 'https://cdn.britannica.com/55/188355-050-D5E49258/Salvatore-Corsitto-The-Godfather-Marlon-Brando-Francis.jpg',\n",
+ " 'conv': ['Can you give me a short introduction to this movie?',\n",
+ " 'The Godfather is a classic American crime drama film directed by Francis Ford Coppola and released in 1972. It is based on the novel of the same name by Mario Puzo and follows the story of the Corleone family, a powerful mafia clan in New York City. The film stars Marlon Brando as the patriarch of the family, Vito Corleone, and Al Pacino as his son, Michael Corleone. The Godfather is known for its iconic performances, memorable dialogue, and its portrayal of the American mafia lifestyle. It is considered one of the greatest films of all time and has had a lasting impact on popular culture.']},\n",
+ " {'img': 'https://therecipecritic.com/wp-content/uploads/2020/01/broiled_lobster.jpg',\n",
+ " 'conv': ['How should I make something like this?',\n",
+ " 'To make this dish, you will need the following ingredients:\\n\\n* 4 large lobsters\\n* 2 tablespoons olive oil\\n* 2 cloves garlic, minced\\n* 1 teaspoon dried thyme\\n* 1 teaspoon dried oregano\\n* 1/2 teaspoon paprika\\n* Salt and pepper, to taste\\n* 4 lemon wedges, for serving\\n\\n1. Preheat the oven to 400°F (200°C).\\n2. In a large baking dish, toss the lobsters with the olive oil, garlic, thyme, oregano, paprika, salt, and pepper.\\n3. Roast the lobsters in the preheated oven for 20-25 minutes, or until they are cooked through.\\n4. Remove the lobsters from the oven and let them cool for a few minutes.\\n5. Serve the lobsters with the lemon wedges on the side.\\n\\nI hope this helps! Let me know if you have any other questions.']},\n",
+ " {'img': 'https://www.kazoart.com/blog/wp-content/uploads/2019/04/Dali-persistance-of-memory.jpg',\n",
+ " 'conv': ['Can you give me a detailed introduction to this painting, and also the artist?',\n",
+ " 'This is a surrealist painting by Salvador Dali, titled \"The Persistence of Memory.\" It is one of Dali\\'s most famous works and is considered one of the greatest works of surrealist art. The painting depicts melting clocks and other objects in a desert landscape, with the sun setting in the background. The overall theme of the painting is the idea that time is not fixed and that it can be distorted and manipulated. The painting is considered a masterpiece of surrealism and is widely recognized as one of Dali\\'s most important works.\\n\\nThe artist, Salvador Dali, was a Spanish surrealist painter and sculptor. He is considered one of the most important figures in the history of art and is known for his unique and imaginative works. Dali was born in 1904 in Spain and began painting at a young age. He is best known for his surrealist paintings, which often feature melting clocks, distorted figures, and other strange and fantastical elements. Dali\\'s work has had a significant influence on the art world and continues to be widely admired and studied today.']}]"
+ ]
+ },
+ "execution_count": 434,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "total_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 383,
+ "id": "61f4bb45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 510,
+ "id": "e9179d57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "to_save = {j: info for j, info in enumerate(total_data)}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 511,
+ "id": "fb7537e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('/home/zhud/project/blip2/demo_example/others3/samples.yaml', 'w') as f:\n",
+ " yaml.dump(to_save, f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e8f3c30",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c2e7034f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfa0ad2c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 244,
+ "id": "81458521",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "total_data = []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ab09fdf3",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "91f0b955",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 186,
+ "id": "fb2f8452",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = \\\n",
+ "\"### Human: What's your name?\" \\\n",
+ "\"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "llama_tokens = model.llama_tokenizer(\n",
+ " text, \n",
+ " return_tensors=\"pt\", \n",
+ " ).to('cuda:0')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "id": "0e2976a9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "### Human: What's your name?### Assistant: \n",
+ "\n",
+ "I'm sorry, I am an AI language model and do not have a\n"
+ ]
+ }
+ ],
+ "source": [
+ "outputs = model.llama_model.generate(\n",
+ " input_ids=llama_tokens.input_ids,\n",
+ " query_embeds=None,\n",
+ " attention_mask=llama_tokens.attention_mask,\n",
+ " max_new_tokens=20,\n",
+ " )\n",
+ "output_text = model.llama_tokenizer.decode(outputs[0])\n",
+ "print(output_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "037a901f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "id": "b2f93372",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'input_ids': [1533, 25518, 29958], 'attention_mask': [1, 1, 1]}"
+ ]
+ },
+ "execution_count": 132,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat.model.llama_tokenizer('', add_special_tokens=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "3b6f10df",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Assistant: This is a photograph of a person standing in the rain on a city street. The person is holding an umbrella and is standing in the middle of the street. There are tall buildings on either side of the street and the sky is cloudy.\\n\\nThe photograph was taken in the evening, as the sky is dark and the streetlights are on. The rain is coming down in large drops and the person is standing in the middle of the street, looking down at their phone. The buildings on either side of the street are tall and have many windows. The street is wet and there are puddles on the ground.\\n###',\n",
+ " 'Assistant: This is a photograph of a horse standing in front of a barn. The horse is standing on its hind legs and has a large harness on its back. There are two men standing next to the horse, one of them is holding a whip. The other man is holding a bucket of water. The barn has a large door and a small window on the side. The sky is cloudy and there are trees in the background.\\n###',\n",
+ " 'Assistant: The horse is standing on its hind legs and has a large harness on its back.\\n###',\n",
+ " 'Assistant: A whiteboard with a joke written on it\\nthe joke is written in black ink on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na joke written on a whiteboard\\na',\n",
+ " 'Human: a man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit drinking a glass of wine\\na man in a suit',\n",
+ " \"Human: a man in a suit drinking a glass of wine\\nsays when people say they don't drink wine they're\\n###\",\n",
+ " \"Human: a man in a suit holding a glass of wine\\nsays when people say they're not drinking, they're\\n###\"]"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat.raw_answers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "53264936",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ========================================\n",
+ "# Gradio Setting\n",
+ "# ========================================\n",
+ "def gradio_reset():\n",
+ " chat.reset()\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def gradio_ask(user_message, chatbot):\n",
+ " chat.ask(user_message)\n",
+ " chatbot = chatbot + [[user_message, None]]\n",
+ " return '', chatbot\n",
+ "\n",
+ "\n",
+ "def gradio_answer(chatbot):\n",
+ " llm_message = chat.answer(1000)\n",
+ " chatbot[-1][1] = llm_message\n",
+ " return chatbot\n",
+ "\n",
+ "\n",
+ "def gradio_upload_img(gr_img, chatbot):\n",
+ " llm_message = chat.upload_img(gr_img)\n",
+ " chatbot = chatbot + [[(gr_img,), None]]\n",
+ " chatbot[-1][1] = llm_message\n",
+ " return chatbot\n",
+ "\n",
+ "\n",
+ "with gr.Blocks() as demo:\n",
+ " gr.Markdown(\"## GPT-4 Mini\")\n",
+ " with gr.Row():\n",
+ " with gr.Column(scale=0.5):\n",
+ " image = gr.Image(type=\"filepath\")\n",
+ " upload = gr.Button(\"Upload Image\")\n",
+ " clear = gr.Button(\"Restart\")\n",
+ " with gr.Column():\n",
+ " chatbot = gr.Chatbot()\n",
+ " text_input = gr.Textbox()\n",
+ "\n",
+ " text_input.submit(gradio_ask, [text_input, chatbot], [text_input, chatbot], queue=False).then(\n",
+ " gradio_answer, chatbot, chatbot\n",
+ " )\n",
+ "\n",
+ " upload.click(gradio_upload_img, [image, chatbot], chatbot)\n",
+ " clear.click(gradio_reset, None, chatbot, queue=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e34fc8ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running on local URL: http://127.0.0.1:7860\n",
+ "Running on public URL: https://6261e2449344bfbe7a.gradio.live\n",
+ "\n",
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": []
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/transformers/generation/utils.py:1219: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "demo.launch(share=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "86943179",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "demo.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b815e52c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "233c9f93",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4298ba71",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6eea726c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3251a51",
+ "metadata": {},
+ "source": [
+ "### Alignment Dataset Prepare"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 256,
+ "id": "11b36de5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "from tenacity import (\n",
+ " retry,\n",
+ " stop_after_attempt,\n",
+ " wait_random_exponential,\n",
+ ") # for exponential backoff\n",
+ "openai.api_key = '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'\n",
+ "\n",
+ "\n",
+ "def prepare_chatgpt_message(task_prompt, paragraph):\n",
+ " messages = [{\"role\": \"system\", \"content\": task_prompt},\n",
+ " {\"role\": \"user\", \"content\": paragraph}]\n",
+ " return messages\n",
+ "\n",
+ "\n",
+ "@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))\n",
+ "def call_chatgpt(chatgpt_messages, max_tokens=200, model=\"gpt-3.5-turbo\"):\n",
+ " response = openai.ChatCompletion.create(model=model, messages=chatgpt_messages, temperature=0.7, max_tokens=max_tokens)\n",
+ " reply = response['choices'][0]['message']['content']\n",
+ " total_tokens = response['usage']['total_tokens']\n",
+ " return reply, total_tokens"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 413,
+ "id": "b30c475e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import webdataset as wds\n",
+ "from lavis.datasets.datasets.base_dataset import BaseDataset\n",
+ "class PILDataset(BaseDataset):\n",
+ " def __init__(self, vis_processor, text_processor, location):\n",
+ " super().__init__(vis_processor=vis_processor, text_processor=text_processor)\n",
+ "\n",
+ " self.inner_dataset = wds.DataPipeline(\n",
+ " wds.ResampledShards(location),\n",
+ " wds.tarfile_to_samples(handler=wds.warn_and_continue),\n",
+ " wds.shuffle(1000, handler=wds.warn_and_continue),\n",
+ " wds.decode(\"pilrgb\", handler=wds.warn_and_continue),\n",
+ " wds.to_tuple(\"jpg\", \"json\", handler=wds.warn_and_continue),\n",
+ " wds.map(self.to_dict, handler=wds.warn_and_continue),\n",
+ " )\n",
+ "\n",
+ " def to_dict(self, sample):\n",
+ " return {\n",
+ " \"image\": sample[0],\n",
+ " \"text_input\": self.text_processor(sample[1][\"caption\"]),\n",
+ " }\n",
+ " \n",
+ "vis_processor_cfg = cfg.datasets_cfg.cc_combine.vis_processor.train\n",
+ "vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)\n",
+ "\n",
+ "text_processor_cfg = cfg.datasets_cfg.cc_combine.text_processor.train\n",
+ "text_processor = registry.get_processor_class(text_processor_cfg.name).from_config(text_processor_cfg)\n",
+ "\n",
+ "dataset = PILDataset(vis_processor, text_processor, cfg.datasets_cfg.cc_combine.build_info.storage).inner_dataset\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 332,
+ "id": "5b967080",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 291,
+ "id": "fd8e84f5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/ibex/project/c2133/blip_dataset/laion_1b/laion_gpu/{00000..10488}.tar'"
+ ]
+ },
+ "execution_count": 291,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cfg.datasets_cfg.laion.build_info.storage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 408,
+ "id": "18315556",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "builder = registry.get_builder_class('cc_combine')(cfg.datasets_cfg['cc_combine'])\n",
+ "dataset = builder.build_datasets()['train']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 416,
+ "id": "2f6386b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_iter = iter(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 417,
+ "id": "bd81ba4c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_point = next(data_iter)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 418,
+ "id": "247b872f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image = data_point['image']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 419,
+ "id": "51d9fb01",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 419,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "image"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 324,
+ "id": "b0d47ad6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'a hiker pauses along the rim of the grand canyon'"
+ ]
+ },
+ "execution_count": 324,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_point['text_input']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 381,
+ "id": "fffa9245",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [02:14<00:00, 6.72s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_dir = '/ibex/project/c2133/blip_dataset/image_alignment'\n",
+ "\n",
+ "\n",
+ "image_list = []\n",
+ "text_list = []\n",
+ "description_list = []\n",
+ "negative_list = []\n",
+ "verify_list = []\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "for i in tqdm(range(20)):\n",
+ " data_point = next(data_iter)\n",
+ " image = data_point['image']\n",
+ " text = data_point['text_input']\n",
+ " \n",
+ " fix_prompt = \\\n",
+ " \"Fix the error in given paragraph. \" \\\n",
+ " \"Remove any repeating sentences, meanless characters, not English sentences, and so on.\" \\\n",
+ " \"Rewrite any incompleted sentences.\" \\\n",
+ " \"Return directly the results WITHOUT explaination.\" \\\n",
+ " \"Return directly the input paragraph if it is already correct WITHOUT explaination.\"\n",
+ "\n",
+ " answers = []\n",
+ " answer_tokens = 0\n",
+ " chat.reset()\n",
+ " chat.upload_img(image)\n",
+ " chat.ask(\"Describe this image in detail. Give as many details as possible. Say everything you see.\")\n",
+ " answer, tokens = chat.answer()\n",
+ " answers.append(answer)\n",
+ " answer_tokens += tokens\n",
+ " if len(answer_tokens) < 80:\n",
+ " chat.ask(\"Continue\")\n",
+ " answer, answer_token = chat.answer()\n",
+ " answers.append(answer)\n",
+ " answer_tokens += tokens\n",
+ " answer = ' '.join(answers)\n",
+ "\n",
+ " chatgpt_message = prepare_chatgpt_message(fix_prompt, answer)\n",
+ " improved_answer, num_token = call_chatgpt(chatgpt_message)\n",
+ " \n",
+ " if 'is already correct' in improved_answer:\n",
+ " improved_answer = answer\n",
+ " if 'incomplete' in improved_answer or len(improved_answer) < 50:\n",
+ " negative_list.append(improved_answer)\n",
+ " else:\n",
+ " image_list.append(image)\n",
+ " text_list.append(text)\n",
+ " description_list.append(improved_answer)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 329,
+ "id": "1fc5f266",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "' This This is an image of a person hiking on a trail in the grand canyon. The person is wearing a backpack and has a hiking pole in their hand. They are looking out over the canyon, which is visible in the background. The sky is cloudy and there are some dark clouds in the distance. The landscape is rugged and rocky, with steep cliffs and valleys visible in the distance. The trail is narrow and winds through the rocky terrain. There are no other people visible in the image.'"
+ ]
+ },
+ "execution_count": 329,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 330,
+ "id": "d2651aa6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'This is an image of a person hiking on a trail in the grand canyon. The person is wearing a backpack and has a hiking pole in their hand. They are looking out over the canyon, which is visible in the background. The sky is cloudy and there are some dark clouds in the distance. The landscape is rugged and rocky, with steep cliffs and valleys visible in the distance. The trail is narrow and winds through the rocky terrain. There are no other people visible in the image.'"
+ ]
+ },
+ "execution_count": 330,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "improved_answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 389,
+ "id": "34b6f1f6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'and protect\" tube also contains NovaMin technology, which is designed to help repair and protect sensitive teeth.'"
+ ]
+ },
+ "execution_count": 389,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "description_list[2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 383,
+ "id": "320368ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['This is an incomplete sentence. Please provide the complete paragraph for correction.',\n",
+ " 'no specific action or activity taking place.',\n",
+ " 'The image shows a woman standing in front of a',\n",
+ " 'touch of sophistication to the look.',\n",
+ " 'This is an image of a car.',\n",
+ " 'Input paragraph is incomplete. Please provide a complete paragraph to fix.']"
+ ]
+ },
+ "execution_count": 383,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "negative_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 372,
+ "id": "ff92e94a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "34"
+ ]
+ },
+ "execution_count": 372,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len('The input paragraph is incomplete.')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 420,
+ "id": "b0223541",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:08<00:00, 583.08it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_dir = '/ibex/project/c2133/blip_dataset/image_alignment_cc'\n",
+ "texts = {}\n",
+ "for i in tqdm(range(5000)):\n",
+ " data_point = next(data_iter)\n",
+ " image = data_point['image']\n",
+ " texts[i] = data_point['text_input']\n",
+ " image.save(os.path.join(save_dir, \"image/{}.jpg\".format(i)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 394,
+ "id": "12933b52",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{0: 'spinach and artichoke fippers on a white plate'}"
+ ]
+ },
+ "execution_count": 394,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 397,
+ "id": "75971b3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "\n",
+ "# open a file in write mode\n",
+ "with open(os.path.join(save_dir,\"old_cap.json\"), \"w\") as outfile:\n",
+ " # write the dictionary to the file in JSON format\n",
+ " json.dump(texts, outfile)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d0eb5390",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8911c5d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1bdb050e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "744da257",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "131dad16",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 516,
+ "id": "a88ca86a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import shutil"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 514,
+ "id": "56db0857",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('/ibex/project/c2133/blip_dataset/image_alignment_cc_prepare/filter_cap.json', 'r') as f:\n",
+ " caps = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 517,
+ "id": "110b2eb6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "root = '/ibex/project/c2133/blip_dataset/image_alignment_cc_prepare/'\n",
+ "for content in caps['annotations']:\n",
+ " src_file = root + 'train/{}.jpg'.format(content['image_id'])\n",
+ " dest_file = root + 'used_train/{}.jpg'.format(content['image_id'])\n",
+ " \n",
+ " shutil.copy(src_file, dest_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 518,
+ "id": "1824499f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'annotations': [{'image_id': '2',\n",
+ " 'caption': 'The image shows a man fishing on a lawn next to a river with a bridge in the background. Trees can be seen on the other side of the river, and the sky is cloudy.'},\n",
+ " {'image_id': '3',\n",
+ " 'caption': 'This image shows a kitchen with stainless steel appliances, including a refrigerator, oven, and dishwasher. The countertops are made of black granite, and there is a white backsplash behind the stove. The floor is made of beige tiles, and the walls are painted white. There is a door that leads to the outside.'},\n",
+ " {'image_id': '4',\n",
+ " 'caption': \"The image shows a group of people walking down a sidewalk, holding signs that read `get scientific right'. They are wearing red shirts and carrying banners with the same message. The people in the image appear to be young adults and children, and they are walking in a line, with some people in front and some behind. The image is taken from a distance, so it's difficult to see their faces, but it's clear that they are all holding up their signs and walking in the same direction. The background of the image is a residential neighborhood with trees and houses visible in the distance.\"},\n",
+ " {'image_id': '5',\n",
+ " 'caption': 'This image shows a group of people standing in front of a building with pink and white balloons in the air. The people are dressed in formal attire, with the men wearing black tuxedos and the women wearing white dresses. There is a large tree in the background, and several cars parked in front of the building. The atmosphere of the image is joyful and celebratory.'},\n",
+ " {'image_id': '8',\n",
+ " 'caption': 'This image is a cartoon illustration of an astronaut floating in the air with his arms outstretched. The astronaut wears a white spacesuit with a helmet and gloves, and the background is a gradient of light blue and white. This vector illustration can be scaled to any size without losing quality, making it suitable for use in various contexts.'},\n",
+ " {'image_id': '10',\n",
+ " 'caption': 'The image depicts two ducks flying over a body of water. One duck is in the foreground and the other is in the background. The duck in the foreground is flying towards the left side of the image, while the duck in the background is flying towards the right side of the image. Both ducks have their wings outstretched and their beaks open, as if they are calling out to each other. The background of the image is a dark blue sky with a few clouds in it. The water below the ducks is a light blue color with some ripples in it. There are some trees on the left side of the image, and some bushes on the right side. Overall, the image is a beautiful depiction of two ducks flying over a peaceful body of water. It could be used as a wallpaper or as part of a larger design.'},\n",
+ " {'image_id': '11',\n",
+ " 'caption': 'This image shows a bedroom with a balcony overlooking the ocean. The room has white walls, a wooden floor, and a large glass door that leads to the balcony. The bed is made with white sheets.'},\n",
+ " {'image_id': '12',\n",
+ " 'caption': \"The image depicts a watch with a red leather strap, a green dial, and a gold case and buckle. The face of the watch displays two hands, one pointing to 12 o'clock and the other pointing to 6 o'clock, with a small red dot at the 12 o'clock position. The watch is set against a white background.\"},\n",
+ " {'image_id': '13',\n",
+ " 'caption': 'The image shows a slice of cheesecake on a red and white checkered plate with a fork next to it. The cheesecake appears to be baked in a cast iron skillet and is topped with a sprinkle of powdered sugar. The plate is sitting on top of a striped napkin on a wooden surface.'},\n",
+ " {'image_id': '14',\n",
+ " 'caption': 'This image shows a man in a blue suit and a red tie standing at a podium with his hands in the air. He has a serious expression on his face and is speaking into a microphone. Behind him, there is a blue curtain and a Canadian flag hanging on the wall. The image is well lit and the colors are bright and vivid.'},\n",
+ " {'image_id': '16',\n",
+ " 'caption': 'This image shows a silhouette of a crane on top of a building. The crane appears to be lifting something off the roof of the building. The sky in the background appears to be cloudy, with dark clouds in the distance. The crane appears to be made of metal and has a long arm with a hook on the end. The building appears to be made of brick and has several windows on the top floor. There is a ladder leading up to the roof of the building.'},\n",
+ " {'image_id': '20',\n",
+ " 'caption': 'This image depicts an open wooden door in the middle of a dark blue room. Light shines through the door and illuminates the surrounding area. The walls are painted light blue, and there are no windows. The floor is tiled, with a small area rug in front of the door. The image creates a sense of mystery and intrigue, leaving the viewer to wonder what lies beyond the open door.'},\n",
+ " {'image_id': '21',\n",
+ " 'caption': 'The image shows a young boy sitting at a table with several rolls of toilet paper in front of him. He is wearing a medical face mask.'},\n",
+ " {'image_id': '22',\n",
+ " 'caption': 'This image shows a wall made of concrete blocks stacked on top of each other to form a gray, rough-textured solid structure. There are no windows or doors on the wall. It is located on the side of a building with a sidewalk in front of it. No other visible objects are present in the image.'},\n",
+ " {'image_id': '23',\n",
+ " 'caption': 'The image shows a beautiful sunset over the ocean, with the sun setting behind a row of palm trees on the horizon. The sky is a mix of orange, pink, and purple hues, with a few clouds scattered throughout. The buildings in the foreground are small, colorful houses with red roofs and white walls, and there are a few people standing on the balconies, watching the sunset. The overall mood of the image is peaceful and serene, with a sense of calm and tranquility.'},\n",
+ " {'image_id': '24',\n",
+ " 'caption': 'The image shows a group of planets in different sizes and colors. The largest planet appears to be made up of red and orange flames, while the smaller planets appear to be made up of different shades of blue and green. The planets are arranged in a circular formation, with the largest planet in the center and the smaller planets orbiting around it. The planets appear to be floating in space, with no visible background or other objects in the image.'},\n",
+ " {'image_id': '25',\n",
+ " 'caption': 'This image depicts a wooden tree house with a ladder, small window, and door. The roof is made of wooden shingles with a chimney on top. The tree house is situated among trees with a small path leading up to it.'},\n",
+ " {'image_id': '26',\n",
+ " 'caption': 'The image displays a wedding cake with white frosting decorated with purple and white flowers on a gold plated tray. The cake is placed on a white tablecloth on a wooden table.'},\n",
+ " {'image_id': '28',\n",
+ " 'caption': 'This image shows a female chef preparing food in a professional kitchen. She is wearing a white apron and has a smile on her face. The kitchen is well equipped with stainless steel countertops, sinks, and cooking utensils. There are several pots and pans on the stove, and a cutting board on the counter in front of the chef. The lighting in the kitchen is bright, and there are several windows letting in natural light. The overall atmosphere is clean, organized, and professional.'},\n",
+ " {'image_id': '29',\n",
+ " 'caption': 'This is a black and white photograph of a lone tree in the middle of a field. The tree appears to be dead, with no leaves or branches. The sky behind the tree is dark and stormy, with dark clouds and lightning in the distance. The ground is dry and cracked, with no signs of water or life. The overall mood of the image is one of loneliness and desolation.'},\n",
+ " {'image_id': '31',\n",
+ " 'caption': 'The image is a video game cover for the game \"Wayne Gretzky\\'s 3D Hockey\". It features a black and white image of a hockey player wearing a red and white jersey with the number 99 on the back. The player is holding a hockey stick and appears to be skating towards the right side of the image. The title of the game, \"Wayne Gretzky\\'s 3D Hockey,\" is written in white letters above the player\\'s head. The game\\'s publisher, Electronic Arts, is written in small white letters at the bottom of the image.'},\n",
+ " {'image_id': '32',\n",
+ " 'caption': 'A dark brown horse with a black mane and tail standing on all four legs in a lush green field with tall grass and trees in the distance. The horse has a muscular build and is holding its head high. A white fence surrounds the field in the background.'},\n",
+ " {'image_id': '33',\n",
+ " 'caption': 'This image appears to be a waving flag of the country of Poland. The flag is made up of horizontal red and white stripes with a red and white diagonal stripe in the top left corner. There is a small white emblem in the center of the flag, but it is difficult to make out what it is.'},\n",
+ " {'image_id': '34',\n",
+ " 'caption': 'This image shows a classroom with students sitting at desks and a teacher standing at the front of the room. The walls are painted white and there are windows on either side of the room, letting in natural light. The room is well lit with fixtures hanging from the ceiling. The desks are arranged in rows and there is a whiteboard at the front. The teacher is holding a piece of chalk and facing the students who are focused on their work and appear engaged in the lesson.'},\n",
+ " {'image_id': '35',\n",
+ " 'caption': 'This image is an abstract painting with orange, yellow, and pink colors on a white background. There are circular shapes in different sizes and shades of orange, yellow, and pink on the painting, which appear to be floating on the surface and overlapping each other. Various brush strokes and techniques have been used to create the shapes and colors, resulting in a vibrant and energetic effect.'},\n",
+ " {'image_id': '37',\n",
+ " 'caption': 'The image shows a woman holding an old book titled \"The Living and the Dead\" with a serious expression on her face. The book\\'s worn cover suggests signs of wear and tear, and the woman may be reading it with a sense of sadness or nostalgia. The book could be a memoir, biography, or story about people who have passed away and their impact on the living.'},\n",
+ " {'image_id': '38',\n",
+ " 'caption': 'The image shows a large, modern hotel building with several floors and a large parking lot in front of it. The hotel\\'s name, \"Courtyard by Marriott,\" is written in large letters on the front of the building. There are several windows on the upper floors, and the building appears to have an atrium or lobby area in the center. The building is surrounded by trees and other landscaping, and there is a sidewalk leading to the entrance.'},\n",
+ " {'image_id': '39',\n",
+ " 'caption': 'This image shows a pregnant woman sitting on a bed with her hands on her stomach. She is wearing a white shirt and blue jeans, and has her hair pulled back in a ponytail. The woman is smiling and appears to be in good spirits.'},\n",
+ " {'image_id': '40',\n",
+ " 'caption': 'This image shows a woman wearing a black shirt and a purple bow tie. She is posing with her hands on her chin, looking at the camera with a serious expression. The background is black, and there are no other objects or people visible in the image.\\n\\nDescription: This image shows a woman wearing a black shirt and a purple bow tie. She is posing with her hands on her chin, looking at the camera with a serious expression. The background is black, and there are no other objects or people visible in the image.'},\n",
+ " {'image_id': '43',\n",
+ " 'caption': \"The image is a black and white silhouette of two swans in the shape of a heart, with the sun in the background. The swans face each other with their necks intertwined, while the sun is in the shape of a circle. It is a simple and elegant image that could be used as a logo for companies or organizations that promote love, romance, relationships, peace, harmony, or unity. Additionally, it could be used as a symbol for wedding or anniversary celebrations, or as a decoration for Valentine's Day or other romantic occasions.\"},\n",
+ " {'image_id': '44',\n",
+ " 'caption': 'This image is a colorful and dynamic abstract design with a yellow and green background, various shapes and lines made with brush strokes, and a white circle in the center. The colors used are bright and bold, with a mix of yellow, green, and black, and the shapes and lines are varied and playful. This design could be used for a variety of purposes, such as a background for a website or social media post, or as a design element in a print or digital project.'},\n",
+ " {'image_id': '45',\n",
+ " 'caption': \"This is an image of a man wearing a black suit with a white shirt and a black tie. He is standing with his hands in his pockets, looking down at the ground. The image is in black and white, and the background is not visible. The man's face is not visible, but he appears to have a serious expression. He is wearing a black hat that is tilted slightly to the side, and his hair appears to be short and neatly styled. The man's hands are visible in his pockets, and he appears to be wearing black gloves. This image could be used for advertising a clothing brand or promoting a business event.\"},\n",
+ " {'image_id': '47',\n",
+ " 'caption': 'In the image, a small wooden hut sits atop a hill, overlooking a valley with mountains in the background. The hut has a small chimney on top, and a person stands on its porch, gazing at the view. The sky is clear and blue, with some distant clouds. The snow-covered mountains have a glacier visible on the side of one of the peaks, while the green valley below has scattered trees.'},\n",
+ " {'image_id': '48',\n",
+ " 'caption': 'This is a photograph of a residential street with houses on either side and a car parked on the side of the road. The houses are all different colors and styles, some having red, white, or blue roofs. The road is narrow with no sidewalks. The sky is cloudy with dark clouds in the background.\\n\\nA small white car with tinted windows is parked on the side of the road at an angle with one side facing the camera and the other side facing away.\\n\\nSeveral houses are visible in the background with different colors, styles, and some with balconies or porches. A few trees are also visible.'},\n",
+ " {'image_id': '49',\n",
+ " 'caption': 'The image shows a beautiful garden with a fountain in the center. The fountain is surrounded by lush green plants and flowers. Several umbrellas and chairs are set up around the fountain for people to sit and relax in the shade. The sky is dark and cloudy with a few visible stars, creating a peaceful and serene mood.'},\n",
+ " {'image_id': '50',\n",
+ " 'caption': \"The image shows a group of young women sitting on a staircase. They are all wearing different clothing, with some wearing red and white outfits and others wearing black and white outfits. They are all smiling and looking at the camera. The image is well lit and the colors are bright and vibrant.\\n\\nThe girls are sitting on the stairs in a casual pose, with their legs crossed and their arms resting on their knees. They are all wearing different shoes, with some wearing sneakers and others wearing boots. The background is white and there are no other objects or people visible in the image.\\n\\nOverall, the image is a cheerful and upbeat representation of a group of young women enjoying each other's company. The bright colors and natural lighting give the image a lively and energetic feel.\"},\n",
+ " {'image_id': '52',\n",
+ " 'caption': \"The man in the image is wearing a yellow shirt and brown pants. He is holding a trophy in his left hand and smiling at the camera. There is a red carpet on the ground in front of him. Behind him, there is a wall with a banner that reads 'Indian Film Academy Awards' in white letters. There are several people in the background, some of whom are clapping and others are standing around. The overall mood of the image is celebratory and joyful.\"},\n",
+ " {'image_id': '55',\n",
+ " 'caption': 'This is a black and white photograph of a bride and groom standing next to each other in formal attire. The bride is wearing a long, white dress with a veil and carrying a bouquet of flowers. The groom is wearing a black tuxedo with a white shirt and bow tie. They are both smiling at the camera. In the background, there is a large, ornate mirror hanging on the wall.'},\n",
+ " {'image_id': '56',\n",
+ " 'caption': 'The image shows a small outdoor office space with a white desk and chair, surrounded by green grass and trees in the background. The walls of the office are made of wood and have large windows that allow natural light to enter. The roof of the office is made of corrugated metal and has a skylight in the center. The floor of the office is made of wood and has a rug on it. There is a white door on the side of the office that leads to the outside.'},\n",
+ " {'image_id': '57',\n",
+ " 'caption': 'Input: This image is a painting of a baby boy sitting in a bathtub filled with water. The baby is wearing a white diaper and is looking up at the viewer with a curious expression. The background of the painting is a blue and white tiled bathroom with a sink and toilet in the corner. There is also a shower head on the wall above the bathtub. The overall mood of the painting is calm and peaceful.\\n\\nOutput: This image is a painting of a baby boy sitting in a bathtub filled with water. The baby is wearing a white diaper and is looking up at the viewer with a curious expression. The background of the painting is a blue and white tiled bathroom with a sink, toilet, and shower head. The overall mood of the painting is calm and peaceful.'},\n",
+ " {'image_id': '58',\n",
+ " 'caption': 'This image shows a group of people sitting around a long wooden table. They are all wearing casual clothing and appear to be engrossed in conversation. The room has large windows that let in a lot of natural light, and there are plants in pots on the windowsill. The walls are painted a light color, and there are wooden beams on the ceiling. The overall atmosphere of the room is warm and inviting.'},\n",
+ " {'image_id': '59',\n",
+ " 'caption': 'The image shows a woman walking down a hallway wearing a yellow dress with a pleated skirt and black shoes. The dress has a v-neckline and short sleeves. The woman is wearing a pair of black sunglasses and has her hair styled in a messy bun. The walls of the hallway are made of white tiles and there are black and white tiles on the floor. There is a large window on one side of the hallway that lets in a lot of natural light. The overall atmosphere of the image is bright and cheerful.'},\n",
+ " {'image_id': '60',\n",
+ " 'caption': 'This is a photo of a young woman lying on the floor with her daughter on her back. They are both smiling and looking at the camera. The woman is wearing a white shirt and jeans, and her daughter is wearing a pink dress. They are in a living room with a couch and a coffee table in the background. The walls are painted white and there are some plants in pots on the floor. The lighting is bright and natural, coming from the windows on the left and right sides of the room. The overall mood of the photo is happy and relaxed.'},\n",
+ " {'image_id': '62',\n",
+ " 'caption': 'This is a black and white image of a young girl with a crown on her head. She is wearing a white dress and has her hands on her hips. The girl is looking down and has a serious expression on her face.'},\n",
+ " {'image_id': '63',\n",
+ " 'caption': 'The image shows a coral reef with a variety of fish swimming around it. The coral reef is made up of many different types of coral, including brain coral, star coral, and mushroom coral. There are also many different types of fish swimming around the reef, including angelfish, butterflyfish, and parrotfish. The water is clear and blue, and there is a lot of sunlight shining down on the reef. The overall scene is very colorful and vibrant, with a lot of different textures and shapes in the coral and fish.'},\n",
+ " {'image_id': '64',\n",
+ " 'caption': 'The image shows a bulletin board with various items on it. At the top, there is a red and white sign that reads \"Working Wall.\" Below the sign, there are several pieces of paper and posters pinned to the board. One poster has a drawing of a person with a speech bubble coming out of their mouth. Another poster has a quote that reads, \"Believe in yourself and all that you are. Know that there is something inside you that is greater than any obstacle.\" The board also has several notes and reminders pinned to it, including one that reads, \"Don\\'t let yesterday take up too much of today.\"'},\n",
+ " {'image_id': '65',\n",
+ " 'caption': 'The image shows a small body of water in the middle of a snowy forest. The water has a reddish color and is surrounded by trees and rocks. There is a small stream flowing into the water from the left side. The sky is clear and blue.'},\n",
+ " {'image_id': '66',\n",
+ " 'caption': 'The woman in the image is wearing ripped jeans and a white t-shirt with a black and white graphic on the front. She is standing with her hands on her hips and looking at the camera. The jeans are frayed at the knees and have holes in the thighs.'},\n",
+ " {'image_id': '68',\n",
+ " 'caption': \"The image depicts a construction site with a large crane lifting heavy equipment. The crane has a long arm with a hook at the end, which is being used to lift a large metal beam. The beam is suspended from the crane's hook and is being lowered onto a pile of other metal beams on the ground.\"},\n",
+ " {'image_id': '69',\n",
+ " 'caption': 'This is an architectural drawing of a building. It shows the front and side elevations of the building. The building is made up of several floors, each with multiple windows and balconies. There is a central staircase that leads to the upper floors. The building has a symmetrical facade, with a large central entrance and two smaller entrances on either side. The roof is flat and has several skylights.'},\n",
+ " {'image_id': '70',\n",
+ " 'caption': 'The image shows a desk with a stack of books on it. There is a red apple sitting on top of the stack of books with a pencil next to it. There is a painting on the wall behind the desk of a landscape with mountains in the background and a river running through it. The window behind the desk looks out onto the same landscape. The room is well lit with natural light coming in from the window.'},\n",
+ " {'image_id': '71',\n",
+ " 'caption': 'The building in the image appears to be a multi-storey commercial building with several shops on the ground floor and offices on the upper floors. The facade of the building is white with blue accents, and there are several windows on each floor. The building appears to be well-maintained with clean windows and fresh paint. There are several people walking on the sidewalk and a few cars parked on the street, suggesting a bustling commercial area with a mix of shops and offices.'},\n",
+ " {'image_id': '72',\n",
+ " 'caption': 'This image shows a doctor and a patient sitting at a table in a hospital or clinic. The doctor is wearing a white lab coat and has a stethoscope around his neck. The patient is wearing a pink headscarf and is seated in front of the doctor. There is a table between them with various medical equipment on it, including a syringe and a vial of liquid. The room has a white wall and a window with blinds in the background.'},\n",
+ " {'image_id': '73',\n",
+ " 'caption': 'The image shows a woman The woman has long, wavy, blonde hair and is wearing a grey sweater. She has a serious expression and is looking directly at the camera. The background is dark and the room appears to be dimly lit. There are no other objects or people visible in the image.'},\n",
+ " {'image_id': '75',\n",
+ " 'caption': 'The image shows a group of men wearing black suits and orange socks standing in a circle. One of the men is wearing a white shirt and black pants, while the others are wearing black pants and white shirts. They are all looking at the camera and smiling. The background is a brick wall with graffiti on it.'},\n",
+ " {'image_id': '77',\n",
+ " 'caption': 'This image shows a dirt road surrounded by pine trees. Two cars are parked on the side of the road. The road is in good condition with no visible potholes or debris. The trees on either side are tall and full, providing shade. The sky is clear and blue with a few fluffy clouds visible in the distance.'},\n",
+ " {'image_id': '78',\n",
+ " 'caption': 'The image shows a cluster of glowing green mushrooms of various shapes and sizes growing out of the ground in the dark. The mushrooms overlap each other, creating an eerie and mysterious atmosphere.'},\n",
+ " {'image_id': '79',\n",
+ " 'caption': 'The image shows a night sky with stars and constellations visible in the sky. The constellations are labeled with their names, such as Orion, Cassiopeia, and Andromeda. The stars in the sky are also visible, with some of them labeled with their names, such as Alpha Centauri and Betelgeuse. The sky appears to be clear, with no clouds or other obstructions visible.'},\n",
+ " {'image_id': '80',\n",
+ " 'caption': 'This image shows a dining room with dark wood paneling on the walls.'},\n",
+ " {'image_id': '82',\n",
+ " 'caption': 'The image shows a pink slice of cake with a crumbly texture on a plate with a fork on top. The plate is on a table with a white tablecloth, and there is a cake stand with more slices of cake in the background.'},\n",
+ " {'image_id': '84',\n",
+ " 'caption': 'This image is a geometric pattern made up of triangles arranged in a symmetrical pattern. The triangles are white and made up of straight lines against a light gray background. The image gives a modern and simple feel.'},\n",
+ " {'image_id': '85',\n",
+ " 'caption': 'This image is a quote from the book of Psalms in the Bible, which reads, \"For the Lord your God is a sun and shield; the Lord bestows favor and honor. No good thing does he withhold from those who walk uprightly\" (Psalm 84:11). The quote is written in black ink on a white background.'},\n",
+ " {'image_id': '86',\n",
+ " 'caption': 'the second row are all tails up. The tails of the coins in the first row face the heads of the coins in the second row, and vice versa.'},\n",
+ " {'image_id': '87',\n",
+ " 'caption': 'The image is outdoors.\\n* There is a clear blue sky in the background.'},\n",
+ " {'image_id': '88',\n",
+ " 'caption': 'This image shows two people sitting at a desk in front of two computer monitors in an office or workspace with white walls, ceiling, and floor. The people are dressed in business attire, with the woman wearing a blue shirt and black pants, and the man wearing a white shirt and black pants. Both are looking at the computer screens intently, with the woman pointing at something on the left monitor and the man looking at something on the right monitor.'},\n",
+ " {'image_id': '89',\n",
+ " 'caption': 'This image shows a bowl of noodles with shrimp, mushrooms, and other vegetables, cooked and ready to eat. The bowl is sitting on a dark surface, likely a table or countertop. No other items are visible in the image.'},\n",
+ " {'image_id': '90',\n",
+ " 'caption': 'The image shows a storefront window with posters of various musicians and bands displayed on it. The posters are of different sizes and designs, with some featuring images of the musicians and others featuring their names and album covers. The posters are arranged in a haphazard manner, creating a vibrant and colorful display in the window.'},\n",
+ " {'image_id': '91',\n",
+ " 'caption': 'This image shows a woman walking down a city street wearing a white coat, black boots, and carrying a black handbag. The street is lined with tall buildings, trees, and parked cars. The sky is cloudy, and there are no other people visible.'},\n",
+ " {'image_id': '92',\n",
+ " 'caption': 'The image shows a cartoon fox pulling a sled with a Christmas tree through a snowy forest. The fox wears a red scarf and has a happy expression. Pine trees are in the background and the sky is cloudy.'},\n",
+ " {'image_id': '93',\n",
+ " 'caption': 'The image shows a floor plan of a two bedroom, two bathroom apartment with an open concept living and dining area, and a kitchen with an island. The bedrooms are located at the back of the apartment, with one having a walk in closet and the other having a balcony. The bathrooms are located at the front of the apartment, with one having a shower and the other having a bathtub. There is also a laundry room with a washer and dryer. The apartment has hardwood floors throughout and a large balcony with a view of the city.'},\n",
+ " {'image_id': '94',\n",
+ " 'caption': 'This image shows a group of women wearing pink hats standing in front of a market stall that sells various types of jewelry, including necklaces, bracelets, and earrings. The women are looking at the jewelry on display, and one of them is holding a pair of earrings. The background of the image is filled with other market stalls and people walking by.'},\n",
+ " {'image_id': '96',\n",
+ " 'caption': 'This is a black and white image of a military font, consisting of various block-style letters and numbers in different sizes and styles, arranged in a stylized manner, and standing out against the white background.'},\n",
+ " {'image_id': '97',\n",
+ " 'caption': 'This image depicts a group of people dressed in traditional Mexican clothing, including a woman with a guitar and a man with a sombrero. They are standing in front of a campfire and there are several horses in the background. The overall tone of the image is warm and inviting, with the orange and yellow flames of the fire casting a warm glow on the faces of the people.'},\n",
+ " {'image_id': '98',\n",
+ " 'caption': 'The image shows a group of football players in maroon and gold uniforms walking onto the field. They are carrying their helmets in their hands and walking in a line, with one player leading the way and the others following behind. The field is green with white lines on it. The stadium is in the background and there are people in the stands watching the game.'},\n",
+ " {'image_id': '99',\n",
+ " 'caption': 'This is a black and white photograph of a woman carrying a small child on her back as they walk down a dirt road surrounded by trees. The woman is wearing a white shirt and blue jeans, and the child is wearing a yellow shirt and blue overalls. The woman is smiling, and the child is looking up at her with a happy expression. There are trees on either side of the road, and the sky is visible in the background.'},\n",
+ " {'image_id': '101',\n",
+ " 'caption': 'The image shows a small island in the middle of the ocean, with palm trees and coral reefs surrounding it. There is a small boat floating in the water near the island, with fish swimming around it. The sky is dark and there is a full moon in the background.\\n\\nThe colors used in the image are mostly shades of blue and green, with some brown and yellow for the sand and coral. The overall mood of the image is peaceful and serene, with a sense of calmness and tranquility. The image could be used to depict a tropical paradise or a secluded island getaway.'},\n",
+ " {'image_id': '103',\n",
+ " 'caption': \"This image shows a bride standing in a park, wearing a white wedding dress and holding a bouquet of flowers. The bride's veil is blowing in the wind and her hair is styled in loose waves. The trees in the background are covered in green leaves and there is a path leading through the park. The colors are bright and vivid.\"},\n",
+ " {'image_id': '105',\n",
+ " 'caption': 'The image shows a close up view of a flower. The flower appears to be in full bloom and is a vibrant pink color. The petals are delicately arranged and the center of the flower is a darker pink. The image is taken from a low angle, giving the viewer a sense of the size and beauty of the flower.\\n\\nThe flower appears to be in full bloom and is a vibrant pink color. The petals are delicately arranged and the center of the flower is a darker pink. The image is taken from a low angle, giving the viewer a sense of the size and beauty of the flower.'},\n",
+ " {'image_id': '107',\n",
+ " 'caption': 'The image shows a watch with a silver case, black leather strap, white dial, black hands and hour markers, and a small red dot on the face of the watch.'},\n",
+ " {'image_id': '108',\n",
+ " 'caption': \"This image is a sunset over a body of water with a large bridge in the distance. The sun is setting behind the bridge, casting a warm orange glow over the water. The sky is filled with clouds, some of which are lit up by the sun's rays. There are no boats or other objects visible in the water. The bridge appears to be a suspension bridge, with cables running from the towers to the deck. The deck is wide and appears to be made of concrete or asphalt. The towers are tall and slender, with cables running from the top of each tower down to the deck. The bridge appears to be in good condition, with no visible damage or wear. The orange and pink hues of the sky contrast nicely with the deep blue of the water.\"},\n",
+ " {'image_id': '110',\n",
+ " 'caption': 'This is an image of a group of investors watching a number on a screen. They appear to be interested and focused on the information being presented. The image is in black and white, with the investors in suits and ties sitting at a table in front of a large screen displaying the number. There are no other details visible in the image.'},\n",
+ " {'image_id': '111',\n",
+ " 'caption': 'This image shows a person watering a small plant in a pot. The person is holding a watering can and spraying water onto the soil around the plant. There are other potted plants in the background, and a greenhouse or other structure can be seen in the distance.'},\n",
+ " {'image_id': '112',\n",
+ " 'caption': 'The image shows a red 2020 Audi S7 Sportback parked in front of a mountain range at sunset. The car has a sleek and modern design with sharp lines and a bold front grille. It has large alloy wheels and a spoiler on the back. The windows are tinted and the car has a panoramic sunroof. The car is parked on a rocky terrain with mountains in the background.'},\n",
+ " {'image_id': '113',\n",
+ " 'caption': \"The image shows a watch with an olive green nylon strap and a black dial. The dial has white hour markers and hands, as well as a date display at the 6 o'clock position and a small seconds subdial at the 9 o'clock position. The watch has a stainless steel case.\"},\n",
+ " {'image_id': '118',\n",
+ " 'caption': 'The woman in the image is wearing a black bikini and standing in front of a tall, spiky cactus. She poses with her hands on her hips, showcasing her long, dark hair. The bikini top features a deep V-neckline, while the bottoms are high-waisted.'},\n",
+ " {'image_id': '119',\n",
+ " 'caption': 'The image shows a yellow blimp flying in the sky with the words \"The Rolling Stones\" written on the side. The blimp has a large mouth with a tongue sticking out, and it appears to be smiling. The sky is a bright blue and there are a few clouds in the background. The Rolling Stones are a British rock band that formed in 1962. The band consists of Mick Jagger, Keith Richards, Charlie Watts, and Ronnie Wood. They have released many hit songs, including \"Satisfaction,\" \"Jumpin\\'Jack Flash,\" and \"Brown Sugar.\" The Rolling Stones are considered one of the most influential and iconic bands in rock music history.'},\n",
+ " {'image_id': '120',\n",
+ " 'caption': 'The image shows a wooden surface with various decorations on it. There are gold and silver stars hanging from the ceiling, as well as a red and white deer head hanging on the wall. There is also a white ceramic mug sitting on the wooden surface.'},\n",
+ " {'image_id': '121',\n",
+ " 'caption': 'The image shows a soccer game with two teams playing. The goalkeeper for the white team is diving to his left to make a save and stop the ball from going into the net. The ball is coming from the right side of the image. The blue team is trying to score while the white team is trying to defend.'},\n",
+ " {'image_id': '122',\n",
+ " 'caption': 'The image shows a group of children walking around the world, carrying books and backpacks. They are smiling and seem to be enjoying their journey. The background is a blue sky with clouds, and there are no other objects or people in the image.\\n\\nThe children are dressed in different clothing, with some wearing uniforms and others wearing casual clothes. They all have backpacks on their backs and are holding books in their hands. The image has a cartoon style, with exaggerated features and bright colors.\\n\\nThe overall theme of the image is education and travel, with the children representing students going on a field trip or exploring the world. The image could be used to promote a travel or education brand, or to encourage children to explore and learn about the world around them.'},\n",
+ " {'image_id': '124',\n",
+ " 'caption': 'The image shows a group of five coins, each with a different design on it. The coins are made of metal and have a shiny surface. The designs on the coins depict various historical events and figures, including the Statue of Liberty, the American flag, and a bald eagle. The coins are arranged in a row, with each coin slightly overlapping the one in front of it. The overall image is symmetrical, with the coins arranged in a straight line.'},\n",
+ " {'image_id': '126',\n",
+ " 'caption': 'This image shows a bride and groom standing under a gazebo during their wedding ceremony. The bride is wearing a white wedding dress and holding a bouquet of flowers, while the groom is wearing a black tuxedo and holding a single red rose. The couple is surrounded by a group of people in formal attire, who are watching the ceremony from the sidelines. The ceremony is taking place in an outdoor setting, with trees and greenery visible in the background. The sky is overcast, with clouds covering the sun.'},\n",
+ " {'image_id': '127',\n",
+ " 'caption': 'This image shows a clothing store with various clothing items hanging on racks and displayed on shelves. The walls are decorated with a large cityscape mural and several black and white photographs. The floor is made of wooden planks, and there is a black and white tiled pattern on the walls. The lighting in the room is dim, and there are several black and white lamps hanging from the ceiling. The atmosphere of the room is sophisticated and modern.'},\n",
+ " {'image_id': '128',\n",
+ " 'caption': 'The image depicts a person wearing a crown and holding a basketball in his right hand. He is standing in front of a blue and purple background with a pattern of stars and other celestial objects. The person is wearing a purple and gold jersey with the number 23 on the back and a crown on his head. He has a serious expression on his face and is looking directly at the camera.\\n\\nIn terms of details, the person is wearing a purple and gold jersey with the number 23 on the back and a crown on his head. He has a serious expression on his face and is looking directly at the camera. The background is a mix of blue and purple with a pattern of stars and other celestial objects.'},\n",
+ " {'image_id': '129',\n",
+ " 'caption': 'This is a black and white image of the logo for the fashion brand, Chanel. The logo is made up of the letters \"C\" and \"H\" in white, on a black background. The letters are stacked on top of each other, with the \"C\" on the left and the \"H\" on the right. This creates a sense of balance and symmetry in the design. The black and white color scheme gives it a sleek and sophisticated look.'},\n",
+ " {'image_id': '132',\n",
+ " 'caption': 'There are two dogs lying on the floor, a golden retriever and a black labrador, with the former wearing a harness and the latter wearing a collar. They seem relaxed and content. In the background, there is a brown leather couch, and in front of it, there is a wooden coffee table, indicating that the room is a living room.'},\n",
+ " {'image_id': '133',\n",
+ " 'caption': 'The image shows a dump truck carrying a load of dirt and rocks driving down a dirt road in a large open pit mine. The pit is surrounded by large piles of dirt and rocks, and there are no trees or other vegetation in the area. The sky is clear and blue.'},\n",
+ " {'image_id': '134',\n",
+ " 'caption': 'The image is a quote that reads, \"Being a mechanical engineer is easy, it\\'s like riding a bike you never forget how to do it.\" The quote is written in black text on a white background in large letters. It is meant to be inspirational and motivational, encouraging people to pursue careers in mechanical engineering and reminding them of the skills they have learned.'},\n",
+ " {'image_id': '135',\n",
+ " 'caption': 'The image shows a green hill with a tower on top of it. The tower is tall and has a pointed top. The sky is cloudy and there are some clouds in the background. The grass on the hill is long and lush. There are no other buildings or structures visible in the image.\\n\\nThe image shows a green hill with a tower on top of it. The tower is tall and has a pointed top. The sky is cloudy and there are some clouds in the background. The grass on the hill is long and lush. There are no other buildings or structures visible in the image.'},\n",
+ " {'image_id': '137',\n",
+ " 'caption': 'The image shows a white sandy beach with clear blue water and a few people walking along the shore. There are several buildings in the background, which appear to be hotels or resorts. The sky is a bright blue and there are a few clouds in the distance.'},\n",
+ " {'image_id': '138',\n",
+ " 'caption': 'the majority of the space in the foreground and the pot of stew and plate of herbs providing context in the background. The colors of the stew and herbs complement each other well, creating an appetizing and visually appealing image.'},\n",
+ " {'image_id': '139',\n",
+ " 'caption': 'This is an image of a tapered roller bearing. The bearing is made up of an outer ring, an inner ring, and a number of tapered rollers in between to support a heavy load while still being able to rotate smoothly. The outer ring is typically larger in diameter and made of steel or cast iron, while the inner ring is typically smaller in diameter and made of a softer material like brass or bronze. The rollers are made of hardened steel and positioned between the outer and inner rings. This type of bearing is commonly used in machinery, vehicles, and heavy equipment due to its ability to handle high loads.'},\n",
+ " {'image_id': '142',\n",
+ " 'caption': 'This is an image of three alpacas standing in a pen. The first alpaca is black with a white face, the second is white with a black face, and the third is brown with a white face. They are all standing next to each other and looking at the camera. There is a fence separating them from the viewer, and there is hay in the pen with them. The background is a mix of dirt and grass.'},\n",
+ " {'image_id': '143',\n",
+ " 'caption': 'This is a black dress with lace detailing on the neckline and sleeves. The dress has a fitted bodice and a flared skirt.'},\n",
+ " {'image_id': '145',\n",
+ " 'caption': 'The image shows a group of people standing around a large ice cream cone. The cone is green and has a scoop of ice cream on top. The people in the image are reaching out to touch the ice cream cone. They are all smiling and appear to be enjoying the ice cream. There is a white background behind the people and the ice cream cone.'},\n",
+ " {'image_id': '146',\n",
+ " 'caption': 'This image is a colorful striped background consisting of pastel shades of pink, yellow, blue, and green arranged vertically in a striped pattern with each color separated by a thin white line. The background appears to be a digital image giving the image a cheerful and playful feel. It could be used as a design element for a website or app, or as part of a graphic design project. It could also be used as a background for a social media post or as part of a marketing campaign to draw attention to a particular product or brand. In conclusion, the image is visually appealing and versatile for use in a variety of design contexts.'},\n",
+ " {'image_id': '147',\n",
+ " 'caption': 'This is an image of a man sitting on a bench with his knee wrapped in a bandage. He is wearing a white t-shirt and black shorts, and is holding his right arm with his left hand. The man appears to be in pain, as his facial expression is one of discomfort. The background of the image is a city street with trees and buildings visible in the distance.'},\n",
+ " {'image_id': '148',\n",
+ " 'caption': 'The image shows a pair of tan suede high top sneakers with laces on the front and no laces on the back. There is a small bow on the front of each shoe. The soles of the shoes are made of a light-colored material and have a small amount of dirt on them. The shoes appear to be in good condition.'},\n",
+ " {'image_id': '149',\n",
+ " 'caption': 'This image shows a man in a blue shirt and white shorts holding a tennis racket on a green tennis court. He is wearing white socks and black tennis shoes. The court has white lines on it and there is a net in the background.'},\n",
+ " {'image_id': '151',\n",
+ " 'caption': 'The image shows a group of men in white shirts standing on the balcony of a building. They are all smiling and posing for the camera. The building behind them appears to be made of stone and has arched windows and balconies. There are trees in the background and the sky appears to be cloudy.'},\n",
+ " {'image_id': '152',\n",
+ " 'caption': 'This image shows a crowd of people at a concert, with their hands raised in the air. The stage is lit up with bright blue and green lights, and the people in the crowd are wearing a variety of clothing. Some are holding up their phones to take pictures or record the concert. The atmosphere is lively and energetic, with the crowd cheering and singing along to the music.'},\n",
+ " {'image_id': '153',\n",
+ " 'caption': 'This is a toy truck with a trailer attached to the back. The trailer is made of plastic and has a small door on the side. The truck is white with red and black accents and has a small cab with a steering wheel and two seats. There is a small ladder attached to the back of the truck.'},\n",
+ " {'image_id': '154',\n",
+ " 'caption': 'There are several types of cakes and desserts in the box. The cakes include chocolate, vanilla, carrot, and cheesecake. All cakes are covered in frosting and decorated with toppings like sprinkles, chocolate chips, and nuts. Additionally, the cakes have pieces of fruit such as grapes, strawberries, and blueberries arranged on top. The box sits on a white tablecloth with a knife and plate nearby.'},\n",
+ " {'image_id': '155',\n",
+ " 'caption': 'This image shows The building in the image is a warehouse with a large parking lot in front of it. The warehouse appears to be made of concrete and steel, with large windows on the front and sides. There are several large trucks parked in front of the warehouse. The sky is cloudy and there are some trees in the background.'},\n",
+ " {'image_id': '158',\n",
+ " 'caption': 'This is a clear wine glass with the words \"best dad in the world\" engraved on the side. It has a stem and bowl and is of standard size.'},\n",
+ " {'image_id': '159',\n",
+ " 'caption': 'There is a bed in the room with a white comforter and pillows. There are two lamps on either side of the bed. The walls are painted green and there is a window with white curtains. There is a desk with a computer on it in the corner of the room.'},\n",
+ " {'image_id': '160',\n",
+ " 'caption': 'The image shows a resort with a large swimming pool surrounded by lounge chairs and umbrellas. There are several buildings in the background with white walls and blue roofs. There are sand dunes and palm trees in the background indicating that the resort is located in a desert area. The sky is clear and blue with a few fluffy clouds in the distance.'},\n",
+ " {'image_id': '161',\n",
+ " 'caption': 'The image is a menu for a restaurant, displaying a range of dishes and drinks.'},\n",
+ " {'image_id': '162',\n",
+ " 'caption': 'This image shows a large open space with high ceilings and exposed beams. The walls are painted white and there are large windows letting in natural light. The floor is made of wooden planks and there are several pieces of furniture in the room, including a couch, chairs, and a dining table. The room appears to be a loft apartment with an industrial feel.'},\n",
+ " {'image_id': '163',\n",
+ " 'caption': 'The image shows a purple dresser with gold accents and two framed mirrors hanging on the wall above it. The dresser has six drawers and is made of wood with a high gloss finish. The mirrors have ornate gold frames and beveled edges. The room has hardwood floors and white walls. There are no other pieces of furniture in the room.'},\n",
+ " {'image_id': '166',\n",
+ " 'caption': 'The image shows the front cover of a book with the title'},\n",
+ " {'image_id': '168',\n",
+ " 'caption': 'The image shows a bottle of Coca Cola and a glass of iced tea on a wooden table with a dark background.\\n\\nThe bottle of Coca Cola is made of clear glass and has a red and white label with the brand name and logo. The glass of iced tea is also made of clear glass and has a small amount of ice cubes floating in it.\\n\\nThe table is made of wood and has a rough, textured surface. The background is very dark, almost black, and there is no other visible objects in the image.'},\n",
+ " {'image_id': '169',\n",
+ " 'caption': 'The painting depicts a river with trees on either side. The water is calm and reflects the clouds in the sky. The sky is a bright blue with fluffy white clouds. The trees on the left side of the painting are tall and thin, while the trees on the right side are shorter and more spread out. There are no people or animals in the painting.'},\n",
+ " {'image_id': '170',\n",
+ " 'caption': 'The image shows a road at night with several cars driving on it. The headlights of the cars are shining brightly in the dark, and there are some streetlights on the side of the road. There are also some trees and buildings visible in the background. The image is taken at night, so the sky is dark and there are no stars visible.\\n\\nThe cars in the image are driving on the left side of the road, which is typical for countries that drive on the left side of the road. The headlights of the cars are shining brightly, which helps to illuminate the road and make it easier for the drivers to see where they are going. The streetlights on the side of the road also help to provide additional illumination.\\n\\nOverall, the image shows a typical night scene on a road with cars driving on it.'},\n",
+ " {'image_id': '171',\n",
+ " 'caption': 'a turban and has a serious expression on his face. Behind him are two flags on the wall, one of which is the Indian flag. The room is dimly lit with a red carpet and dark curtains on the windows. The man has a lot of medals and insignias on his chest and is holding a microphone in his left hand. The desk in front of him has a laptop and some papers on it.'},\n",
+ " {'image_id': '173',\n",
+ " 'caption': 'The image shows a man standing in front of an iron gate. The gate is old and rusty, with the words \"Grand Nefat\" engraved on it. The man is wearing a white shirt and black pants, and has his arms crossed in front of him. Behind him, you can see a vineyard with grapevines growing on it. The sky is clear and blue, and there are some clouds in the distance.'},\n",
+ " {'image_id': '174',\n",
+ " 'caption': 'This image shows a crab on the sand with an orange body, white spots, and a brown head with black eyes. Its legs are long and thin, and its claws are large and sharp. The sand around the crab is light brown, and there are small rocks and pebbles in the background. The sky is cloudy and overcast.'},\n",
+ " {'image_id': '175',\n",
+ " 'caption': 'The image is a bracelet made of white pearls that are strung together on a thin and delicate gold chain with a clasp that fastens the bracelet around the wrist. The pearls are large, round, and have a smooth, glossy surface. The overall design of the bracelet is simple and elegant, making it suitable for both casual and formal wear.'},\n",
+ " {'image_id': '177',\n",
+ " 'caption': 'The image shows a group of men dressed in red and black uniforms, carrying rifles and marching down a street. They are part of a military parade or ceremony. The uniforms are similar to those worn by soldiers in the British army during the 19th century. The men are wearing red coats with black facings, black trousers, and black boots. They are also wearing black helmets with plumes on top. The rifles they are carrying are also black, with brass fittings and bayonets. The men are marching in formation, with one leading the way and the others following closely behind. The street is lined with people watching the parade, and there are buildings on either side. The image is in black and white, which gives it a historical feel.'},\n",
+ " {'image_id': '178',\n",
+ " 'caption': 'This image is a collage of various landmarks from around the world including the Statue of Liberty, Big Ben, the Eiffel Tower, the Taj Mahal, the Great Wall of China, and the Sydney Opera House. The landmarks are depicted in a colorful, abstract style on a multicolored background. The Statue of Liberty is located in New York City and is a symbol of freedom and democracy in the United States. Big Ben is the nickname for the Great Bell of the clock at the north end of the Palace of Westminster in London, England. It is often used as a symbol of London. The Eiffel Tower is a wrought iron lattice tower located on the Champ de Mars in Paris, France.'},\n",
+ " {'image_id': '181',\n",
+ " 'caption': 'The image shows a mother and two children sitting on the grass in front of a brick wall. The mother is wearing a white and red striped shirt, and the children are wearing white shirts with red and white stripes. The mother is holding a baby in her arms, and the children are looking up at her with smiles on their faces.'},\n",
+ " {'image_id': '182',\n",
+ " 'caption': \"This is a picture of a woman's dark brown hair in a messy bun with some highlights. The hair is pulled back into a ponytail and then twisted into a bun, with some strands sticking out of it. The bun is held in place with a hair tie, and the woman's face is not visible in the picture.\"},\n",
+ " {'image_id': '185',\n",
+ " 'caption': 'The image shows a group of people standing in front of a desk in an office. They are all smiling and posing for the camera. The people are dressed in casual clothing, with one person wearing a blue shirt and another wearing a green shirt. There is a logo on the wall behind the desk that says \"Tesla\". The room appears to be well lit and spacious.'},\n",
+ " {'image_id': '187',\n",
+ " 'caption': 'This image shows a wallpaper with a floral pattern on a light blue background. The pattern features various birds, fruits, and leaves in shades of green, pink, and yellow. The birds are perched on branches and leaves, while the fruits are depicted as apples, pears, and peaches. The leaves are depicted in various sizes and shapes, with some of them being more detailed than others. The overall design of the wallpaper is elegant and sophisticated, with a focus on natural elements.'},\n",
+ " {'image_id': '189',\n",
+ " 'caption': 'This image shows a group of people, including three adults and two children, sitting on a wooden deck with a rope railing overlooking a body of water. The adults are sitting in chairs and the children are standing next to them. They all seem to be enjoying their time together and are smiling. The background includes trees and a dock that leads out into the water.'},\n",
+ " {'image_id': '191',\n",
+ " 'caption': 'The image shows a piece of cake on a plate with white frosting and sprinkles on top. The plate has a red and white checkered napkin on it. The cake appears to be moist and fluffy, and the frosting is smooth and creamy. There is a bite taken out of the cake, revealing the moist and fluffy cake inside. The cake appears to be homemade and made with fresh ingredients.'},\n",
+ " {'image_id': '192',\n",
+ " 'caption': 'The image shows a piece of chocolate cake with sprinkles on top, sitting on a white plate with a pink and white floral design on it. The plate is sitting on top of a pink and white polka dot tablecloth. There are also some colorful sprinkles scattered around the plate.\\n\\nThe cake appears to be made of chocolate, with a smooth and even texture. The sprinkles on top are a variety of colors, including pink, yellow, and blue. The plate has a floral design on it, with pink and white flowers. The tablecloth also has a pink and white polka dot pattern.\\n\\nOverall, the image depicts a sweet and colorful dessert on a pretty table setting.'},\n",
+ " {'image_id': '195',\n",
+ " 'caption': 'The image shows a man standing on a stage with a microphone in his hand, singing into a microphone. He is wearing a green shirt and has a serious expression on his face. The background is dark and there are no other people visible in the image.'},\n",
+ " {'image_id': '196',\n",
+ " 'caption': 'The woman is wearing a white tank top and pink headband, with loose wavy curls and pink lipstick. She is smiling and looking at the camera against a background of a white sink, mirror, and towels in what appears to be a bathroom.'},\n",
+ " {'image_id': '197',\n",
+ " 'caption': \"The image is a quote that reads, `'to the world you may be just one person, but to one person you may be the world.'\\n\\nIt is a motivational quote that emphasizes the importance of individuality and the impact that one person can have on the world. The quote is written in a decorative font and is surrounded by a variety of colors and patterns, including stripes, polka dots, and flowers. The background is a light blue color with white clouds and a yellow sun. The overall design is cheerful and uplifting.\"},\n",
+ " {'image_id': '198',\n",
+ " 'caption': \"This image shows a person's hand holding an oatmeal mixture in a glass.\"},\n",
+ " {'image_id': '199',\n",
+ " 'caption': 'The image shows a group of football players running on a field at night. The players are wearing maroon and white uniforms with numbers on the front and back. One player is carrying the ball and the others are chasing after him. The field is lit up by stadium lights and there are trees in the background.'},\n",
+ " {'image_id': '201',\n",
+ " 'caption': 'The image shows a vegetable garden in a raised bed made of wooden planks, filled with various types of plants such as kale, lettuce, and chard. The plants are growing in different sizes, some reaching up to the top of the bed. The garden is surrounded by a lawn and trees are visible in the background.'},\n",
+ " {'image_id': '202',\n",
+ " 'caption': 'This is a diagram of a cross section of a rectangular prism. The rectangular prism is made up of four rectangular faces that are perpendicular to each other. Each face is made up of two parallel lines, one horizontal and one vertical, that intersect at right angles. The rectangular prism also has four vertices and four edges.'},\n",
+ " {'image_id': '203',\n",
+ " 'caption': \"The image shows a woman in a white wedding dress sitting on a wooden bench under a large tree. The tree has a banner hanging from its branches that reads, `'to the moon and back.' The woman is looking off into the distance, and the image has a dreamy, ethereal quality to it.\"},\n",
+ " {'image_id': '204',\n",
+ " 'caption': 'The image shows a group of people standing on top of a large rock formation in the middle of the ocean, jutting out of the water. There is a small island visible in the distance. The clear, blue sky has some clouds in the distance. The people appear to be enjoying the view and taking in the scenery. The water around the formation is choppy with waves crashing against the rocks.'},\n",
+ " {'image_id': '206',\n",
+ " 'caption': 'This image shows a bride and groom cutting their wedding cake at an outdoor reception. The bride is wearing a long dress and the groom is wearing a black tuxedo with a white shirt and black tie. The multi-tiered cake has white frosting and is decorated with pink and white flowers. The table is covered with a white tablecloth and chairs are set up for guests to sit and watch. The background is a wooded area with trees and greenery.'},\n",
+ " {'image_id': '207',\n",
+ " 'caption': 'This is an image of a black deer standing in the middle of a lush and green forest. The deer has large antlers on its head and is looking straight at the camera with a curious expression on its face. The forest is surrounded by tall trees and bushes, and ferns and other vegetation are visible in the background. The sunlight is shining through the trees, casting dappled shadows on the ground.'},\n",
+ " {'image_id': '208',\n",
+ " 'caption': 'The image displays a wooden house with a red roof, two large windows on either side and a door in the middle. The house is surrounded by a well-maintained lawn and several trees. A pathway leads to the front door of the house.'},\n",
+ " {'image_id': '209',\n",
+ " 'caption': 'The image is a birthday cake decorated with purple and white frosting, featuring a fleur de lis symbol of the LSU Tigers football team. The words \"happy birthday\" are written on the cake in white frosting.'},\n",
+ " {'image_id': '212',\n",
+ " 'caption': 'The image shows a bedroom with pink walls and a white ceiling. There is a large window with white curtains and a balcony with a view of the city. The bed is made with white sheets and has two pillows on it. There is a small table next to the bed with a vase of yellow flowers on it. The room appears to be well lit and spacious.'},\n",
+ " {'image_id': '215',\n",
+ " 'caption': 'The image shows a dark grey 2019 volkswagen passat\\n\\nThe 2019 volkswagen passat is a midsize sedan that has a sleek and modern design. The front of the car has a large grille with the vw logo in the center, surrounded by LED headlights. The sides of the car have a flowing line that runs from the front to the back, giving the car a smooth and aerodynamic look. The back of the car has a sloping roofline and a pair of tailpipes on either side. The car is shown in a light grey color.'},\n",
+ " {'image_id': '216',\n",
+ " 'caption': 'This image shows a small island surrounded by turquoise water. The island is covered in green vegetation and has a sandy beach on one side. There is a small boat in the water near the island. The sky is clear and blue, with a few fluffy clouds visible in the distance.'},\n",
+ " {'image_id': '217',\n",
+ " 'caption': 'This image shows a group of people climbing on a rocky mountain. They are wearing climbing gear, including helmets, harnesses, and ropes. The sky is clear and blue, and there are some clouds in the distance. The landscape is rocky and barren, with no vegetation.'},\n",
+ " {'image_id': '218',\n",
+ " 'caption': 'There is a woman sitting in an armchair wearing a green dress and holding a phone to her ear. She appears to be talking on the phone. A bookshelf with several books is visible in the background.'},\n",
+ " {'image_id': '219',\n",
+ " 'caption': 'The image shows a man in a tuxedo The man in the image is wearing a black tuxedo with a white shirt and black tie. He has a beard and is holding an Oscar statuette in his left hand. The statuette is made of gold and is engraved with the name of the award and the year it was given. The man has a serious expression on his face and is looking directly at the camera. The background of the image is blurred and there are no other objects or people visible.'},\n",
+ " {'image_id': '223',\n",
+ " 'caption': 'The image displays the Sydney skyline at night, featuring the Sydney Harbour Bridge in the background. The bridge is illuminated with blue and white lights, and is a significant symbol of the city. The steel arch bridge serves as an important transportation link, connecting the north and south shores of Sydney. A concrete pier with a metal railing extends from the foreground into the harbour, with a few boats docked to it, and their lights reflecting in the water.'},\n",
+ " {'image_id': '224',\n",
+ " 'caption': 'The image shows a large, ornate room with high vaulted ceilings and large stained glass windows. The walls are made of stone and there are marble columns and arches throughout the room. The floor is made of checkerboard tiles and there are several people walking around the room. There is a large chandelier hanging from the center of the room and several other light fixtures on the walls. The room appears to be very grand and luxurious.'},\n",
+ " {'image_id': '225',\n",
+ " 'caption': 'The image shows a train traveling on tracks through a mountainous area with trees and mountains in the background. The train appears to be a commuter train, with a white body and red stripes. There are no passengers visible in the image.\\n\\nIn the foreground, there are some trees and bushes growing along the side of the tracks. The leaves on the trees are starting to change color, indicating that it is autumn. The mountains in the background are covered in a mix of evergreen and deciduous trees, with some of the leaves starting to change color as well. The sky is clear and blue, with a few fluffy clouds visible.\\n\\nOverall, the image depicts a peaceful scene of a train traveling through a beautiful mountainous area on a sunny autumn day.'},\n",
+ " {'image_id': '226',\n",
+ " 'caption': 'The image shows a group of people, including a woman in a white dress and a man in a suit, standing in front of a group of soldiers dressed in military uniforms. The woman is holding a bouquet of flowers and the man is standing next to her with his hands behind his back. The soldiers are standing at attention in a line behind the couple, with their arms at their sides. The image appears to have been taken in a formal setting, such as a military parade or ceremony.\\n\\nThe woman in the image is likely a member of a royal family or other high-ranking dignitary, given her formal attire and the presence of soldiers in military uniforms. The man standing next to her may be her spouse or a member of her entourage. The soldiers in the image are likely part of a ceremonial guard or honor guard, responsible for providing a visual display of military precision and discipline during public events.\\n\\nOverall, this'},\n",
+ " {'image_id': '229',\n",
+ " 'caption': 'This image shows a samurai helmet on a wooden stand. The helmet is made of metal and has intricate designs on the front and back. It has a pointed top, a curved brim, and two small horns on the top. The helmet is in good condition and appears to be well made.'},\n",
+ " {'image_id': '230',\n",
+ " 'caption': 'The image shows a garden with a wooden bench surrounded by white flowers and greenery. Tall trees and a stone wall make up the background, creating a peaceful and serene atmosphere.'},\n",
+ " {'image_id': '234',\n",
+ " 'caption': 'This is a photograph of a kitchen. The walls are painted white, and the floor is made of wooden planks. The countertops and cabinets are made of black granite and wood respectively. There is a stainless steel sink on top of the counter along with a faucet and garbage disposal. All the appliances including refrigerator, stove, and dishwasher are made of stainless steel. A large window above the sink allows natural light to come in making the room well lit. The overall design of the kitchen is modern and sleek with clean lines and a minimalist aesthetic.'},\n",
+ " {'image_id': '235',\n",
+ " 'caption': 'The image shows a group of people standing on a stage in front of a microphone. They are all dressed in formal attire, with one person holding a trophy and another person holding a microphone. Behind them, there is a curtain and a banner that reads \"Congratulations\" in large letters. The people in the image appear to be smiling and happy.'},\n",
+ " {'image_id': '238',\n",
+ " 'caption': 'This is a collage of various images, including a banana, a man in a suit, and a woman in a bikini.'},\n",
+ " {'image_id': '239',\n",
+ " 'caption': 'The image shows a man and a woman walking on the beach. The man is wearing a white robe and the woman is wearing a black robe. They are holding hands and walking towards the water. The beach is sandy and there are some rocks in the water. The sky is clear and there are some clouds in the distance. The sun is shining brightly and casting shadows on the sand.\\n\\nThe man and woman appear to be enjoying a leisurely walk on the beach. They are not in a hurry and seem to be taking in the sights and sounds of the ocean. The man is wearing a white robe and the woman is wearing a black robe. They are holding hands and walking towards the water. The beach is sandy and there are some rocks in the water. The sky is clear and there are some clouds in the distance. The sun is shining brightly and casting shadows on the sand.'},\n",
+ " {'image_id': '240',\n",
+ " 'caption': 'The image shows a gearbox, which is a mechanical device used to transmit power from one rotating shaft to another. The gearbox consists of several gears and shafts that work together to change the speed and torque of the rotating shafts. The gears are made of metal and have teeth that mesh with each other to transmit power. The shafts are used to hold the gears in place. The gearbox is used in automobiles, motorcycles, and industrial machinery.'},\n",
+ " {'image_id': '241',\n",
+ " 'caption': 'This is an image of Sonic the Hedgehog, a popular video game character known for his speed and agility. He is shown here in his classic form, with his trademark blue spikes and red shoes. The caption reads, \"\\'Lose rings fall into water die\\'\\' which is a reference to the game Sonic the Hedgehog, in which players must collect rings while avoiding obstacles and enemies. The Sonic the Hedgehog video game series was first released in 1991 for the Sega Genesis console and has since become a beloved franchise among gamers.'},\n",
+ " {'image_id': '242',\n",
+ " 'caption': 'This image appears to be a seamless pattern made up of various shades of green, purple, and brown. The colors are swirled together to create a flowing, organic design that could be used as a background or texture for a variety of projects. The overall effect is one of sophistication and elegance.'},\n",
+ " {'image_id': '243',\n",
+ " 'caption': 'This is an image of a man and a child sitting on a bench in front of a hut. The man is holding a bowl of food and the child is looking up at him. The image is in black and white.\\n\\nThe man is wearing a hat and a shirt, and the child is wearing a shirt and shorts. The hut in the background is made of wood and has a thatched roof. There is a small fence in front of the hut, and some trees can be seen in the background. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '245',\n",
+ " 'caption': \"This image shows a man and a woman walking down the street holding hands. They are both wearing casual clothing, with the man wearing a white shirt and jeans, and the woman wearing a blue shirt and white pants. They are both smiling and looking at each other as they walk down the sidewalk. There is a building in the background with a large arched doorway, and there are trees and other buildings visible in the distance. The lighting in the image is natural, with the sun shining down from the right and casting shadows on the couple's faces. The overall mood of the image is happy and romantic, as the couple appears to be enjoying their time together.\"},\n",
+ " {'image_id': '246',\n",
+ " 'caption': 'This image shows a street scene in the French Quarter of New Orleans, Louisiana. The tall and ornate buildings on either side of the street have balconies and wrought iron railings. The street is lined with parked cars, and people are walking on the sidewalk. In the distance, a trolley car travels down the street. The sky is clear and blue without any clouds.'},\n",
+ " {'image_id': '248',\n",
+ " 'caption': 'The image depicts two hands holding a cat and a dog in the shape of a heart. The background is a red circle with the words \"love\" written in the center.\\n\\nThe image is a representation of the love between a cat and a dog. The cat and dog are holding hands in the shape of a heart, symbolizing their love and affection for each other. The background is a red circle with the word \"love\" written in the center, further emphasizing the theme of love and affection.'},\n",
+ " {'image_id': '250',\n",
+ " 'caption': 'The dress is a silver metallic dress with a plunging neckline and a thigh high slit. It has a fitted bodice and a flared skirt, made of shiny, reflective fabric that catches the eye. This modern statement piece is perfect for a night out or special occasion.'},\n",
+ " {'image_id': '251',\n",
+ " 'caption': \"The image shows a person sitting at a desk with a laptop open in front of them. They are wearing a suit and have a pen in their hand, which they are using to write on a piece of paper on the desk in front of them. Behind them, there is a window with a view of a city skyline.\\n\\nThe image is a representation of a business person working on their laptop, possibly analyzing data or working on a report. The graph on the laptop screen could be a representation of the company's financial performance, or any other type of data that the person is analyzing.\\n\\nOverall, the image is a representation of a business person working on their laptop, possibly analyzing data or working on a report. The graph on the laptop screen could be a representation of the company's financial performance, or any other type of data that the person is analyzing.\"},\n",
+ " {'image_id': '254',\n",
+ " 'caption': 'The image shows a row of red scooters parked on the side of a narrow street with buildings on either side. The scooters are lined up in a row, with some parked on the sidewalk and others parked on the street. The buildings on either side of the street are old and made of brick, with balconies and shutters on the windows. The street is lined with cobblestones, and there are no cars or other vehicles in sight. The sky is clear and blue, with a few fluffy clouds in the distance.'},\n",
+ " {'image_id': '257',\n",
+ " 'caption': 'This image shows a red lifted truck with large tires parked in a grassy area. The truck has a lifted suspension and large tires on both the front and rear axles. Additionally, the front bumper is lifted and has a winch mounted on it. The truck seems to be in good condition and is prepared for off-road use.'},\n",
+ " {'image_id': '260',\n",
+ " 'caption': 'This is a photograph of a rocky shoreline on a cloudy day. The waves are crashing against the shore and there is a small amount of foam on the water. The sky is overcast with clouds in the background. There are no people or other objects in the image.'},\n",
+ " {'image_id': '261',\n",
+ " 'caption': 'The image shows a map of the world with a golden airplane on top of it. The map displays the continents and countries of the world.'},\n",
+ " {'image_id': '262',\n",
+ " 'caption': \"This image shows a green frog sitting on a leaf in a puddle of water. The frog's body is translucent, allowing the viewer to see its internal organs, including its heart and lungs. The frog's eyes are large and black, and its legs are long and thin. The leaf has some water droplets on it, and the background is a blurred image of what appears to be a window or glass pane. The overall effect of the image is one of mystery and intrigue.\"},\n",
+ " {'image_id': '263',\n",
+ " 'caption': 'crumbling and destroyed, giving the impression that the city is under attack. The color scheme of the image is mostly black and grey, with hints of orange and red from the flames.\\n\\nOverall, the poster suggests that the movie or TV show is an action-packed, intense, and suspenseful adventure featuring the iconic character of Batman.'},\n",
+ " {'image_id': '265',\n",
+ " 'caption': 'The image shows a woman wearing a black and gold patterned midi skirt. The skirt has an asymmetrical hem and is worn with a pair of black ankle boots. The woman is also wearing a black top with long sleeves and a gold necklace with a pendant. The overall look is elegant and chic, perfect for a night out or a special occasion.'},\n",
+ " {'image_id': '266',\n",
+ " 'caption': 'The entrance to a brick hotel is shown in the image, with \"federal hotel\" written on the front of the building in large letters. The upper floors have several windows, and a large sign hangs above the entrance saying \"federal hotel\" in bold letters. The building is brightly lit at night.'},\n",
+ " {'image_id': '268',\n",
+ " 'caption': 'This image shows two scientists working in a laboratory. They are both wearing white lab coats and gloves, and one of them is holding a microscope while the other is holding a test tube with a liquid in it. The background of the image is a white, sterile laboratory with various equipment such as beakers, flasks, and test tubes. There is also a large window in the background that lets in natural light. The overall atmosphere of the image is professional and scientific.'},\n",
+ " {'image_id': '269',\n",
+ " 'caption': 'The image shows a circuit board with various electronic components attached to it, including capacitors, resistors, diodes, and transistors. The circuit board is connected to a power source, which is indicated by the positive and negative terminals on the left and right sides of the image. There is also a microcontroller on the circuit board, which is responsible for processing and controlling the various electronic components.\\n\\nThe image shows a circuit board with various electronic components attached to it, including capacitors, resistors, diodes, and transistors. The circuit board is connected to a power source, which is indicated by the positive and negative terminals on the left and right sides of the image. There is also a microcontroller on the circuit board, which is responsible for processing and controlling the various electronic components.'},\n",
+ " {'image_id': '270',\n",
+ " 'caption': 'The image shows a variety of ingredients for cooking, including rice, noodles, sauces, spices, a bottle of soy sauce, and a package of ramen noodles.'},\n",
+ " {'image_id': '271',\n",
+ " 'caption': \"This image shows a close-up view of a tiger's face. The tiger appears to be staring directly at the camera with a serious expression. The tiger's fur is dark brown and its eyes are yellow with black pupils. Its whiskers are long and bushy. The tiger's nose is black, and its mouth is open slightly, revealing its sharp teeth. The tiger's ears are perked up, and its head is tilted slightly to the side. The background is black and there is no other visible detail in the image.\"},\n",
+ " {'image_id': '272',\n",
+ " 'caption': 'cartoon astronaut waving in a welcoming gesture, wearing a helmet and a spacesuit, with a background gradient of blue and purple and stars and planets visible in the distance.\"'},\n",
+ " {'image_id': '276',\n",
+ " 'caption': 'The image shows a group of three people standing in front of a stone wall. They are all wearing long robes and holding swords. The person on the left is Harry Potter, the person in the middle is Hermione Granger, and the person on the right is Ron Weasley. They are all looking at the camera with serious expressions.\\n\\nIn the background, there is a castle with tall towers and a large gate. The castle is surrounded by a moat and there is a bridge leading to the gate. The sky is cloudy and there is a rainbow in the distance.\\n\\nOverall, the image depicts a scene from the Harry Potter series, with the main characters standing in front of Hogwarts School of Witchcraft and Wizardry.'},\n",
+ " {'image_id': '278',\n",
+ " 'caption': 'The image shows two people hugging One person is wearing a black leather jacket and the other is wearing a red sweater. They are both smiling and embracing each other in a hug.'},\n",
+ " {'image_id': '279',\n",
+ " 'caption': 'This image is a collage made up of different pieces of paper. The center of the collage is a picture of Abraham Lincoln, the 16th President of the United States, depicted in a suit and tie with a serious expression on his face. The background of the collage is made up of various papers with different colors, patterns, words, images, and designs. The overall effect is one of complexity and diversity with different elements coming together to create a cohesive whole.'},\n",
+ " {'image_id': '280',\n",
+ " 'caption': 'This image appears to be a retro distressed sticker of a cartoon bottle of whiskey. The bottle appears to be made of glass and has a cork in the top. There is a label on the front of the bottle that reads \"Whiskey\" in bold white letters. The background of the sticker is a distressed texture, giving it a vintage look. The overall design of the sticker is simple and straightforward, making it suitable for use in various applications such as logos, branding, and web design.'},\n",
+ " {'image_id': '282',\n",
+ " 'caption': \"The image shows a woman standing next to a car, smiling and holding a car key in her hand. She is wearing a business suit and has her hair styled in a bun. The car behind her is a sleek, black sedan with tinted windows and a shiny finish. The dealership's showroom is visible in the background, with several other cars parked in the lot.\\n\\nThe woman is likely a salesperson at the dealership, and the car she is standing next to is likely a new model that she is showing to a potential customer. The key she is holding is likely the key to the car, which she is offering to the customer as part of the sales pitch. The image suggests that the dealership is modern and well-equipped, with a clean and well-lit showroom and a variety of cars to choose from.\"},\n",
+ " {'image_id': '283',\n",
+ " 'caption': \"This image is a black and white photograph of a person's face with a serious expression, looking directly at the camera. The details of the person's face are clear and visible, with short, dark hair and a well-defined jawline. The well-composed image has no distracting elements in the background, creating an overall effect of simplicity and clarity.\"},\n",
+ " {'image_id': '284',\n",
+ " 'caption': 'This image depicts a wooden mannequin holding a large, grey hat with a red ribbon tied around the brim. The mannequin is wearing a white shirt and black pants. There is a white background behind the mannequin.\\n\\nDescription:\\n\\n* The mannequin is holding a large, grey hat with a red ribbon tied around the brim.\\n* The mannequin is wearing a white shirt and black pants.\\n* There is a white background behind the mannequin.\\n* The mannequin is made of wood.\\n* The hat is made of grey paper.\\n* The red ribbon is tied around the brim of the hat.'},\n",
+ " {'image_id': '286',\n",
+ " 'caption': 'This image shows a coastal area with sandy beaches and dunes, as well as a river flowing into the ocean. There are also some rocks and cliffs visible in the background. The water appears to be a deep blue color, and there are some waves crashing against the shore. The sand on the beach is a light brown color, and there are some small plants growing in the dunes. The sky appears to be cloudy, with some dark clouds in the distance. Overall, this is a beautiful image of a natural coastal area.'},\n",
+ " {'image_id': '287',\n",
+ " 'caption': 'This image depicts a patio area enclosed by a wrought iron fence.'},\n",
+ " {'image_id': '288',\n",
+ " 'caption': 'The image shows a white flower in full bloom, with a long curved petal and a thin straight stem attached to the base of the flower. The background is light green.'},\n",
+ " {'image_id': '290',\n",
+ " 'caption': \"The image shows a brightly colored building with blue walls and a green door. The sign above the door reads `poke bar'in white letters. There are several potted plants on the sidewalk in front of the building.\"},\n",
+ " {'image_id': '291',\n",
+ " 'caption': 'The image shows a man sitting on a chair with his legs crossed wearing traditional Indian attire. He has a serious expression on his face and is looking at something in the distance. Behind him is a wall with graffiti, and in the background, there is a city skyline with the sun setting behind it. Two dogs sit next to the man, one on each side. It appears to be a still from a movie or TV show set in an urban environment, possibly from a Bollywood film.'},\n",
+ " {'image_id': '292',\n",
+ " 'caption': 'The image depicts a green field with a white house on a hill overlooking the ocean. The house is small with a red roof and chimney, surrounded by green grass and trees in the foreground. The clear blue sky has some clouds and the ocean is visible in the background.'},\n",
+ " {'image_id': '293',\n",
+ " 'caption': 'This image shows a man playing a guitar in a recording studio. He is seated in a chair, wearing headphones, with a microphone in front of him and a recording device on a stand nearby. Other instruments and equipment are visible in the background.'},\n",
+ " {'image_id': '294',\n",
+ " 'caption': 'The image shows a basketball player wearing a white jersey with the number 23 on the back and black shorts, taking a shot at the hoop during a game. He is holding the ball in his right hand, ready to shoot. The hoop and backboard are in the background, with a crowd of people watching from the stands.'},\n",
+ " {'image_id': '296',\n",
+ " 'caption': 'The image is a black leather wallet with a clear plastic window on the front. The wallet has two compartments, one for cash and the other for cards. The wallet is made of high quality leather and has a sturdy zipper closure. The wallet also has a small pocket on the back for storing receipts or other small items. The wallet has a sleek and modern design, making it a great accessory for both casual and formal outfits.'},\n",
+ " {'image_id': '297',\n",
+ " 'caption': 'This image shows a small stream flowing through a wooded area. The trees on either side of the stream are in full autumn color, with orange, yellow, and red leaves. The sky above is cloudy, with dark clouds gathering in the distance. The overall mood of the image is peaceful and serene, with the sound of the stream providing a calming background.'},\n",
+ " {'image_id': '298',\n",
+ " 'caption': 'The image shows a soccer player in a red and blue jersey running with the ball at his feet. He is wearing white shorts and black cleats. The field behind him is green and there are spectators in the stands watching the game. The sky is blue and there are clouds in the background.'},\n",
+ " {'image_id': '300',\n",
+ " 'caption': 'The image shows a double rainbow over a campsite with several RVs parked in the foreground. The sky is clear and blue, with the rainbow stretching across it. The rainbow has a bright, vibrant color scheme, with shades of pink, orange, yellow, green, and blue. The RVs are parked in a row, with their awnings open and their doors closed. The grass in the foreground is green and lush, and there are some trees in the background. The overall mood of the image is peaceful and serene, with the rainbow adding a touch of magic and wonder to the scene.'},\n",
+ " {'image_id': '301',\n",
+ " 'caption': 'The image shows two kayaks on a sandy beach with palm trees in the background. The kayaks appear to be made of orange plastic and have paddles attached to them. There is a small building in the background that appears to be a beach hut or kiosk. The sky is blue and there are some clouds in the distance.'},\n",
+ " {'image_id': '302',\n",
+ " 'caption': 'This is a black and white photograph of a man in a suit and tie. He has a beard and is wearing a bow tie. He is looking directly at the camera with a serious expression on his face. The background is not visible in this image.'},\n",
+ " {'image_id': '304',\n",
+ " 'caption': 'This image shows a person wearing a backpack and standing in a park with a view of a city skyline in the background. The person appears to be looking down at something on the ground in front of them. There are several other people in the background, walking or sitting on the grass. The sky is cloudy and there are some trees in the foreground.'},\n",
+ " {'image_id': '305',\n",
+ " 'caption': 'The image shows a silhouette of a person standing on a tree stump in the middle of a forest. The person is wearing a headdress made of feathers and has their arms outstretched. The sun is setting in the background, casting a warm orange glow over the scene. The trees in the background are tall and thin, with branches that stretch up towards the sky. There are no other visible objects in the image.'},\n",
+ " {'image_id': '306',\n",
+ " 'caption': \"The image shows a full moon shining over a rocky coastline with a lighthouse in the distance. The sky is dark, but the moon's light illuminates the landscape. The water is calm and still, reflecting the moon's glow. The lighthouse is a tall, white structure with a red roof and a beacon on top. It stands on a rocky outcropping near the water's edge. In the foreground, there are several large rocks jutting out of the water, with smaller rocks and pebbles scattered around them. The overall mood of the image is peaceful and serene, with the moon casting a magical glow over the scene.\"},\n",
+ " {'image_id': '309',\n",
+ " 'caption': 'The image depicts a concrete and steel hydroelectric dam with turbines on top. Water flows over the dam creating a visible mist. On one side of the dam, there is a road and some trees. The sky is clear and blue.'},\n",
+ " {'image_id': '310',\n",
+ " 'caption': 'This image is a logo for a baby care company. It features a cute cartoon bee sitting on a flower with the words \"baby care\" written underneath it. The bee has a smiling face and is surrounded by colorful flowers and leaves. The design is simple and cute, making it suitable for use on a wide range of products.\\n\\nThe colors used in this logo are pastel shades of pink, yellow, and green. The font used for the text is playful and childlike, which fits well with the overall theme of the logo. This logo is a great representation of a baby care company and would be well suited for use on a wide range of products.'},\n",
+ " {'image_id': '312',\n",
+ " 'caption': 'This image shows a woman crossing the street at a crosswalk. She is wearing a black shirt, black pants, and black shoes. There is a yellow caution tape on the ground in front of her, indicating that there is a hole in the street. The woman is looking down at the ground as she walks, and there are cars and buildings in the background.'},\n",
+ " {'image_id': '315',\n",
+ " 'caption': 'The image shows a bedroom with a large white bed and two lamps on either side. The walls are painted light green and have floral wallpaper. The bed is made with a white comforter and pillows, and there is a white nightstand on each side. The room is well lit with two lamps and a large window.'},\n",
+ " {'image_id': '316',\n",
+ " 'caption': \"This is an image of a green leaf with a map of the world imprinted on it. The map is made up of different shades of green, with the continents and oceans depicted in different shades of the color. The leaf has a rough texture and appears to be from a plant that is native to a tropical or subtropical region. The image is a representation of the interconnectedness of the world's ecosystems and the importance of preserving the natural environment.\"},\n",
+ " {'image_id': '317',\n",
+ " 'caption': 'This image shows a woman wearing a white mask and pointing at something with her right hand. The woman is wearing a black top and brown pants, and her hair is pulled back into a ponytail. The background is black.'},\n",
+ " {'image_id': '318',\n",
+ " 'caption': 'The image shows a man sitting at a desk with a computer in front of him. He is wearing glasses and has a serious expression on his face. There is a lamp on the desk next to him, and a green background behind him. The image is in a flat, cartoon style.\\n\\nThe man is sitting at a desk with a computer in front of him. He is wearing glasses and has a serious expression on his face. There is a lamp on the desk next to him, and a green background behind him. The image is in a flat, cartoon style.'},\n",
+ " {'image_id': '320',\n",
+ " 'caption': 'This image shows a group of people, including a man, woman and two children, standing in front of a body of water. The man is wearing a black shirt, black pants, and black shoes. The woman is wearing a white shirt, black pants, and black shoes. The children are both wearing black shirts, black pants, and black shoes. They are all smiling and posing for the camera with a clear blue sky and mountains in the background.'},\n",
+ " {'image_id': '322',\n",
+ " 'caption': 'This is an image of a metal badge with a white background and a black star in the center. It is made of metal and has a shiny finish. There is no text or writing on the badge.'},\n",
+ " {'image_id': '323',\n",
+ " 'caption': 'This is a math talk mat with the number 10 written on it. There are 10 circles on the mat, each with a number from 1 to 10. The mat is surrounded by red and blue dots.'},\n",
+ " {'image_id': '324',\n",
+ " 'caption': 'branches and leaves depicted in a unique and artistic manner.'},\n",
+ " {'image_id': '325',\n",
+ " 'caption': 'The image shows three cans of olive oil, each with a different label. The first can is labeled \"Classic Olive Oil,\" the second can is labeled \"Premium Olive Oil,\" and the third can is labeled \"Organic Olive Oil.\" All three cans have a green cap on top and are sitting on a green surface.'},\n",
+ " {'image_id': '326',\n",
+ " 'caption': \"This image shows a butterfly resting on a flower in a field. The butterfly is a yellow and black tiger swallowtail, and it is perched on a pink flower with green leaves. The butterfly's wings are spread out and its antennae are extended. The background of the image is a grassy field with some trees in the distance. There are also some pink flowers and green leaves in the foreground. The lighting in the image is natural, with the sun shining down from the right and casting shadows on the butterfly's wings. The overall mood of the image is peaceful and serene, with the butterfly resting calmly on the flower.\"},\n",
+ " {'image_id': '327',\n",
+ " 'caption': \"The image shows a woman standing on \\u200b\\n\\nThe image shows a woman standing on a frozen lake with her dog. The woman is wearing a pink jacket, white pants, and red boots. She is holding the dog's leash in her left hand and petting the dog with her right hand. The dog is a husky breed and is wearing a collar with a tag on it. The lake is covered in ice and there are some trees visible in the background. The sky is cloudy and it appears to be cold weather.\"},\n",
+ " {'image_id': '328',\n",
+ " 'caption': \"The image displays a map of the German Empire in 1871, which was formed after the unification of various German states. It depicts the different regions of the empire, including Prussia, Bavaria, Saxony, and Württemberg. Kaiser Wilhelm I was crowned in 1871 as the ruler of the empire. It was dissolved in 1918 following Germany's defeat in World War I.\"},\n",
+ " {'image_id': '329',\n",
+ " 'caption': 'This image is of a young woman on a green surfboard in the ocean. She is wearing a black bikini and has long, curly brown hair. The water is a deep blue with waves in the background. The sun is shining and there are clouds in the sky. The woman is smiling and standing on the surfboard with one foot while holding onto the rope with her hand. The water is up to her waist. The image is clear and well lit.'},\n",
+ " {'image_id': '330',\n",
+ " 'caption': 'This image shows a dining room with a long wooden table and several chairs. The walls are lined with shelves filled with plates, bowls, and other kitchenware. The table is set for a meal, with plates, silverware, and glasses arranged on it. The room is well lit, with natural light coming in from large windows on one wall. The floor is made of wooden planks, and there is a rug in front of the table. The atmosphere of the room is cozy and welcoming.'},\n",
+ " {'image_id': '332',\n",
+ " 'caption': \"The man in the image is wearing a black leather jacket, black pants, and white sneakers. He is standing in front of a large black and white image on the wall behind him. The image appears to be a close up of a person's face. The man is smiling and his arms are crossed in front of him.\\n\\nThe image appears to have been taken in a dimly lit room, as the man's face is partially obscured by shadows. The wall behind him appears to be made of concrete or some other rough material. There are no other objects or people visible in the image.\"},\n",
+ " {'image_id': '333',\n",
+ " 'caption': 'This image depicts a skull wearing a traditional Mexican sombrero hat with flowers around it. The hat has a wide brim and can be made of straw or felt. The skull is also wearing sunglasses, perched on the brim of the hat.'},\n",
+ " {'image_id': '334',\n",
+ " 'caption': \"The image shows a farmer driving a tractor through a field of crops. The tractor is pulling a plow to till the soil for planting. The farmer is wearing a hat and a short-sleeved shirt, holding onto the steering wheel with both hands. Rows of crops that have already been plowed are visible in the background. The sky is clear and blue, and there are no other buildings or structures in the image.\\n\\nReasons for population growth:\\n\\n* Farming methods have increased the world's food supply\\n* Advances in transportation and communication have made travel and communication easier\\n* Medical advances have improved health and increased life expectancy\\n* Increased access to education has led to higher literacy and better job opportunities\"},\n",
+ " {'image_id': '335',\n",
+ " 'caption': 'The image shows a group of people rafting down a river in the middle of a canyon. The canyon walls are steep and rocky, and there are mountains in the background. The water is clear and the sky is blue. The people in the raft are wearing life jackets and helmets, and they appear to be enjoying the ride. There are no other boats or people visible in the image.'},\n",
+ " {'image_id': '336',\n",
+ " 'caption': 'This image shows a large, modern lobby with high ceilings and large windows letting in plenty of natural light. The floor is made of white and black tiles, while the walls are painted in a light shade of grey. The ceiling is made of wooden beams, with recessed lighting fixtures hanging from them. The furniture in the lobby consists of a large, round coffee table in the center, surrounded by several chairs and a sofa. There is also a reception desk in the corner of the room. The overall design of the lobby is sleek and modern, with clean lines and minimal decor.'},\n",
+ " {'image_id': '337',\n",
+ " 'caption': 'There are two beds in the room, one double and one single, both with black leather headboards and white linens. The room has white walls and a white tiled floor. In the corner of the room, there is a black leather couch and a black leather chair. A large window lets in plenty of natural light, and in front of the window is a black leather ottoman.'},\n",
+ " {'image_id': '340',\n",
+ " 'caption': 'This is a photograph of a city square in Prague, Czech Republic. There are several buildings in the background, including a large church with a tall steeple. People are walking around the square, and there is a statue in the center. The sky is cloudy and overcast.'},\n",
+ " {'image_id': '342',\n",
+ " 'caption': 'This image is a still life painting of a bowl of sliced lemons arranged in a symmetrical pattern. The background is white, and there are no other objects depicted. The composition is balanced with the lemons in the foreground and mostly white background. Natural lighting creates shadows on the table and lemons. The colors used are mostly shades of yellow, with the lemons being bright and vibrant. The overall impression is that of a well-executed still life painting.'},\n",
+ " {'image_id': '344',\n",
+ " 'caption': 'The image shows a small, white house with a blue light shining from the top of the roof, surrounded by trees and a dark sky in the background. In the foreground, a person wearing a black hoodie and pants stands on the grass, looking up at the light. The overall mood of the image is eerie and mysterious, with the blue light casting an otherworldly glow on the house and person. The dark sky and trees in the background add to the sense of isolation and uncertainty.'},\n",
+ " {'image_id': '348',\n",
+ " 'caption': 'This image depicts a large, ornate door with intricate carvings on it. The carvings appear to depict various figures, including angels, demons, and other mythical creatures. The door appears to be made of stone or some other type of material that has been carved to create the intricate designs. The carvings are very detailed, with attention paid to the facial expressions and body language of each figure. Some of the figures appear to be in motion, as if they are about to come to life and step out of the door.'},\n",
+ " {'image_id': '349',\n",
+ " 'caption': 'a suspenseful and thrilling horror story that will keep the audience on the edge of their seats. The poster features a woman standing in front of a dark, ominous forest. She is wearing a long, flowing dress and has her back turned to the viewer. Her hair is long and flowing down her back, and her face is obscured by shadows. The forest behind her is full of twisted, gnarled trees that seem to be reaching out towards her. The overall tone of the poster is ominous and foreboding, suggesting that the film will be a scary and suspenseful horror story.'},\n",
+ " {'image_id': '350',\n",
+ " 'caption': 'The woman in the image is wearing a denim shirt and jeans. She has long, curly hair and is posing with her hands on her hips. The shirt is a light blue color and has a button-down collar. The jeans are a dark blue color and have frayed hems. The woman is wearing a pair of silver hoop earrings and a necklace with a small pendant. She has a tattoo of a flower on her left wrist. The background is a light blue color.'},\n",
+ " {'image_id': '351',\n",
+ " 'caption': 'The image shows a beautiful autumn scene with colorful leaves on the trees and on the ground. The trees are in the foreground, with their branches and leaves in various shades of orange, yellow, and red. In the background, there is a winding path that leads through the woods. The path is lined with colorful foliage, and there are glimpses of other trees and shrubs in the distance. The overall effect is one of warmth and coziness, as if the viewer is standing in the midst of a beautiful fall forest.'},\n",
+ " {'image_id': '352',\n",
+ " 'caption': \"buttigieg president ' are written in white letters on a blue background. The hoodie is made of light grey material with a hood that can be pulled over the head.\"},\n",
+ " {'image_id': '353',\n",
+ " 'caption': 'This mug has the words \"Home is where the dog is\" written in black on a white background with pink triangles.'},\n",
+ " {'image_id': '355',\n",
+ " 'caption': 'The image shows a plate with bread and soup on a checkered tablecloth on a wooden table with beer glasses. The background is a brick wall with hanging plants. The atmosphere is cozy.'},\n",
+ " {'image_id': '356',\n",
+ " 'caption': 'decorative pattern on top of the cheesecake. The cheesecake appears to have a smooth and creamy texture, and the graham cracker crust is visible on the edges. The plate the cheesecake is on is pink in color and appears to be made of ceramic.\\n\\nThe colors in the image are bright and vibrant, with the strawberries being a deep red color and the cheesecake being a light pink color. The lighting in the image is also well done.\\n\\nOverall, the image is visually appealing and makes the viewer want to try a slice of the strawberry cheesecake.'},\n",
+ " {'image_id': '357',\n",
+ " 'caption': \"This image depicts a black and white silhouette of a man and a woman's head. The man's head is on the left side and facing towards the left, while the woman's head is on the right side and facing towards the right. Both heads are slightly tilted in opposite directions. The man has short, dark hair, while the woman has long, light hair. Both faces are expressionless. This symbolizes the concept of equality between men and women.\"},\n",
+ " {'image_id': '359',\n",
+ " 'caption': 'This is an image of a person swimming in a pool. The person is wearing a blue swimsuit and goggles, and they are holding their arms out in front of them as they swim. The water in the pool is clear and blue, and there are other people in the pool with the person in the image. Some of these people are swimming, while others are lounging on pool floats or sitting on the edge of the pool. In the background, there are several buildings and trees visible through the windows of the pool area.'},\n",
+ " {'image_id': '360',\n",
+ " 'caption': 'The painting depicts a serene scene of a large blue house with white trim and a tall tower on top, surrounded by a white picket fence and a small green lawn. A woman in a flowing dress stands on the porch, looking out at the view, with a contemplative expression. Behind her is a large window that frames a view of the ocean and some boats in the distance, with two pale blue curtains. The bright blue sky with fluffy white clouds adds to the peaceful mood of the image.'},\n",
+ " {'image_id': '361',\n",
+ " 'caption': 'This image shows a man in a black shirt and headphones standing in front of a DJ booth. The man is holding a pair of headphones and appears to be in the process of mixing music. The background is dark, with blue and purple lights illuminating the scene.'},\n",
+ " {'image_id': '362', 'caption': 'The image is a cartoon illustration'},\n",
+ " {'image_id': '363',\n",
+ " 'caption': 'es people to pursue their aspirations. The t-shirt features black color and bold gold lettering in the center of the chest that reads, \"we\\'re gonna do what we can\\'t be done.\" The design is made of a lightweight and thin material with short sleeves and no other visible graphics. The quote is often used in various contexts to inspire individuals to strive for their goals and push themselves beyond their limits.'},\n",
+ " {'image_id': '364',\n",
+ " 'caption': 'The image depicts a family riding bicycles in a park at sunset. The father is riding a bicycle with his son on the back, while the mother is riding a bicycle with her daughter on the back. The family is enjoying the beautiful sunset and the peaceful atmosphere of the park.\\n\\nIn the background, there is a tree with birds perched on its branches. The silhouette of the tree and the birds creates a beautiful contrast against the orange and pink sky of the sunset. The overall mood of the image is peaceful and happy, as the family is spending quality time together in nature.'},\n",
+ " {'image_id': '366',\n",
+ " 'caption': 'The image is of a blue backpack with a zipper on the top and two straps on the sides. It also features a small pocket on the front and a larger pocket on the back.'},\n",
+ " {'image_id': '368',\n",
+ " 'caption': 'This image is a purple background with white text that says \"all the good girls go to hell\". The text is written in a stylized font and appears to be glowing, as if it is made up of light trails. The overall effect of the image is dark and mysterious.'},\n",
+ " {'image_id': '369',\n",
+ " 'caption': 'This is a photo of a white boat with a sign on the side that says \"berkeley\" in black letters. The boat is docked at a pier and there are several people standing on the deck. The sky is cloudy and there are buildings in the background.'},\n",
+ " {'image_id': '370',\n",
+ " 'caption': 'This image shows a house with a red metal roof, surrounded by trees and mountains in the background. The house appears to be made of wood and has large windows on the front and side. There is a gravel driveway leading up to the house, and a small outdoor patio area with a table and chairs to the right of the house. The sun is setting behind the mountains, casting a warm orange glow on the scene.'},\n",
+ " {'image_id': '372',\n",
+ " 'caption': 'This The image shows a group of walnuts arranged in the shape of a heart on a wooden table. There are several walnuts on the left side of the heart, and one walnut on the right side. The walnuts are arranged in a way that they form the shape of a heart. The background is a wooden table with a rough texture.'},\n",
+ " {'image_id': '373',\n",
+ " 'caption': \"This is an image of a white swan swimming alone in a calm river. The swan's neck is stretched out and its beak is open as it swims towards the left side of the image. The water reflects the cloudy sky and some trees on the riverbank, as well as some buildings in the background. The swan's feathers look smooth and glossy in the water. It is a peaceful and serene image.\"},\n",
+ " {'image_id': '375',\n",
+ " 'caption': 'This is an image of a dollar bill with a syringe sitting on top of it. The syringe appears to be filled with a clear liquid and has a needle attached to the end. The dollar bill is lying on top of a white surface, and there is a small puddle of liquid next to the syringe. The background of the image is blurred, but it appears to be a light colored surface.'},\n",
+ " {'image_id': '376',\n",
+ " 'caption': 'This image is a silhouette of a raven, a black bird with a large beak and long legs, standing on the ground with its wings spread out. The raven is a symbol of death and mourning in many cultures and is often depicted as a scavenger, feeding on carrion. It is also known for its intelligence and ability to solve problems. The raven is a bird of the crow family and is found in many parts of the world. It is a common sight in urban areas and is often seen perched on telephone wires or on the ground searching for food. The raven is a symbol of mystery and magic in many cultures and is often associated with the occult and the supernatural. It is also a symbol of transformation and renewal, as it is often seen as a messenger between the physical and spiritual worlds.'},\n",
+ " {'image_id': '377',\n",
+ " 'caption': 'The image shows a circle with a diameter of 8cm. The center of the circle is at the origin (0,0) and the coordinates of the endpoints of the chord are (4,0) and (-4,0). The equation of the chord is x = 4 and y = 0. The equation of the circle is x^2 + y^2 = 16. The length of the chord is 8cm.'},\n",
+ " {'image_id': '378',\n",
+ " 'caption': 'The image shows a white rabbit with pink ears and a pink nose standing on its hind legs. The rabbit has a pink ribbon tied around its neck and is placed on a wooden surface.'},\n",
+ " {'image_id': '379',\n",
+ " 'caption': 'The image shows a man in a red and orange costume standing on a stage in front of a group of people. He is wearing a long, flowing cloak and holding a sword in his right hand. The people in the background are dressed in various costumes and are watching him on stage.'},\n",
+ " {'image_id': '380',\n",
+ " 'caption': 'This image displays a collection of military-related icons including tanks, helicopters, soldiers, and other equipment. The icons are arranged in a grid pattern on a beige background in a simple, flat style. It seems to be a design element for a website or application associated with the defense industry.'},\n",
+ " {'image_id': '382',\n",
+ " 'caption': 'This image shows a group of people standing in a boxing ring. Two men are boxing, one in a red shirt and black pants and the other in a white shirt and black pants, both wearing boxing gloves. There is a referee in a black and white striped shirt and black pants, standing in the corner of the ring with a whistle in his mouth, watching the match closely. Several people are sitting in the stands, wearing different clothing and cheering on the fighters. The room is dimly lit and shadows can be seen on the walls.'},\n",
+ " {'image_id': '383',\n",
+ " 'caption': 'This is an image of the Hulk, a superhero character from the Marvel Comics universe. The Hulk is a large, green figure with bulging muscles and a mean expression on his face. He is wearing green pants, revealing his muscular chest and shoulders. The Hulk is known for his incredible strength and ability to transform into a giant, rampaging monster when he becomes angry.\\n\\nIn the image, the Hulk is shown with his fists clenched and his eyes narrowed in a menacing expression. He appears ready to fight, and his muscles are bulging and tense. This image captures his strength and intensity well.'},\n",
+ " {'image_id': '384',\n",
+ " 'caption': 'This image shows a small room with a blue wall and a white ceiling. There is a red fire extinguisher hanging on the wall and a white cabinet with drawers underneath it. The floor is made of gray tiles and there is a white door on the right side of the room.'},\n",
+ " {'image_id': '385',\n",
+ " 'caption': 'This image shows a bedroom with yellow walls and wooden beams. There is a large bed with a wooden headboard and white linens. The room has large windows that let in a lot of natural light. There is a chandelier hanging from the ceiling and a small table with a vase of flowers on it. The room appears to be spacious and well lit.'},\n",
+ " {'image_id': '386',\n",
+ " 'caption': \"This is a black and white photograph of a man and woman standing under a tree in the snow. The man is holding the woman's hand and they are both looking at something in the distance. There is a streetlight in the background, casting a warm glow on the couple. The snow is falling heavily, covering the ground and the branches of the tree. The couple is dressed in winter clothing, with the man wearing a hat and scarf and the woman wearing a coat and boots.\"},\n",
+ " {'image_id': '387',\n",
+ " 'caption': 'This image depicts a room with a blue door and a chalkboard on the wall featuring various drawings and sketches of people, animals, and objects. A woman is standing in the doorway, observing the chalkboard. It seems to be an office or workspace.'},\n",
+ " {'image_id': '388',\n",
+ " 'caption': 'The image shows a table with a glass of wine and a bowl of nuts on it. The background is a green field with trees in the distance.'},\n",
+ " {'image_id': '389',\n",
+ " 'caption': \"The image shows a woman in a blue dress carrying a man on her back while wearing skates. They both seem to be enjoying themselves and smiling. The man is holding onto the woman's waist with his arms. The woman has long blonde hair and is wearing a pair of earrings. The man has short brown hair and is wearing a pair of sunglasses.\"},\n",
+ " {'image_id': '390',\n",
+ " 'caption': 'This image appears to be a graphic design with the words \"the earth laughs in flowers\" written in white letters on a gray background. The words are written in a stylized font with a vintage feel, and the graphic design appears to be a sticker or decal that can be placed on various surfaces. The phrase is often attributed to Ralph Waldo Emerson, who used it in his essay \"Nature,\" and it is interpreted as a reminder to appreciate the beauty and resilience of nature and find joy in the simple things in life. The image conveys a message of environmentalism and appreciation for the natural world.'},\n",
+ " {'image_id': '391',\n",
+ " 'caption': 'There are several people standing in the middle of a large, empty room. They are standing in a circle and facing each other. Several blue chairs are arranged in a circle around them. The walls are white and there are large windows on one side of the room. The floor is made of concrete. There are no other objects in the room.'},\n",
+ " {'image_id': '392',\n",
+ " 'caption': 'There are several potted plants on the shelf, including a spider plant, a snake plant, and a peace lily. There are also several books stacked on the shelf, including one with the title \"The Secret Life of Plants\" and another with the title \"How to Keep Your Houseplants Alive.\" There is a mug of coffee on the shelf as well.'},\n",
+ " {'image_id': '394',\n",
+ " 'caption': 'This is a diagram of a building with several floors, including the ground floor, first floor, second floor, third floor, and fourth floor. Each floor has several rooms with different functions, such as a kitchen, living room, bedroom, bathroom, and office. The building also has a rooftop terrace with a view of the city.'},\n",
+ " {'image_id': '395',\n",
+ " 'caption': 'The theme of this image is success and accomplishment.'},\n",
+ " {'image_id': '396',\n",
+ " 'caption': 'This painting depicts a group of people gathered around a table, enjoying a meal together. The colors used in the painting are vibrant and add to the joyful atmosphere of the scene. The artist has paid attention to detail, as seen in the intricate patterns on the tablecloth and the expressions on the faces of the people in the painting. The use of light and shadow creates a sense of depth and makes the painting come alive. Overall, it is a beautiful piece of art that captures a moment of happiness and togetherness.'},\n",
+ " {'image_id': '397',\n",
+ " 'caption': 'The image shows a table with a white tablecloth and two chairs. On the table, there is a photo of a woman in a wedding dress and a man in a tuxedo, as well as a bouquet of flowers. The wall behind the table is decorated with pictures of the bride and groom.'},\n",
+ " {'image_id': '398',\n",
+ " 'caption': 'The image shows a group of three people, two women and one man, standing together in a hallway. They are all wearing backpacks and smiling at each other. The woman on the left is wearing a white shirt and black pants, while the woman on the right is wearing a black shirt and white pants. The man in the middle is wearing a white shirt and black pants. They are all standing in a row, with the woman on the left standing closest to the camera and the man in the middle standing closest to the wall. The sun is shining through the windows behind them, casting a warm light on their faces.'},\n",
+ " {'image_id': '402',\n",
+ " 'caption': 'This image shows a close up view of a pork roast cooking in a slow cooker that is made of black plastic and has a removable lid. The pork roast has been seasoned with herbs and spices and is browning on the outside while cooking to perfection on the inside. The slow cooker is placed on a white countertop with other kitchen appliances visible in the background.'},\n",
+ " {'image_id': '403',\n",
+ " 'caption': 'The image shows a hotel room with a large bed, a desk, a chair, and a television. The walls are painted white and there is a large window with wooden blinds. The carpet is a light brown color and there are two lamps on either side of the bed. The room appears to be well lit and spacious.'},\n",
+ " {'image_id': '404',\n",
+ " 'caption': 'There are several rows of seats in the dining car, each with a table in front of them. The seats are upholstered in a light beige color with red and blue accents on the armrests. The tables are made of wood and have white tablecloths on them. There is a large window behind the seats that lets in natural light. The walls are painted in a light gray color and there are several light fixtures hanging from the ceiling.'},\n",
+ " {'image_id': '405',\n",
+ " 'caption': 'life and delicious cuisine, and many tourists visit the island specifically to experience both.\\n\\nThe image depicts a white windmill with a red and white striped body on top of a hill overlooking the ocean. A small building with white walls and a red roof is in the foreground. There are several boats in the water and some buildings on the shore in the background. The sky is clear and blue with a few fluffy clouds. The windmill is an iconic symbol of Mykonos, a Greek island in the Aegean Sea, and was used to grind wheat and other grains into flour. The small building in the foreground may be a taverna, a type of restaurant or cafe that serves traditional Greek food and drinks.'},\n",
+ " {'image_id': '406',\n",
+ " 'caption': 'The image shows a small plane flying over a body of water with clouds in the background. The plane appears to be flying at a low altitude, with its nose pointed down towards the water. The clouds in the background are white and fluffy, with some of them appearing to be shaped like cotton balls. The sky is a bright blue color, with a few small, white clouds scattered throughout it. The water below the plane is a deep blue color, with some small waves visible on its surface. There are no other objects visible in the image besides the plane and the clouds.'},\n",
+ " {'image_id': '408',\n",
+ " 'caption': 'The image shows a group of children in a boat on a river with ducks and geese swimming in the water. The children are smiling and enjoying their time on the river. There is a forest in the background with trees and greenery. The sky is clear and blue with fluffy clouds. The overall mood of the image is happy and carefree.'},\n",
+ " {'image_id': '409',\n",
+ " 'caption': 'This image depicts a set of stairs with colorful book covers on each step, which appear to be from the Harry Potter series by J.K. Rowling.'},\n",
+ " {'image_id': '412',\n",
+ " 'caption': 'The image shows a balcony or terrace with a table and chairs overlooking a lush green valley. There are trees and mountains in the background, and the sky appears to be clear and blue. The furniture on the balcony is made of wood, and there are plants in pots on the table. The overall feeling of the image is peaceful and serene, with a sense of being surrounded by nature.'},\n",
+ " {'image_id': '414',\n",
+ " 'caption': 'The image shows the entrance to a garden. The gate is made of wrought iron and decorated with vines and flowers. The garden walls are made of stone and covered in ivy. The path leading to the gate is made of cobblestones and lined with trees.'},\n",
+ " {'image_id': '415',\n",
+ " 'caption': 'The red brick building with large windows and a green lawn in front of it appears to be a well-maintained residential or office complex surrounded by trees. The building has several floors with apartments or offices on each floor, and the windows are made of glass, allowing natural light to enter the building. The sidewalk in front of the building is wide and has several benches, while the cars parked on the street are mostly sedans and SUVs.'},\n",
+ " {'image_id': '417',\n",
+ " 'caption': 'The image shows a dirt road lined with colorful trees on either side. The trees have bright red, orange, and yellow leaves. The sky is a bright blue with fluffy white clouds. There is a wooden fence on the left side of the road and a grassy area on the right. The road appears to be winding and going through a rural area.'},\n",
+ " {'image_id': '419',\n",
+ " 'caption': \"The image shows a table with different items on it, such as a book, a cup of coffee, a vinyl record, and a wooden keychain with the word 'person' engraved on it. The background is a wooden surface with scratches and imperfections.\"},\n",
+ " {'image_id': '421',\n",
+ " 'caption': \"This image shows a person's hand wearing purple gloves holding up a piece of metal that has been cut into the shape of a rectangle. The metal has a shiny surface and appears to be made of a thin sheet of metal. The person's hand is positioned in front of a green background.\"},\n",
+ " {'image_id': '424',\n",
+ " 'caption': 'The image shows a man sitting on the balcony of a building overlooking the ocean. The man is wearing a black t'},\n",
+ " {'image_id': '425',\n",
+ " 'caption': 'The image shows a person riding a mountain bike down a dirt trail in the middle of a lush green forest. The person is wearing a yellow shirt and black pants, and has a backpack on their back. The trail is narrow and winds through the trees, with dirt and rocks on either side. The sky is clear and blue, with fluffy clouds in the distance. There are no other people or objects in the image.\\n\\nDescription:\\n\\n* The image shows a person riding a mountain bike down a dirt trail in the middle of a lush green forest.\\n* The person is wearing a yellow shirt and black pants, and has a backpack on their back.\\n* The trail is narrow and winds through the trees, with dirt and rocks on either side.\\n* The sky is clear and blue, with fluffy clouds in the distance.\\n* There are no other people or objects in the image.'},\n",
+ " {'image_id': '426',\n",
+ " 'caption': 'This is an image of a hair clipper with a black and silver body and a cord attached to it, plugged into an electrical outlet on the right side of the image. The clipper has a rotating blade on the top that can be adjusted to different lengths. Its handle is made of black plastic and has a comfortable grip, with a switch that controls the power of the clipper.'},\n",
+ " {'image_id': '427',\n",
+ " 'caption': \"The image shows a view of a lake with mountains in the background. There is a wooden bench sitting on the grass next to the water's edge. The sky is clear and there are some clouds in the distance. It looks like a peaceful place to relax and enjoy the scenery.\"},\n",
+ " {'image_id': '429',\n",
+ " 'caption': 'The image shows a man in a tuxedo holding a trophy. The man is smiling and holding the trophy with both hands. He is wearing a black tuxedo with a white shirt and black tie. The trophy is made of metal and has a rectangular shape with rounded edges. It is engraved with the name of the award and the year it was given. The background is a dark color, possibly black, and there are no other objects or people visible in the image.'},\n",
+ " {'image_id': '431',\n",
+ " 'caption': 'The painting depicts a church with a tall steeple, a large stained glass window, and a cemetery with several graves and headstones. The sky is a deep blue with a few white clouds, and the sun is setting behind the church, casting a warm orange glow on the building and the surrounding area.'},\n",
+ " {'image_id': '433',\n",
+ " 'caption': 'This image depicts a robotic hand reaching out to a human hand. The robotic hand is made of metal and plastic, with fingers that can move and grip objects. The human hand is also made of metal and plastic, with fingers that can move and grip objects. Both hands appear to be reaching out to each other, as if they are trying to touch or shake hands. The image is in black and white, with a white background.\\n\\nThe image shows a robotic hand reaching out to a human hand. The robotic hand is made of metal and plastic, with fingers that can move and grip objects. The human hand is also made of metal and plastic, with fingers that can move and grip objects. Both hands appear to be reaching out to each other, as if they are trying to touch or shake hands. The image is in black and white, with a white background.'},\n",
+ " {'image_id': '434',\n",
+ " 'caption': 'This is a living room with white furniture, including a couch, chairs, and a coffee table. The walls are painted in a light beige color, and there is a fireplace in the corner of the room. The flooring is hardwood, and there is a rug on the floor in front of the couch. The windows are large and let in a lot of natural light.'},\n",
+ " {'image_id': '435',\n",
+ " 'caption': 'The image shows a group of people in brightly colored kayaks on a large body of water. The sky is overcast and cloudy, with dark clouds in the background. The people in the kayaks appear to be smiling and enjoying their time on the water. There are several other kayaks in the background, as well as a few people standing on the shore. The water is calm and still, with no waves or ripples visible. The overall mood of the image is one of relaxation and enjoyment.'},\n",
+ " {'image_id': '438',\n",
+ " 'caption': 'The image is a pair of sunglasses. The frame is made of tortoiseshell acetate and has a rectangular shape with rounded edges. The lenses are tinted in a gradient shade of purple. The sunglasses have no visible branding or logos on them.\\n\\nIn terms of style, the sunglasses have a classic, timeless look that would work well with a variety of outfits, from casual to formal. The tortoiseshell acetate frame gives the sunglasses a sophisticated, elegant look, while the purple lenses add a touch of playfulness and personality.\\n\\nOverall, these sunglasses are a great choice for anyone looking for a versatile, stylish pair of sunglasses that can be worn in a variety of settings.'},\n",
+ " {'image_id': '442',\n",
+ " 'caption': 'The woman is wearing a white shirt and blue jeans while holding a straw basket. She stands in front of a stone building with arches and a wooden door on a cobblestone street. Trees are visible in the cloudy background.'},\n",
+ " {'image_id': '443',\n",
+ " 'caption': 'The image shows a group of people standing on top of a snow-covered mountain at sunset. The sun is setting in the background, casting a warm orange glow over the scene. The people in the image are wearing winter clothing and appear to be enjoying the view from the top of the mountain. There are some rocks and boulders in the foreground, as well as some trees in the background. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '445',\n",
+ " 'caption': 'The image shows a large group of people standing in front of a white building. They are holding signs and looking at the camera. There is a large American flag hanging from the top of the building. The people in the image are diverse in age and ethnicity. They are standing in front of the building, which appears to be the White House.'},\n",
+ " {'image_id': '448',\n",
+ " 'caption': 'This image shows a woman sitting on a large mat in the middle of a desert. The mat is covered in colorful pillows and blankets. The sky is clear with a few clouds in the distance. The woman is wearing a white shirt and black pants, with her hair tied back in a ponytail. No other people or structures are visible.'},\n",
+ " {'image_id': '449',\n",
+ " 'caption': 'The image is a black and white illustration of an old fashioned pickup truck. The truck appears to be vintage and has an open bed with a wooden tailgate. The front of the truck has a large grille and headlights, while the sides have large fenders and tires. The back of the truck has a bumper with a license plate and a pair of taillights. The truck appears to be in good condition, with no major dents or scratches. The tires appear to be in good shape, with no signs of wear or tear. The overall appearance of the truck suggests that it is a classic, vintage vehicle that has been well maintained over the years.'},\n",
+ " {'image_id': '451',\n",
+ " 'caption': 'The image shows a withered red tulip in a clear glass vase on a bright green background.'},\n",
+ " {'image_id': '452',\n",
+ " 'caption': \"This image is a hand-drawn doodle of different objects like food, drinks, and other items. The objects are depicted in a cartoon style with simple lines and basic shapes. The colors used are mostly black and white, with some shading and highlights added to provide depth and dimension. The overall effect is a playful and whimsical design suitable for various purposes like children's books, party decorations, or greeting cards.\"},\n",
+ " {'image_id': '454',\n",
+ " 'caption': 'This image shows a microphone in a recording studio. The microphone has a black body with a silver grille and a silver stand that is adjustable. The background of the image is a green screen mounted on the wall behind the microphone and extends to the top of the image. There are no other objects or people visible.'},\n",
+ " {'image_id': '456',\n",
+ " 'caption': 'The man in the image is wearing a red coat, black pants, and a black hat with a feather. He is holding a sword in one hand and a pistol in the other. He is standing on a rock in the middle of a body of water. The sky is dark and stormy, with lightning flashing in the distance. The man appears to be a pirate, with a long beard and a scruffy appearance.'},\n",
+ " {'image_id': '457',\n",
+ " 'caption': 'The image shows a woman wearing a white wedding dress with spaghetti straps and a plunging neckline. The dress has a lace bodice and a long, flowing skirt. The woman is smiling and holding the train of her dress, which is draped behind her. She is standing in front of a white backdrop.'},\n",
+ " {'image_id': '458',\n",
+ " 'caption': 'The woman in the image is wearing a lilac colored dress with a cowl neck and long sleeves. The dress has a high slit on the side, revealing her legs. She is also wearing a pair of beige suede ankle boots with a small heel. Her hair is styled in loose waves and she has a pair of sunglasses perched on her head. She is holding a small white purse and standing in front of a white door with a wreath on it.'},\n",
+ " {'image_id': '460',\n",
+ " 'caption': \"This image is a red shirt with white lettering that reads, `'all i want for christmas is to sleep in heavenly peace'. The lettering is written in a casual, handwritten style and is centered on the front of the shirt. The shirt appears to be a standard, short-sleeved t-shirt with a crew neckline. It is not clear from the image what material the shirt is made of, but it appears to be a thin, lightweight fabric. The image does not show any other details about the shirt.\"},\n",
+ " {'image_id': '461',\n",
+ " 'caption': 'The image shows a dart hitting a bullseye on a target. There are three darts in the bullseye, with one in the center and two on either side. The darts are colored red, blue, and green. The background is a blue sky with clouds. The image is a flat design, with no shading or texture. It is a simple illustration of a target with darts hitting the bullseye.'},\n",
+ " {'image_id': '463',\n",
+ " 'caption': 'white jersey with a red, white, and blue design on it, along with black shorts and shoes with white soles. The man is wearing a black and white helmet with a red and white stripe, black gloves with white stripes, and a black and white cap with a red and white stripe. He is also wearing black socks with white stripes and black and white sunglasses. The bicycle he is riding has black handlebars and black wheels, and the track he is riding on is made of asphalt with white lines. The background of the image is a blue sky.'},\n",
+ " {'image_id': '464',\n",
+ " 'caption': 'The bike in the image is a full suspension mountain bike with a silver frame and black and blue accents. It has 26 inch knobby tires on both the front and rear. The bike is equipped with a dropper post, which enables the rider to lower the saddle for better control on steep descents.'},\n",
+ " {'image_id': '466',\n",
+ " 'caption': 'This image shows a large crowd of people gathered on a grassy area in front of a row of white tents. In the background, there are several large buildings with balconies and windows. The sky is cloudy and there are a few fluffy clouds. The people in the image are dressed in a variety of clothing, including shorts, t shirts, and hats. Some are carrying drinks and others are chatting with each other. There are also a few vendors selling food and other items.'},\n",
+ " {'image_id': '467',\n",
+ " 'caption': 'The image shows a small garden area with chairs and an umbrella. The grass is well-kept and the area is surrounded by trees and a fence. The sky is clear and blue with a few distant clouds visible.'},\n",
+ " {'image_id': '468',\n",
+ " 'caption': 'There are several items on the table, including a watch, a bracelet, and a pair of earrings. The watch has a black face with white numbers and a gold band. The bracelet is made of black leather and has a gold clasp. The earrings are made of white gold and have small diamonds on them.\\n\\nThe watch has a black face with white numbers and a gold band. The bracelet is made of black leather and has a gold clasp. The earrings are made of white gold and have small diamonds on them.'},\n",
+ " {'image_id': '469',\n",
+ " 'caption': 'The image depicts a wooden shelf with clear glass objects, such as a vase, glass, and bottle, on it. The shelf has a light brown color and is set against a white background.'},\n",
+ " {'image_id': '470',\n",
+ " 'caption': 'This image depicts a cartoon character standing in front of an exit sign in a hospital corridor. The character is wearing a business suit and holding a briefcase with his arms crossed and a look of determination on his face. The exit sign has an arrow pointing to the right, indicating that it leads to the outside of the building. The image could be used to represent the idea of leaving a place or completing a task, symbolizing determination and readiness to move on to the next stage. The exit sign could also symbolize the end of a journey or a difficult situation.'},\n",
+ " {'image_id': '471',\n",
+ " 'caption': 'This image shows a large, luxurious house made of wood and stone, with large windows overlooking the ocean. A swimming pool in the front yard, surrounded by lush greenery and trees. The beach is visible in the distance, with waves crashing against the shore. The sky is clear and blue, with a few fluffy clouds floating in the distance.'},\n",
+ " {'image_id': '472',\n",
+ " 'caption': 'ard is chasing the hyenas, while the hyenas are trying to escape. The environment around them appears to be dry and rocky, with dust and dirt in the air.'},\n",
+ " {'image_id': '474',\n",
+ " 'caption': 'The image shows a table with a white tablecloth, two glasses of red wine, a plate of sliced oranges and lemons, and a small bowl of olives. A painting of a landscape with mountains and a river is hanging on the wall in the background. There is also a small vase of flowers on the table. The image depicts a cozy and inviting atmosphere with a focus on wine and food.'},\n",
+ " {'image_id': '475',\n",
+ " 'caption': 'The image shows a white building with balconies and stairs leading up to the second floor. There is a large palm tree in front of the building, and several potted plants on the ground. The building appears to be in a residential area, with other houses visible in the background.'},\n",
+ " {'image_id': '476',\n",
+ " 'caption': \"This image is a diagram of the earth's energy balance. It shows the incoming energy from the sun (solar radiation) and the outgoing energy from the earth (longwave radiation). The diagram displays the earth's atmosphere, which absorbs and reflects some of the incoming energy. The earth's surface also absorbs and reflects some of the incoming energy.\"},\n",
+ " {'image_id': '477',\n",
+ " 'caption': 'a natural backdrop. The colors are vivid and eye-catching, with the green and black of the butterfly contrasting nicely with the various shades of green in the background. The image is a beautiful representation of nature and the delicate balance of life within it.'},\n",
+ " {'image_id': '478',\n",
+ " 'caption': 'The image shows a woman wearing a white dress with a plunging neckline, standing at a red carpet event. Her hair is styled in loose, wavy waves and she has a smokey eye makeup look with dark eyeliner and brown eyeshadow. She is wearing a pair of silver earrings and a bracelet on her left wrist. She is looking directly at the camera with a serious expression on her face.'},\n",
+ " {'image_id': '479',\n",
+ " 'caption': 'The image is a seamless pattern of blue and gray roller skates on a white background, with black wheels and silver laces. It can be used for various purposes such as wallpaper, upholstery, or clothing design.'},\n",
+ " {'image_id': '481',\n",
+ " 'caption': 'This is a screenshot of a first person shooter video game set in a post-apocalyptic world. The player is standing in front of a ruined building with debris and rubble scattered around. They are holding a gun and aiming at a target in the center of the screen.'},\n",
+ " {'image_id': '482',\n",
+ " 'caption': 'This image shows a young man sitting on the grass in a park. He is wearing a black hoodie.'},\n",
+ " {'image_id': '483',\n",
+ " 'caption': 'This image shows a hotel room with a large bed, a desk, a chair, and a television. The walls are painted white and there is a large window with a view of the city. The bed is made with white linens and there are two pillows on it. The desk has a lamp and a computer on it, and there is a chair in front of it. The television is mounted on the wall and there is a small table next to it with a vase of flowers on it. The room appears to be spacious and comfortable.'},\n",
+ " {'image_id': '484',\n",
+ " 'caption': 'This image depicts a group of men posing in front of a red and white striped tent in a grassy field with trees in the distance. They are all dressed in floral shirts and shorts, and one of them is holding a beer can.'},\n",
+ " {'image_id': '485',\n",
+ " 'caption': 'This image is a seamless pattern of coffee cups with steam coming out of the top. The cups are brown and have a brown steam coming out of the top. The background is white.\\n\\nThis pattern could be used for a variety of purposes, such as wallpaper, fabric, or wrapping paper. It could also be used as a design element in a website or graphic design project.'},\n",
+ " {'image_id': '487',\n",
+ " 'caption': 'This image shows a shelf with several bottles of different types of drinks. The bottles are lined up in a row on the shelf, with some of them stacked on top of each other. The labels on the bottles indicate that they contain various types of drinks, including juices, sodas, and other beverages. The bottles are made of glass and have different colors and designs on them. The shelf appears to be made of wood, and there are some small shelves on the wall behind the bottles. The overall color scheme of the image is warm and inviting, with a mix of natural and artificial lighting.'},\n",
+ " {'image_id': '488',\n",
+ " 'caption': 'The woman in the image is wearing a grey coat, black tights, and knee-high boots. She is carrying a black handbag and standing next to a wooden fence made of wooden planks. There is a row of bushes growing along the side of the fence. The woman is looking down at her black phone with a white screen. The image is taken from a distance, and the woman is standing in front of the fence with her back to the camera.'},\n",
+ " {'image_id': '489',\n",
+ " 'caption': 'This image is a photograph of a white bowl filled with a salad made up of a variety of greens, including romaine lettuce, red radishes, and thin slices of red onion. The bowl is sitting on top of a wooden table with a white tablecloth. There is a fork and knife in the bowl, as well as a sprig of fresh rosemary on the side of the bowl. The overall color scheme of the image is bright and fresh, with the green of the lettuce and radishes contrasting with the white of the bowl and tablecloth.'},\n",
+ " {'image_id': '490',\n",
+ " 'caption': 'The image shows a white convertible car parked in a large, white room. The car has a sleek and modern design, with a long hood and a sloping roofline. The front of the car has a large grille and headlights, and the sides have large wheels and tires. The car appears to be in excellent condition, with no visible scratches or dents.'},\n",
+ " {'image_id': '492',\n",
+ " 'caption': 'The image shows three white bowls filled with different spices, including turmeric, cinnamon, and ginger. There is also a wooden spoon in one of the bowls, which appears to be stirring the mixture. The background is a wooden table with a rustic texture.'},\n",
+ " {'image_id': '494',\n",
+ " 'caption': 'The image shows a young boy sitting on the floor with a pacifier in his mouth. He is wearing a yellow shirt and green overalls. The background is white. The image is a cartoon illustration.'},\n",
+ " {'image_id': '495',\n",
+ " 'caption': \"woman in a military uniform, wearing a long coat, hat, gloves, and belt with a buckle. She is standing with her hands on her hips and looking directly at the camera. Her hair is styled in a bob and she is wearing a pair of sunglasses. The background is not visible, but it appears to be a studio setting. The photograph is of high quality and the details of the woman's uniform and accessories are clearly visible.\"},\n",
+ " {'image_id': '496',\n",
+ " 'caption': 'This is a large, red brick house with a green lawn and palm trees in the front yard. There is a driveway leading up to the house and a garage on the right side. The windows have white shutters, and the roof is tiled with a chimney on the left side. The lawn is well-manicured with a sprinkler system in place, and the street is lined with palm trees and a sidewalk leading up to the house.'},\n",
+ " {'image_id': '497',\n",
+ " 'caption': \"The image depicts a person standing before a large sphinx statue made of stone. The statue is in the shape of a lion with a human head. The person is gazing up at the night sky filled with stars and constellations, including the Big Dipper and Orion's Belt. The person seems to be enjoying the view.\"},\n",
+ " {'image_id': '498',\n",
+ " 'caption': 'This is an image of a woman holding a trophy. She is wearing a blue dress and standing in front of a red and black background. The trophy is made of metal and has an engraved design on it. The woman is smiling and holding the trophy with both hands. There are other people in the background, but they are not visible in this image.'},\n",
+ " {'image_id': '500',\n",
+ " 'caption': \"This image appears to be a page from a book, with a black and white illustration of a cat sitting on a windowsill. The cat is looking out the window and appears to be staring at something outside. The image is in black and white, with the cat's fur and the scenery outside sketched in intricate detail.\"},\n",
+ " {'image_id': '501',\n",
+ " 'caption': 'This is a collage of four different characters from the TV show Arrow. From left to right, the characters are: 1. Oliver Queen, also known as the Arrow, 2. Barry Allen, also known as the Flash, 3. Kara Danvers, also known as Supergirl, and 4. Clark Kent, also known as Superman. These characters are all part of the DC Comics universe and have appeared in various TV shows and movies.'},\n",
+ " {'image_id': '502',\n",
+ " 'caption': 'The image shows a white 2020 Infiniti QX80 driving on a dirt road surrounded by trees. The vehicle has a large grille and headlights, with the Infiniti logo on the front. It has large wheels and tires, and there is a spare tire on the back. The windows are tinted, and there is a sunroof on the top. The vehicle appears to be in good condition, with no visible damage.'},\n",
+ " {'image_id': '503',\n",
+ " 'caption': 'The painting depicts the ruins of an ancient temple on a hill overlooking a lake at sunset. The temple is in a state of disrepair, with crumbling columns and broken pediments. The sky is a vibrant orange and pink at sunset, with clouds in the distance. The water in the lake is calm and reflects the sky. There are no people or animals in the scene.'},\n",
+ " {'image_id': '504',\n",
+ " 'caption': 'This image shows a lion standing on its hind legs with its front paws resting on the ground. The lion has a large mane and is looking to the left. The image is in front of a white background.'},\n",
+ " {'image_id': '506',\n",
+ " 'caption': 'This image shows a group of people on an airplane. They are all sitting in the aisle, with some standing in the back of the plane. They are all looking at the camera, with some smiling and others looking serious. The plane appears to be a small one, with only a few rows of seats. The windows are tinted, making it difficult to see outside. The interior of the plane appears to be clean and well maintained.'},\n",
+ " {'image_id': '507',\n",
+ " 'caption': \"The image is a silhouette of an eagle in flight, with its wings spread out and its talons open, appearing to be soaring through the sky with the sun shining behind it. The eagle's body and wings are outlined in brown against a black and white background.\\n\\nThe eagle is a symbol of strength, courage, and freedom in various cultures around the world and is often used as a symbol of the United States. It is known for its sharp vision to spot prey from high up in the sky and its powerful talons to catch and kill its prey. The eagle is a majestic and powerful bird that is frequently depicted in art and other forms of media.\"},\n",
+ " {'image_id': '508',\n",
+ " 'caption': 'The image shows a night scene with a full moon shining brightly in the sky. In the foreground, there is a silhouette of a tree standing alone on a hill.'},\n",
+ " {'image_id': '509',\n",
+ " 'caption': 'The painting depicts a group of people sitting in the shade of a tree in a park. There is a woman holding a small plant in her hand, sitting on the ground with her back to the viewer. Behind her, a man and another woman are sitting on a bench, with the latter facing the viewer. In the background, there is a row of trees and a path leading into the distance. The colors used in the painting are muted and earthy, with green and brown tones dominating. The figures are depicted realistically, with attention paid to their clothing and facial expressions. The tree in the foreground is detailed, with individual leaves and branches visible. The painting has a peaceful and serene mood, with a well-balanced composition.'},\n",
+ " {'image_id': '510',\n",
+ " 'caption': 'The image shows a large cave with a river flowing through it. The walls of the cave are made up of large boulders and rock formations. The water in the river is crystal clear and there is a small waterfall coming down from the top of the cave. The cave is surrounded by lush greenery and trees. There is a small path leading to the entrance of the cave.'},\n",
+ " {'image_id': '511',\n",
+ " 'caption': \"This image shows a wedding ceremony taking place in a large room with a long aisle lined with chairs. The aisle is decorated with flowers and there are two large windows on either side of the room, letting in natural light. The bride and groom are standing at the end of the aisle, facing each other and holding hands. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. The guests are seated in the chairs and watching the ceremony. On the wall behind the bride and groom, there is a large banner with the words 'I remember this day forever and always' written on it.\"},\n",
+ " {'image_id': '513',\n",
+ " 'caption': 'This image depicts a man in a suit and glasses standing with his arms outstretched and a big smile on his face. He is wearing a beige suit and brown shoes. The background is white.\\n\\nThe man is wearing glasses and a beige suit. He is standing with his arms outstretched and a big smile on his face. He is wearing a beige suit and brown shoes. The background is white.'},\n",
+ " {'image_id': '514',\n",
+ " 'caption': 'This image shows a blender filled with sliced cucumbers, watermelon, and other ingredients. The blender has a stainless steel blade and a clear plastic container with measurements on the side. The image is on a white background with a pink and white striped towel in the background.'},\n",
+ " {'image_id': '515',\n",
+ " 'caption': 'The image shows a map of the world made up of gold coins. The coins are arranged in the shape of the continents and countries on the map. The image is on a black background.'},\n",
+ " {'image_id': '516',\n",
+ " 'caption': 'The image shows a red silk or satin envelope adorned with gold heart-shaped decorations.'},\n",
+ " {'image_id': '517',\n",
+ " 'caption': 'This is an image of a person wearing a football helmet and holding a football. The text on the image reads, \"Kittle over the middle,\" which is likely a reference to a football play where the quarterback throws the ball to a tight end in the middle of the field. The person in the image is wearing the jersey of the San Francisco 49ers, a professional football team in the National Football League (NFL). The team\\'s colors are red, gold, and white, and their logo is a stylized version of the number 49. The helmet on the person\\'s head is also red and gold, with the team\\'s logo on the side. The football in the person\\'s hand is also red and white, with the team\\'s logo on it.'},\n",
+ " {'image_id': '518',\n",
+ " 'caption': 'The image is a black and white illustration of a bouquet of flowers on a black background. The flowers are drawn in a simple, stylized style, with each petal outlined in white. The bouquet is arranged in a loose, casual style, with the stems and leaves trailing off the sides of the image. The flowers include roses, peonies, and daisies, giving the bouquet a charming rustic look, perfect for country or rustic themed designs or events.'},\n",
+ " {'image_id': '519',\n",
+ " 'caption': 'The image shows a hallway with blue walls, a white door, and a black rubbish bin. The walls are painted in a light shade of blue, and the floor is made of wooden planks. The door has a white frame and a small window at the top. The rubbish bin is made of plastic and has a black lid. There are no other objects in the hallway.\\n\\nThe walls are painted in a light shade of blue.\\n\\nThe floor is made of wooden planks.\\n\\nThe door has a white frame and a small window at the top.\\n\\nThe rubbish bin is made of plastic and has a black lid.\\n\\nThere are no other objects in the hallway.'},\n",
+ " {'image_id': '520',\n",
+ " 'caption': 'The image shows a man wearing a top hat, trench coat, and boots walking down a city street with his back to the camera. He is carrying a large backpack and appears to be looking at something in the distance. There are several buildings in the background, including a tall one with a clock tower on top. The overall tone of the image is dark and mysterious.'},\n",
+ " {'image_id': '521',\n",
+ " 'caption': \"This image shows a sign hanging from the side of a brick building that says `future's closet boutique' in white letters on a black background. The letters are stylized and have a futuristic look. The sign appears to be made of metal or plastic. No other details are visible in the image.\"},\n",
+ " {'image_id': '522',\n",
+ " 'caption': 'The image shows a group of people dressed in formal attire, standing on a red carpet at a movie premiere. The men are wearing black tuxedos and the women are wearing a red dress. They are all smiling and posing for the camera. In the background, there are posters and banners for the movie they are at the premiere for.'},\n",
+ " {'image_id': '523',\n",
+ " 'caption': 'which make up the majority of their diet. Koalas are also known for their slow metabolism and low energy levels, which is why they spend most of their time sleeping in trees. Koalas are native to Australia and are considered a national symbol. However, they are listed as a vulnerable species due to habitat loss and disease. Conservation efforts are underway to protect koalas and their habitats.'},\n",
+ " {'image_id': '524',\n",
+ " 'caption': 'This image shows the superficial muscles of the human body, including the biceps, triceps, pectorals, and deltoids. The biceps bend the elbow and flex the forearm, while the triceps straighten the elbow and extend the forearm. The pectorals move the arms forward and upward, and the deltoids move the arms backward and downward.'},\n",
+ " {'image_id': '525',\n",
+ " 'caption': 'in a dimly lit room with a curtain in the background. The image depicts a romantic and intimate scene between the two actors.'},\n",
+ " {'image_id': '526',\n",
+ " 'caption': 'This image shows a living room with a fireplace and a chair in front of it. The walls are made of bricks and there is a clock on the wall above the fireplace. The floor is made of wooden planks and there is a rug in front of the fireplace. The room is well lit and there are curtains on the windows.'},\n",
+ " {'image_id': '527',\n",
+ " 'caption': 'This image is a set of thin line icons related to social media and communication. The icons include a speech bubble, a thought bubble, a heart, a speech balloon, a chat bubble, a smiley face, a thumbs up and a thumbs down. The icons are arranged in a grid pattern on a white background, and they are simple and minimalistic in design, making them suitable for use in various applications.\\n\\nThe icons can be used to create a consistent visual identity for a brand or product, and they can represent various actions and emotions related to social media, such as sharing, liking, commenting, and connecting with others. They are versatile and can be used in various contexts, such as in marketing materials, presentations, and infographics.'},\n",
+ " {'image_id': '529',\n",
+ " 'caption': \"This image is a black and white photograph of a man wearing a baseball cap and holding his hands up to his face, as if he's covering his eyes. The man's face is obscured by shadows, but you can see the outline of his features. The background of the image appears to be a dark, solid color. There are no other visible objects or elements in the image, except for the man's hands and the baseball cap he's wearing.\"},\n",
+ " {'image_id': '531',\n",
+ " 'caption': 'The image shows a dog and a cat lying on a red and black blanket on the floor. The dog has its mouth open and appears to be panting, while the cat is lying on its side with its paws tucked underneath it. The background of the image appears to be a living room with a couch and other furniture in the background.'},\n",
+ " {'image_id': '532',\n",
+ " 'caption': \"The image shows a luxurious resort on a lush green hillside. The resort comprises several wooden buildings with large windows providing natural light. The surroundings are filled with trees and greenery, creating a serene environment. The picture was taken at night, with the buildings' lights shining brightly against the dark sky.\"},\n",
+ " {'image_id': '534',\n",
+ " 'caption': \"The image shows a pregnant woman holding a piece of paper with two drawings on it. The drawings are of a man and a woman, with the man holding the woman's hand. The woman's stomach is visible in the image, and she is wearing a yellow shirt.\"},\n",
+ " {'image_id': '535',\n",
+ " 'caption': 'The image depicts a woman sitting at a sewing machine, surrounded by flowers. She is wearing a white apron and has a pair of scissors in her hand. The background is floral with pink, purple, and white flowers. The woman is smiling and appears to be happy while working on her sewing project. The theme of the image is creativity and craftsmanship, and the message is that creativity and hard work can lead to happiness and fulfillment.'},\n",
+ " {'image_id': '536',\n",
+ " 'caption': 'This image is a logo for a hair salon. The logo features a silhouette of a woman\\'s head with long, curly hair in shades of purple and blue. The words \"you gane hair\" are written in elegant script underneath the silhouette. The design is simple and elegant, with a focus on the hair and the salon\\'s name. It could be used on business cards, flyers, and other promotional materials for the salon.'},\n",
+ " {'image_id': '538',\n",
+ " 'caption': ', the image depicts glasses of green smoothie made with avocado and garnished with sliced avocado and mint leaves. The avocado provides healthy fats, vitamins, and minerals and adds creaminess to the smoothie. The mint leaves add freshness and a cooling sensation. Sliced avocado also adds texture and flavor to the drink.'},\n",
+ " {'image_id': '539',\n",
+ " 'caption': \"This is an image of a person holding a small black and tan dachshund's mouth open to reveal its teeth. The dog is wearing a collar with a tag on it. The person is using a toothbrush to brush the dog's teeth. The image is in focus, with the person and dog in sharp relief against the white background.\"},\n",
+ " {'image_id': '540',\n",
+ " 'caption': 'The woman in the image is wearing a white dress with a floral pattern on it, which has a high neckline and long sleeves. She is also wearing a black blazer over a white shirt with a collar, which is tucked into the dress. The woman is holding a brown bag in her left hand and has her right hand in her pocket. She is standing in front of a mirror, looking at herself. The room appears to be a bedroom with a bed, white comforter, and pillows in the background. The walls are painted a light color and there are white curtains on the windows. The floor is wooden with a white rug on the ground. Overall, the image depicts a woman taking a selfie in front of a mirror in a bedroom.'},\n",
+ " {'image_id': '541',\n",
+ " 'caption': 'This is a black and white photograph of an old car parked on a street in Havana, Cuba. The car appears to be a vintage model, possibly from the 1950s or 1960s. It is parked on the right side of the road, facing in the direction of the camera. There are no people or other vehicles visible in the image. The street is lined with old buildings on both sides, some of which appear to be in disrepair. The sky is cloudy and overcast, with the sun setting behind the buildings on the left side of the image. The overall mood of the image is one of nostalgia and decay.'},\n",
+ " {'image_id': '543',\n",
+ " 'caption': 'This is a photo of a room with a lot of old, rustic items in it. The walls are made of wooden planks and there is a wooden floor. The room is dimly lit by a few lanterns hanging from the ceiling. There is a large wooden table in the center of the room, covered in various items such as bottles, pots, and pans. Some old tools, such as hammers and saws, are hanging on the walls. The overall atmosphere of the room is very rustic and old fashioned.'},\n",
+ " {'image_id': '544',\n",
+ " 'caption': 'The image shows a group of plants growing in a garden. The plants are tall and green, with long, thin leaves. They appear to be growing in a row, with some of them leaning over the edge of the garden bed. There is a wooden fence in the background, and some trees can be seen in the distance. The overall color scheme of the image is green, with some brown and gray tones in the sky and the fence.'},\n",
+ " {'image_id': '546',\n",
+ " 'caption': 'This image shows a bride standing in front of a mirror, looking at her reflection. She is wearing a white wedding dress and has her hair styled in a loose bun. The bride is holding a bouquet of flowers in her hand. The room is decorated with white flowers and candles, and there is a window with white drapes in the background.'},\n",
+ " {'image_id': '548',\n",
+ " 'caption': 'The man in the image is wearing a gray suit, a black hat, and a white shirt. He is standing with his hands in his pockets and giving a thumbs up gesture. The background is white.'},\n",
+ " {'image_id': '549',\n",
+ " 'caption': 'The painting depicts a landscape with a river running through it, with trees on either side. The sky is dark and stormy, with clouds and lightning in the distance. In the foreground, there is a woman standing on the bank of the river, looking out at the storm. Behind her, there is a man standing on a bridge, looking out at the storm as well. In the background, there are mountains and more trees, with the stormy sky continuing into the distance.'},\n",
+ " {'image_id': '550',\n",
+ " 'caption': 'In the image, there is a bowl of cereal on a wooden table with a spoon next to it. The cereal appears to be a mixture of oatmeal, pomegranate seeds, and other ingredients. The background is a brown wooden surface.'},\n",
+ " {'image_id': '552',\n",
+ " 'caption': \"The image is a quote that reads, `'casting all your cares upon him, for he cares for you.' It is written in a calligraphy style, with the words cast and cares written in a bold, black font and the rest of the quote written in a lighter, cursive font. The background of the image is a light blue or turquoise color, and there is a white border around the text. The overall design of the image is simple and elegant, with the focus on the text and the message it conveys.\"},\n",
+ " {'image_id': '553',\n",
+ " 'caption': 'This is an image of a woman riding a horse in an indoor riding arena. The woman is wearing a helmet and riding gear, and the horse is wearing a bridle and saddle. The arena has a dirt floor and wooden walls, and there are other horses and riders in the background. The woman is holding the reins with her hands. The horse is standing still, and the woman is smiling. The lighting in the arena is bright, and there are shadows on the walls from the sunlight coming in through the windows. The overall mood of the image is peaceful and happy.'},\n",
+ " {'image_id': '555',\n",
+ " 'caption': 'This is a pair of sunglasses sitting on top of a wooden table. The sunglasses have a black frame and brown lenses. The lenses are tinted and the frame is made of metal. The sunglasses have a brown leather strap that goes around the back of the head. There is also a brown leather case on the table next to the sunglasses. The case is open and you can see the sunglasses inside.'},\n",
+ " {'image_id': '556',\n",
+ " 'caption': \"This is an image of a female doctor examining a patient's neck. The doctor is wearing a white lab coat and has a stethoscope around her neck. The patient is lying on an examination table and is looking up at the doctor, who is holding a stethoscope to the patient's neck. The room is well lit and has a white background.\"},\n",
+ " {'image_id': '557',\n",
+ " 'caption': 'This image is a photograph of a courtyard with a rectangular stone fountain in the center. The courtyard is surrounded by arches and domes, adorned with intricate geometric patterns on the walls. The water in the fountain is clear and reflects the blue sky above. A person, wearing a blue shirt and pants with their hands in their pockets, stands on the edge of the fountain, looking at the camera. The courtyard is otherwise empty except for the fountain and the person.'},\n",
+ " {'image_id': '558',\n",
+ " 'caption': ' The bike in the image is a full suspension mountain bike with a blue and white color scheme. The frame is made of aluminum and has a carbon fiber front triangle. The bike has 27.5 inch wheels and a dropper post, which allows the rider to lower the saddle for better control on steep descents.'},\n",
+ " {'image_id': '560',\n",
+ " 'caption': 'This image is a cartoon illustration of a baby sleeping on the crescent of the moon. The baby is wearing a blue and white outfit and has a pacifier in its mouth. The background is a starry night sky.'},\n",
+ " {'image_id': '564',\n",
+ " 'caption': 'The woman in the image is sitting on a wooden chair with her legs crossed and smiling. She has long, curly brown hair and is wearing a black velvet top with silver embroidery, blue jeans, and white sneakers. The background is black and the lighting is dim.'},\n",
+ " {'image_id': '566',\n",
+ " 'caption': 'The image shows a football player in a blue and gold uniform running with the ball during a game. The player is wearing a helmet and has a number on the back of his jersey. The stadium in the background is filled with fans watching the game.'},\n",
+ " {'image_id': '567',\n",
+ " 'caption': \"The power lines and towers lead the viewer's eye towards the sunset, and the mix of orange and purple in the sky adds to the beauty of the scene. The colors are vibrant and the lighting is well done, with the sun casting a warm glow over the power plant.\"},\n",
+ " {'image_id': '568',\n",
+ " 'caption': 'The image shows a drawing on a cement wall. The drawing depicts a red heart with a black arrow pointing downwards towards the bottom of the heart. The cement wall is grey and has some cracks in it. There are no other objects or people in the image.'},\n",
+ " {'image_id': '569',\n",
+ " 'caption': 'The image shows a modern dining room with a white marble dining table and chairs. The table top has a rectangular shape and is made of white marble. The legs of the table are made of metal and have a sleek modern design. The chairs are also sleek and modern in design, made of black leather. The walls of the room are painted in a light gray color, and there is a white fur rug on the floor. In the corner of the room, there is a large white Christmas tree decorated with silver and white ornaments.'},\n",
+ " {'image_id': '570',\n",
+ " 'caption': 'This image shows a group of people gathered on the deck of a large sailboat in a harbor or marina. The boat has several masts and sails, and the people are dressed in a variety of clothing, including shorts, t-shirts, and hats. Some of them are holding drinks and chatting with each other, while others are simply standing and looking out over the water. The overall atmosphere of the image is relaxed and casual, with people enjoying a day out on the water.'},\n",
+ " {'image_id': '571',\n",
+ " 'caption': 'This is an image of a person using a screwdriver to remove the propeller from a small RC plane. The person is holding the propeller with their left hand and using the screwdriver with their right hand to loosen the screws that hold the propeller in place. The plane has a black body and green wings, and the propeller is made of black plastic with three green blades. The person is wearing black gloves and a black shirt, and there is a white table in the background with some tools on it.'},\n",
+ " {'image_id': '572',\n",
+ " 'caption': \"This image shows a bride and groom sitting in the back of a vintage car. The bride is wearing a white wedding dress with a long veil, and the groom is wearing a black tuxedo with a white shirt and black tie. They are both smiling and looking at each other. The car appears to be a classic convertible, with the top down and the wind blowing through the bride's hair. The background is blurred, but it appears to be a city street with buildings and trees in the distance.\"},\n",
+ " {'image_id': '575',\n",
+ " 'caption': \"This image appears to be a drawing of a person wearing a mask and holding a cat. The person is wearing a black and white striped shirt, black pants, and black boots. The cat is sitting on the person's shoulder and appears to be looking up at the person's face. The image is in black and white and appears to have been drawn with a pen or pencil.\"},\n",
+ " {'image_id': '576',\n",
+ " 'caption': 'The image shows two flags, one green and the other white with a red crescent and star on it. The flags are waving in the wind against a dark cloudy sky.\\n\\nThe green flag has a white crescent and star in the center. The white flag has a green crescent and star in the center. The flags are waving in the wind against a dark cloudy sky.'},\n",
+ " {'image_id': '577',\n",
+ " 'caption': 'The image shows a bedroom with a black carpet and white walls. There is a large window with white curtains and a small table with a lamp on it. The bed is made with a black and white striped comforter, and there is a black and white striped rug on the floor. A black and white striped chair is placed in the corner of the room, with a cushion on it. The bed also has a black and white striped blanket and pillow. On the nightstand, there is a black and white striped lamp and vase. The window has a black and white striped curtain, and the bathroom door has a black and white striped shower curtain. Additionally, there is a black and white striped towel on the towel rack.'},\n",
+ " {'image_id': '578',\n",
+ " 'caption': 'This image shows a man wearing glasses and a denim shirt, leaning against a wall with his hands in his pockets.'},\n",
+ " {'image_id': '579',\n",
+ " 'caption': 'This image shows a man in a grey suit and tie standing next to a yellow vintage car from the 1920s or 1930s. The man is holding a briefcase in one hand and a cell phone in the other. The image was taken on a city street with buildings and other cars visible in the background.'},\n",
+ " {'image_id': '581',\n",
+ " 'caption': 'The image shows two hands holding a bowl of popcorn'},\n",
+ " {'image_id': '582',\n",
+ " 'caption': 'The image shows a plate with a sandwich on it. Slices of bananas are on the sandwich, and a jar of peanut butter and a knife are next to it. The plate is sitting on a kitchen counter.'},\n",
+ " {'image_id': '585',\n",
+ " 'caption': 'The image is a top view of a modern garden design featuring a large pond with a fountain in the center, surrounded by a variety of trees, shrubs, and flowers. There is also a patio area with outdoor furniture, including a table and chairs. The color scheme is neutral and the design has clean lines.'},\n",
+ " {'image_id': '586',\n",
+ " 'caption': 'This image appears to be an illustration of a woman standing on a rooftop, looking down at the ground below. The woman is wearing a long, flowing dress and has her hair pulled back into a ponytail. She is holding a sword in one hand and has a determined expression on her face. The sky behind her is dark and stormy, with lightning flashing in the distance. The rooftop appears to be made of shingles or tiles, and there are several chimneys visible in the background. The overall mood of the image is ominous and tense, as if the woman is preparing for a battle or confrontation.'},\n",
+ " {'image_id': '587',\n",
+ " 'caption': 'The image shows a close up of a red velvet cupcake with white frosting on top and a bite taken out of it. The cupcake is on a black background and there are other cupcakes in the background.\\n\\nThe cupcake appears to be made with red velvet cake mix and topped with cream cheese frosting. The frosting has a smooth, creamy texture and is piped onto the cupcake in swirls. The cupcake itself is a deep red color and has a smooth, glossy finish. The overall appearance of the cupcake is visually appealing and appetizing.'},\n",
+ " {'image_id': '590',\n",
+ " 'caption': 'This image is a pattern of princesses in various poses on a white background.'},\n",
+ " {'image_id': '591',\n",
+ " 'caption': 'This image shows a pile of old rope made up of several strands twisted and knotted together. Some strands are frayed and broken, while others are still intact. The rope appears to be made of natural fibers such as hemp or jute and is covered in dirt and debris. A small amount of green twine is wrapped around the base of the pile, giving it an overall effect of neglect and abandonment.'},\n",
+ " {'image_id': '592',\n",
+ " 'caption': 'This is a photo of a shelf in a grocery store. The shelf is filled with jars of spices and seasonings, including salt, pepper, and various herbs and spices. The jars are stacked in a grid pattern and placed on a wooden shelf with a metal frame. Above the shelf, there is a sign that says \"Spices\" in large letters. It is located in the spice aisle of the grocery store.'},\n",
+ " {'image_id': '593',\n",
+ " 'caption': 'This image shows a group of people standing in front of a brick building. The building has a sign on it that reads, \"Canterbury Hospital Medical Centre.\" Some people are wearing hats. There are also plants and flowers in front of the building.'},\n",
+ " {'image_id': '594',\n",
+ " 'caption': 'This The image shows a small body of water surrounded by tall reeds and other vegetation. The water is shallow and appears to be stagnant. There is a small tree growing out of the water in the center of the image. The sky is clear and there are some clouds visible in the distance.\\n\\nIn the foreground, there are some small plants growing in the water, including some lily pads and other aquatic vegetation. There are also some small fish swimming in the water.\\n\\nIn the background, there are some trees and other vegetation visible on the banks of the water. It appears to be a wetland or marsh area.\\n\\nOverall, this is a peaceful and serene image of a natural body of water surrounded by plants and wildlife.'},\n",
+ " {'image_id': '595',\n",
+ " 'caption': 'This image shows a night scene of a beach restaurant with a thatched roof, wooden tables and chairs, and a view of the ocean in the background. The lighting is provided by lanterns hanging from the roof, casting a warm glow over the area. The atmosphere is relaxed and peaceful, with the sound of waves crashing against the shore in the background.'},\n",
+ " {'image_id': '596',\n",
+ " 'caption': 'This image shows a man wearing a blue shirt with a pattern of horses on it. The shirt has short sleeves and is unbuttoned at the collar. He is wearing pink shorts and white sneakers, and standing with his hands in his pockets while looking directly at the camera. The shirt is made of lightweight material and has a slim fit with small brown horses on a navy blue background. The pattern on the shirt is made up of small brown horses on a navy blue background. The shorts are also made of lightweight material with an elastic waistband. The sneakers are white with a rubber sole and laces. The man has short, dark hair and a well-groomed beard, and a serious expression on his face. The background is not visible.'},\n",
+ " {'image_id': '597',\n",
+ " 'caption': 'This is an image of a market stall with colorful beads hanging from the ceiling. The stall appears to be selling various types of beads and other jewelry items. There are several customers browsing through the items on display. The overall atmosphere of the image is lively and vibrant, with bright colors and a lot of activity.'},\n",
+ " {'image_id': '598',\n",
+ " 'caption': 'symbol of innocence, purity, and beauty, which makes it a popular choice for jewelry.\\n\\nOverall, the postage stamp image portrays a graceful and refined young woman, adorned with pearls and exuding a sense of tranquility.'},\n",
+ " {'image_id': '599',\n",
+ " 'caption': \"C's. The Gucci logo features the brand's name in a bold, serif font, while the Prada logo features the brand's name in a simple, sans-serif font. Overall, the image serves as a visual representation of some of the most well-known and iconic brands in the fashion industry.\"},\n",
+ " {'image_id': '600',\n",
+ " 'caption': 'This image displays a pair of grey combat boots with a floral design on the side. They have a side zipper and top laces. The rubber soles have treads for improved traction. The boots are in good shape.'},\n",
+ " {'image_id': '601',\n",
+ " 'caption': 'The image shows a The image depicts a cityscape with colorful houses and buildings on either side of the road. There is a police car parked on the side of the road, and a man walking towards it. The sky is clear and blue, with a few clouds in the distance. The image is set against a white background.'},\n",
+ " {'image_id': '603',\n",
+ " 'caption': \"The image depicts a futuristic city featuring a large floating globe surrounded by a halo of light in the foreground. The cityscape in the background showcases a combination of modern and futuristic architecture, neon lights, and towering skyscrapers that illuminate the night sky. The image conveys a sense of progress and innovation, with the globe symbolizing the world's interconnectedness and the city representing humanity's advancement towards a brighter, more technologically advanced future.\"},\n",
+ " {'image_id': '604',\n",
+ " 'caption': 'This is a red dress with white lace detailing on the sleeves and hem, featuring a sweetheart neckline and a flared skirt. The model completes the outfit with white tights and red shoes with white laces, resulting in a cute and feminine look.'},\n",
+ " {'image_id': '605',\n",
+ " 'caption': 'The sun is shining brightly in the blue sky, and there is a single sunflower in the foreground. The sunflower is standing tall and proud, with its bright yellow petals and dark brown center. The sky is a bright blue, with fluffy white clouds floating in the distance. There is a sense of peace and tranquility in this image.'},\n",
+ " {'image_id': '606',\n",
+ " 'caption': 'The image shows a large, dimly lit auditorium with rows of seats facing a stage. The stage is empty without actors or props. The auditorium has large windows that let in natural light and a high ceiling. The walls are white without decorations, and the floor is made of wooden planks without carpets or rugs. The atmosphere is empty and still.'},\n",
+ " {'image_id': '608',\n",
+ " 'caption': 'The building in the image appears to be a hotel or restaurant. It has a red brick facade with large windows and a balcony on the second floor. There are several tables and chairs on the sidewalk in front of the building, and a few cars are parked in the parking lot. The building is surrounded by trees and greenery, and there is a clear blue sky in the background.'},\n",
+ " {'image_id': '609',\n",
+ " 'caption': 'This image shows a group of women standing around a table with plates of food on it. They appear to be preparing the food for a meal. The women are wearing aprons and hats, and one of them is holding a tray of drinks. There is a large window behind them that lets in a lot of natural light. The room appears to be a kitchen or dining area.'},\n",
+ " {'image_id': '610',\n",
+ " 'caption': 'There is a white plate with chicken, rice, and green beans on it. The chicken is delicious.'},\n",
+ " {'image_id': '612',\n",
+ " 'caption': \"This is an image of a person's wrist with a metal chain attached to it. The chain has a clasp at the end and the person's hand is holding it. They are also wearing a watch on their other wrist. The background of the image is a wooden table with various objects on it.\"},\n",
+ " {'image_id': '613',\n",
+ " 'caption': 'stillness in the scene, as the tree stands alone in the vast expanse of the plain.'},\n",
+ " {'image_id': '614',\n",
+ " 'caption': 'This is an aerial view of a large, luxurious mansion made of stone surrounded by lush green trees and shrubs. It has several windows and balconies. There is a large swimming pool in the front yard and a small pond in the backyard. The property is surrounded by a tall fence and there is a gate leading to the driveway.'},\n",
+ " {'image_id': '615',\n",
+ " 'caption': 'This is a photo of a bedroom on a boat. The bed is made up with white sheets and blankets, and there are two pillows on the bed. There is a small table next to the bed with a vase of flowers on it. The walls are painted white and there is a large window that lets in a lot of natural light. The floor is made of wood and there is a rug on the floor.\\n\\nThis is a photo of a small bedroom on a boat. The bed is made up with white sheets and blankets, and there are two pillows on the bed. There is a small table next to the bed with a vase of flowers on it. The walls are painted white and there is a large window that lets in a lot of natural light. The floor is made of wood and there is a rug on the floor.'},\n",
+ " {'image_id': '616',\n",
+ " 'caption': 'The image shows a clear phone case with a red strap attached to it, lying on a white surface next to a green leafy plant. The phone case appears to be made of clear plastic and has a sleek, modern design. The red strap is made of a thick, braided material and is attached to the phone case with a metal clasp. The leafy plant in the background is a large, tropical plant with broad, green leaves and a thick, woody stem. The plant appears to be healthy and well cared for.'},\n",
+ " {'image_id': '617',\n",
+ " 'caption': 'This image shows the interior of a restaurant with a long bar and several tables with white tablecloths and black chairs. The walls are made of exposed brick and the ceiling is made of wood beams. There are several large windows letting in natural light.'},\n",
+ " {'image_id': '618',\n",
+ " 'caption': \"The image shows a man wearing a black suit with a white shirt and a green tie. The tie has a pattern of small green dots on it. He is holding the tie in his left hand and appears to be adjusting it with his right hand. The man has short, dark hair and a serious expression on his face while looking directly at the camera. The background is a dark grey and the lighting in the image is slightly dim, with shadows on the man's face and the tie. The overall mood of the image is professional and serious.\"},\n",
+ " {'image_id': '619',\n",
+ " 'caption': 'The image shows a small, white house made of wood with a pitched roof and chimney on top. The house is surrounded by lush green trees and shrubs. The windows have white frames and shutters and the front door is wooden with a small porch in front of it. The lawn in front of the house is overgrown with moss and weeds, and there is a small path leading up to the front door. The trees in the background are tall and leafy, with branches that stretch up towards the sky.'},\n",
+ " {'image_id': '620',\n",
+ " 'caption': 'This is a map that shows the location of a village in the countryside. The village is surrounded by fields and forests, and there are several roads that lead to it. The map also shows the location of a river that runs through the area. There are several buildings marked on the map, including a church, a school, and a few houses. The map also shows the location of a cemetery, which is located on the outskirts of the village.'},\n",
+ " {'image_id': '621',\n",
+ " 'caption': 'the surrounding landscape is beautifully depicted with tall trees and a gray, cloudy sky.'},\n",
+ " {'image_id': '622',\n",
+ " 'caption': 'The image shows a silver coin with a tree image on it, specifically a silver fern which is a symbol of New Zealand. It could be a commemorative or collectible item.'},\n",
+ " {'image_id': '623',\n",
+ " 'caption': 'The image depicts a two-story house with blue exterior and white trim, featuring a balcony on the second floor. The front gate is made of metal and has a sign that reads \"person\". A small garden with plants and a fountain is visible in front of the house. The sky is cloudy and it seems to be dusk.'},\n",
+ " {'image_id': '625',\n",
+ " 'caption': 'shows a rocky coastline with a small town in the background. There are some boats in the water and a few people standing on the shore. The sky is cloudy and there are some mountains in the distance.'},\n",
+ " {'image_id': '626',\n",
+ " 'caption': 'The image shows two large telescopes on top of a snowy mountain. The telescopes are white and have large domes on top of them. The sky is cloudy and there is snow on the ground. The telescopes are pointed towards the sky, as if they are being used to observe the stars.'},\n",
+ " {'image_id': '627',\n",
+ " 'caption': 'This image depicts a decorative wallpaper design featuring a large, stylized floral motif in shades of purple and gold. The pattern is made up of a series of overlapping petals and leaves, arranged in an intricate, swirling design. The background of the wallpaper is a neutral, off-white color. The overall effect is a luxurious, ornate look that would be suitable for use in a formal setting such as a dining room or living room.'},\n",
+ " {'image_id': '629',\n",
+ " 'caption': 'This image is a shirt design of a cute koala bear wearing a flower crown and sitting on a tree branch. The words \"save the koalas\" are written in a playful font. The design would make a great gift for anyone interested in conservation efforts.'},\n",
+ " {'image_id': '630',\n",
+ " 'caption': 'The image shows a pile of video game controllers, including NES, SNES, and GBA controllers in good condition with no visible damage or wear. The controllers are arranged haphazardly and have a mix of black, white, and gray colors, with the NES controller having a distinctive red and white design. The image suggests a collection of vintage video game controllers gathered together for a purpose such as display or sale.'},\n",
+ " {'image_id': '631',\n",
+ " 'caption': 'This image depicts a dark and eerie scene, with a large twisted tree in the center of the frame. The sky above is dark and stormy, with lightning flashes illuminating the scene. In the foreground, there is a figure standing on a rocky outcropping, looking down at the tree. The figure is dressed in dark, tattered robes and holds a scythe in one hand. The overall mood of the image is ominous and foreboding.'},\n",
+ " {'image_id': '632',\n",
+ " 'caption': 'This image shows a woman standing outside of a building, wearing a black coat, black pants, and white sneakers. She is carrying a red tote bag and has a pair of black sunglasses on her head. The building in the background has a white wall and a large window. There are some plants in pots on the sidewalk in front of the building.'},\n",
+ " {'image_id': '633',\n",
+ " 'caption': 'This image is a stained glass window depicting a whirlpool in the ocean. The colors used in the window are blues, greens, and yellows. The whirlpool is depicted in the center of the window, with waves and bubbles swirling around it. On the left side of the window, there is a large fish swimming in the ocean. On the right side, there is a smaller fish swimming near the whirlpool. The overall effect of the window is one of peacefulness and serenity.'},\n",
+ " {'image_id': '634',\n",
+ " 'caption': 'The image depicts a yellow flower growing in a rocky mountain landscape with a cloudy and misty mountain range in the background. The petals of the flower are long and thin, with a slightly wavy texture, and the center of the flower is a darker yellow color. Small green plants surround the base of the flower, some of which appear to be moss or lichen. The angle of the image is low, looking up at the flower, which is the main focus of the image.'},\n",
+ " {'image_id': '635',\n",
+ " 'caption': 'be an old fashioned machine used for weaving fabric. There are several spools of thread hanging from the loom, and the woman on the left is holding a piece of fabric that has been woven on the loom. The background of the photograph appears to be a factory or workshop, with pipes and other machinery visible.'},\n",
+ " {'image_id': '636',\n",
+ " 'caption': 'This image is a black tank top with the words \"training for the class wars\" written in yellow on the front. The words are written in a bold font and are centered on the chest of the shirt. The shirt is made of 100% cotton and has a relaxed fit. It is a unisex tank top that can be worn by both men and women.'},\n",
+ " {'image_id': '637',\n",
+ " 'caption': 'This The image shows a group of papaya trees growing in a field. The trees are tall, with long, thin trunks and large, green leaves. The fruit on the trees is not yet ripe, as it is still small and green. The trees are surrounded by dirt and grass, and there is a clear blue sky in the background.\\n\\nThe papaya trees are a type of tropical fruit tree that is commonly grown in many parts of the world. They are known for their sweet, juicy fruit, which is often used in smoothies, salads, and other dishes. The trees can grow to be very tall, and they require a lot of sunlight and water to thrive.\\n\\nOverall, this image shows a group of papaya trees growing in a field, with the fruit still unripe. The trees are surrounded by dirt and grass, and there is a clear blue sky in the background.'},\n",
+ " {'image_id': '638',\n",
+ " 'caption': 'This image shows a hotel room with a large bed, a desk, and a television. The walls are covered in a beige patterned wallpaper, and there are two lamps on either side of the bed. The room is spacious and well lit, with large windows letting in natural light.'},\n",
+ " {'image_id': '639',\n",
+ " 'caption': 'The image shows a muscular man holding an orange in one hand and a glass of orange juice in the other. He is standing in front of a white background and is wearing a sleeveless shirt that reveals his toned abs. He has a serious expression on his face and is holding the orange in a way that suggests he is about to take a bite out of it. The glass of orange juice is half full and appears to be freshly squeezed. The overall impression of the image is one of health and vitality.'},\n",
+ " {'image_id': '640',\n",
+ " 'caption': 'This image is a close up of a white background with pink and purple cherry blossom flowers in full bloom. The petals are delicate and have a soft, feathery texture. The flowers are arranged randomly, some hanging down and others standing upright. The colors used in this image are soft and pastel, providing a pop of color against the white background. The overall effect is tranquil and beautiful, with a delicate, almost ethereal quality to the flowers. The composition is well done, drawing the eye towards the flowers.'},\n",
+ " {'image_id': '641',\n",
+ " 'caption': 'This image shows a pair of grey and white leather shoes with a blue sole, designed for golfing. The lace-up closure and small brand logo on the side of the shoe are visible. The shoes are in good condition and suitable for use on the golf course.'},\n",
+ " {'image_id': '642',\n",
+ " 'caption': 'This image shows a wooden swing set with a wooden frame and a wooden seat, located outside in front of a building with a large garage door open. The swing set appears to be in good condition and is ready for use.'},\n",
+ " {'image_id': '644',\n",
+ " 'caption': 'The image shows a group of people playing a game of basketball on a court. The players are wearing different colored jerseys and are dribbling the ball.'},\n",
+ " {'image_id': '645',\n",
+ " 'caption': \"The image shows two tote bags with colorful designs on them. One bag has the words 'happy birthday' written on it in pink letters, while the other has the words 'sweet baby' written on it in blue letters. Both bags have large bows on them, one pink and the other blue. The bags are on top of a wooden surface, which appears to be a table or easel. There are various colors and shapes of paintbrushes and art supplies scattered around.\"},\n",
+ " {'image_id': '646',\n",
+ " 'caption': 'This image is a map of the world. It shows the different countries and their capitals. The countries are colored in different shades of blue, green, yellow, red, and orange, and are labeled with their respective names. The map is divided into different regions, including Europe, Asia, Africa, North America, South America, Australia, and Antarctica. The capitals of the countries are highlighted in a different color. The map also shows the oceans and the equator, which divides the world into the northern and southern hemispheres.'},\n",
+ " {'image_id': '647',\n",
+ " 'caption': 'This image shows a group of people playing a game of shuffleboard in a bar. The table is made of wood and has a green felt surface. The players are dressed in casual clothing, with one man wearing a white t-shirt and jeans, and the other wearing a black t-shirt and shorts. There is a beer bottle and glasses on the table in front of them. The room has a wooden floor and a high ceiling with exposed beams. There is a large window behind the players that lets in a lot of natural light. The overall atmosphere of the image is relaxed and casual.'},\n",
+ " {'image_id': '650',\n",
+ " 'caption': 'This is an image of a woman standing in a fenced-in area with several small goats around her. The woman is wearing a black and white striped shirt with her hair tied back in a ponytail. The goats are different colors and sizes, nibbling on the grass in the enclosure. A wooden fence surrounds the area with some trees visible in the background.'},\n",
+ " {'image_id': '651',\n",
+ " 'caption': \"The image shows a bouquet of red roses in a vase. The roses have long stems and are tied together with a ribbon. There are a few baby's breath flowers in the bouquet as well. The vase is clear glass and has a curved shape. The background is white.\"},\n",
+ " {'image_id': '652',\n",
+ " 'caption': 'This is a black and white drawing of a large brick building with several windows and a tall, pointed roof. It has a large entrance and appears to be an office or government building. Several palm trees are in front of the building and a sidewalk leads up to the entrance.'},\n",
+ " {'image_id': '654',\n",
+ " 'caption': 'This is an image of a green rectangle with a black border on the top and bottom, and a white border on the left and right sides. There is a small black square in the top left corner of the image, and a small white square in the bottom right corner.\\n\\nThe green color of the rectangle creates a sense of calm and tranquility, while the black border adds a sense of sophistication and elegance. The white border also adds a sense of cleanliness and purity. The small black square in the top left corner adds a sense of mystery and intrigue, while the small white square in the bottom right corner adds a sense of balance and symmetry.\\n\\nOverall, this image is visually pleasing and creates a sense of harmony and balance.'},\n",
+ " {'image_id': '655',\n",
+ " 'caption': 'This is a photograph of a farm field with rows of soybean crops growing. The sun is setting in the background, casting a warm orange glow over the scene. The straight and evenly spaced rows indicate that the crops have been planted and tended to with care. The dark and rich soil suggests fertility, allowing the crops to grow. This peaceful and picturesque image captures a farm at sunset.'},\n",
+ " {'image_id': '656',\n",
+ " 'caption': 'city contrasting with the dark sky. The water in the foreground adds a sense of tranquility to the scene, while the bridge in the distance gives a sense of depth to the image.'},\n",
+ " {'image_id': '657',\n",
+ " 'caption': \"This image shows a sunset over an oil field with pump jacks in the foreground. The sun is setting behind the pump jacks, casting a warm glow over the scene. The sky is filled with clouds, some of which are lit up by the sun's rays. The pump jacks are tall, metal structures with arms that move up and down to pump oil out of the ground. They are arranged in a row, with some of them visible in the foreground and others in the background. The oil field is surrounded by flat, barren land. There are no other buildings or structures visible in the image.\"},\n",
+ " {'image_id': '660',\n",
+ " 'caption': 'This image shows a group of black and white sheep standing on a dirt path next to a fence. The sheep are all facing the same direction and appear to be looking at something outside of the frame. The fence is made of metal bars and there is a small opening in it where the sheep are standing. The ground is covered in dirt and there is a small patch of grass visible in the foreground. The sky is cloudy and overcast.'},\n",
+ " {'image_id': '661',\n",
+ " 'caption': 'The image shows a cityscape with tall concrete and glass buildings in the background, some with balconies on the upper floors. The foreground features a small park with a fountain and a few benches surrounded by a mix of deciduous and evergreen trees such as oak, maple, and pine. The sky is clear and blue with a few fluffy clouds visible in the distance.'},\n",
+ " {'image_id': '662',\n",
+ " 'caption': \"This image shows a young man wearing a black suit and white shirt, smiling at the camera. He has short, dark hair and is clean shaven. He is wearing a pair of black shoes and his hands are in his pockets. The background of the image appears to be an outdoor event or party, with people milling around in the background. The image is well lit and the man's face is clearly visible.\"},\n",
+ " {'image_id': '663',\n",
+ " 'caption': 'The image shows a grassy hill with a view of a city in the distance. The hill is covered in green grass and there are no trees or other vegetation in the foreground. The sky is clear and blue, with a few fluffy clouds visible in the distance. The sun is shining down on the hill, casting long shadows across the grass. The city in the distance appears to be quite large, with several tall buildings visible. There is a road running along the base of the hill, and a few cars can be seen driving along it. Overall, the scene is peaceful and serene, with the green grass and blue sky creating a calming atmosphere.'},\n",
+ " {'image_id': '664',\n",
+ " 'caption': 'This image shows the words \"Magic Warriors\" written in white, glowing letters on a dark background. The letters seem to be floating and are stylized with the \"W\" in \"Warriors\" forming into a sword, creating a mysterious and magical effect.'},\n",
+ " {'image_id': '665',\n",
+ " 'caption': \"The image shows a race car driving on a race track. The car is blue and white with a green stripe down the side. It has a number 22 on the side and a sponsor's logo on the hood. The wheels are black and the tires are white. The track is made of asphalt and has painted lines on it. There are trees in the background and the sky appears to be cloudy.\"},\n",
+ " {'image_id': '666',\n",
+ " 'caption': \"This image shows the London skyline at night, with the London Eye ferris wheel in the foreground and the Houses of Parliament in the background. The London Eye is a large ferris wheel located on the south bank of the River Thames in London, England. It is one of the city's most recognizable landmarks and a popular tourist attraction. The Houses of Parliament, also known as the Palace of Westminster, is the seat of the British government and is located on the north bank of the River Thames. It is a complex of buildings that includes the House of Commons, the House of Lords, and the clock tower known as Big Ben.\"},\n",
+ " {'image_id': '667',\n",
+ " 'caption': 'The painting depicts a view of a canal in Venice, Italy, lined with buildings on both sides with balconies overlooking the water. A gondola with two people sitting in it is in the middle of the canal. The left buildings are taller and more ornate than the right. The sky is overcast with mostly muted colors of browns, grays, and blues. The overall mood is quiet and peaceful, with a sense of stillness in the water and air.'},\n",
+ " {'image_id': '668',\n",
+ " 'caption': \"a red collar with a bell on it and is depicted in various facial expressions including happiness, sadness, surprise, anger, and confusion while standing on its hind legs with front paws on its hips. The cat has large expressive eyes, whiskers, an open mouth as if meowing, perked up ears, and a fluffy tail. Its body is orange, with a white belly, paws, and a white patch on its chest. The cat's cheeks have black whiskers, and its fur is fluffy and well-groomed.\"},\n",
+ " {'image_id': '669',\n",
+ " 'caption': 'This is an image of a person sitting in a green tent in the snow. The person is wearing snowshoes and has skis leaning against the tent wall. There is a small amount of snow on the ground in front of the tent, and trees can be seen in the background.'},\n",
+ " {'image_id': '670',\n",
+ " 'caption': 'The image depicts a collage of various plants, including potted plants, hanging plants, and plants growing in a garden. The plants are a mix of different species, such as ferns, succulents, and flowering plants, arranged in a haphazard manner, with some overlapping and some standing alone. The result is a lush and vibrant display of greenery.'},\n",
+ " {'image_id': '672',\n",
+ " 'caption': 'This image shows a plate of food on a wooden table. The plate has a steak, french fries, and a tomato on it. There are also two glasses of drinks on the table, one of which has a straw in it. The background of the image is a wooden wall and a window with curtains.'},\n",
+ " {'image_id': '673',\n",
+ " 'caption': 'This is a photograph of a baseball game being played in a large stadium. The stadium is filled with people sitting in the stands and watching the game. The field is green and the players are on it playing the game. The sky is clear and blue with no clouds. The sun is setting in the background, casting long shadows on the field. The stadium lights are on, illuminating the field.'},\n",
+ " {'image_id': '674',\n",
+ " 'caption': 'The image is a blue and purple background with the words \"I am a recovering perfectionist\" written on it in white letters. The words are centered and slightly raised above the background. There are no other visible objects or elements in the image.'},\n",
+ " {'image_id': '675',\n",
+ " 'caption': \"This is an image of a white sedan car with a sleek and modern design. It has a large front grille, headlights, and a spoiler on the back. The car's wheels are equipped with black rims. The car is parked on a grey background.\"},\n",
+ " {'image_id': '676',\n",
+ " 'caption': 'This image shows a bracelet made of green emeralds and white diamonds. The emeralds and diamonds are set in white gold. The bracelet has a clasp made of white gold. The emeralds are oval shaped, bright green, and have a total weight of approximately 1.5 carats. The diamonds are round shaped and have a total weight of approximately 0.5 carats. The overall look of the bracelet is elegant and sophisticated.'},\n",
+ " {'image_id': '678',\n",
+ " 'caption': 'This is an image of a kite with a red, white, and blue design featuring an eagle at the center, holding a ribbon in its beak and having its wings spread out. The kite has a long tail made of red and white ribbons and is suspended from a string that is attached to the top of the image. The image is on a white background.'},\n",
+ " {'image_id': '679',\n",
+ " 'caption': \"This is an image of a person wearing shoes with flames coming out of the bottom. The shoes have a yellow and black design on them and appear to be made of rubber or plastic. The person is standing on a dark surface, possibly asphalt or concrete, and there is a car parked in the background. The image appears to have been taken at night with streetlights in the background, and the person's face is partially obscured by shadows.\"},\n",
+ " {'image_id': '680',\n",
+ " 'caption': 'This image shows a living room with a couch, coffee table, and shelves on the wall. There is a man standing in the middle of the room, looking at the shelves. The room has a neutral color scheme with white walls, beige carpet, and brown furniture. There are two windows on either side of the room, letting in natural light. The shelves are made of wood and have various items on them, including books, vases, and a clock. The coffee table has a lamp on it, and there is a throw pillow on the couch. The overall atmosphere of the room is cozy and welcoming.'},\n",
+ " {'image_id': '682',\n",
+ " 'caption': 'This is an image of a large, cylindrical machine in a laboratory setting. The machine appears to be made of metal and has several gauges and dials on the front. There are several pipes and tubes coming out of the top and bottom of the machine. The machine is sitting on a wheeled stand and there is a chain attached to the bottom of it. There are also several other machines and equipment visible in the background.'},\n",
+ " {'image_id': '684',\n",
+ " 'caption': 'The image shows a pair of white sandals with a braided design on the straps. The straps are made of woven rope and have a tassel at the end. The soles of the sandals appear to be made of a light colored material, possibly leather. The overall design of the sandals is simple and elegant.'},\n",
+ " {'image_id': '685',\n",
+ " 'caption': 'The image shows a gold necklace with small white pearls hanging from it. The pearls are arranged in a row, with a small gap between each one. The necklace has a delicate, dainty look to it.'},\n",
+ " {'image_id': '687',\n",
+ " 'caption': \"The image shows two cats, one is black and white and the other is grey and white. The black and white cat is laying on its back with its paws in the air, while the grey and white cat is sitting next to it with its head tilted to the side. Both cats have their eyes open and appear to be looking at the camera. The caption reads, `'do you want to know something funny? nope.'''\"},\n",
+ " {'image_id': '688',\n",
+ " 'caption': 'This image is a birthday greeting with the words \"happy birthday\" written in colorful letters in a playful, childlike font. The background is white with no other design elements, making it a simple and fun way to wish someone a happy birthday.'},\n",
+ " {'image_id': '689',\n",
+ " 'caption': 'The image shows a large, ornate cathedral with intricate, gold mosaics on the walls and ceiling depicting various religious scenes, including the crucifixion and resurrection of Jesus. The cathedral also has large stained glass windows that let in a warm, golden light. Rows of pews are in front of the altar where people can sit and pray, creating an overall atmosphere of grandeur and reverence.'},\n",
+ " {'image_id': '690',\n",
+ " 'caption': \"There are several tables with black chairs in the center of a modern and minimalist room. Large windows surround the tables, letting in natural light. The walls are painted white with a few black and white photographs hanging on them. The room's floor is made of wooden planks with some potted plants scattered around.\"},\n",
+ " {'image_id': '691',\n",
+ " 'caption': 'The image shows a man standing in front of a cash register with a shopping cart full of groceries in the background. The man is wearing a suit and holding a shopping list in his hand. The cash register has a screen displaying the total cost of the items in the cart.\\n\\nThe image depicts a scene from a supermarket or grocery store, where customers can pay for their purchases at the cash register. The cash register has a screen that displays the total cost of the items in the cart, and the man is standing in front of it, ready to pay for his groceries. The shopping cart behind him is full of various items, such as bread, milk, eggs, and other groceries.\\n\\nThe image could be used to illustrate various concepts related to shopping, such as paying for groceries, checking out at the cash register, or buying items at a supermarket'},\n",
+ " {'image_id': '692',\n",
+ " 'caption': \"The image shows a group of rugby players huddled together on the field, with one player sitting on the ground with his head in his hands. The players are wearing blue and red jerseys, with the number 10 on the back of one player's jersey. The field is green and there are spectators in the stands watching the game.\"},\n",
+ " {'image_id': '694',\n",
+ " 'caption': 'The image shows a dish of fish with vegetables such as broccoli and carrots on a white plate. A lemon wedge is on the side of the plate, and a small bowl of sauce next to it. The fish is grilled or baked, with a crispy brown exterior and a moist interior. The vegetables are cooked but still crisp, and the sauce is creamy and flavorful. The presentation of the dish is visually appealing and appetizing.'},\n",
+ " {'image_id': '695',\n",
+ " 'caption': 'This image shows a stack of whole wheat pancakes with sliced bananas on top. A hand is pouring golden honey syrup over them. The pancakes are on a blue and white checkered plate.'},\n",
+ " {'image_id': '696',\n",
+ " 'caption': 'This is an image of a broken egg. It appears to have cracked open and the yolk is leaking out of it. The egg is lying on the ground and there is dirt and debris around it. The image is in black and white.'},\n",
+ " {'image_id': '697',\n",
+ " 'caption': 'This image is a drawing of a military truck. The truck is a large vehicle with a box on the back. The front of the truck has a hood, grille, and headlights. The windshield wraps around the front. Large wheels and tires are on the sides. The back of the truck has a box with a ramp and a winch.\\n\\nDimensions: 20 feet long, 8 feet wide, and 10 feet tall. Box: 8 feet wide and 12 feet long. Wheels: 4 feet in diameter.'},\n",
+ " {'image_id': '698',\n",
+ " 'caption': 'The image shows a large school of fish swimming in clear blue water. The fish are of different sizes and are swimming together in a coordinated manner. The water is clear and the sunlight is shining down on the fish, making them appear to be glistening in the sun. There are no other objects in the image except for the fish and the water.'},\n",
+ " {'image_id': '699',\n",
+ " 'caption': \"The Church of the Savior on Spilled Blood, also known as the Church of the Resurrection, is a Russian Orthodox church situated in St. Petersburg, Russia. Built in the late 19th century, it is one of the city's most recognizable landmarks. The church is located on the banks of the Griboyedva River and has brightly colored domes and intricate architecture.\\n\\nThe church was built to commemorate the assassination of Emperor Alexander II in 1881. Its design is a fusion of traditional Russian and Byzantine styles, with a large central dome and four smaller domes on the corners. The exterior is adorned with intricate mosaics and frescoes depicting scenes from the Bible and Russian history.\\n\\nThe interior of the church is equally impressive, with marble columns and intricate mosaics covering the walls and ceiling.\"},\n",
+ " {'image_id': '700',\n",
+ " 'caption': \"The image depicts a man performing an abdominal exercise on a machine. The man is wearing a blue workout suit and has his hands on the machine's handles. The machine has several cables and pulleys attached to it, which are used to provide resistance to the exerciser's movements. The man's body is in a bent position, with his legs extended behind him and his arms extended forward. The image is in a white background.\"},\n",
+ " {'image_id': '701',\n",
+ " 'caption': 'the image depicts a young girl in a fairy costume standing on a natural-looking globe while holding a wand with a star on the end. She is looking up at the stars in the night sky with a facial expression of awe and wonder. The background shows mountains and valleys. The mood of the image is one of wonder and magic.'},\n",
+ " {'image_id': '702',\n",
+ " 'caption': 'The image shows a group of businessmen climbing up a steep cliff, with one holding a red flag and the other holding a blue flag. They are trying to reach the top of the cliff, which is depicted as a large mountain in the background. The overall theme of the image appears to be one of determination and teamwork, as the men work together to reach the top of the mountain.\\n\\nThe colors used in the image are mostly shades of blue and red, with the red flag standing out prominently against the blue sky in the background. The overall mood of the image is one of determination and perseverance, as the men work together to achieve their goal of reaching the top of the mountain. The image could be used to convey a message of teamwork and the importance of working together to achieve a common goal.'},\n",
+ " {'image_id': '703',\n",
+ " 'caption': 'The image shows a police officer standing next to a car that has been involved in an accident. The car appears to have sustained significant damage to the front end, with the hood crumpled and the windshield shattered. The officer is standing next to the car, with his arms crossed and a look of concern on his face. Behind the car, there are several other vehicles parked in the lot, including a large truck with the letters \"pt\" on the side.'},\n",
+ " {'image_id': '705',\n",
+ " 'caption': 'The image shows a black car driving down a road with mountains in the background. The sun is shining brightly in the sky, and there are no other cars on the road. The car has dark tinted windows and a spoiler on the back. It appears to be a sports car, with a large engine and low profile tires. The license plate on the car is not visible, but it appears to be from a state in the United States.\\n\\nThe car appears to be in good condition, with no visible dents or scratches on the body. The tires are also in good shape, with no signs of wear or tear. The windows are tinted, so it is not possible to see inside the car. However, it appears to be a luxury sports car, with leather seats and a high tech dashboard.\\n\\nOverall, the image shows a sleek, black sports car driving down a desert road at sunset'},\n",
+ " {'image_id': '706',\n",
+ " 'caption': 'The image shows a white cosmetic pouch with gold foil lettering on the front that reads \"glamorous\".'},\n",
+ " {'image_id': '709',\n",
+ " 'caption': 'The image shows a large stone tower with a pointed top, surrounded by trees and a foggy sky. The tower has several windows and a small door on the side. There are no people or other objects in the image.'},\n",
+ " {'image_id': '710',\n",
+ " 'caption': 'The image shows a potted plant sitting on a windowsill'},\n",
+ " {'image_id': '711',\n",
+ " 'caption': \"The image depicts a silhouette of a person holding a star in their hand. The person is standing in front of a white background. The image is a simple, flat illustration with no shading or texture. The person's face is not visible, only the outline of their body and the star they are holding. The image could be used as a symbol for achievement, success, or recognition.\"},\n",
+ " {'image_id': '713',\n",
+ " 'caption': 'There is a bed with a blanket and pillows on it, a dresser with a television on top, a desk with a lamp and a chair in front of it, and a window with curtains and a view of the outside.'},\n",
+ " {'image_id': '714',\n",
+ " 'caption': 'The image depicts an elderly woman wearing a straw hat, green pants, and a red shirt. She is holding a map in her left hand and a suitcase in her right hand. The woman is standing with her legs apart and looking down at the map. The image is in a flat, cartoon style and is on a white background. The woman is wearing a straw hat, green pants, and a red shirt. She is holding a map in her left hand and a suitcase in her right hand. The woman is standing with her legs apart and looking down at the map. The image is in a flat, cartoon style and is on a white background.'},\n",
+ " {'image_id': '718',\n",
+ " 'caption': 'visible texture or patterns. The reflective quality of the oval gives it a three-dimensional appearance, as it appears to be raised off the surface it is placed on. Overall, the image showcases a bright and shiny yellow oval shape with a reflective surface, placed on a white background to create contrast.'},\n",
+ " {'image_id': '720',\n",
+ " 'caption': 'The image depicts a silhouette of a person riding a horse. The person is wearing a black cape and has a sword in their hand. The horse is galloping with its mane and tail flowing in the wind. The image is on a yellow background.\\n\\n'},\n",
+ " {'image_id': '723',\n",
+ " 'caption': 'This image is a 3D representation of a dental implant, which is a titanium post surgically placed into the jawbone to support a replacement tooth or bridge. The implant is connected to the jawbone through osseointegration, allowing the bone to grow around the implant and hold it in place. The implant is then capped with a crown, the visible part of the tooth in the mouth.'},\n",
+ " {'image_id': '724',\n",
+ " 'caption': \"This is an illustration of various objects hanging from strings, with the words 'things that art' written above them. The objects depicted include a pair of scissors, a paintbrush, a pencil, and a piece of paper. The words are written in a stylized font, with each letter hanging from a different string. The overall effect is one of creativity and inspiration, as if the objects are there as a reminder to create something beautiful.\"},\n",
+ " {'image_id': '726',\n",
+ " 'caption': \"The image shows an old, yellowed piece of paper with handwriting on it, which appears to be from the 1800s and is written in a formal, cursive style. The letter is addressed to someone named `person' from someone named `person' and is written in ink, possibly with a quill pen. Despite some stains and smudges, the words on the paper are still legible.\"},\n",
+ " {'image_id': '727',\n",
+ " 'caption': 'This is a photograph of a bedroom. The walls are painted white and there is a grey carpet on the floor. There is a large bed in the middle of the room with a yellow bedspread and white pillows. Two lamps are placed on either side of the bed, one of which is turned on. There is a door on the left and a window on the right. The room is well lit and appears to be clean and well kept.'},\n",
+ " {'image_id': '728',\n",
+ " 'caption': 'The image shows a close-up view of a small, green plant growing out of the ground. The plant has several small leaves and a stem that is covered in water droplets, which are reflecting the light in different directions. The leaves are also covered in water droplets, giving them a shiny appearance. The background is black. The plant appears to be a type of grass or weed, with thin, green leaves and a long, thin stem.'},\n",
+ " {'image_id': '730',\n",
+ " 'caption': 'The image shows a family sitting on the sand at the beach. The mother is holding a baby and the father is sitting next to her with his arm around her. They are all smiling and seem to be enjoying their time together. The sun is shining brightly in the background, casting long shadows on the sand. There are some tall grasses growing in the foreground, and some trees can be seen in the background.'},\n",
+ " {'image_id': '731',\n",
+ " 'caption': 'This is an image of a Boeing 747-8 airplane painted with a horse on the side parked on the tarmac at an airport.'},\n",
+ " {'image_id': '732',\n",
+ " 'caption': \"The image is a quote from a book or movie, written in white on a navy blue background. The quote reads, `'It is not the critic who counts; not the man who points out how the strong man stumbles, or where the doer of deeds could have done them better. The credit belongs to the man who is actually in the arena, whose face is marred by dust and sweat and blood; who strives valiantly; who errs, who comes short again and again, because there is no effort without error and shortcoming; but who does actually strive to do the deeds; who knows great enthusiasms, the great devotions; who spends himself in a worthy cause; who at the best knows in the end the triumph of high achievement, and who at the worst, if he fails, at least fails while daring greatly, so that his place shall never be with those cold and timid souls.\"},\n",
+ " {'image_id': '733',\n",
+ " 'caption': 'the image is calm and reflects the blue sky above. There are ripples in the water caused by the boats and the people walking on the dock. The picnic tables nearby are made of wood and have benches attached to them. Some of the people sitting at the tables are eating and drinking, while others are simply enjoying the view.'},\n",
+ " {'image_id': '734',\n",
+ " 'caption': 'This image shows a kitchen with white cabinets and countertops made of granite. There are three pendant lights hanging from the ceiling. The large island in the center of the kitchen has a stainless steel sink and a white refrigerator on the other side. The walls are painted in a light gray color with a white tile backsplash behind the stove. The hardwood floor has a rug in front of the island.'},\n",
+ " {'image_id': '735',\n",
+ " 'caption': 'This is a black and white line drawing of a cartoon girl throwing a frisbee. The girl is wearing a pink shirt and shorts, and she is holding a frisbee in her right hand. The frisbee is flying through the air, and the girl is looking up at it with a determined expression on her face. The background is empty, and there are no other objects or people in the image.'},\n",
+ " {'image_id': '736',\n",
+ " 'caption': 'This is an image of a machine being worked on in a factory. The machine is made of metal and has several pipes and hoses attached to it. A person stands next to the machine holding a wrench and looking at the pipes. The machine is being repaired or maintained.'},\n",
+ " {'image_id': '737',\n",
+ " 'caption': 'The image shows a group of people walking down a snowy sidewalk in front of a red brick building. The people are dressed in winter clothing, including hats, scarves, and coats. Some of them are carrying backpacks and other bags, while others have their hands in their pockets. In the background, there are several trees with snow on their branches, and a few cars parked on the street. The sky is cloudy and overcast, with no sun in sight.'},\n",
+ " {'image_id': '739',\n",
+ " 'caption': 'The image shows a large auditorium filled with people sitting in rows of chairs facing a stage. On the stage, there is a large orchestra with a conductor standing in front of them. The orchestra is playing instruments such as violins, violas, cellos, and double basses. The audience is watching the performance, and some people are clapping their hands. The ceiling of the auditorium is high, and there are large chandeliers hanging from it. The walls are decorated with paintings and murals, and there are large windows on either side of the auditorium letting in natural light.'},\n",
+ " {'image_id': '740',\n",
+ " 'caption': 'This is a diagram of a formal plot of a story. It shows the rising action, climax, falling action, and resolution of the story. The rising action is the beginning of the story, where the characters are introduced and the conflict is established. The climax is the turning point of the story, where the conflict is at its highest point and the outcome is uncertain. The falling action is the part of the story after the climax, where the conflict is resolved and the outcome is determined. The resolution is the ending of the story, where the loose ends are tied up and the story comes to a close. This diagram is a useful tool for visualizing the structure of a story and identifying its key elements.'},\n",
+ " {'image_id': '741',\n",
+ " 'caption': \"Constellation Leo is one of the twelve zodiac constellations in the night sky. It is named after the lion in Greek mythology and located in the northern hemisphere. Leo is easily recognizable by its seven bright stars, which form the shape of a lion's head and mane. The brightest star in Leo is Regulus at the base of the lion's tail. Leo covers an area of about 900 square degrees and is bordered by Cancer to the east, Hydra to the south, and Virgo to the west. It is visible in the northern hemisphere from around July to October and is best viewed in the early evening during the summer months. Leo has several interesting deep sky objects associated with it, including the Leo.\"},\n",
+ " {'image_id': '742',\n",
+ " 'caption': \"This image is a cartoon of a pig wearing overalls and a blue shirt with the text 'just a girl who loves pigs' written on it in white letters. The pig is standing on its hind legs, with its front legs resting on a wooden fence. The background is a grassy field with some trees in the distance. The overall theme of the image is cute and playful, with a focus on the love of pigs. It could be used for a variety of purposes, such as a social media post, a greeting card, or a t-shirt design.\"},\n",
+ " {'image_id': '743',\n",
+ " 'caption': 'The image displays a brown sign with a fish and an arrow pointing towards it, situated in a wooded area with fallen leaves on the ground.'},\n",
+ " {'image_id': '744',\n",
+ " 'caption': 'This image shows two women, one older and one younger, standing outside in front of a house. They are both smiling and posing for the camera. The older woman has gray hair and is wearing glasses, while the younger woman has blonde hair and is wearing a blue shirt and jeans. The house behind them has a white picket fence and a green lawn. The sky is cloudy and there are trees in the background.'},\n",
+ " {'image_id': '745',\n",
+ " 'caption': 'The image shows a group of people standing on top of a truck holding a trophy. They are all wearing blue shirts and one person is holding a microphone. The trophy appears to be made of metal and has a golden base. The people in the image are smiling and waving at the camera. There is a crowd of people in the background watching the scene.'},\n",
+ " {'image_id': '746',\n",
+ " 'caption': 'The woman in the image is wearing a black tank top, black tights, and black high heels. Her hair is styled in loose waves and she is wearing black sunglasses. She is posing with her hands on her hips and looking directly at the camera. The image is in black and white.'},\n",
+ " {'image_id': '747',\n",
+ " 'caption': 'The \"koleif\" logo is a modern and sleek design consisting of a letter \"k\" in green, blue, and red. The colors used in the logo represent growth, stability, trust, reliability, energy, and passion. The unique shape of the letter \"k\" is memorable and represents the company\\'s commitment to precision and accuracy.'},\n",
+ " {'image_id': '748',\n",
+ " 'caption': \"This image is a diagram of a person's brain, with different areas highlighted and labeled with terms related to psychology and neuroscience. The brain is depicted as a tree with branches and roots, with each area of the brain represented by a different shape and color. It is intended to be used as a visual aid for teaching or explaining brain functions.\"},\n",
+ " {'image_id': '750',\n",
+ " 'caption': 'This image shows a balcony with a table and chairs overlooking a body of water. The table is set for a meal, with plates, glasses, and silverware. The balcony is surrounded by a wooden railing, and there are trees in the background. The sky is clear, and the sun is shining down on the water.'},\n",
+ " {'image_id': '752',\n",
+ " 'caption': \"The image is a bangle bracelet with the words'baseball mom'engraved on it. The bracelet has three charms hanging from it, including a baseball, a heart, and a star. The charms are made of silver and have a shiny finish. The bracelet is adjustable and can be worn by anyone.\"},\n",
+ " {'image_id': '753',\n",
+ " 'caption': 'This is an image of a woman with a pink lipstick on her lips. She is wearing a blue shirt and has her hair styled in a messy bun. The image is well lit and the colors are bright and vibrant.\\n\\nIn the image, the woman is looking directly at the camera with a slight smile on her face. Her eyes are brown and her eyelashes are long and curled. The lipstick she is wearing is a bright pink shade with a glossy finish. The color looks great on her lips and complements her skin tone.\\n\\nOverall, this is a beautiful image of a woman with a great makeup look. The lighting, colors, and details are all well done.'},\n",
+ " {'image_id': '755',\n",
+ " 'caption': 'This is an image of a rectangular metal object sitting on a surface made of the same material. Metal pieces are attached to the top and bottom of the object to hold it in place. A ruler or measuring tool is placed next to the object to measure its dimensions.'},\n",
+ " {'image_id': '757',\n",
+ " 'caption': 'There are two women dressed in 18th century clothing standing next to each other. The woman on the left is wearing a black dress with a white apron and a red scarf around her neck. The woman on the right is wearing a blue and white striped dress with a yellow apron and a red scarf around her neck. Both women have their hair styled in loose curls and are wearing gold hoop earrings. They are standing on a street with a crowd of people in the background.'},\n",
+ " {'image_id': '758',\n",
+ " 'caption': 'This is a black t-shirt made of 100% cotton, featuring a skull and crossbones in the center. The words \"Feed the Meat Grinder\" are written in red above and below the skull. The shirt has a standard fit and is suitable for both casual and formal wear. It can be paired with jeans or shorts for a cool and edgy look. The design is bold and eye-catching, making it perfect for anyone who loves horror or punk style. It is a great t-shirt for anyone who wants to make a statement with their clothing.'},\n",
+ " {'image_id': '760',\n",
+ " 'caption': 'boy riding the horse through a desert with cacti in the background.'},\n",
+ " {'image_id': '761',\n",
+ " 'caption': 'This image is a photograph of a person on a rocky beach with the ocean in the background. The person is standing with their back to the camera, looking out at the water. The sky is bright blue, and the water is deep blue. The sand on the beach is light brown, and there are rocks in the foreground. The person appears lost in thought as they gaze out at the ocean, creating a peaceful and serene mood.'},\n",
+ " {'image_id': '763',\n",
+ " 'caption': 'This image shows a green, vintage military jeep with a large engine on the back parked in a parking lot. The jeep appears to be in good condition, with no visible damage or wear. The front of the jeep has a large grille and headlights, and the windshield appears to be in good condition. The tires on the jeep appear to be in good condition as well.'},\n",
+ " {'image_id': '764',\n",
+ " 'caption': 'The image shows a The image shows the interior of an airplane with three rows of seats. The seats are arranged in a 2 - 2 configuration, with each row having two seats on either side of the aisle. The seats are upholstered in a light grey fabric with blue and green accents on the headrests and armrests. There are two windows on either side of the plane, and each window has a shade that can be raised or lowered. There is a small table between the seats in the middle row, with a tray for drinks or other items. The carpet on the floor is a light beige color.'},\n",
+ " {'image_id': '765',\n",
+ " 'caption': 'image captures the essence of an extravagant and sophisticated lifestyle, with the grand staircase and lavish decor providing a backdrop for the elegant attire and social interaction of the people in the image.'},\n",
+ " {'image_id': '766',\n",
+ " 'caption': 'This is a photograph of a young girl playing in the sand on a beach. The girl is wearing a black and white dress and is sitting on her hands and knees in the sand. In the background, you can see the ocean and some palm trees. The girl appears to be building something out of the sand with her hands. The sand around her is smooth and flat, and there are no footprints or other marks in it. The sky is clear and blue, and there are a few fluffy clouds visible in the distance.'},\n",
+ " {'image_id': '767',\n",
+ " 'caption': \"This image is a logo for the Ram truck brand. It features a ram's head in the center of the logo, with the words 'RAM' written in bold letters above and below the image. The ram's head is shown in a profile view, with its horns and beard clearly visible. The overall design of the logo is sleek and modern, with a black and white color scheme that gives it a bold, eye-catching look.\"},\n",
+ " {'image_id': '768',\n",
+ " 'caption': \"The image depicts an abstract representation of a lighthouse. The lighthouse is made up of geometric shapes and lines, with the light shining from the top of the structure. The lighthouse stands on a rocky outcropping, with waves crashing against the shore in the foreground. The image is in black and white, with the lighthouse and waves standing out against the white background.\\n\\nThe lighthouse is a symbol of guidance and direction, and is often used as a metaphor for finding one's way in life. The waves represent the challenges and obstacles that we face in our journey, and the light from the lighthouse represents the hope and promise of a better future. The image could be interpreted as a reminder to keep moving forward, even in the face of adversity.\"},\n",
+ " {'image_id': '769',\n",
+ " 'caption': 'This image is of a bedroom with a yellow and white theme. The walls are painted yellow, and there is a white canopy bed with a red and white striped rug on the floor. The room has large windows that let in a lot of natural light. There is a white desk with a computer on it in the corner, as well as a red and white striped chair.'},\n",
+ " {'image_id': '770',\n",
+ " 'caption': 'This image is an abstract design composed of diagonal lines in different shades of purple, green, and white. The lines form a swirling and chaotic pattern that gives a sense of energy and movement. The design is dynamic and eye-catching, suitable for use in several contexts like a website background or a graphic element in a marketing campaign.'},\n",
+ " {'image_id': '771',\n",
+ " 'caption': 'This image appears The image is a painting of two people, a man and a woman, standing next to each other. The man is wearing a tuxedo and the woman is wearing a long, flowing dress. They are both looking at each other with smiles on their faces. The background of the painting is a golden, ornate frame.'},\n",
+ " {'image_id': '772',\n",
+ " 'caption': 'The painting depicts a snowy road in the woods at sunset. The sky is pink and there are trees on either side of the road. The road is covered in snow and there are tire tracks in the snow. There are no buildings or other structures visible in the painting. The overall mood of the painting is peaceful and serene.'},\n",
+ " {'image_id': '773',\n",
+ " 'caption': \"This is a close up shot of a bride's hands, adorned with intricate henna designs. The bride is wearing a red and gold lehnga, a traditional Indian outfit for brides. The henna designs on her hands symbolize good luck and prosperity in Indian culture. The bride's hands are also adorned with gold jewelry, including bangles and rings. The henna designs on her hands are intricate and detailed, featuring floral motifs and geometric patterns, creating a beautiful and traditional look for the bride on her wedding day.\"},\n",
+ " {'image_id': '774',\n",
+ " 'caption': \"The image shows a wire that is connected to a small, circular object. The object appears to be made of plastic and has a small hole in the center. The wire is connected to the object at one end and is looped around the object at the other end. The wire is thin and appears to be made of metal. There are no other objects visible in the image.'\"},\n",
+ " {'image_id': '775',\n",
+ " 'caption': 'the statue add to the sense of reverence for the player and his accomplishments. The building in the background also adds to the sense of importance and grandeur surrounding the statue.'},\n",
+ " {'image_id': '777', 'caption': 'This is an image of a postage'},\n",
+ " {'image_id': '778',\n",
+ " 'caption': \"This image shows a bride and groom standing on a wooden fence in the middle of a field. The bride is wearing a white dress and the groom is wearing a white shirt and black pants. The couple is embracing and looking into each other's eyes. The sky is blue and there are some clouds in the background. The grass is green and there are some trees in the distance. The image is taken from a low angle, looking up at the couple.\"},\n",
+ " {'image_id': '779',\n",
+ " 'caption': \"This is a framed picture with a brown wooden frame. Inside the frame, there is a black and white photograph of a man standing on a stage with a microphone in his hand. The man is wearing a suit and tie, and he is looking directly at the camera with a serious expression on his face. Behind the man, there is a large banner with the words 'person' printed on it in white letters. There is a spotlight shining down on the man from above, and it is casting a shadow on the floor in front of him. The background of the photograph is dark.\"},\n",
+ " {'image_id': '780',\n",
+ " 'caption': 'This image shows a small room with a bunk bed and a ladder leading to the top bunk. The walls are painted red, and there are curtains hanging from the ceiling. The room appears to be in a state of disarray, with clothes and other items scattered on the floor.'},\n",
+ " {'image_id': '781',\n",
+ " 'caption': 'The image shows a boat floating in the middle of a large body of water at sunset. The sun is setting behind the horizon, casting a warm orange glow on the water and the boat. The boat appears to be made of wood and has a curved shape with a pointed bow and stern. It has a single mast with a sail that is billowing in the breeze. The water is calm and there are no other boats or objects visible in the scene. The sky is a deep shade of purple with clouds that are tinged with pink and orange. The overall effect of the image is peaceful and serene, with a sense of stillness in the water and the sky.'},\n",
+ " {'image_id': '782',\n",
+ " 'caption': 'The image shows a living room with a table, chairs, and a lamp. There is also a window with a view of the outside. The room is decorated with a rug and a painting on the wall.'},\n",
+ " {'image_id': '784',\n",
+ " 'caption': 'This image shows a bathroom with a sink, toilet, and bathtub. Two beds are present in the room, one with a white bedspread and the other with a green blanket. The walls are painted white, and there are two windows with white curtains. The floor is tiled with white and grey tiles.'},\n",
+ " {'image_id': '785',\n",
+ " 'caption': 'The image shows a dart hitting a bullseye in the center of a yellow and black target with a red dot in the middle. The dart is stuck in the bullseye and the image is on a black background.'},\n",
+ " {'image_id': '787',\n",
+ " 'caption': 'This image shows a setup for an outdoor event. Tables covered in black tablecloths and white paper lanterns are hanging from the ceiling. Pink and white flowers adorn the tables, and chairs are set up for guests to sit in. It appears to be a wedding or other formal gathering.'},\n",
+ " {'image_id': '788',\n",
+ " 'caption': 'This image shows a bride and groom sharing a kiss at their wedding. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. They are standing in front of a fireplace with a large mirror above it. The room appears to be decorated in a rustic style, with wooden beams on the ceiling and a wooden floor. There are several pieces of furniture in the room, including a large wooden table and chairs.'},\n",
+ " {'image_id': '790',\n",
+ " 'caption': 'This image shows a bride and groom standing in a field of flowers. The bride is wearing a white dress and the groom is wearing a black suit with a white shirt and tie. They are both smiling and looking at each other. The sun is shining on their faces and there are flowers in the background.'},\n",
+ " {'image_id': '791',\n",
+ " 'caption': 'The image shows a bottle of hand sanitizer on a blue cloth. The bottle is clear and has a white cap with a spray nozzle on the top. The nozzle is open and there is a small amount of liquid coming out of it. The background is a white and blue striped cloth.'},\n",
+ " {'image_id': '792',\n",
+ " 'caption': \"This is an image of a disassembled laptop with the hard drive being removed from the motherboard. The hard drive is connected to the motherboard by two cables, with one being held by the person's hand in the image. The person is gently pulling on the cable to remove the hard drive.\\n\\nThe laptop has a black case with a silver keyboard and touchpad. There are vents on the bottom of the laptop to cool the components. The hard drive is on the right side of the laptop, and the power supply is on the left.\\n\\nSeveral screws hold the laptop together, which need to be removed to access the hard drive. The screws are near the vents on the bottom of the laptop.\"},\n",
+ " {'image_id': '793',\n",
+ " 'caption': 'The image shows a bowl filled with oatmeal, which is a type of cereal made from rolled oats. The oatmeal is a light brown color and appears to be uncooked. The bowl is made of dark brown clay and has a rough surface. There is a small amount of the oatmeal spilled out of the bowl onto the white surface below it. The image is a top down view of the bowl and the oatmeal inside it.'},\n",
+ " {'image_id': '794',\n",
+ " 'caption': \"This image depicts a close-up view of a person's face, with their eyes closed and their mouth slightly open. The person appears to be in a state of deep thought or contemplation. The lighting in the image is dim, with shadows cast on the person's face and the wall behind them. The person's skin appears to be smooth and unblemished, and their hair appears to be dark and straight. The overall mood of the image is introspective and thoughtful.\"},\n",
+ " {'image_id': '795',\n",
+ " 'caption': 'This image shows a bedroom with a beige carpet and white walls. There is a large window with wooden shutters on the left side of the room. The bed is made with a beige comforter and pillows, and there is a brown leather chair in the corner of the room. The room has a high ceiling with wooden beams, and there is a chandelier hanging from the center of the room.'},\n",
+ " {'image_id': '796',\n",
+ " 'caption': 'This image shows a wooden porch with two rocking chairs and a stone wall in the background. The chairs have cushions on them. The porch has a wooden railing and a wooden floor. The stone wall is made of large stones and has a wooden door in the center with a knob. Trees can be seen in the background and the sky is cloudy.'},\n",
+ " {'image_id': '799',\n",
+ " 'caption': 'The image shows a group of young men posing with a basketball in a gym. They are all wearing red and white uniforms, and one of them is holding a ball. They are all smiling and looking at the camera. The walls of the gym are made of wood and there are baskets hanging from the ceiling. The floor is made of hardwood and there are lines on it. There are windows on one side of the gym that let in natural light. The image is well lit and the colors are vibrant.'},\n",
+ " {'image_id': '800',\n",
+ " 'caption': 'This is a bathroom with a large bathtub, a sink, a toilet, and a walk-in shower. The walls are beige, and the floor is tiled. There is a large mirror on the wall above the sink, and a large window lets in plenty of natural light. The room is spacious and well-lit.'},\n",
+ " {'image_id': '801',\n",
+ " 'caption': 'The painting shows a group The painting depicts a group of people gathered around a table in a dimly lit room. They are dressed in period clothing and appear to be enjoying a meal together. The table is set with plates, glasses, and silverware. The room is decorated with tapestries on the walls and a chandelier hanging from the ceiling. The overall mood of the painting is warm and inviting.'},\n",
+ " {'image_id': '802',\n",
+ " 'caption': 'This cake is a yellow cake with a cartoon bird image on top. The bird is smiling and has blue eyes and a pink beak. The cake is decorated with white frosting and has yellow flowers on the sides.'},\n",
+ " {'image_id': '803',\n",
+ " 'caption': 'This is an image of a sunflower painted in watercolor style with yellow petals and a brown center on a light beige background. There is a green stem coming out of the top of the sunflower and leaves are scattered around it.'},\n",
+ " {'image_id': '804',\n",
+ " 'caption': 'This image shows a red fire truck parked in a lot, with a ladder on the back that indicates it may be used for aerial operations.'},\n",
+ " {'image_id': '805',\n",
+ " 'caption': 'This image shows two military vehicles parked next to each other in front of a large building. The first vehicle is a small, armored vehicle with tracks and a large gun mounted on top. The second vehicle is a larger, armored vehicle with tracks and a large gun mounted on top as well. The building behind them appears to be a warehouse or storage facility.'},\n",
+ " {'image_id': '806',\n",
+ " 'caption': 'This image shows an older man in a blue shirt pointing at a large, old, hand-drawn map of a city or town on the wall. The map has various streets, buildings, and landmarks marked on it. The man is pointing at a specific location, perhaps indicating where he lives or where he is from. He has a serious expression on his face and glasses perched on his nose. The room appears dimly lit with only a small amount of light coming in from the window behind him. The image suggests a sense of history and nostalgia.'},\n",
+ " {'image_id': '807',\n",
+ " 'caption': 'The image shows a group of people walking on a sandy beach next to a large body of water. The water is a bright blue and there are mountains in the background. The sky is clear and the sun is shining down on the scene. There are trees on the sides of the beach and some rocks in the water.'},\n",
+ " {'image_id': '808',\n",
+ " 'caption': 'The image shows a green clock with a white face and yellow hands mounted on a white wall. The clock has a round shape.'},\n",
+ " {'image_id': '809',\n",
+ " 'caption': 'This image shows a bedroom with a beige carpet and white walls. There is a large bed with a wooden frame and a beige bedspread. The bed is positioned in front of a large window that looks out onto a balcony or patio. There is a small table with a lamp on it next to the bed. The room is spacious and well lit.'},\n",
+ " {'image_id': '812',\n",
+ " 'caption': 'The image shows a group of people rafting down a river. They are all wearing life jackets and are smiling as they ride the rapids. The water is splashing up around them and they are holding onto the raft for dear life.'},\n",
+ " {'image_id': '813',\n",
+ " 'caption': 'This is an image of a modern and minimalist house with a wooden facade, a small garden, a large window on the second floor, and a small balcony on the first floor. The garden is filled with small trees and bushes, and there is a path leading to the front door.'},\n",
+ " {'image_id': '814',\n",
+ " 'caption': 'The image shows a group of men playing volleyball. They are all wearing red and white uniforms, with one player wearing a blue shirt. The player in the middle is reaching up to block the ball with his hand. The other players are positioned around the court, ready to hit the ball.'},\n",
+ " {'image_id': '818',\n",
+ " 'caption': 'There are several muffins on a white table with a plate of sliced strawberries in the background. The muffins have a crumbly texture and are topped with a sprinkle of sugar. The strawberries are sliced and arranged in a bowl on the table.'},\n",
+ " {'image_id': '819',\n",
+ " 'caption': 'This is a two-story house with a large front yard and a driveway leading to the garage. The exterior of the house is made of stucco and brick, with large windows on the front and side. The roof is made of shingles and there is a chimney on the left side. The front yard is landscaped with trees and shrubs, and there is a patio area in the back.'},\n",
+ " {'image_id': '820',\n",
+ " 'caption': \"The image shows a businessman in a suit and tie pressing a button on a touch screen. The background is a blue gradient with white squares and rectangles. The businessman's hand is pressed against the screen, with his index finger on the button. The image is a representation of technology and the use of touch screens.\"},\n",
+ " {'image_id': '821',\n",
+ " 'caption': 'This is an image of a bartender pouring a cocktail made with vodka, lime juice, and ice into a clear glass. The bartender is wearing a white shirt with black sleeves and a black apron and is holding a stainless steel cocktail shaker with a black handle in their right hand. The drink is garnished with a lemon wedge and a thin slice of lemon on the rim of the glass. The background is a wooden bar with a black countertop and shelves of liquor bottles behind the bartender.'},\n",
+ " {'image_id': '822',\n",
+ " 'caption': 'people in lab coats looking at beakers and test tubes filled with different colored liquids on a wall behind them. The background features a green and blue gradient circle.'},\n",
+ " {'image_id': '823',\n",
+ " 'caption': 'The image depicts a red Dodge Charger driving down a tree-lined road. The car is a large, four door sedan with a bold front grille, sleek lines, tinted windows, and a spoiler on the back. It appears to be in good condition.'},\n",
+ " {'image_id': '824',\n",
+ " 'caption': 'This image shows a bride and groom walking down the aisle at their wedding in front of a large, ornate building. The sun is shining brightly in the background, casting long shadows on the ground and illuminating the scene. The bride is wearing a white wedding gown and carrying a bouquet of flowers, while the groom is wearing a black tuxedo and holding a single red rose. The ceremony is taking place on a grassy area in front of the building, and there are several rows of chairs set up for guests to watch. The building has a large, arched entrance and several windows, and there are several trees and bushes in the foreground.'},\n",
+ " {'image_id': '825',\n",
+ " 'caption': 'The image depicts a colorful mural on the side of a building. The mural shows a group of people holding umbrellas and walking in the rain, with the umbrellas having intricate designs. The people in the mural are smiling and seem to be enjoying their walk in the rain. The background of the mural is a bright blue sky with fluffy clouds. The overall effect of the mural is cheerful and uplifting.'},\n",
+ " {'image_id': '826',\n",
+ " 'caption': 'This image shows a young boy standing in front of a fence with a group of cows behind him. The boy is reaching his hand out to pet one of the cows. The cows are in a pen, and there is hay on the ground in front of them. The boy is wearing a plaid shirt and jeans, and he has short brown hair. The cows are brown with white spots, and they have long tails. The fence is made of wood and has a gate in the middle. There are trees in the background, and the sky is cloudy.'},\n",
+ " {'image_id': '828',\n",
+ " 'caption': 'This image depicts a white hatchback car driving alongside a body of water on a road with a mountain in the background. The car has four doors, tinted windows, alloy wheels, and a small rear spoiler. It appears to be in good condition with a clean and shiny white finish, no visible scratches or dents, and a well-maintained appearance. The front grille of the car displays its logo, and the license plate is not visible.'},\n",
+ " {'image_id': '830',\n",
+ " 'caption': 'This image shows a bedroom with white walls, hardwood floors, and a large window with white curtains. The bed has a white headboard, blue and white striped bedspread, and blue pillows. There is also a small table with a lamp next to the bed.'},\n",
+ " {'image_id': '831',\n",
+ " 'caption': 'The image shows a couple standing on a grassy hill overlooking a body of water. The man is holding the woman in his arms and they are both smiling. The sky is cloudy and there are mountains in the background.'},\n",
+ " {'image_id': '832',\n",
+ " 'caption': 'The image shows a group of people standing on top of a snowy mountain. They are wearing backpacks and winter clothing, and are posing for a group photo. The sky is cloudy and there are mountains in the background.'},\n",
+ " {'image_id': '833',\n",
+ " 'caption': 'This image is a black and white photograph of a glass dome with a silhouette of a person inside it. The person is standing with their back to the viewer, and their head is tilted to the side. The words \"i am not a robot\" are written in white letters on the black background. The image is simple and minimalistic, with a clear focus on the person inside the dome.'},\n",
+ " {'image_id': '834',\n",
+ " 'caption': 'The image shows two ears of corn on the cob. The corn is yellow and appears to be freshly harvested. There are several kernels visible on each ear, and the husks are still attached. The ears are lying on top of each other, with one ear slightly tilted to the side. The background is a blue sky with fluffy white clouds.'},\n",
+ " {'image_id': '835',\n",
+ " 'caption': 'The image is a rubber stamp with the words blood sugar level written on it in blue ink. The stamp appears to have been stamped onto a transparent surface, such as a piece of paper or plastic. The words blood sugar level are written in capital letters, with the first letter of each word in a larger font than the rest of the words. The stamp appears to be made of rubber, with a slightly raised texture on the surface. The words blood sugar level are written in blue ink, with the first letter of each word in a slightly darker shade of blue than the rest of the words. The stamp appears to have been stamped onto the surface with some force, as there are small indentations in the rubber where the ink has been pressed down. The overall appearance of the stamp is slightly worn, with some scratches and scuffs on the surface.'},\n",
+ " {'image_id': '837',\n",
+ " 'caption': 'This is an image of a white plastic water filtration system with two hoses connected to the top. The words \"triple clear\" are written on it in blue letters. The filtration system removes impurities from water, making it safe to drink.'},\n",
+ " {'image_id': '838',\n",
+ " 'caption': 'The image depicts a rural field with fully grown crops, likely wheat or corn. No buildings or structures are visible. The sky is overcast, and the image was probably taken during the growing season. It is unclear whether the farm is small and independent or larger and commercial, or whether it grows crops for personal use or sale. The image highlights the important role of the agricultural industry in providing food for people worldwide.'},\n",
+ " {'image_id': '839',\n",
+ " 'caption': 'the background. The harness has a leash attached to it, and the dog is also wearing a yellow vest and collar with a tag. The background consists of trees and a cloudy sky.'},\n",
+ " {'image_id': '840',\n",
+ " 'caption': 'The image shows a group of young men standing on the deck of a ship. They are all wearing coats and scarves, and one of them is holding a guitar. The ship appears to be in a state of disrepair, with rust and debris visible on its hull and deck. The men are looking out to sea, as if they are waiting for something to happen. The overall mood of the image is one of anticipation and uncertainty.'},\n",
+ " {'image_id': '841',\n",
+ " 'caption': \"This image is a creative illustration of a cartoon tea kettle with steam coming out of its spout. The tea kettle is depicted in a stylized manner, with a round body and a curved spout. The steam coming out of the spout is depicted as a wavy line. The overall color scheme of the image is a warm gradient, with the tea kettle and steam depicted in shades of yellow and orange. The background of the image is not visible, as it is obscured by the steam. This image could be used in a variety of contexts, such as in a children's book illustration, on a product label, or as part of a graphic design. It could also be used as a decorative element in a home or office setting.\"},\n",
+ " {'image_id': '842',\n",
+ " 'caption': 'This is an image of gold and silver coins arranged in a pyramid shape on a black background. The gold coins are on the bottom and the silver coins are on top. The gold coins are shiny and reflective, while the silver coins are dull and matte. The image conveys a sense of wealth and prosperity.'},\n",
+ " {'image_id': '843',\n",
+ " 'caption': 'The image shows a group of cartoon characters The cartoon characters are all different animals, including a raccoon, a cat, a dog, and a rat. They are all wearing different outfits and posing in different positions. The background is a plain yellow color.'},\n",
+ " {'image_id': '846',\n",
+ " 'caption': 'The bride is wearing a white robe and standing in front of a window with a wreath hanging on it. The wreath is made of greenery and has a bow on top. The bride is looking at her reflection in the mirror.'},\n",
+ " {'image_id': '847',\n",
+ " 'caption': 'The image shows a female golfer hugging a male golfer on a golf course. The female golfer is wearing a green polo shirt and white pants, while the male golfer is wearing a white polo shirt and black pants. Both golfers have their arms around each other and are smiling at the camera. The background is a green golf course with trees and a blue sky.\\n\\nThe female golfer is wearing a green polo shirt and white pants, while the male golfer is wearing a white polo shirt and black pants. Both golfers have their arms around each other and are smiling at the camera. The background is a green golf course with trees and a blue sky.'},\n",
+ " {'image_id': '848',\n",
+ " 'caption': 'The image is of a woman wearing a black t-shirt with the words \"I\\'m a drummer\\'s wife\" written in pink letters on the front. The shirt has a relaxed fit and is made of a soft, breathable fabric. It is a simple and stylish design, which is suitable for anyone who loves music or wants to show their love for drumming. The shirt is comfortable and easy to wear, making it a great way to showcase your personality and style.'},\n",
+ " {'image_id': '850',\n",
+ " 'caption': 'This image shows a large, tan building with white columns and black shutters on the windows. The front of the building has a covered porch with white pillars and a balcony on the second floor. The building is surrounded by trees and landscaping, including bushes and flowers. There is a sidewalk leading up to the entrance of the building and a driveway on the right side. The building appears to be well maintained and in good condition.'},\n",
+ " {'image_id': '851',\n",
+ " 'caption': 'The image depicts a building with pink and blue bricks and a pool located at the center. The pool has pink and blue tiles surrounding it and a blue diving board at one end. A ladder with pink and blue painted rungs leads down to the pool. The building has windows on the top floor and a pink and blue roof. The sky is blue with clouds in the background.'},\n",
+ " {'image_id': '852',\n",
+ " 'caption': 'This image shows a man standing next to a table holding a trophy. The man is wearing a black leather jacket and is smiling at the camera. The trophy appears to be made of metal and has engravings on it. The background of the image is a dimly lit room with a white ceiling and walls.'},\n",
+ " {'image_id': '853',\n",
+ " 'caption': \"The image shows a small wooden house on a hill with a red roof and white walls overlooking a lake surrounded by green hills and flowers. A path leads down to the water's edge. The sky is bright blue with fluffy white clouds, creating a peaceful and serene mood.\"},\n",
+ " {'image_id': '854',\n",
+ " 'caption': 'This image shows a group of people standing outside in front of a building. They are all dressed in different clothing, with one person wearing a hat and sunglasses, another wearing a suit and tie, and the others wearing casual clothing. They are all smiling and appear to be having a good time. There is a car parked in front of the building and some trees in the background.'},\n",
+ " {'image_id': '855',\n",
+ " 'caption': 'The image shows a herd of bison grazing near a lake in a forest area. The bison are standing on the grassy shore of the lake, with their heads down as they eat. In the background, there are tall pine trees and mountains visible. The sky is clear and blue, with a few fluffy clouds visible. \\n\\nThere are several bison in the image, ranging in size from small calves to large adults. They have shaggy brown fur and large heads with distinctive humps on their backs. Some of the bison are standing on their hind legs, while others are lying down or grazing on the grass.\\n\\nThe lake is surrounded by lush green forest, with tall pine trees and other types of foliage visible. The water is clear and reflects the trees and mountains in the background.'},\n",
+ " {'image_id': '856',\n",
+ " 'caption': \"flowing through the valley and the surrounding hills and mountains providing a breathtaking backdrop. The crystal clear water, pink flowers, and lush greenery add to the scene's tranquility, while the rocky outcrop and trees provide a sense of stability and grounding. The presence of snow on the distant mountains also adds a touch of harshness and contrast to the otherwise peaceful scene.\"},\n",
+ " {'image_id': '857',\n",
+ " 'caption': 'The dress is made of a sheer, lacy material with intricate embroidery on the bodice and sleeves. It has a high neckline and a fitted waist, with a flared skirt that falls just above the knee. The dress has a zipper closure on the back. The color of the dress is a light grey with white and silver embroidery. The model is wearing a pair of strappy sandals with the dress.'},\n",
+ " {'image_id': '858',\n",
+ " 'caption': 'The image depicts a sliced orange on a white background. There are six slices of the orange in the image, each with its own unique shape and color. The orange slices are arranged in a circular pattern, with one slice in the center and the other slices surrounding it. The orange slices have a bright, vibrant color that stands out against the white background. The overall effect of the image is one of freshness and vitality, as the orange slices appear to be freshly sliced and ready to be enjoyed.'},\n",
+ " {'image_id': '860',\n",
+ " 'caption': \"The image is a white t-shirt with a red and orange design on the front that reads `libor university'. The design features a shield with the words `libor university' written on it in red and orange letters. The shield is surrounded by a red and orange border with the same words. The t-shirt also features the university's logo, which is a shield with the same words written on it in red and orange letters.\"},\n",
+ " {'image_id': '862',\n",
+ " 'caption': \"The image depicts a cartoon bear wearing a suit and bow tie, standing with his hands in his pockets. The bear is wearing a plaid jacket and brown pants, and has a brown bow tie around its neck. The bear's face is not visible, but it appears to be smiling. The image is in black and white, with a white background.\\n\\nThe bear is wearing a suit and bow tie, which suggests that it is dressed up for a formal occasion. The brown pants and jacket give the bear a rustic, outdoorsy look, while the bow tie adds a touch of sophistication. The bear appears to be smiling, which suggests that it is happy or excited about something. The white background helps to make the bear stand out and gives the image a clean, crisp look.\"},\n",
+ " {'image_id': '863',\n",
+ " 'caption': 'There are three tables in the image, all made of wood with round tops. One has a brown finish while the other two have a lighter color. The tables are arranged in a row, with the largest on the left and the smallest on the right. The room has concrete walls and a metal door on the left side. No chairs or other furniture are present.'},\n",
+ " {'image_id': '865',\n",
+ " 'caption': 'This image shows a metal pot filled with a white liquid being stirred by a wooden spoon. A measuring cup with an unidentified liquid is placed beside the pot on a wooden table with a brown surface.'},\n",
+ " {'image_id': '867',\n",
+ " 'caption': 'The image shows a man and a woman standing next to each other at a party. The man is wearing a black suit and the woman is wearing a brown dress. They are both holding drinks and smiling at the camera. There are other people in the background, sitting at tables and standing around them. The room is dimly lit and there are chandeliers hanging from the ceiling.'},\n",
+ " {'image_id': '869',\n",
+ " 'caption': \"This is a photo of a bookstore. The walls are painted white and there are shelves of books on the walls, filled with a variety of books, including fiction, nonfiction, and children's books. The floor is made of tiles and there is a carpeted area in the center of the room. The room is well lit with overhead lights and floor lamps on the shelves. There is a large window on one wall that lets in a lot of natural light. Two people are in the photo, one standing by the window and the other in front of the shelves.\"},\n",
+ " {'image_id': '870',\n",
+ " 'caption': 'This image is a flyer or poster advertising a yoga class or event. It features a photograph of a person in a yoga pose on the left side, with the words \"Sound Immersions at Brevard Yoga on the 3rd Sunday of the month at 7pm\" written above. On the right side, there is an image of a set of drums and other musical instruments, with the same text written above them. The overall design is clean and simple, with a white background and bold, black text.'},\n",
+ " {'image_id': '872',\n",
+ " 'caption': \"This is a white husky puppy sitting on the grass, looking up at the camera with its tongue out. The puppy's fur is fluffy and white, and its eyes are bright blue. The grass is green and lush, and there is a blue sky in the background. The puppy's ears are perked up and its tail is wagging. There is a fence and some trees in the distance.\"},\n",
+ " {'image_id': '875',\n",
+ " 'caption': 'The image depicts a glowing globe surrounded by a network of lines and dots. The lines and dots appear to be connected to the globe and to each other, forming a complex pattern. The globe is glowing with a bright blue light, and the lines and dots around it are also glowing in different shades of blue and green. The overall effect is a futuristic, high-tech image that suggests the interconnectedness of technology and the world.\\n\\nThe image could be used to represent a variety of concepts related to technology and the world, such as globalization, interconnectedness, and the role of technology in shaping our understanding of the world. It could also be used to represent the idea of a \"global village,\" in which technology has made the world smaller and more interconnected. Overall, the image suggests a futuristic, high-tech vision of the world that is connected and interdependent.'},\n",
+ " {'image_id': '876',\n",
+ " 'caption': 'This image shows the inside of a building with intricate carvings and decorations on the walls and ceiling. The floor is made of wooden planks and there are two large elephant tusks on either side of the entrance. The walls are decorated with carvings of various animals, including elephants, lions, and giraffes. There is a large red curtain hanging from the ceiling in the center of the room.'},\n",
+ " {'image_id': '877',\n",
+ " 'caption': 'This image depicts a person pushing a cart full of snow-covered boxes down a snowy street. The person is wearing a hooded sweatshirt and gloves, and is using both hands to push the cart. The boxes are stacked on top of each other. Cars are parked on both sides of the street, and buildings can be seen in the background. The sky is cloudy and snow is falling.'},\n",
+ " {'image_id': '878',\n",
+ " 'caption': 'The image shows a woman wearing a pink dress and a white hat. She has a serious expression on her face and is looking directly at the'},\n",
+ " {'image_id': '879',\n",
+ " 'caption': \"This image depicts the planet Mars, which is the fourth planet from the sun in our solar system. It is known for its reddish appearance and is often referred to as the Red Planet. The image shows a view of the planet from space, with the planet's surface visible in the foreground and the blackness of space in the background. There is a small satellite orbiting the planet, which can be seen in the upper left corner of the image. The satellite is equipped with various instruments to study the planet's atmosphere and surface.\"},\n",
+ " {'image_id': '880',\n",
+ " 'caption': 'This image depicts a djembe, which is a type of drum that originated in West Africa. The djembe is made from a hollowed tree trunk and has a goat skin head. It is played with sticks and is used in traditional African music. The image shows the djembe on a white background.'},\n",
+ " {'image_id': '881',\n",
+ " 'caption': 'The image shows a shoe organizer with six compartments, each containing a pair of shoes. The compartments are lined with beige fabric and have brown straps to keep the shoes in place. The bottom of the organizer has a zipper to close it.'},\n",
+ " {'image_id': '882',\n",
+ " 'caption': 'This image shows a clean desk, which is a sign of an organized and tidy workspace. The desk appears to be made of wood and has a computer, monitor, keyboard, mouse, and other office supplies on it. There is a lamp on the desk, and the room appears to be well lit. The walls of the room are painted white, and there is a window behind the desk that lets in natural light. The floor is made of wood or laminate, and there are no visible clutter or mess on the desk or surrounding area. The overall impression is one of cleanliness and organization.'},\n",
+ " {'image_id': '884',\n",
+ " 'caption': 'This image is a production cel from the animated film, \"Snow White and the Seven Dwarfs.\"'},\n",
+ " {'image_id': '885',\n",
+ " 'caption': 'This image shows a man sitting at a table with a plate of food in front of him. The man is wearing a suit and holding a glass of wine. There are several other plates of food on the table, including meat, vegetables, and bread. The table is covered with a white tablecloth and has a vase of flowers. The room appears to be a dining room, with a large window behind the man and a chandelier hanging from the ceiling. The walls are painted a light color, and there are several pieces of artwork. The overall atmosphere of the image is elegant and sophisticated.'},\n",
+ " {'image_id': '887',\n",
+ " 'caption': 'The image shows a group of people standing in a circle on a green lawn, holding hands and looking towards the center where a small tree with a red ribbon tied around it is located. They appear to be happy and smiling, while the sun shines down on the scene, casting long shadows on the grass. The mood of the image is joyful and unifying.'},\n",
+ " {'image_id': '888',\n",
+ " 'caption': 'The image shows a metal pegboard with tools arranged in the shape of the letter \"b\", including hammers, wrenches, and screwdrivers. It is mounted on a white background.'},\n",
+ " {'image_id': '889',\n",
+ " 'caption': \"This is a Lego model of a humanoid robot. It has a white and gray color scheme with black accents on the arms and legs. The robot is standing on its hind legs with its arms outstretched. It appears to be holding something, but it's unclear what it is. The robot has a sleek and futuristic design, with intricate details on the body and head, resembling a character from a science fiction movie or video game.\"},\n",
+ " {'image_id': '890',\n",
+ " 'caption': 'The image shows a beach with the ocean in the background. A large tree branch hangs over the sand, and a small dog stands on the beach in front of it. The sky is clear and blue with a few fluffy clouds in the distance. The water is deep blue with small waves washing up onto the shore. The sand is white with small rocks and shells scattered around. Palm trees and greenery are in the background.'},\n",
+ " {'image_id': '891',\n",
+ " 'caption': 'The image shows a rooftop garden surrounded by a metal fence with stairs leading up to it. The garden contains various plants, including grasses and shrubs. In the background, there is a construction site with cranes and scaffolding. The sky is clear and blue.'},\n",
+ " {'image_id': '892',\n",
+ " 'caption': 'This is a white mug with an image of a person wearing a red and white striped shirt and black hair. The mug is placed on a white background.'},\n",
+ " {'image_id': '893',\n",
+ " 'caption': 'This image depicts a young boy playing tennis with a racket on a white background. The boy is wearing blue shorts and a green shirt, and he is holding a tennis racket in his hand. The image is a simple illustration of a young boy playing tennis, and it could be used in a variety of contexts, such as on a website, in a book, or as part of a design for a poster or brochure. The image could also be used as a decorative element, or as a part of a larger illustration or design.'},\n",
+ " {'image_id': '894',\n",
+ " 'caption': 'The image shows a small waterfall flowing over rocks in a wooded area surrounded by lush green foliage and trees. The rocks are covered in moss and lichen. The water is clear and appears to be flowing from a higher elevation to a lower one, giving a sense of tranquility with the sound of the water and peaceful surroundings.'},\n",
+ " {'image_id': '895',\n",
+ " 'caption': \"The image is a movie poster for the comedy horror film Young Frankenstein, directed by Mel Brooks and starring Gene Wilder, Madeline Kahn, and Cloris Leachman. It was released in 1974 and parodies the classic Universal horror films of the 1930s and 1940s, specifically Frankenstein and Bride of Frankenstein. The story follows a young neurosurgeon who inherits his grandfather's castle in Transylvania and attempts to reanimate his grandfather's monster. The film features iconic characters such as the monster, played by Peter Boyle, and the mad scientist, played by Gene Wilder. Young Frankenstein was a box office success and has become a cult classic.\"},\n",
+ " {'image_id': '897',\n",
+ " 'caption': 'the foreground and a distant mountain range in the distance. The sky is a deep orange color, with a few clouds visible in the distance.'},\n",
+ " {'image_id': '898',\n",
+ " 'caption': 'The image shows a woman standing next to a car at a gas station. She is wearing a blue crop top and ripped jeans, and has her hair styled in a messy bun. There is a gas pump in the foreground, and the woman appears to be getting out of the car. The image is well lit and in focus, with a clear blue sky in the background.'},\n",
+ " {'image_id': '899',\n",
+ " 'caption': 'The image shows a group of people standing around a white Mercedes S Class Coupe on display at an auto show. The car has a sleek, modern design. Some people in the image are taking pictures of the car while others are just standing and looking at it. In the background, there are other cars on display, including a red sports car and a black sedan. The people in the image are dressed in a variety of clothing, including suits, dresses, and casual outfits. The overall atmosphere of the image is one of excitement and interest in the cars on display.'},\n",
+ " {'image_id': '902',\n",
+ " 'caption': 'This image shows a bride and groom walking through an olive grove at sunset. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. They are holding hands and looking at each other with smiles on their faces. The sun is setting behind them, casting a warm orange glow over the scene. The trees in the background are tall and thin with changing leaves for the fall. The ground is covered with dry leaves and twigs, and there is a dirt path that leads through the grove.'},\n",
+ " {'image_id': '904',\n",
+ " 'caption': \"This is an image of a police officer wearing a black uniform with a rainbow heart badge on the chest. The officer is holding a pair of handcuffs in one hand and a baton in the other. The officer's face is not visible in the image. The image is in black and white.\"},\n",
+ " {'image_id': '905',\n",
+ " 'caption': 'This is a black and white photograph of a man in ski gear kneeling on a snowy slope with skis and poles at his feet. The man is wearing a white helmet, goggles, gloves, and a black jacket with white stripes on the sleeves. He is also wearing black pants and boots. In the background, there are mountains with snow on them and a blue sky.\\n\\nThe man is kneeling on the snow with his skis and poles at his feet. He is wearing a white helmet, goggles, gloves, and a black jacket with white stripes on the sleeves. He is also wearing black pants and boots. In the background, there are mountains with snow on them and a blue sky.'},\n",
+ " {'image_id': '906',\n",
+ " 'caption': 'This image is a black and white photograph of a woman in a ball gown standing in front of a mirror. The woman is wearing a long, flowing dress with a low neckline and a slit up the side. The dress is adorned with beads and sequins, and the woman is holding a bouquet of flowers in her left hand. Behind the woman, there is a reflection of a young girl in a pink dress standing on the other side of the mirror. The girl is looking at the woman in the ball gown with a curious expression on her face. The overall tone of the image is elegant and sophisticated.'},\n",
+ " {'image_id': '908',\n",
+ " 'caption': 'This is an image of a fish. It has a long, slender body with a small head and large eyes. The fish has a brown and white striped pattern on its body.'},\n",
+ " {'image_id': '909',\n",
+ " 'caption': 'This image shows a group of people sitting in a theater watching a movie. They are all wearing casual clothing and smiling at the camera. Some of them are holding drinks and snacks, while others are holding their phones. The theater is dimly lit, with only the light from the screen illuminating the room. The people in the image appear to be enjoying themselves and having a good time.'},\n",
+ " {'image_id': '910',\n",
+ " 'caption': 'The woman in the image is wearing a green and gold patterned dress with a plunging neckline, long sleeves, a fitted bodice, and a flared skirt. She is also wearing a brown jacket with gold detailing on the lapels and pockets, which has long sleeves and is open at the front. Her black high heels complement her outfit, and her hair is styled in loose waves. She is posing with her hands on her hips and a smile on her face in a dimly lit room with tables and chairs set up for a party or event.'},\n",
+ " {'image_id': '911',\n",
+ " 'caption': 'There is a stack of old and worn books on a desk in front of a chalkboard in a dimly lit room. An apple with a bite taken out of it sits on top of the stack. The chalkboard has writing on it, but the words are unclear. A single lightbulb hangs from the ceiling.'},\n",
+ " {'image_id': '912',\n",
+ " 'caption': 'The bride and groom are standing in front of a large tree with colorful leaves in the background. They are both smiling and looking at the camera. The bride is wearing a white wedding dress with a long veil and the groom is wearing a black tuxedo with a white shirt and tie.'},\n",
+ " {'image_id': '914',\n",
+ " 'caption': \"The image is a black and white photograph of a woman wearing a dress with a tropical print. The dress has a high neckline, long sleeves, and a flared skirt. The woman is wearing a pair of black heels and has her hair styled in a bun. The photograph is taken from a low angle, looking up at the woman's face and the dress. The background is a blur of green leaves and plants.\"},\n",
+ " {'image_id': '915',\n",
+ " 'caption': \"This is an image of a television screen with a red background displaying an image of a person standing in front of a white wall. The person is wearing a red shirt and black pants. The image is clear, well-defined, and high-resolution with no pixelation or blurriness. Fine details such as the wrinkles in the person's shirt and the texture of the wall can be seen. The colors are vibrant and well-saturated, with the red background providing bold contrast to the white wall and the person's clothing. Overall, this is a well-composed and visually appealing image.\"},\n",
+ " {'image_id': '916',\n",
+ " 'caption': 'This is a small white church with a steeple on top and a cross on the front. The building is surrounded by green grass and trees.'},\n",
+ " {'image_id': '918',\n",
+ " 'caption': 'The image shows a group of colorful buttons arranged in a circular pattern on a white cloth. Each button has a different color and shape, and they are all tied together with a piece of yarn. The buttons appear to be made of plastic or some other synthetic material, and they have a smooth, glossy surface. The overall effect of the image is one of bright, cheerful colors and playful, whimsical design.'},\n",
+ " {'image_id': '919',\n",
+ " 'caption': \"The image shows a group of people standing together in a line, with their arms around each other's shoulders. They are all smiling and looking at the camera. The background is white and there is no text or other elements in the image. The people in the image appear to be a group of friends or colleagues, standing together in a casual pose. They are all dressed in different clothing, with some wearing jeans and t-shirts, and others wearing more formal attire such as suits and dresses. The overall mood of the image is one of happiness and camaraderie.\"},\n",
+ " {'image_id': '922',\n",
+ " 'caption': 'This image shows a bowl of zucchini noodles with beef and vegetables, topped with sliced beef and vegetables, served in a white bowl with a wooden spoon holding the noodles. The background is a wooden table with a white tablecloth.'},\n",
+ " {'image_id': '923',\n",
+ " 'caption': 'This image features three sailboats sailing towards the camera on a calm body of water. In the background, there are tall buildings made of brick or concrete with windows, balconies, and some flags or banners. The sky is overcast with dark clouds in the distance. The sailboats include a small white sailboat with a red hull, a larger white sailboat with a blue hull and white sails, and a small white sailboat with a blue hull and white sails.'},\n",
+ " {'image_id': '925',\n",
+ " 'caption': 'This image depicts a large white building with many windows and balconies, located on a city street at dusk. The building has several floors and appears to be made of concrete and steel. There are several cars parked on the street in front of the building, and people can be seen walking on the sidewalks. The sky is pink and orange, indicating a sunset.'},\n",
+ " {'image_id': '927',\n",
+ " 'caption': 'This is an image of a black and white cat with yellow eyes looking directly at the camera with a curious expression. The text on the image reads, \"the amazing adventures of Marcello the cat.\" The cat appears to be well groomed and in good health. Its fur is smooth and shiny, eyes are bright and alert, whiskers are long and bushy, and ears are perked up and attentive. The background is solid black, making the cat stand out. The overall effect of the image is one of playfulness and cuteness.'},\n",
+ " {'image_id': '928',\n",
+ " 'caption': \"This image shows a group of people sitting on the steps of a building. They are all dressed in white shirts and shorts, and one person is holding a dog on a leash. The building behind them appears to be made of stone and has several windows and a door. The sky is cloudy and there are some trees in the background. The people in the image appear to be relaxing and enjoying each other's company. They are sitting in a row, with the person holding the dog sitting at the end. The person on the left is wearing sunglasses and has their arm around the person next to them. The person in the middle is smiling and has their hand on the shoulder of the person on the right, who is looking down and appears to be petting the dog. This image is a snapshot of a group of friends or family members enjoying a moment together outside.\"},\n",
+ " {'image_id': '929',\n",
+ " 'caption': 'The image shows a group of people standing in front of a large body of water, with lightning flashing in the background. The people in the image are dressed in dark clothing, with one person holding a staff and another holding a sword. The image appears to be a scene from a movie.'},\n",
+ " {'image_id': '930',\n",
+ " 'caption': 'This image is a botanical illustration of a plant with large, green leaves and small, white flowers. The leaves are long and thin, and the flowers are small and white with yellow centers. The plant appears to be growing out of the ground and has a long, thin stem. The background of the image is white.'},\n",
+ " {'image_id': '931',\n",
+ " 'caption': 'The image shows a bowl of white whipped cream topped with sliced lemons. The lemons are scattered around the bowl and on the table in front of it. The background appears to be a wooden surface.'},\n",
+ " {'image_id': '934',\n",
+ " 'caption': 'The image shows a beige scooter parked on a gray surface. The scooter has a brown seat and black wheels. The scooter appears to be in good condition, with no visible scratches or dents.\\n\\nThe scooter has a sleek, modern design, with a streamlined body and a small, round headlight. The handlebars are low and wide, making it easy for the rider to control the scooter. The fuel tank is located under the seat, and there is a small storage compartment under the seat as well.\\n\\nOverall, the scooter appears to be a reliable and stylish mode of transportation.'},\n",
+ " {'image_id': '935',\n",
+ " 'caption': \"This image is a statue of an angel The statue is made of stone and depicts an angel holding a baby in its arms. The angel's face is serene and its wings are outstretched, as if it is about to take flight. The statue is located in a garden surrounded by shrubs and trees.\"},\n",
+ " {'image_id': '936',\n",
+ " 'caption': 'The image shows a man and The man is wearing a black suit with a white shirt and a black tie. The woman is wearing a black suit with a white blouse and a black tie. Both are wearing black shoes and have their hands in their pockets. They are standing in front of a white background.'},\n",
+ " {'image_id': '937',\n",
+ " 'caption': 'This image shows a man wearing a blue shirt, white pants, and white sneakers walking down a light blue hallway with a blue carpet. He has his hands in his pockets, wears sunglasses, and has a serious expression on his face. No other people are visible in the image.'},\n",
+ " {'image_id': '938',\n",
+ " 'caption': 'This is an image of a man wearing a white apron and gloves, standing in front of a kitchen counter. The man is holding a large knife in his right hand and appears to be preparing food. The kitchen behind him is well equipped with various utensils and appliances, including a stove, oven, and refrigerator. The overall atmosphere of the image is warm and inviting, with soft lighting and a cozy ambiance.'},\n",
+ " {'image_id': '939',\n",
+ " 'caption': 'This image shows an older man standing next to a wooden fence with a cross on it. The man is wearing a gray sweater and has gray hair and a beard. He is looking at the camera with a serious expression on his face. Behind him, there is a house with a white picket fence and a small garden with some flowers in it. The sky is cloudy and there are some trees in the background.'},\n",
+ " {'image_id': '940',\n",
+ " 'caption': 'The image is a handshake between two people, one wearing a shirt with the colors of the South African flag and the other wearing a shirt with the colors of the United States flag. The handshake is a symbol of unity and cooperation between the two countries.\\n\\nThe colors of the South African flag are black, green, yellow, red, and blue, while the colors of the United States flag are red, white, and blue. The handshake is taking place in front of a white background, and both people are looking at each other with smiles on their faces.\\n\\nThe image can be interpreted as a symbol of friendship and cooperation between South Africa and the United States, as well as a representation of the diversity and unity of the two countries.'},\n",
+ " {'image_id': '942',\n",
+ " 'caption': 'This is a black and white ink drawing of a gate in the middle of a field. The gate is made of metal and has a rusty appearance. Tall grasses and weeds are growing on either side of the gate. In the background, a hill with trees on it can be seen, and in the distance, there is a body of water. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '945',\n",
+ " 'caption': 'The image shows a person standing on top of a rocky mountain with snow covered peaks in the background. The person is wearing a red jacket, black pants, and a backpack. The sky is cloudy and there are no trees or other vegetation visible.'},\n",
+ " {'image_id': '946',\n",
+ " 'caption': 'The image shows a body of water with a large glass building in the background. Several boats are in the water, and a few people are standing on the shore. The sky is clear and blue with clouds in the distance. Tall buildings and trees are visible on the other side of the water.'},\n",
+ " {'image_id': '948',\n",
+ " 'caption': 'This image shows a white wedding dress with lace detailing on the bodice and skirt hanging on a wooden rack in a dimly lit room with wooden beams. The strapless gown has a sweetheart neckline and is made of a delicate, sheer material. Light is coming in from the windows and chandelier hanging from the ceiling. The room has white walls and a wooden plank floor with a rug in front of the dress.'},\n",
+ " {'image_id': '949',\n",
+ " 'caption': \"This image is of a Paw Patrol themed birthday party. The plate features several characters from the show, including Chase, the main character, and various vehicles such as a fire truck and police car. The plate has a blue and white background with the Paw Patrol logo in the center. Balloons and streamers in blue, red, and yellow, the show's signature colors, decorate the plate.\"},\n",
+ " {'image_id': '950',\n",
+ " 'caption': 'The woman in the image is wearing a white dress and a wide-brimmed straw hat while standing on a sandy beach. She has her hands on her hips and is looking off into the distance. The dress has a high neckline and long sleeves, and the hat has a wide brim to shade her face from the sun. She is also wearing white sneakers and has a small bag slung over her shoulder. The background features sandy beach, rocks, and palm trees in the distance.'},\n",
+ " {'image_id': '951',\n",
+ " 'caption': 'This is a map of the city of Mumbai, India. Mumbai is located on the coast of the Arabian Sea and is the largest city in India. It is divided into several districts, including South Mumbai, North Mumbai, East Mumbai, and West Mumbai. The city is known for its bustling streets, vibrant culture, and iconic landmarks such as the Gateway of India and the Taj Mahal Palace Hotel.'},\n",
+ " {'image_id': '952',\n",
+ " 'caption': \"This image shows a silver 2005 Honda Odyssey minivan parked in front of a brick building with graffiti on the side. The license plate on the back of the car reads `person'. The car appears to be in good condition, with no visible dents or scratches. The windows are tinted and the wheels are silver. There is a small amount of graffiti on the side of the building.\"},\n",
+ " {'image_id': '953',\n",
+ " 'caption': 'The image shows heart-shaped sunglasses with metal frames in a matte finish. The lenses are tinted in shades of pink, brown, and purple. The sunglasses are displayed in a grid pattern against a white background.'},\n",
+ " {'image_id': '954',\n",
+ " 'caption': 'The image shows two hands, one holding a small plant in a pot, and the other hand placed on top of the plant as if to protect it. The background is plain white, and the overall tone is peaceful and serene.'},\n",
+ " {'image_id': '955',\n",
+ " 'caption': 'This image shows the inside of a large, ornate cathedral with high vaulted ceilings and large stained glass windows. The walls are made of stone and have intricate carvings and decorations. The floor is made of marble and there are several pews for people to sit on. The altar at the front of the cathedral has a large crucifix on it and there are several candles lit around it. The overall atmosphere of the image is one of grandeur and reverence.'},\n",
+ " {'image_id': '956',\n",
+ " 'caption': 'This is a black folding bicycle with white accents. The front wheel is removed and leaning against a white wall. The bike has a black frame, white wheels, and white accents on the handlebars, seat, and pedals. The tires are black and in good condition. The bike has a small amount of dirt but is in overall good condition.'},\n",
+ " {'image_id': '957',\n",
+ " 'caption': 'The image depicts a grey leather shoulder bag with silver studs on the sides and a zipper closure on the top. The bag has a long adjustable strap for wearing it as a shoulder or crossbody bag. It also features a small tassel on the zipper pull. The bag appears to be empty with no visible pockets or other features.'},\n",
+ " {'image_id': '958',\n",
+ " 'caption': \"This image is a representation of a computer screen with an image of a person's hand pointing towards it. The background is a colorful, abstract design with various shapes and lines. The hand appears to be reaching out towards the screen, as if it is trying to interact with the image.\"},\n",
+ " {'image_id': '959',\n",
+ " 'caption': 'The image is of an olive green canvas waist bag with multiple compartments and straps. It has a zipper closure on the top and two zippered pockets on the front - one on the left and one on the right. It also has two adjustable straps on the back made of the same material as the bag with metal buckles. The bottom of the bag is reinforced for extra support.'},\n",
+ " {'image_id': '960',\n",
+ " 'caption': 'This image appears to be a patchwork quilt hanging on a clothesline. The quilt is made up of different colored and patterned squares, with a blue, orange, and green color scheme. The overall design of the quilt appears to be random, with no discernible pattern or design.'},\n",
+ " {'image_id': '961',\n",
+ " 'caption': 'The image shows a soccer player standing on the field with his arms crossed in front of him. He is wearing a blue and white jersey with the number 10 on the back. Behind him, you can see a group of fans in the stands watching the game. The stadium is surrounded by green grass and trees, and there is a clear blue sky in the background.'},\n",
+ " {'image_id': '962',\n",
+ " 'caption': 'This is an image of a magazine cover titled \"Vogue\". It features a woman wearing a red dress and holding a red rose in front of a white background. The cover also includes a stylized black and white \"v\" logo. The design is modern with bold typography and a minimalist color scheme. The composition is well balanced, with the woman\\'s figure in the foreground and the title and logo in the background. The use of negative space draws attention to the woman and the title, creating an overall effect of sophistication and elegance.'},\n",
+ " {'image_id': '963',\n",
+ " 'caption': \"The image shows a baby lying on its stomach on a white blanket. The baby is wearing a white onesie and has its arms stretched out to the sides. The baby's eyes are closed and it appears to be sleeping. The image is well lit and the details of the baby's face and body are clear.\"},\n",
+ " {'image_id': '964',\n",
+ " 'caption': 'The image shows a woman wearing a leopard print long cardigan sweater over a black tank top and ripped jeans. She is standing with her hands in her pockets, looking at the camera with a neutral expression. The cardigan has a hood and long sleeves, and is open at the front to reveal a brown and tan striped shirt underneath. The jeans are ripped at the knees and have a frayed hem. The woman is wearing brown ankle boots with a square toe and low heel. The overall style of the outfit is casual and comfortable, with a mix of leopard print and neutral colors.'},\n",
+ " {'image_id': '965',\n",
+ " 'caption': \"This image shows a man standing on a balcony overlooking the city. He is wearing a black shirt with white sleeves and a hat, with his hands in his pockets. A table with a record player and records stacked next to it is in front of him. The man is looking into the distance with a thoughtful expression. The composition is well done, with the man in the foreground and the cityscape in the background. The lighting is bright, with the sun casting shadows on his body. The colors are muted, except for the man's clothing. The overall effect is one of calm and contemplation.\"},\n",
+ " {'image_id': '968',\n",
+ " 'caption': 'This is a black and white photograph of a room with a billiards table in the center. The walls are decorated with paintings and mirrors, and there are several pieces of furniture in the room, including a desk and chairs. The floor is covered with a rug, and there is a fireplace in the corner of the room.'},\n",
+ " {'image_id': '969',\n",
+ " 'caption': 'This image shows a large, ornate ballroom with high ceilings, chandeliers, and large windows. The walls are painted in a pale blue color, and the floors are made of marble. There is a large, round chandelier hanging from the center of the ceiling.'},\n",
+ " {'image_id': '970',\n",
+ " 'caption': 'The image shows a pair of white eggs on a white background. The eggs are oval in shape and have smooth, glossy surfaces indicating their freshness and unbroken state.'},\n",
+ " {'image_id': '972',\n",
+ " 'caption': 'The dress is a green and purple gown with a mermaid style bodice and a long, flowing skirt. The bodice has a sweetheart neckline and a purple bow at the waist. The skirt is made of layers of green and purple tulle, with a layer of purple tulle at the bottom. The dress has a long, flowing train in the back.'},\n",
+ " {'image_id': '974',\n",
+ " 'caption': 'There are two dogs sitting on the floor, one wearing a red sweater and the other wearing a green collar. They are both looking at the camera with their tongues out. The background is a wooden floor and there are some plants on the windowsill.'},\n",
+ " {'image_id': '975',\n",
+ " 'caption': 'The image shows a group of underwater creatures, including fish, turtles, and jellyfish, swimming in the ocean. There are also coral reefs and seaweed in the background. The image is colorful and vibrant, with a lot of detail in the sea creatures and the ocean floor. The overall mood of the image is peaceful and serene, as if the viewer is looking at a beautiful underwater scene.'},\n",
+ " {'image_id': '976',\n",
+ " 'caption': 'This image appears to be a logo for a business or brand. It features the letter \"i\" in a cursive font with a crown on top, which is a simple and elegant design that adds a regal touch to the logo. The color scheme is mostly white with a yellow background, and the overall design is simple and clean, making it easy to read and recognize.'},\n",
+ " {'image_id': '977',\n",
+ " 'caption': 'The image is a level 3 question that asks for a description of the image.'},\n",
+ " {'image_id': '978',\n",
+ " 'caption': 'This is a dish of grilled lamb chops on a white plate, garnished with parsley and black pepper. The lamb chops are cooked to perfection, with a crispy exterior and juicy interior. The parsley and black pepper add a fresh and aromatic flavor to the dish. The plate is surrounded by a white tablecloth, and there is a fork and knife on the side. The overall presentation of the dish is visually appealing and appetizing.'},\n",
+ " {'image_id': '979',\n",
+ " 'caption': 'The image shows a basketball player wearing a white jersey with the number 23 on the back and black shorts. He is holding a basketball in his right hand and appears to be walking on a court with a crowd in the background.'},\n",
+ " {'image_id': '980',\n",
+ " 'caption': 'etsuit and holding a surfboard under their arm. The ocean is a deep blue color with some waves, while the sky is a light blue with distant clouds. The image gives off a relaxed and peaceful vibe.'},\n",
+ " {'image_id': '981',\n",
+ " 'caption': 'The image shows a group of people engaging in various physical activities, such as running, jumping, and lifting weights. The people are diverse in age, gender, and ethnicity. They are all wearing athletic clothing, such as shorts, t-shirts, and sneakers. The setting is a gym or fitness center, with equipment such as treadmills, weights, and exercise balls visible in the background. The overall theme of the image is fitness and exercise.'},\n",
+ " {'image_id': '982',\n",
+ " 'caption': 'The image shows a large luxury yacht docked in a marina. The yacht is white with black accents and has several decks and balconies. There are several people standing on the dock, watching the yacht. The sky is clear and blue, with a few clouds in the distance. The water in the harbor is calm and still.'},\n",
+ " {'image_id': '983',\n",
+ " 'caption': \"This is a double decker bus. It is green with a yellow stripe down the side. The front of the bus has the words `person' written on it in white letters. The windows are tinted and the doors are open, revealing the inside of the bus. There is a sign on the front of the bus that says `person' in white letters. The bus is parked on the side of the road and there is a building in the background.\"},\n",
+ " {'image_id': '984',\n",
+ " 'caption': 'This is an image of a slow cooker with a reddish sauce in it. A metal spoon is visible, stirring the liquid. The slow cooker is made of black plastic and has a lid on it.'},\n",
+ " {'image_id': '986',\n",
+ " 'caption': 'The image shows a cyclist wearing an orange jersey and black shorts riding a bicycle on a dirt road. The cyclist is wearing a black helmet and gloves and has a black backpack on his back. There are trees on either side of the road and a blue sky in the background.\\n\\nThe cyclist is wearing an orange jersey with black shorts and a black helmet and gloves. He has a black backpack on his back and is riding a bicycle on a dirt road. There are trees on either side of the road and a blue sky in the background.'},\n",
+ " {'image_id': '987',\n",
+ " 'caption': 'The image shows a cartoon character holding a guitar on the stage of a theater. The character is wearing a black and white striped shirt, black pants, and black boots. The stage is empty and there are no other characters or objects in the image. The walls of the theater are made of wooden panels and there is a curtain hanging from the ceiling. The lighting in the theater is dim, with a few spotlights shining down on the stage.'},\n",
+ " {'image_id': '988',\n",
+ " 'caption': 'This image shows a bedroom with white walls, hardwood floors, and a large window. There is a white bed with a wooden headboard and white bedding. The walls have framed pictures hanging on them. The room is well lit, with a lamp on the bedside table. The bedroom is spacious and airy, with plenty of natural light coming in through the window. The hardwood floors add a warm and cozy feel to the room. The white walls and bedding create a clean and crisp look, while the wooden headboard adds a rustic touch. The framed pictures on the wall add a personal touch and make the room feel more inviting. This bedroom is a great example of a modern and stylish space that is both functional and aesthetically pleasing.'},\n",
+ " {'image_id': '989',\n",
+ " 'caption': 'The image shows a large stone slab with hieroglyphics carved into it. The hieroglyphics depict several figures, including a man and a woman. The stone is displayed in a glass case, surrounded by other artifacts in a museum.'},\n",
+ " {'image_id': '990',\n",
+ " 'caption': 'The image shows a group of birds perched on a wire fence, with palm trees in the background. The fence appears to be made of metal and has a diamond pattern. The birds are perched on the top of the fence, with their wings spread out, and some have their beaks open. The sky is clear and blue with a few fluffy clouds visible, and there is a small building in the distance with a red roof and white walls.'},\n",
+ " {'image_id': '991',\n",
+ " 'caption': 'The image shows a cocktail glass filled with a clear liquid and garnished with a slice of lime on the side, sitting on top of a wooden surface. The drink appears to be a mojito cocktail, with a small pile of ice cubes in the glass. Several other glasses and bottles are visible in the background.'},\n",
+ " {'image_id': '995',\n",
+ " 'caption': 'This is a bedroom with a bed, dresser, and television. The walls are painted purple, and there is a window with white curtains. The bed has a white comforter and pillows, and there is a lamp on the nightstand. The dresser has a mirror on top and several drawers. The television is mounted on the wall above the bed, and there is a small table next to it with a vase of flowers.'},\n",
+ " {'image_id': '997',\n",
+ " 'caption': 'characters in an engaging way. The use of bold typography and contrasting colors helps to make the title of the movie or show stand out, while the two characters provide a sense of intrigue and excitement.'},\n",
+ " {'image_id': '998',\n",
+ " 'caption': 'The image shows a pair of eels swimming in the ocean. The eels are brown in color and have long, slender bodies with small heads and large eyes. They appear to be swimming in a school, with one eel leading the way and the other following closely behind. The background of the image is a mix of rocks, coral, and sand, with various fish and other sea creatures visible in the distance. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '999',\n",
+ " 'caption': \"This is a black and white photograph of a police officer standing next to a police car. The officer is wearing a uniform with a badge on his chest and a hat on his head. The car has the word'police'written on the side in white letters. The officer is standing with his arms crossed in front of him, looking at the camera. There is a chain link fence in the background.\"},\n",
+ " {'image_id': '1000',\n",
+ " 'caption': \"The image shows a family of four standing in front of a blue carpet with snowflakes on it. The father is wearing a white shirt and black pants, the mother is wearing a red dress, and the two children are wearing matching outfits. They all have smiles on their faces and are posing for the camera. In the background, there is a large movie poster with the title'frozen'written in white letters on a blue background.\"},\n",
+ " {'image_id': '1002',\n",
+ " 'caption': \"The image is a logo for a company called TLC West Boulevard, LLC. The logo features a colorful square with the company's initials, TLC, in the center. The letters are stacked on top of each other in a stylized font. The logo is designed to be modern and eye catching, with bright colors that stand out against a white background.\"},\n",
+ " {'image_id': '1004',\n",
+ " 'caption': 'The image shows a woman standing in front of a large glass door wearing a white shirt and black pants with her hands in her pockets. Behind her, there are several other glass doors, all of which are open. The walls of the building behind her are white, and there are some plants in pots on the ground in front of the doors. In the foreground, there are several people walking on the sidewalk outside the building, dressed in different clothing, and some of them are carrying bags or other items. The sidewalk is made of gray concrete, and there are some trees on either side of it. In the background, you can see other buildings and some cars driving on the street.'},\n",
+ " {'image_id': '1005',\n",
+ " 'caption': \"This image depicts a soccer ball with the flag of Singapore on it.\\n\\nThe flag of Singapore consists of a red field with a white crescent and five white stars in the center. The crescent and stars are arranged in a semicircle on the left side of the flag. The flag is a symbol of the country's independence and sovereignty.\\n\\nThe soccer ball has a red and white design, with the flag's colors and emblem in the center. The ball is a popular sport in Singapore and is often used in international competitions.\\n\\nOverall, this image depicts a soccer ball with the flag of Singapore, which represents the country's independence and sovereignty.\"},\n",
+ " {'image_id': '1006',\n",
+ " 'caption': 'This image depicts a large room with rows of black chairs lined up in front of a large window overlooking a city skyline. There is a man standing at the back of the room, looking out the window. The room is spacious and well lit, with a high ceiling and exposed beams. The walls are painted white and there are no decorations or furniture in the room. The floor is made of hardwood and there are no rugs or carpets. There are no other people in the room.'},\n",
+ " {'image_id': '1007',\n",
+ " 'caption': 'There are several buses parked on the side of the road at night. The buses are yellow and have the words \"ImageContent\" written on them. There are people walking on the sidewalk next to the buses. The buildings in the background are tall and have many windows. There are streetlights on the poles along the road. The sky is dark and there are no stars visible.'},\n",
+ " {'image_id': '1008',\n",
+ " 'caption': 'This image shows a clear glass jar filled with small white pills. The jar is sitting on top of a white surface, which appears to be a table or shelf. The pills are arranged in a neat, organized pattern inside the jar.'},\n",
+ " {'image_id': '1009',\n",
+ " 'caption': 'This is an image of a healthy fern plant with water droplets on its leaves, formed due to condensation in a humid environment.'},\n",
+ " {'image_id': '1011',\n",
+ " 'caption': 'whimsical image of a cartoon chef on a grey hoodie. The character is depicted as happy and skilled in the kitchen, adding a playful touch to the clothing item.'},\n",
+ " {'image_id': '1012',\n",
+ " 'caption': \"This image shows a close-up view of a woman's face wearing a green and blue mask with a feathered headdress. The mask is decorated with beads and sequins, and the woman's hair is styled in a long braid. The background is a blurred image of a crowd of people in colorful costumes.\"},\n",
+ " {'image_id': '1013',\n",
+ " 'caption': \"The image shows a group of children dressed in animal costumes performing on a stage. There is a lion, a zebra, a monkey, and a tiger on the stage. The children are wearing colorful costumes and masks, and they are all smiling. There is a rainbow in the background, and the stage is set up with a curtain and a microphone. The children are performing in front of an audience, which is not shown in the image. It could be used to illustrate a story about children performing in a play or a musical, or to promote a children's theater or a performance group.\"},\n",
+ " {'image_id': '1014',\n",
+ " 'caption': 'The image shows a bottle of wine with a black label that says \"ImageContent\" on it. Several silverware items, including forks, knives, and spoons, are arranged around the bottle, and there are some olive branches in the background.'},\n",
+ " {'image_id': '1016',\n",
+ " 'caption': 'This image is a silhouette of a group of people standing in a circle, with one person in the center raising their arms. The people in the image appear to be a mix of adults and children, and they are all wearing different clothing. The background of the image is white.\\n\\nThe people in the image are standing in a circle, with one person in the center raising their arms. The people in the image appear to be a mix of adults and children, and they are all wearing different clothing. The background of the image is white.'},\n",
+ " {'image_id': '1019',\n",
+ " 'caption': 'The image shows a residential area with houses on either side of the road. A fence separates the road from the well-maintained grassy area in front of the houses. The houses have white walls and red roofs, some even have balconies or patios. Trees on either side of the road provide shade to the houses, and a few cars are parked on the side of the road. The sky is clear and cloudless.'},\n",
+ " {'image_id': '1022',\n",
+ " 'caption': 'This is an image of a table with a game on top of it. The table is made of metal and has a grid pattern on top. The game is made up of different colored blocks of various shapes, including squares, rectangles, and triangles. Some blocks are stacked on top of each other, while others are placed next to each other. There is a small opening in the middle of the table where the blocks can be placed. A tree can be seen in the background, indicating that the table is outdoors.'},\n",
+ " {'image_id': '1023',\n",
+ " 'caption': 'The image shows a landscape with a large body of water in the foreground and a small island in the background. The water is a deep blue color with small waves on the surface. The island is covered in green grass and has a few trees on it, and there is a small path leading to the island from the shore. The sky is bright blue with fluffy clouds, creating a peaceful and serene atmosphere.'},\n",
+ " {'image_id': '1024',\n",
+ " 'caption': 'This is a white t-shirt with the words \"I am the bride\" printed in pink on the front in cursive font. A small pink heart is underneath the text. The model wears the untucked shirt with white pants and black shoes. The image is clear and well-lit, showcasing the details of the shirt and the model\\'s face.'},\n",
+ " {'image_id': '1025',\n",
+ " 'caption': 'This image depicts a man sitting at a desk in a factory, surrounded by machinery and tools. The man is wearing a suit and tie, and appears to be working on a document or piece of paper. There are other workers in the background, some of whom are also working on machinery. The overall tone of the image is one of industry and productivity.'},\n",
+ " {'image_id': '1026',\n",
+ " 'caption': \"This image appears to be a logo or emblem for a woodworking or carpentry company. The design features a wood grain pattern in the shape of a circle, with the company's name written in a stylized font inside the circle. The image is in black and white, with the wood grain pattern in shades of brown. The overall design is simple and clean, with a focus on the company's name and the woodworking theme.\"},\n",
+ " {'image_id': '1027',\n",
+ " 'caption': 'This image shows a pink and white bathroom with a sink, mirror, and walk-in shower. The walls are painted pink and there are pink tiles on the floor.'},\n",
+ " {'image_id': '1029',\n",
+ " 'caption': 'this is a set of three brochures, each with a different image on the cover. The first brochure is about the forest, describing its different types of trees and animals, and emphasizing the importance of preserving the forest and its ecosystem. The second brochure is about the river, describing its various aquatic animals and emphasizing the importance of preserving the river and its ecosystem. The third brochure is about the road, describing different types of vehicles and pedestrians, and emphasizing the importance of preserving the road and its infrastructure.'},\n",
+ " {'image_id': '1031',\n",
+ " 'caption': 'This image shows a man sitting at a table with his hands resting on his chin. He is wearing a blue shirt and has a serious expression on his face. Behind him, there is a whiteboard with several diagrams and equations written on it. The room appears to be an office or laboratory setting.'},\n",
+ " {'image_id': '1032',\n",
+ " 'caption': 'This image shows a group of people wearing helmets and standing in front of a fence at a carnival or event. The woman on the left is wearing a purple shirt and green pants, while the man on the right is wearing a yellow shirt and black pants. They both have yellow helmets on their heads. The image is well lit with bright sunlight shining on the people and the fence behind them, and the colors are vibrant.'},\n",
+ " {'image_id': '1033',\n",
+ " 'caption': 'This is an image of a female athlete running in a marathon. She is wearing a black and red swimsuit and has a number on her chest.'},\n",
+ " {'image_id': '1034',\n",
+ " 'caption': 'A rational number is a real number that can be written as a fraction.'},\n",
+ " {'image_id': '1035',\n",
+ " 'caption': \"The image shows a blue Toyota Corolla parked in front of a graffiti-covered wall. The car's design features a bold front grille and headlights, a sloping roofline that gives it a sporty appearance, and modern taillights and bumper. The car seems to be in good condition with no visible dents or scratches. The bright blue color contrasts well with the colorful graffiti behind it, making for an overall stylish image.\"},\n",
+ " {'image_id': '1038',\n",
+ " 'caption': 'The image shows a group of people gathered around a well. They are all dressed in traditional clothing, with the men wearing turbans and the women wearing saris. The men are holding buckets and the women are holding pots, which they are using to draw water from the well. The scene is set in a rural area, with trees and a few buildings visible in the background. The overall mood of the image is peaceful and serene, with the people going about their daily tasks in a calm and orderly manner.'},\n",
+ " {'image_id': '1039',\n",
+ " 'caption': 'This image shows a small, green camper trailer parked in a grassy area. The trailer has four wheels and appears to be a small, compact trailer that can be towed behind a vehicle for camping or other outdoor activities. There are no other details visible in this image.'},\n",
+ " {'image_id': '1040',\n",
+ " 'caption': 'The image shows a tall building with a curved facade made up of white, wavy lines creating a wave-like pattern. The building is made of white painted concrete or similar material with a small entrance opening at the top. There are no visible windows or doors on the facade. The foreground shows some vegetation including small trees and bushes. The overall impression of the image is a modern, sleek building with a unique, wave-like design.'},\n",
+ " {'image_id': '1041',\n",
+ " 'caption': 'The image shows two advertisements for a dental practice. The first advertisement has a blue background with white text that reads, \"IT support for your dental practice.\" The second advertisement has a white background with blue text that reads, \"Two Dogs IT Support for your dental practice.\" Both advertisements feature the same image of two dogs sitting next to each other. The first advertisement emphasizes the importance of IT support for a dental practice, while the second advertisement highlights the benefits of using Two Dogs IT Support for dental professionals.'},\n",
+ " {'image_id': '1042',\n",
+ " 'caption': 'This is an image of a house with white walls and a green garage door. There is a small driveway in front of the house that leads to the garage. The house appears to be in a rural area, surrounded by trees and greenery. The image is taken from the side of the house, looking down the driveway towards the garage.'},\n",
+ " {'image_id': '1045',\n",
+ " 'caption': 'The image is a round beach towel with the words \"let your dreams come true\" written in blue on a pink background.'},\n",
+ " {'image_id': '1046',\n",
+ " 'caption': 'This image shows a group of people in a meeting. They are dressed in business attire and sitting around a conference table in a well-lit office or conference room. A large television screen on the wall behind them displays a news program.'},\n",
+ " {'image_id': '1047',\n",
+ " 'caption': 'This is an image of a large bird, possibly a vulture, with its wings spread out and its head tilted to the side. The bird appears to be perched on something, possibly a tree branch or a rock. The image is in black and white.'},\n",
+ " {'image_id': '1049',\n",
+ " 'caption': 'The image shows a man and a woman standing in front of a large, ornate building. The man is wearing a black suit and the woman is wearing a black dress. They are both smiling and posing for the camera. The building behind them is a large, old mansion with many windows and a balcony. There are trees in the background and the sky is cloudy.'},\n",
+ " {'image_id': '1050',\n",
+ " 'caption': 'The image shows three people standing in front of a tree. The first person is wearing a black leather jacket and a red dress. The second person is wearing a black leather jacket and a white shirt. The third person is wearing a black leather jacket and black pants. All three people have their hands on their hips and are looking at the camera. Behind them, there is a large tree with branches and leaves. The background is a dark sky with stars.'},\n",
+ " {'image_id': '1051',\n",
+ " 'caption': 'The image shows a piece of pumpkin cake with cream cheese frosting on a white plate. The cake has a swirl of cream cheese frosting on top and is topped with a sprinkle of cinnamon sugar. There is a fork in the picture.'},\n",
+ " {'image_id': '1052',\n",
+ " 'caption': \"This is an image of a woman standing in front of a white background, holding up a sign that says, ` we are here to help '. The woman is wearing a gray shirt and has short, curly hair. She is smiling and looking directly at the camera.\"},\n",
+ " {'image_id': '1053',\n",
+ " 'caption': 'The image shows a set of six gargoyle figurines made of black metal. Each figurine has a unique pose and expression, with intricate details such as wings, horns, and fangs. The gargoyles are perched on small pedestals with their wings spread out and their mouths open in a roar or snarl, giving an ominous and menacing effect as if they are about to attack.'},\n",
+ " {'image_id': '1054',\n",
+ " 'caption': 'This is an image of a building with graffiti on the walls. It appears to be an abandoned building with broken windows and Arabic graffiti. Two people are sitting on the stairs in front of the building.'},\n",
+ " {'image_id': '1055',\n",
+ " 'caption': 'The image shows a group of cacti with different colored flowers on them. The cacti are all different sizes and shapes, and they are arranged in a circle. The flowers on the cacti are bright and colorful, and they are different types of flowers. There are also green leaves on the cacti. The background of the image is a light green color, and it looks like the cacti are growing in a pot.'},\n",
+ " {'image_id': '1056',\n",
+ " 'caption': 'The image is a book cover with the title \"The Guts to Create an Authentic Heart\" in red letters on a white background. The cover features a heart made out of gears.'},\n",
+ " {'image_id': '1058',\n",
+ " 'caption': 'This image depicts a woman holding a white sign with the word stop written on it. The woman is wearing a white shirt and has long blonde hair. She is holding the sign with both hands and looking directly at the camera. In the background, there is a blue wall.'},\n",
+ " {'image_id': '1060',\n",
+ " 'caption': 'This image shows a bathroom with a white sink, toilet, and bathtub. There is a large mirror on the wall above the sink. The floor is made of tiles and the walls are painted white. There is a window on one side of the room and a door on the other side that leads to another room.'},\n",
+ " {'image_id': '1061',\n",
+ " 'caption': 'The image shows a Christmas tree with presents underneath it. The tree is decorated with red and green ornaments and has a star on top. There are also presents wrapped in red and green paper with bows on them next to the tree.'},\n",
+ " {'image_id': '1063',\n",
+ " 'caption': \"This image shows a black dragon flying through a dark, stormy sky with its wings spread out. The dragon's eyes are glowing red and its mouth is open, revealing sharp teeth. The landscape below is rocky and rugged, with mountains and valleys visible in the distance. The overall mood of the image is ominous and foreboding.\"},\n",
+ " {'image_id': '1064',\n",
+ " 'caption': 'The image shows a black wallet with a zipper closure and multiple compartments inside. The wallet is made of leather and has gold accents on the zipper and around the edges. There is also a small pocket on the outside of the wallet with a zipper closure.\\n\\nInside the wallet, there are several compartments for storing cash, credit cards, and other small items. There is also a larger compartment with a zipper closure that can be used to store larger items such as a phone or passport. The wallet has a sleek, modern design and would be suitable for both casual and formal wear.'},\n",
+ " {'image_id': '1065',\n",
+ " 'caption': 'This image shows a group of people standing in front of a large tree in a park. They are all dressed in casual clothing and are smiling at the camera. The tree has a plaque on it that reads, \"In memory of those who gave their lives for our country.\"\\n\\nThe people in the image are a mix of ages and ethnicities. They are all standing in front of the tree, which has a thick trunk and branches that stretch up towards the sky. The background of the image is a mix of green grass and trees, with some buildings visible in the distance. It appears to be a sunny day.\\n\\nOverall, this image depicts a group of people paying their respects to those who have given their lives.'},\n",
+ " {'image_id': '1066',\n",
+ " 'caption': 'This image is a white t-shirt with black text that reads, \"only a ginger can call another ginger ginger\". The text is written in a bold, sans-serif font.'},\n",
+ " {'image_id': '1067',\n",
+ " 'caption': 'This image shows a room with a high ceiling and large windows that let in natural light. The walls are painted white and the floor is made of wooden planks. Several pieces of furniture, including a desk, chairs, and a bookshelf, are present in the room, indicating that it is a study or office area.'},\n",
+ " {'image_id': '1068',\n",
+ " 'caption': 'This is an image of an alpaca, a domesticated animal native to South America. Alpacas are known for their soft, woolly coats which are used to make clothing and textiles. They are also kept as pets due to their gentle nature. The alpaca in the image is light brown with a fluffy coat, large round eyes, a small pointed nose, long floppy ears, and a short curved tail. It is standing in a fenced enclosure, looking at the camera. Overall, this is a cute and appealing image of an alpaca.'},\n",
+ " {'image_id': '1070',\n",
+ " 'caption': 'The woman in the image is wearing a long, dark green velvet gown with a plunging neckline and long sleeves. The dress has a high slit on the side and is cinched at the waist with a belt. Her hair is styled in loose, curly waves and she is wearing high heels. She is posing with her hands on her hips and looking directly at the camera. The background is a dark grey.'},\n",
+ " {'image_id': '1071',\n",
+ " 'caption': 'This image depicts a person holding a fishing rod and reeling in a fish. The person is wearing a red shirt, blue jeans, and white sneakers. The background is a blank white paper.'},\n",
+ " {'image_id': '1072',\n",
+ " 'caption': \"This image is a quote that reads, 'the revolution will not be supervised'. It is written in white text on a black background, with the words 'revolution'and 'supervised'in bold letters. The font used is a sans serif font. The message conveyed by this image is one of rebellion and resistance against authority. It suggests that the people will not be controlled or monitored by those in power, and that they will take matters into their own hands to bring about change.\"},\n",
+ " {'image_id': '1074',\n",
+ " 'caption': 'This image is a screenshot of a PowerPoint presentation document in Microsoft Word. The document has a blue background with white text and an American flag in the center, waving in the wind with the stars and stripes visible. A text box with the title \"ImageContent\" is also present on the right side of the page.'},\n",
+ " {'image_id': '1075',\n",
+ " 'caption': 'The image shows a person sitting at a table with a white bowl filled with rice and vegetables in front of them. The person is wearing a white sweater and is eating the food in the bowl with a pair of metal chopsticks. The bowl has a pink and white pattern on it and appears to be made of ceramic. The chopsticks have a shiny finish and the table is made of wood with a white tablecloth. There are flowers in a vase on the table, but they are not visible in the image. The atmosphere of the image is cozy and relaxed, with the person enjoying their meal and the food looking appetizing.'},\n",
+ " {'image_id': '1076',\n",
+ " 'caption': 'This image is a stylized representation of a human skull depicted in a low polygonal style with sharp angles and clean lines. The skull is shown in profile with the left side visible and the right side obscured by the hairline. It appears to be made up of several geometric shapes, including triangles and rectangles. The background is a slightly lighter shade of the same color as the skull, creating an overall effect of simplicity and minimalism.'},\n",
+ " {'image_id': '1078',\n",
+ " 'caption': 'This image shows a group of people standing on the rooftop of a building with a city skyline in the background. The people are dressed in formal attire, and they appear to be chatting and enjoying the view. The skyline in the background is made up of tall buildings with lit up windows, and there are also some clouds in the sky. The overall mood of the image is one of relaxation and enjoyment.'},\n",
+ " {'image_id': '1079',\n",
+ " 'caption': 'This image shows a large conference room with a long wooden table and several chairs. The walls are made of stone and the floor is made of tiles. The room has large windows that let in a lot of natural light. There is a painting on the wall behind the table, and a sculpture on the floor in front of the table.'},\n",
+ " {'image_id': '1080',\n",
+ " 'caption': \"This image shows a black dog sitting in the snow, looking up at the camera with a curious expression on its face. The dog's fur is fluffy and white, and its eyes are bright and alert. The background is a blanket of white snow. The dog appears to be in good condition, with no visible injuries or signs of distress. It is wearing a collar with a tag, but it is not clear what the tag says. The dog's expression is friendly and curious.\"},\n",
+ " {'image_id': '1081',\n",
+ " 'caption': 'The image shows a group of clothing items hanging on a rack, including a striped shirt, a brown skirt, and a pair of black shoes. The shirt has a striped pattern and is tucked into the skirt. The skirt has a brown leather belt with a gold buckle. The shoes have a pointed toe and are made of black leather.'},\n",
+ " {'image_id': '1082',\n",
+ " 'caption': \"This image appears to be a movie poster for a French film, possibly from the 1950s or 1960s. The title of the film is written in large, bold letters at the top of the poster, with an image of a woman in a red dress and a man in a suit dancing in front of her.\\n\\nBelow the title, there is a tagline in French that reads `'un amour en paris'`, which translates to `'a love in Paris'`. The poster appears to be a romantic comedy or drama, set in the city of Paris.\"},\n",
+ " {'image_id': '1083',\n",
+ " 'caption': 'The image shows a view of the twelve apostles rock formations on the great ocean road in Australia. The cliffs are tall and rocky, jutting out into the deep blue ocean under a clear and sunny sky without any clouds. The cliffs are covered in green vegetation and some small plants grow on the ground. There are no people in the image. The apostles are a popular tourist attraction, visited by millions of people each year, and are a testament to the power of erosion and the forces of nature. They remind us of the beauty and majesty of the natural world and are a must-see for anyone visiting Australia.'},\n",
+ " {'image_id': '1084',\n",
+ " 'caption': 'The image depicts a stainless steel bracelet with a silver cross engraved in the center. The bracelet features a chain link design and a clasp closure, making it adjustable to fit different sizes. The image shows the bracelet on a white background.'},\n",
+ " {'image_id': '1085',\n",
+ " 'caption': \"The image shows two children, a boy and a girl, standing next to each other with their arms around each other. The boy is wearing a red shirt and blue jeans, while the girl is wearing a pink shirt and white shorts. They are both looking at the camera with smiles on their faces. The quote on the image reads, `'The best gift you could give your child is your time.'`\"},\n",
+ " {'image_id': '1086',\n",
+ " 'caption': 'The image depicts a yellow and black taxi with the words \"Tokyo Taxi\" written on the side. It is parked on the side of the road with a person standing next to it. The taxi sign on the roof indicates its readiness for use.'},\n",
+ " {'image_id': '1087',\n",
+ " 'caption': 'This image is a photograph of a person standing on a rocky beach with the ocean in the background. The person is standing with their back to the camera, looking out at the waves. The person is wearing a black hoodie and black pants, and their hair is pulled back into a ponytail. The beach is covered in large rocks and pebbles, and there are some small waves washing up onto the shore. The sky is a deep shade of blue, and there are some clouds visible in the distance. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '1089',\n",
+ " 'caption': 'This image shows a woman in a garden, crouching down and tending to a flower bed. The woman is wearing a black shirt and brown pants, and is holding a watering can in her right hand. The garden is filled with a variety of colorful flowers, including red, orange, yellow, and pink. There are also some green leaves and stems visible in the background. The woman appears to be focused on her work, and is carefully watering each plant in the bed. The overall effect of the image is one of peace and tranquility, as the woman is surrounded by vibrant colors and the sound of water dripping from the can.'},\n",
+ " {'image_id': '1090',\n",
+ " 'caption': 'The image depicts a couple standing on top of a hill, looking out at a pink sky with a full moon in the background. The couple is holding hands and looking at each other with love in their eyes. The sky is filled with clouds and stars, and there is a silhouette of a tree in the foreground. The overall mood of the image is romantic and dreamy.'},\n",
+ " {'image_id': '1091',\n",
+ " 'caption': 'The image shows a cup and saucer with a floral design on it. The cup has a curved handle and the saucer has a scalloped edge. The design on the cup and saucer is red and white, with pink roses and green leaves. There is a small spoon resting on the saucer. The overall style of the cup and saucer appears to be vintage or antique.'},\n",
+ " {'image_id': '1092',\n",
+ " 'caption': \"The image is a drawing of a person's head with a map of a city on it. The person's head is shown in profile, with their eyes, nose, and mouth visible. The map is shown on the right side of the image, with various streets and landmarks labeled. The image is in black and white, with the map shown in shades of gray.\\n\\nThe image appears to be a representation of a person's mental map of a city, with the various streets and landmarks representing different memories or experiences that the person has had in that city. The use of black and white gives the image a stark, minimalist feel, emphasizing the simplicity of the map and the person's mental representation of the city.\\n\\nOverall, the image is a creative and interesting representation of the concept of mental mapping, and could be used in a variety of contexts, such as in psychology or urban planning.\"},\n",
+ " {'image_id': '1093',\n",
+ " 'caption': 'The image shows a small room with a wooden shelf on the wall, several rolls of wrapping paper stacked on it, a door on the right side, and a window on the left side. The walls are painted white and there is a wooden floor.'},\n",
+ " {'image_id': '1095',\n",
+ " 'caption': 'The image shows a woman in a The image shows a woman standing in a field of purple flowers. She is wearing a straw hat and a plaid shirt, and is looking down at the flowers in front of her. The flowers are in full bloom and appear to be a type of orchid. The woman is standing in the middle of the field, surrounded by the flowers on all sides. The background of the image is a clear blue sky with a few fluffy clouds.'},\n",
+ " {'image_id': '1097',\n",
+ " 'caption': 'This image shows a wooden bridge in the middle of a swampy area surrounded by mangrove trees. The bridge appears to be old and weathered, with wooden planks and ropes hanging from it. The water below is murky and full of debris, including sticks, leaves, and other vegetation. The trees on either side of the bridge are tall and thin, with their branches reaching out over the water. The sky is cloudy and overcast, casting a gloomy mood over the scene.'},\n",
+ " {'image_id': '1098',\n",
+ " 'caption': 'The woman is wearing a colorful shirt, blue leggings, and white sneakers while carrying a black purse and looking at her phone. There are several parked cars in the background.'},\n",
+ " {'image_id': '1099',\n",
+ " 'caption': 'This is a hotel room with a red accent wall and a large bed with a white comforter. There is also a desk and chair in the corner.'},\n",
+ " {'image_id': '1101',\n",
+ " 'caption': 'The image shows a group of minerals found in nature. These minerals are made up of substances that were never living organisms. They are made up of elements such as calcium, sodium, and magnesium. Some of these minerals are found in rocks, while others can be found in soil or water.\\n\\nThe minerals in the image are likely to have been formed through geological processes, such as the solidification of magma or the precipitation of dissolved minerals from water. These minerals can be used in a variety of ways, such as in the production of glass, ceramics, and other materials.\\n\\nOverall, the image shows a group of minerals that are found in nature and are made up of elements that were never living organisms. These minerals can be used in a variety of ways and are likely to have been formed through geological processes.'},\n",
+ " {'image_id': '1102',\n",
+ " 'caption': 'This image shows a balcony with a view of a river. The balcony is made of wood and has a glass railing. There are two blue cushions on the bench and a small table with a vase of flowers on it. The trees on the other side of the river are reflected in the water. The sky is clear and there are some clouds in the distance.'},\n",
+ " {'image_id': '1103',\n",
+ " 'caption': 'This is a black lace dress with long sleeves and a deep V neckline. The dress has intricate lace detailing on the bodice and sleeves, with a sheer lace overlay on the skirt. It has a fitted bodice and a flared skirt, with a long train at the back. The dress also features a zipper closure at the back.'},\n",
+ " {'image_id': '1105',\n",
+ " 'caption': 'The image shows a pair of white shoes with gold glitter on the sides. The shoes have an open toe and a low heel. The soles are made of rubber and the insoles are made of leather. The shoes are designed for casual wear and are suitable for both men and women.'},\n",
+ " {'image_id': '1106',\n",
+ " 'caption': 'The woman in the image is standing in a field at sunset. She is wearing a long, flowing dress and has her hair pulled back in a ponytail. Her arms are crossed in front of her and she is looking off into the distance. The sky behind her is orange and pink, with clouds that look like they are on fire. There are no other objects or people in the image.'},\n",
+ " {'image_id': '1107',\n",
+ " 'caption': 'The image shows a man in a business suit walking across a wooden bridge suspended over a deep chasm. The man is walking towards the edge of the cliff and looking down at the ground below. The bridge has a handrail on one side and the sky is cloudy with no other objects in the scene. The image is in isometric perspective, with the man and bridge in the foreground and the cliff and sky in the background.'},\n",
+ " {'image_id': '1108',\n",
+ " 'caption': 'The image shows a circular diagram with the words one health, food, environment, and lifestyle written around it. The words are arranged in a circle with one health and food at the top, environment in the middle, and lifestyle at the bottom. The words are written in different colors, with one health in red, food in green, environment in blue, and lifestyle in yellow. In the center of the circle, there is an illustration of a person standing on top of a mountain, with their arms stretched out to the sides, surrounded by trees and a clear blue sky. The illustration is done in a cartoon style, with the person having a happy expression on their face.'},\n",
+ " {'image_id': '1109',\n",
+ " 'caption': 'The woman in the image is wearing a blue dress with a floral pattern on it. The dress has a v-neckline and short sleeves. She has her hair styled in loose waves and is wearing a pair of silver earrings. Her makeup consists of a light foundation, pink blush, and brown eyeshadow. She has a small smile on her face and is posing for the camera. The background of the image appears to be a blue carpet with a white wall behind it. There are no other objects or people visible in the image.'},\n",
+ " {'image_id': '1110',\n",
+ " 'caption': 'The image shows a group of babies sitting on clouds, holding balloons. The babies are wearing diapers and are smiling. The background is a blue sky with fluffy clouds.'},\n",
+ " {'image_id': '1111',\n",
+ " 'caption': 'This image shows a helicopter flying over the ocean with a large ship in the background. The helicopter appears to be a military.'},\n",
+ " {'image_id': '1113',\n",
+ " 'caption': 'The image shows a woman wearing a grey tank The woman is wearing a grey tank top and black skinny jeans. The tank top has a scoop neckline and is made of a thin, stretchy material. The jeans are fitted and have a high waist. The woman is standing with her hands in her pockets and is looking down at the ground.\\n\\nThe tank top is a simple, basic style that can be worn with a variety of outfits. It has a scoop neckline, which adds a touch of femininity to the design. The material is thin and stretchy, which makes it comfortable to wear and easy to move in.\\n\\nThe skinny jeans are a classic style that is popular for both casual and formal wear. They have a high waist, which helps to create a streamlined silhouette and elongate the legs. The jeans are fitted, which means they hug the body and create a slim, sleek look.'},\n",
+ " {'image_id': '1114',\n",
+ " 'caption': 'The painting depicts a group of people on a boat in the middle of the ocean. The sun is setting in the background, casting a warm orange glow over the scene. The people on the boat are dressed in period clothing, with some wearing hats and others holding onto ropes. In the foreground, there is a small rowboat with two people in it, heading towards the larger boat. The water is choppy, with waves crashing over the sides of the boat. The sky is filled with clouds, with the sun peeking out from behind them. The overall mood of the painting is one of adventure and exploration.'},\n",
+ " {'image_id': '1115',\n",
+ " 'caption': \"This image shows a plate with a piece of vanilla cake topped with whipped cream and sliced strawberries. In the background, there are several more slices of cake on a plate. A person's hand is shown dipping a spoon into the whipped cream on the top of the cake.\"},\n",
+ " {'image_id': '1116',\n",
+ " 'caption': \"This image shows a man sitting at a desk in an office. The man is wearing a suit and tie and is looking at a computer screen in front of him. There are several other desks and chairs in the room, as well as a map on the wall behind the man. The room appears to be dimly lit, and there is a clock on the wall above the man's head.\"},\n",
+ " {'image_id': '1118',\n",
+ " 'caption': \"This image shows a person's hand holding a gas nozzle at a gas pump. The hand is wearing a black bracelet and holding the nozzle with fingers. The gas pump is white with a hose connected to the car's gas tank. The car is parked next to the pump with a partially visible license plate.\"},\n",
+ " {'image_id': '1121',\n",
+ " 'caption': 'This image shows a group of people gathered around a table in a dimly lit room. A man in a grey suit and white shirt is standing at the head of the table, speaking into a microphone and holding a glass of wine. Several other people are sitting at the table and looking at him. A dog is sitting on the floor in front of the table, looking up at the man. The room has white walls with paintings hanging on them, wooden floors, and chandeliers hanging from the ceiling. Plants in pots are placed on the floor.'},\n",
+ " {'image_id': '1123',\n",
+ " 'caption': 'The image shows a blue leather jewelry box with a zipper closure on the top. The box has a rectangular shape with rounded corners and is made of a smooth, glossy material. The color of the box is a deep, navy blue. There are no visible markings or logos on the box. The box appears to be empty.'},\n",
+ " {'image_id': '1124',\n",
+ " 'caption': 'The image shows a person standing on a rope in the middle of a large indoor play area. The person is wearing a harness and is holding onto the rope with both hands. The rope is suspended from the ceiling, and there are several other ropes hanging down from the ceiling as well. The play area is filled with various obstacles, such as tunnels, slides, and climbing walls. There are also several other people in the image, some of whom are climbing on the ropes and others who are watching from the sidelines. The overall atmosphere of the image is one of fun and adventure.'},\n",
+ " {'image_id': '1125',\n",
+ " 'caption': 'The image shows a woman in a white shirt and black pants, wearing a hat and smoking a cigarette. She is standing outside in a grassy area with trees in the background.'},\n",
+ " {'image_id': '1127',\n",
+ " 'caption': 'This image shows a wooden cutting board with steak, vegetables, and herbs. The steak is sliced and seasoned with salt, pepper, and other spices. The vegetables include asparagus, bell peppers, and carrots. The herbs include rosemary and thyme. There is also a bottle of olive oil on the cutting board. In the background, there is a wooden table with a white tablecloth and a vase of fresh flowers. The table appears to be set for a meal. Overall, this image suggests a rustic, home cooked meal with fresh, seasonal ingredients.'},\n",
+ " {'image_id': '1130',\n",
+ " 'caption': \"This is a black and white photograph of a couple dancing in a living room. The man is wearing a white shirt and black pants, while the woman is wearing a black dress with white polka dots. They are both holding onto each other as they dance, with the woman's arms wrapped around the man's neck. There is a wooden floor in the background, and a window with curtains in the left corner of the image. The woman is wearing a pair of white sneakers, and the man is wearing a pair of black loafers. The photograph is well lit, with a bright light source coming from the left side of the image. The overall mood of the image is happy and carefree.\"},\n",
+ " {'image_id': '1131',\n",
+ " 'caption': \"The image shows a baby's crib with a dark wooden frame and a white mattress. The crib has a modern design with clean lines and a neutral color scheme. It is placed in a spacious room with white walls, hardwood flooring, and a large window that lets in natural light. There is a white dresser in the background.\"},\n",
+ " {'image_id': '1133',\n",
+ " 'caption': 'The image shows a green hill with a castle on top of it. The castle is made of stone and appears to be abandoned. The sky is dark with clouds and there is a full moon shining brightly in the sky. There are no other buildings or structures visible in the image. The hillside is covered in grass and there are no trees or other vegetation visible.\\n\\nIn the foreground of the image, there is a small path that leads up to the castle. The path is made of dirt and rocks and appears to be overgrown with weeds. There are no people or animals visible in the image.\\n\\nThe overall mood of the image is eerie and abandoned. The dark sky and the abandoned castle give the impression that the place has been abandoned for a long time. The lack of other buildings or structures in the image adds to this feeling of isolation.\\n\\nThe image is well composed, with the castle in the center of the frame'},\n",
+ " {'image_id': '1134',\n",
+ " 'caption': \"This image shows a close up view of a grasshopper's head. The grasshopper is a green insect with long antennae and large eyes. The grasshopper appears to be looking directly at the camera with a curious expression on its face.\"},\n",
+ " {'image_id': '1135',\n",
+ " 'caption': 'This image shows a group of people gathered in front of a building. There is a man standing in the center of the group, wearing a white robe and holding a staff. The other people in the group are dressed in a variety of clothing, including robes and hats. There is a woman standing on the left side of the image, wearing a long dress and holding a baby. On the right side of the image, there is a man standing in front of a wall, wearing a robe and holding a book. In the background, there is a city with buildings and people walking around. The overall mood of the image is peaceful and serene.'},\n",
+ " {'image_id': '1136',\n",
+ " 'caption': 'The image is a video game called Inquisition. It is developed by Bioware and published by Electronic Arts. The game is a role-playing game set in the fantasy world of Thedas. Players take on the role of an inquisitor, tasked with uncovering the truth behind a conspiracy that threatens to tear the world apart. The game features a vast open world to explore with a variety of characters and creatures to interact with, as well as a deep and engaging storyline to follow. Inquisition is available on multiple platforms, including PlayStation 4, Xbox One, and PC.'},\n",
+ " {'image_id': '1138',\n",
+ " 'caption': 'This is an illustration of a young girl with red hair, wearing a blue and white striped dress with a white apron and red and white striped stockings. She is standing with her arms crossed in front of her, looking down at the ground. The background is a light blue gradient.'},\n",
+ " {'image_id': '1139',\n",
+ " 'caption': 'The image shows a scenic view of a canyon with a river running through it. The canyon walls are made up of red rock formations and there are trees growing on the sides. In the distance, there is a large body of water with mountains in the background. The sky is clear and blue.'},\n",
+ " {'image_id': '1140',\n",
+ " 'caption': \"This image is a quote that reads 'these are difficult times' in white text on a black background. The text is written in a stylized font, with the words 'difficult' and 'times' written in capital letters. The overall design of the image is simple and minimalist, with the text standing out against the dark background. It could be interpreted as a statement about the challenges and struggles that people face in their lives, or as a reminder to stay strong and persevere through difficult times.\"},\n",
+ " {'image_id': '1141',\n",
+ " 'caption': 'The bike in the image is a road bike with a carbon fiber frame and disc brakes. It has a dropper post, allowing for a more aggressive riding position. The handlebars are also dropper and can be adjusted for a more upright riding position. The bike has a full carbon fiber fork and wheels, with deep section rims for improved aerodynamics and stiffness. The tires are tubeless, providing better grip and traction at lower pressures. The saddle is lightweight and ergonomic with a cutout to reduce pressure on the perineum.'},\n",
+ " {'image_id': '1142',\n",
+ " 'caption': 'This image depicts a hallway in a hotel or office building. The walls are painted in light beige, and the floor is covered with light brown carpet. The ceiling has wooden beams with light fixtures hanging from it. Modern artwork decorates the walls. Doors on either side of the hallway are made of wood and have frosted glass panels. A chandelier hangs from the ceiling center with several lights shining down onto the floor. Potted plants are placed on either side of the corridor.'},\n",
+ " {'image_id': '1143',\n",
+ " 'caption': 'The image shows a couple standing in front of a body of water, with mountains in the background. The woman is wearing a green jacket and black pants, while the man is wearing a black jacket and blue jeans. They are both looking at each other with smiles on their faces. The water is clear and there are some rocks visible in the foreground. The mountains in the background are covered in snow, and there are some trees visible on the left side of the image.'},\n",
+ " {'image_id': '1144',\n",
+ " 'caption': 'The woman is wearing a white t-shirt, ripped jeans, and a black cardigan sweater. She is holding a cell phone in her right hand and has her left hand in her pocket. The background is a mirror.'},\n",
+ " {'image_id': '1145',\n",
+ " 'caption': 'The image shows a green road sign with the words Chicago County written on it. The sign is a standard road sign with white lettering on a green background. The word Chicago is written in large letters at the top of the sign, while the word County is written in smaller letters below it. The sign is mounted on a green pole with a white base. There is no other information or context provided in the image.'},\n",
+ " {'image_id': '1146',\n",
+ " 'caption': 'The image shows a brown woven basket with a lid on top, sitting on a wooden floor. The basket has a handle on the side and is open, revealing the contents inside. There are no other objects in the image.'},\n",
+ " {'image_id': '1150',\n",
+ " 'caption': 'This image shows the engine compartment of a car, with the hood open to reveal the engine. The engine is a 4 cylinder, with a displacement of approximately 2.5 liters. It is equipped with a turbocharger and direct fuel injection, which helps to improve its performance and efficiency. The engine is connected to a 6 speed automatic transmission, which allows for smooth and seamless shifting.\\n\\nThe engine compartment also features several other components, including the air filter, which helps to clean the air that enters the engine, and the oil filter, which helps to remove impurities from the oil. There is also a radiator, which helps to cool the engine by circulating coolant through the engine and releasing heat into the air.\\n\\nOverall, this engine appears to be well designed and equipped to provide good performance and reliability in a variety of driving conditions.'},\n",
+ " {'image_id': '1151',\n",
+ " 'caption': 'The image shows a group of people playing music on stage. There are two men playing guitars and a woman playing a keyboard. They are all wearing casual clothing and appear to be enjoying themselves. There is a microphone in front of them and a drum set in the background. The lighting is dim and there are curtains hanging on either side of the stage.'},\n",
+ " {'image_id': '1152',\n",
+ " 'caption': \"The image shows a man holding an orange in front of his face with his mouth open, as if he's about to take a bite. The man is wearing a black t-shirt and has short, dark hair. He has a serious expression on his face and is looking directly at the camera. Behind him, there is a dark grey background.\"},\n",
+ " {'image_id': '1153',\n",
+ " 'caption': 'This is an empty room with white walls and hardwood floors. There is a large window on one side of the room and a sliding glass door on the other side. The room is empty and there is no furniture in it.'},\n",
+ " {'image_id': '1154',\n",
+ " 'caption': 'There is a dirt road in the middle of a field. On the left side of the road, there is a fence made of wooden poles with barbed wire on top. On the right side of the road, there is a person walking towards the camera. The person is wearing a blue shirt and shorts, and has a backpack on their back. Behind the person, there is a hill with some trees on it. The sky is clear and blue.'},\n",
+ " {'image_id': '1156',\n",
+ " 'caption': 'The image shows a tropical island with green hills and vegetation surrounding the coastline. Bright blue water and several boats are anchored in the bay. A small village can be seen in the distance on the other side of the island. The sky is clear with a few fluffy clouds.'},\n",
+ " {'image_id': '1157',\n",
+ " 'caption': 'The image shows a silver sports car parked on a dirt road in the middle of a forest. The car has tinted windows and the driver\\'s side door is open, revealing the interior of the car. The license plate on the front of the car reads \"TOP GEAR\" in white letters on a black background. The forest in the background is dense, with tall trees on either side of the road. The sky is overcast, with clouds covering the sun.'},\n",
+ " {'image_id': '1158',\n",
+ " 'caption': 'This image shows a large group of people sitting at long tables in a wooden structure. The tables have white tablecloths and candles. The walls are made of wood and there are strings of lights hanging from the ceiling. The people are dressed in formal attire and appear to be enjoying a meal.'},\n",
+ " {'image_id': '1159',\n",
+ " 'caption': 'The image shows a woman standing in front of a backdrop decorated with balloons in shades of blue, green, and gold. The woman is wearing a pink dress and has her hair styled in loose, wavy curls. She is looking down at her phone, which is sitting on a table in front of her. The table is covered with a white tablecloth and has a vase of pink and white flowers on it. Behind the woman, there are more balloons hanging from the ceiling, as well as streamers in the same colors as the balloons. The overall atmosphere of the image is festive and celebratory.'},\n",
+ " {'image_id': '1161',\n",
+ " 'caption': 'This image is a stylized representation of two people, one in blue and the other in orange, with their arms outstretched as if they are reaching out to each other. It could be used as a logo for a company or organization that promotes teamwork, cooperation, or partnership. The image is simple and easy to recognize, making it a good choice for a logo that needs to be easily recognizable and memorable. The colors are bright and eye-catching, which can help to make the logo stand out and be more memorable. Overall, this image is a good choice for a logo that needs to convey a sense of teamwork and cooperation.'},\n",
+ " {'image_id': '1165',\n",
+ " 'caption': 'The image shows a sweater with a color block design. The top part of the sweater is navy blue, while the bottom part is olive green. The collar and cuffs of the sweater are also navy blue. The sweater appears to be made of a knit material and has a slim fit.'},\n",
+ " {'image_id': '1167',\n",
+ " 'caption': 'This image shows a group of people gathered around a table in a hospital room. They are all dressed in professional attire, with some wearing lab coats and others in suits and ties. One person is standing at the head of the table, speaking to the group. The room is well lit and there are several pieces of medical equipment visible in the background, including a computer monitor and a printer.'},\n",
+ " {'image_id': '1168',\n",
+ " 'caption': 'The image shows a mason jar filled with carrots. The carrots are chopped into small pieces and appear to be floating in the liquid inside the jar. There is a small amount of liquid at the top of the jar. The jar is sitting on a white surface, which appears to be a table or countertop. There are no other objects visible in the image.'},\n",
+ " {'image_id': '1169',\n",
+ " 'caption': 'The woman is wearing an orange jacket and a black skirt. She is sitting on a stone wall and smiling at the camera. There is a tree in the background and some stairs leading up to a building.'},\n",
+ " {'image_id': '1170',\n",
+ " 'caption': \"The image is a yellow and grey geometric pattern on wallpaper. The pattern consists of triangles of varying sizes and shades of yellow and grey, arranged in a seemingly random way to form a cohesive design. The wallpaper has a subtle texture that adds depth and interest. The color scheme is bright and cheerful, making it suitable for a child's room or a playful living space.\"},\n",
+ " {'image_id': '1172',\n",
+ " 'caption': \"The image shows a farmer's field with various vegetables and fruits growing in it. The vegetables include tomatoes, peppers, carrots, and onions. The fruits include apples, pears, and grapes. There are also some flowers in the field, including daisies and sunflowers. The sky is clear and blue, and there is a barn in the background.\\n\\nThe image is a beautiful representation of a farm and the different crops that are grown there. The colors are vibrant and the details are clear, making it a very visually appealing image. The vegetables and fruits are arranged in a way that looks natural and realistic. The flowers add a touch of beauty to the scene, and the barn in the background gives a sense of scale and context. Overall, it is a very well composed image that captures the essence of a farm.\"},\n",
+ " {'image_id': '1173',\n",
+ " 'caption': 'The image shows a cartoon character from the TV show The Simpsons holding a hammer and standing in front of a black background. The character is wearing a yellow shirt and blue pants, and their face is not visible.'},\n",
+ " {'image_id': '1176',\n",
+ " 'caption': 'living room, perfect for relaxing or entertaining guests.'},\n",
+ " {'image_id': '1177',\n",
+ " 'caption': 'This image shows a plate of food with chicken, french fries, coleslaw, and a lemon wedge on the side. The plate is on a white tablecloth with a red and white checkered pattern. There is a red and white checkered napkin on the table next to the plate. The chicken appears to be baked or grilled and is served with a side of french fries and coleslaw. The coleslaw is made with shredded cabbage, carrots, and a creamy dressing. The lemon wedge adds a touch of acidity to the dish. Overall, this appears to be a hearty and satisfying meal.'},\n",
+ " {'image_id': '1179',\n",
+ " 'caption': 'The image shows a garden with sunflowers, daisies, and butterflies arranged in a vase. In the foreground, there is a chalkboard with the words \"14 February Valentine\\'s Day\" written on it. The background consists of a lush green meadow with tall grass and a clear blue sky.'},\n",
+ " {'image_id': '1180',\n",
+ " 'caption': \"The image shows a bride and groom standing in a field at sunset, with the groom kissing the bride's cheek\"},\n",
+ " {'image_id': '1184',\n",
+ " 'caption': \"This is a baby's room with white walls and a wooden floor. A crib is placed in the corner with a zebra print rug beneath it. Animal-themed artwork decorates the walls, featuring a giraffe and a zebra. A white chair and small table are positioned in another corner.\"},\n",
+ " {'image_id': '1185',\n",
+ " 'caption': 'image shows a man wearing a black coat with a grey and white pattern, black pants, and white sneakers with a white sole and laces. He is carrying a black backpack with white accents on the straps, which has a zipper closure. He is standing on a sidewalk next to a building with white walls and a grey roof. A black car with tinted windows is parked on the right side of the image.'},\n",
+ " {'image_id': '1187',\n",
+ " 'caption': 'This image shows a group of people throwing confetti at a bride and groom as they exit their wedding ceremony. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. They are surrounded by their friends and family who are throwing confetti at them as they walk down the street. The image is taken from a low angle and shows the bride and groom walking towards the camera with a big smile on their faces.'},\n",
+ " {'image_id': '1188',\n",
+ " 'caption': 'This image shows a black jeep with large tires parked in a parking lot. The jeep has a lifted suspension and large tires on both the front and back. The body of the jeep appears to be in good condition, with no visible scratches or dents. The windows are tinted, and the wheels are shiny and clean. The license plate on the jeep is not visible in this image.'},\n",
+ " {'image_id': '1189',\n",
+ " 'caption': 'This image shows a hotel room with a large bed, a desk, a chair, and a lamp. The bed is made with a white comforter and pillows, and there is a nightstand with a lamp on it. The desk has a laptop and a phone on it, and there is a chair in front of it. The room has a door that leads to a balcony with a view of the city.'},\n",
+ " {'image_id': '1190',\n",
+ " 'caption': 'The image shows a pile of soybeans, which are a type of legume that is commonly used in cooking and as a source of protein. The beans are arranged in a pile, with some of them spilling out of the pile and onto the ground. The beans are a light brown color and appear to be freshly harvested. The image has a white background, which helps to highlight the beans and make them appear more crisp and fresh. The overall effect of the image is one of abundance and wholesomeness, as the pile of beans suggests a bountiful harvest.'},\n",
+ " {'image_id': '1192',\n",
+ " 'caption': 'The image shows a person\\'s hand reaching into a glass jar filled with coins, with the words \"retirement savings\" written on a piece of paper next to it. The jar is sitting on top of a wooden table, surrounded by various other items such as a clock, a plant, and a book. The scene appears to be taking place outside, as there are trees and a blue sky visible in the background.'},\n",
+ " {'image_id': '1193',\n",
+ " 'caption': 'This image shows a series of photographs of a man riding a horse. The man is wearing a suit and a hat, and is holding the reins of the horse with both hands. The horse is also wearing a bridle and a saddle. The photographs show the man and horse in various poses, including jumping over obstacles, running, and standing still. The background of the photographs appears to be a grassy field with trees in the distance.'},\n",
+ " {'image_id': '1194',\n",
+ " 'caption': 'or he may be someone exploring the abandoned building. The cloudy and overcast sky adds to the overall dreary and desolate mood of the image.'},\n",
+ " {'image_id': '1195',\n",
+ " 'caption': 'The image shows a group of anime characters with different facial expressions, hairstyles, and clothing. Some characters are smiling, while others have serious or frowning expressions. The characters are wearing school uniforms or casual clothing, and the background is plain white.'},\n",
+ " {'image_id': '1196',\n",
+ " 'caption': 'The image is a cartoon of an ice cream cone with a scoop of vanilla ice cream on top and a cinnamon stick as a garnish, on a pink background. The image is simple, colorful, and the ice cream cone is the main focus, with the cinnamon stick being a small detail on top.'},\n",
+ " {'image_id': '1197',\n",
+ " 'caption': 'This image shows a large, white stone building with columns and a dome on top. Several windows are visible on the sides, and there are trees in front of the building. The background shows a cloudy sky.'},\n",
+ " {'image_id': '1198',\n",
+ " 'caption': 'The image shows a large group of people wearing masks and standing in front of a brick wall. The people are all wearing different types of masks, with some wearing full face masks and others wearing half masks. They are all standing in a line, with their arms crossed in front of them. The people in the image are all wearing black and white clothing, with some wearing black pants and white shirts, and others wearing black and white striped shirts. The people are also wearing different types of shoes, with some wearing sneakers and others wearing boots.'},\n",
+ " {'image_id': '1199',\n",
+ " 'caption': 'There are two birds walking on the sidewalk in front of a house. The birds are walking in the direction of the house. There is a car parked on the right side of the street in front of the house. The street is lined with trees on both sides and has a concrete sidewalk on the left side.'},\n",
+ " {'image_id': '1200',\n",
+ " 'caption': 'This image shows a bride and groom sharing their first dance at their wedding reception. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. They are both smiling and looking at each other as they dance. In the background, there is a crowd of people watching them. The room is decorated with white tablecloths, candles, and greenery. The walls are painted white and there are large windows with white drapes that go all the way up to the ceiling. The floor is made of wooden planks.'},\n",
+ " {'image_id': '1201',\n",
+ " 'caption': 'This image shows a woman standing in front of a mirror, looking at her reflection. She is wearing a black dress with a low neckline and long sleeves, and her hair is styled in loose waves. The room is dimly lit, with a chandelier hanging from the ceiling and a table with a vase of flowers on it in the foreground. The walls are adorned with paintings and mirrors, and there is a grandfather clock in the corner. The overall atmosphere is elegant and sophisticated.'},\n",
+ " {'image_id': '1202',\n",
+ " 'caption': 'The image shows two boxer dogs, one brown and white and the other black and white, sitting next to each other on a white background. Both dogs have their mouths open, as if they are barking or about to bark. The brown and white dog has a collar around its neck, while the black and white dog has no collar. The image is well lit and the dogs appear to be in good condition.'},\n",
+ " {'image_id': '1203',\n",
+ " 'caption': \"a Christian event or to show the importance of religion in people's lives. It could also be used in a historical context to show how people dressed for religious events in the past.\"},\n",
+ " {'image_id': '1205',\n",
+ " 'caption': 'This is a magazine cover featuring two women standing in front of a building. The woman on the left is holding a book and the woman on the right is holding a piece of paper. The title of the magazine is written in Arabic at the top of the cover, and the name of the magazine is written in English at the bottom of the cover. There is also a small illustration of a camera on the left side of the cover.'},\n",
+ " {'image_id': '1207',\n",
+ " 'caption': 'This is an image of an espresso machine made of stainless steel with a sleek, modern design.'},\n",
+ " {'image_id': '1208',\n",
+ " 'caption': \"The image is a black mug with white text on it that reads, `'takes a strong man to raise children, but an even stronger man to make his mother proud'`.\"},\n",
+ " {'image_id': '1209',\n",
+ " 'caption': 'In the image, a woman is seen sitting at a table with a plate of food in front of her. She wears a striped shirt and has short, curly hair. On the table, there is a plate with two pieces of chicken and french fries, as well as two glasses of drink - one water and one soda. The background seems to be a restaurant or diner.'},\n",
+ " {'image_id': '1210',\n",
+ " 'caption': 'and ready to eat. The bowl is made of black metal and has a handle on the side. The wooden table has a rough and textured surface. In the background, there is a black chalkboard with some writing on it.'},\n",
+ " {'image_id': '1211',\n",
+ " 'caption': \"The image shows a toy store window display with various toys, such as cars and dolls, arranged for easy viewing. The store's name is written in red letters at the top of the window, and posters and advertisements adorn the walls.\"},\n",
+ " {'image_id': '1212',\n",
+ " 'caption': 'The image shows a group of people sitting around a dining table, raising their glasses in a toast. They are all smiling and appear to be enjoying themselves. The table is set with plates, silverware, and glasses. There is a vase of flowers in the center of the table. The walls are painted a light color and there is a chandelier hanging from the ceiling. The room appears to be well lit and spacious.'},\n",
+ " {'image_id': '1214',\n",
+ " 'caption': 'This is a photograph of a man lying on a bed with his baby on his chest. The man is wearing a blue shirt and jeans, while the baby is wearing a white onesie. The baby is making a peace sign with its hand, while the man is looking down at it with a smile on his face. The background of the photo is a white wall.'},\n",
+ " {'image_id': '1215',\n",
+ " 'caption': 'This The image shows a seagull perched on a wooden post in front of a body of water. In the background, there is a small town with houses and boats in the water. The seagull has its wings spread out and appears to be looking at something in the distance. The sky is clear and blue, with a few fluffy clouds visible.'},\n",
+ " {'image_id': '1216',\n",
+ " 'caption': 'This is an image of a toilet with a child standing next to it. The toilet is white and has a tank on top and a bowl at the bottom. There is a handle on the side of the tank that is used to flush the toilet. The child appears to be reaching for something in the toilet, but it is not clear what they are reaching for. The room is a bathroom, as there is a sink and a shower in the background. The walls and floor are white and made of tile.'},\n",
+ " {'image_id': '1217',\n",
+ " 'caption': 'The image shows a computer monitor with a blank screen. There is nothing else in the image.'},\n",
+ " {'image_id': '1218',\n",
+ " 'caption': 'The image is a collage of various sports equipment, including a soccer ball, a basketball, a volleyball, a football, a tennis racket, and a baseball bat. There is also a text overlay on the image that reads,'},\n",
+ " {'image_id': '1221',\n",
+ " 'caption': 'The image shows a glass of aloe vera juice with a straw in it and some aloe vera leaves on the side. Aloe vera is a succulent plant with thick, fleshy leaves used for medicinal purposes. The juice is made from the inner leaves and is known for its soothing and healing properties. It is used to treat skin conditions such as sunburn, eczema, and psoriasis, as well as for digestive issues and as a natural remedy for colds and flu. The plant is native to tropical and subtropical regions and is widely cultivated for its medicinal properties. It has a long history of use in traditional medicine and is still used today as a natural remedy for a variety of ailments.'},\n",
+ " {'image_id': '1224',\n",
+ " 'caption': \"This is a children's bedroom with two twin beds, each with pink and white striped bedding. The walls are covered in a colorful floral wallpaper, and there are pink and white curtains on the windows. There is a white desk with a pink chair in the corner of the room, and a white bookshelf with pink and white books on it. The floor is covered in a light pink carpet, and there is a pink and white rug in front of the beds.\"},\n",
+ " {'image_id': '1225',\n",
+ " 'caption': 'The image shows a bowl of cooked onions, a knife, and a cutting board on a wooden surface. The onions are sliced and appear to have been cooked in a pan. The knife is next to the bowl and appears to have been used to cut the onions. The cutting board has a brown surface and appears to have been used to prepare the onions.\\n\\nThe image shows a bowl of cooked onions, a knife, and a cutting board on a wooden surface. The onions are sliced and appear to have been cooked in a pan. The knife is next to the bowl and appears to have been used to cut the onions. The cutting board has a brown surface and appears to have been used to prepare the onions.'},\n",
+ " {'image_id': '1226',\n",
+ " 'caption': 'The image is of a blue shirt with a pink \"soon to be dad\" design on the front in white letters. The shirt has short sleeves, a round neckline, and is made of a lightweight material like cotton or polyester. The design is centered on the chest.'},\n",
+ " {'image_id': '1228',\n",
+ " 'caption': 'The image shows a black and grey jacket with the words \"The North Face\" written on the left chest in white.'},\n",
+ " {'image_id': '1229',\n",
+ " 'caption': 'This image depicts a lineman working on an electrical pole. The lineman is wearing a yellow hard hat, safety glasses, gloves, and a harness. He is holding a bucket in one hand and using the other to climb up the pole. The pole is wooden and has metal crossbars for the lineman to hold onto while working. The sky is cloudy and there is a power line running from the pole to a nearby building.'},\n",
+ " {'image_id': '1230',\n",
+ " 'caption': \"This image shows a bride and groom standing in a field of tall grass at sunset. The couple is embracing and looking into each other's eyes. The bride is wearing a white wedding dress and the groom is wearing a black tuxedo. The sun is setting behind them, casting a warm orange light on their faces and the grass. There are trees in the background, and the sky is pink and orange from the sunset.\"},\n",
+ " {'image_id': '1231',\n",
+ " 'caption': 'The image depicts a red 2020 VW Tiguan SUV driving on a wet road with water splashing up from its tires. The SUV has a panoramic sunroof and two people are standing under an umbrella in front of it, looking at the car as it passes by. The VW Tiguan is a midsize SUV with spacious, comfortable interior and strong performance. It also boasts advanced features such as panoramic sunroof and safety features like lane departure warning and automatic emergency braking.'},\n",
+ " {'image_id': '1232',\n",
+ " 'caption': 'This image shows a living room with a wooden floor and a large window that lets in a lot of natural light. The walls are painted in a light blue color, and there are several shelves with books and other items on them. There is a large wooden table in the center of the room, with four chairs around it upholstered in a dark brown leather, and two lamps on the table. The room has a high ceiling with exposed wooden beams, and there is a chandelier hanging from the center. The overall style is modern and minimalist, with a focus on natural materials and neutral colors.'},\n",
+ " {'image_id': '1233',\n",
+ " 'caption': 'This image is a graphic design with the phrase \"you got this\" written in rainbow colors on a blue and pink background, using a stylized font.'},\n",
+ " {'image_id': '1234',\n",
+ " 'caption': 'The image shows a living room with two couches, a coffee table, and a piano. There are two children sitting on the couches, one playing on a tablet and the other playing with a toy. The room is well lit with natural light coming in from the windows. The walls are painted in a neutral color, and there is a rug on the floor. The overall atmosphere of the room is cozy and comfortable.'},\n",
+ " {'image_id': '1237',\n",
+ " 'caption': 'This image appears to be a photograph of a city skyline at dusk, with purple and pink hues in the sky. The text \"welcome to the broads\" is written in white letters on the image. The skyline appears to be that of a large city, with multiple skyscrapers and buildings visible in the distance. There are also some clouds in the sky, which are illuminated by the setting sun. The overall mood of the image is peaceful and serene, with the purple and pink hues giving it a dreamy, ethereal quality.'},\n",
+ " {'image_id': '1239',\n",
+ " 'caption': 'This image is a logo for a company or brand that uses the initials BM. The logo consists of a wreath made of laurel leaves with the initials BM in the center. The design is simple and elegant, making it suitable for use in various applications such as business cards, letterheads, and websites. The color scheme is black and white, with the initials in black and the laurel wreath in white. The overall design is modern and professional, conveying a sense of sophistication and elegance.'},\n",
+ " {'image_id': '1240',\n",
+ " 'caption': 'This is a photograph of a street in a small town. The buildings on either side of the street are different colors and styles. There are cars parked on the street and people walking on the sidewalk. The sky is clear and blue with a few distant clouds. The street is lined with trees and has small shops and restaurants.'},\n",
+ " {'image_id': '1241',\n",
+ " 'caption': 'image depicts a man walking a camel through the desert, with sand dunes and hills in the background, and a clear blue sky above. The man is wearing a white robe and headscarf, and is leading the camel by a rope. The camel is also wearing a rope around its neck and appears to be following behind the man. There are no other people or animals visible in the image.'},\n",
+ " {'image_id': '1242',\n",
+ " 'caption': 'This image shows a fallen tree in front of a house. The tree has fallen across the driveway, blocking access to the house. It appears to have been uprooted by strong winds, and its branches are scattered across the ground. The house is undamaged, and the surrounding area is littered with debris from the fallen tree. The sky is cloudy and overcast, casting a gloomy mood over the scene.'},\n",
+ " {'image_id': '1243',\n",
+ " 'caption': \"This image shows a large collection of nutcrackers on display in a store window. The nutcrackers are made of various materials such as wood, metal, and plastic, and they have different shapes, sizes, and colors. Some are standing, while others are hanging or sitting on shelves, forming a festive display for the holiday season. In the background, we can see the store's interior reflected in the window. There are some decorations hanging from the ceiling, but they are not visible in this image. The image depicts a festive atmosphere and the joy of the holiday season.\"},\n",
+ " {'image_id': '1244',\n",
+ " 'caption': 'The image shows a large, modern building with a glass facade and several floors. On the ground floor, there is a large open space with plants and trees growing in the middle. On the upper floors, there are several balconies with glass railings that offer a view of the city. The building appears to be part of a larger complex, with other buildings visible in the background. The interior of the building is well lit and spacious, with natural light coming in through the large windows. The walls are made of white and gray materials, and there are several pieces of modern furniture scattered throughout the space. The overall design of the building is sleek and modern.'},\n",
+ " {'image_id': '1245',\n",
+ " 'caption': 'This is a photograph of a cup of coffee and a muffin on a table. The coffee is in a white ceramic cup with a brown handle, while the muffin is on a plate next to the cup. There is a brown and white checkered napkin on the table next to them. A brown and white checkered box is also on the table in front of them, with a brown and white checkered label on it saying \"seed and sprout\" in brown letters. In front of the box, there is a brown and white checkered plate with a brown and white checkered fork, knife, and spoon on it.'},\n",
+ " {'image_id': '1248',\n",
+ " 'caption': 'The image shows a silhouette of a cat sitting on a windowsill looking out at the night sky with stars and a crescent moon. The background is a purple gradient with curtains hanging from the top of the window.'},\n",
+ " {'image_id': '1249',\n",
+ " 'caption': 'The image shows a metal sculpture of a globe on a stand. The globe is made of metal and has a polished finish. The stand is also made of metal and has a curved shape. The globe appears to be spinning on its axis. There are no other objects in the image. The background is white.'},\n",
+ " {'image_id': '1250',\n",
+ " 'caption': 'The image shows a map of a city with roads, buildings, parks, and landmarks. There is a large green area in the center of the map, surrounded by roads and buildings. The roads are labeled with different names and directions, and there are several parks marked on the map, including a hospital, a school, and a shopping center. The map is well-organized and easy to read.'},\n",
+ " {'image_id': '1251',\n",
+ " 'caption': 'horizontal and evenly spaced. The background behind the duck and carpet is white.'},\n",
+ " {'image_id': '1252',\n",
+ " 'caption': 'This image shows a large, modern building with several floors and many windows. It appears to be a hotel, with the word \"hotel\" written in large letters on the front of the building. There are several cars parked on the street in front of the building, and a few pedestrians can be seen walking on the sidewalk. The building is surrounded by trees and other buildings, and there is a clear blue sky in the background.'},\n",
+ " {'image_id': '1253',\n",
+ " 'caption': 'This is a tiny house made of wood with a porch on the front, a large window on the side, and a shingle roof. The house has wheels on the bottom for easy mobility.'},\n",
+ " {'image_id': '1254',\n",
+ " 'caption': 'This is an advertisement for a Canadian beer brand featuring a woman wearing a straw hat and holding a can of the beer. The image is in black and white, with the words \"the easy one from canada dry\" on the side of the can. The woman is smiling and appears to be enjoying the beer.'},\n",
+ " {'image_id': '1255',\n",
+ " 'caption': 'The image shows a group of children playing in the park. They are climbing on the tree and swinging on the swings. There is a fence surrounding the park and a white picket fence in the foreground. The sky is blue and there are clouds in the background.'},\n",
+ " {'image_id': '1258',\n",
+ " 'caption': 'This The image shows a red background with the words \"Welcome to the Blue Cat Lodge Marina\" written in blue letters. The words are written in a fun, playful font that matches the nautical theme of the marina. The image also features an illustration of a cat with a fishing rod in its mouth, swimming in the water in front of the marina. The cat looks happy and content, as if it has just caught a big fish. The overall design of the image is bright, colorful, and inviting, conveying a sense of fun and relaxation.'},\n",
+ " {'image_id': '1259',\n",
+ " 'caption': 'ce gown in a champagne color. The dress has a fitted bodice with a sweetheart neckline and a sheer lace overlay. The skirt of the dress is a mermaid style with a long train. She completed her look with a pair of silver heels and a matching clutch.'},\n",
+ " {'image_id': '1260',\n",
+ " 'caption': 'There are three penguins standing. The first penguin is standing on its hind legs with its beak open, calling out to the other penguins. The second penguin is standing on its hind legs with its beak closed. The third penguin is standing on all fours, with its head tilted to the side as if looking at the others. They are in a zoo enclosure with a fence and a gate in the background. The ground is covered in dirt and rocks, with some plants growing in the background. The image depicts a group of penguins in a zoo enclosure, with one of them calling out to the others.'},\n",
+ " {'image_id': '1261',\n",
+ " 'caption': 'people walking down a sidewalk next to a parked SUV. The car is a 2020 Ford Edge in a dark brown color with tinted windows. The license plate is not visible. The people are wearing casual clothing and carrying bags, with one person in a black hoodie and the other in a white shirt and black pants. The sidewalk is made of concrete with a yellow line down the middle and trees on either side. Buildings can be seen in the background and the sky is cloudy with no visible sun.'},\n",
+ " {'image_id': '1262',\n",
+ " 'caption': 'This image is a pattern or texture of green, blue, and purple shades. It can be used as a background or design element in graphic or web design.'},\n",
+ " {'image_id': '1264',\n",
+ " 'caption': \"This image is a map of the state of Wyoming in the United States. The state's name is written in white letters on a black background, with a white outline of the state's shape. Inside the state's shape, there is a white silhouette of the Rocky Mountains, which are located in the western part of Wyoming. The state's capital, Cheyenne, is marked with a white dot in the southeastern part of the state. There are also several white lines crisscrossing the state, which may represent highways or rivers. The overall design of the map is simple and easy to read, with clear labels and a minimalist style.\"},\n",
+ " {'image_id': '1265',\n",
+ " 'caption': 'The image shows a group of pink cherry blossom trees in full bloom, surrounded by tall, dark pine trees. A group of people can be seen walking along a path through the trees, enjoying the beauty of the blossoms. The sky is cloudy, with sunlight peeking through the clouds, casting a warm glow on the scene.'},\n",
+ " {'image_id': '1266',\n",
+ " 'caption': 'This is a living room with light pink walls, a white couch, and a grey carpet. A flat screen television is mounted on the wall, and a small white coffee table is placed in front of the couch. The room has large windows that provide ample natural light. No other furniture is present in the room.'},\n",
+ " {'image_id': '1267',\n",
+ " 'caption': 'This is an image of a room with many cardboard boxes stacked on top of each other, leaving some empty spaces. The room has wooden walls, a window on one side, wooden plank flooring with a rug on top, and a door on the other side that is not visible in the image.'},\n",
+ " {'image_id': '1268',\n",
+ " 'caption': 'This image depicts a person riding a skateboard. The person is wearing a yellow shirt and blue shorts, and is holding a bottle of water in one hand while riding the skateboard with the other. The background is a white surface. The overall mood of the image is casual and relaxed. The image could be used to depict a person enjoying a leisurely activity such as skateboarding.'},\n",
+ " {'image_id': '1269',\n",
+ " 'caption': 'The image is a black and white icon of a euro sign (€) with two arrows coming out of it. The arrow on the left points to the right, and the arrow on the right points to the left. The euro symbol is surrounded by a circle.\\n\\nThe euro symbol is the official currency of the European Union (EU). It was introduced in 1999 and is used by 19 of the 27 member states of the EU. The euro is the second largest and second most traded currency in the world after the US dollar. It is used as a medium of exchange in many countries, and its value is determined by the market demand for it.\\n\\nThe two arrows in the image represent the flow of currency, with one arrow pointing to the right indicating an increase in value and the other arrow pointing to the left indicating a decrease in value. The circle surrounding the euro symbol represents the stability and security of the currency.'},\n",
+ " {'image_id': '1271',\n",
+ " 'caption': 'The image shows a sunset over a city skyline. The sky is cloudy and the sun is setting behind the buildings, casting long shadows on the rooftops. The buildings are a mix of residential and commercial structures, with some tall skyscrapers in the background. There are also some trees and power lines visible in the foreground. The overall mood of the image is peaceful and serene, with the sun casting a warm glow over the scene.'},\n",
+ " {'image_id': '1272',\n",
+ " 'caption': \"The very hungry caterpillar is a popular children's book character created by author and illustrator Eric Carle. The book tells the story of a caterpillar who eats his way through different foods and then transforms into a beautiful butterfly, teaching children about the life cycle of a butterfly and the importance of eating healthy foods. The book has been translated into many languages and has become a classic in children's literature.\"},\n",
+ " {'image_id': '1273',\n",
+ " 'caption': \"The image shows the interior of a car with a steering wheel and dashboard in front of the driver's seat.\"},\n",
+ " {'image_id': '1274',\n",
+ " 'caption': 'The image shows a red beret commonly worn by military personnel, police officers, and other professionals. It is made of wool or cotton and has a flat top and a strap that goes around the back of the head. The beret is maroon in color and has a badge or emblem on the front, which appears to be a shield or crest with some sort of insignia or design on it. The strap that goes around the back of the head is not visible in this image.'},\n",
+ " {'image_id': '1275',\n",
+ " 'caption': 'This image depicts a The image shows a statue of the goddess of justice, holding a scale in one hand and a sword in the other. The statue is made of bronze and stands on a marble pedestal. The goddess is depicted in a classical pose, with a serious expression on her face and a flowing robe draped around her body. The background is a blurred image of trees and greenery, with sunlight filtering through the leaves. The overall effect is one of grandeur and reverence, conveying the idea of justice and fairness.'},\n",
+ " {'image_id': '1276',\n",
+ " 'caption': 'This image shows a person sitting in the back of a golf cart with a stuffed animal in their lap. The person is wearing a black and white striped shirt, black pants, and black shoes. They are holding a golf club in their right hand. The cart is parked on a grassy area with trees in the background. Other people can be seen playing golf in the distance.'},\n",
+ " {'image_id': '1278',\n",
+ " 'caption': \"The image is a logo for the Tampa Bay Lightning, a professional ice hockey team in the National Hockey League. The logo features a blue and white lightning bolt with the team's name in white letters on a black background. The team's colors are blue, white, and black. The Tampa Bay Lightning were founded in 1992 and have played in the NHL since 1993. They won one Stanley Cup championship in 2004. The team's home arena is the Amalie Arena in Tampa, Florida. The Lightning have a large fan base and are known for their fast and exciting style of play.\"},\n",
+ " {'image_id': '1282',\n",
+ " 'caption': 'The image shows an underwater housing for a camera, which is designed to protect the camera from water and other environmental factors. The housing has a large lens on the front, several buttons and controls to adjust the settings, and an LED light to illuminate subjects in low light conditions. It is made of durable materials, such as metal and plastic, to withstand the rigors of underwater use.'},\n",
+ " {'image_id': '1283',\n",
+ " 'caption': 'dimly lit bar or tavern with shelves of alcohol, people drinking and chatting at the bar, and a variety of clothing worn by the patrons.'},\n",
+ " {'image_id': '1285',\n",
+ " 'caption': 'This image shows two hands holding a small electronic device. The device has a black and green circuit board on the top and a white and green circuit board on the bottom. The hands are holding the device by the sides, with the thumb and index finger on one side and the middle and ring fingers on the other. The device appears to have a small screen on the front and several small buttons on the sides.'},\n",
+ " {'image_id': '1287',\n",
+ " 'caption': \"This is a pencil drawing of a woman wearing a Native American headdress and holding a leopard. The woman has long, straight hair and is wearing a feathered headdress with a leopard skin pattern on it. The leopard is also depicted in the drawing, with its head peeking out from behind the woman's shoulder. The overall style of the drawing is very detailed and realistic, with a lot of attention paid to the fur and feathers on the headdress and the leopard. The woman's face is also very well drawn, with a lot of attention paid to the details of her features and expression.\"},\n",
+ " {'image_id': '1288',\n",
+ " 'caption': 'This image shows a cabin in the woods at night, with snow covering the ground and trees around it. The cabin is lit up by the light coming through the windows, casting a warm glow on the surrounding area. It appears to be made of wood and has a pitched roof, with a chimney on top. There is a porch in front of the cabin, with a wooden railing and steps leading down to the ground. The trees around the cabin are tall and covered in snow, with branches hanging down from the top. The sky is clear and dark, with stars visible in the background.'},\n",
+ " {'image_id': '1289',\n",
+ " 'caption': 'The image depicts a body of water with a small island in the distance. The island features a lighthouse and there are rocks in the foreground. The background consists of a large body of water with mountains visible. The sky is cloudy and a boat is present in the water.'},\n",
+ " {'image_id': '1291',\n",
+ " 'caption': 'This image shows a man standing on a ladder in front of a colorful graffiti mural. The man is wearing a black hoodie and appears to be painting the mural. The mural features a variety of colors, including green, blue, and orange. There are also various shapes and designs in the mural, including letters, numbers, and abstract designs.'},\n",
+ " {'image_id': '1292',\n",
+ " 'caption': 'The car in the image is a BMW M4, which is a luxury sports car manufactured by the German automaker BMW. The car has a sleek and stylish design.'},\n",
+ " {'image_id': '1293',\n",
+ " 'caption': 'The image shows a beach with tall buildings in the background. People are walking on the sandy beach. The clear, blue sky has a few distant clouds. A tall, thin structure in the center appears to be a lighthouse.'},\n",
+ " {'image_id': '1294',\n",
+ " 'caption': \"This is a movie poster. The image shows a man and a woman in a romantic embrace. The man is wearing a white shirt and black pants, the woman a red dress. The background is a gradient of blue and purple. The title of the movie is written in large, bold letters at the top of the poster with the actors' names below. The poster appears to be from the 1960s or 1970s based on the style and fashion of the clothing.\"},\n",
+ " {'image_id': '1296',\n",
+ " 'caption': 'The image shows a silver necklace with a small anchor charm made of metal and coated with blue enamel. The silver chain has a lobster claw clasp and the anchor charm has a small hole at the top for hanging it on the necklace. The anchor symbolizes hope and strength, particularly associated with the sea and sailing. The blue enamel coating adds a pop of color to the piece, making it stand out, and the silver chain and lobster claw clasp complete the look, making it a stylish and durable piece of jewelry.'},\n",
+ " {'image_id': '1297',\n",
+ " 'caption': 'The image shows a group of people sitting around a long table. They are all dressed in formal attire, such as suits and ties for the men and dresses for the women. They are all looking at a piece of paper on the table in front of them. There are pens and other writing utensils on the table as well. The room is well lit and appears to be an office or meeting room.'},\n",
+ " {'image_id': '1301',\n",
+ " 'caption': 'The image shows a woman wearing a yellow The woman is wearing a yellow gown with a black belt around the waist. The gown has a deep v neckline and a long slit on the side. The woman is posing on the red carpet and has her hands on her hips. She is wearing high heels and a pair of earrings. The background is a blurred image of people and lights.'},\n",
+ " {'image_id': '1302',\n",
+ " 'caption': 'The image shows a set of plastic containers with different colored lids. The containers are stacked on top of each other and are filled with different types of fruits, such as grapes, strawberries, and raspberries. Each container has a different color lid, with some lids being green, blue, yellow, and red. The containers appear to be made of clear plastic, allowing the contents inside to be seen.'},\n",
+ " {'image_id': '1303',\n",
+ " 'caption': 'The image shows a group of people standing in front of a large banner with a cartoon character on it. The people in the image are all smiling and seem to be enjoying themselves. The person in the center of the image is holding a small child, who is looking up at the camera with a big smile on their face. The other people in the image are standing around the child and the person holding them, and they all seem to be part of the same group or family. The background of the image is not very clear, but it appears to be a large, open space with some trees and buildings visible in the distance.'},\n",
+ " {'image_id': '1304',\n",
+ " 'caption': 'The image depicts a young man wearing suspenders and a red shirt, standing with his arms crossed in front of him. He is looking up at something with a surprised expression on his face. The image is in a flat, cartoon style.\\n\\nThe man is wearing suspenders and a red shirt. He is standing with his arms crossed in front of him, looking up at something with a surprised expression on his face. The image is in a flat, cartoon style.'},\n",
+ " {'image_id': '1305',\n",
+ " 'caption': 'The image shows a wooden statue of a woman holding a book and standing on a pedestal. The statue is intricately carved with detailed clothing and facial features. The pedestal is also carved with a decorative design.'},\n",
+ " {'image_id': '1307',\n",
+ " 'caption': 'This is an image of a black and white cartoon cat with a big grin on its face, standing on its hind legs with its arms outstretched to the sides. The cat is wearing a blue shirt with white sleeves and a white collar. Its eyes are large and white with black pupils, and its mouth is open in a smile. The cat is also wearing black pants with white stripes down the sides, and is standing on a light blue background. There are no other objects or characters in the image.'},\n",
+ " {'image_id': '1308',\n",
+ " 'caption': 'The image shows a group of people hiking on a rocky mountain trail. They are wearing backpacks and hiking boots, and some of them are carrying trekking poles. The sky is clear and blue, and there are some clouds in the distance. The mountains in the background are rugged and rocky, with some trees and shrubs growing on them. The trail is steep and narrow, with large rocks and boulders on either side. It looks like a challenging hike, but the group seems to be enjoying themselves.'},\n",
+ " {'image_id': '1310',\n",
+ " 'caption': 'This is an image of a table with several columns and rows. The columns are labeled with different numbers and the rows are labeled with different countries.'},\n",
+ " {'image_id': '1311',\n",
+ " 'caption': \"This is an image of a black and white cow standing in a dirt field. The cow appears to be grazing on the grass in the middle of the field. There are no other cows visible in the image. The sky is cloudy and there are some trees in the background. The image appears to have been taken from a distance, as there is a fence in the foreground that blocks the view of the cow's face.\"},\n",
+ " {'image_id': '1315',\n",
+ " 'caption': 'image depicts a strong and masculine man with a rugged appearance and a serious expression, showcasing his muscular physique and tattoos.'},\n",
+ " {'image_id': '1316',\n",
+ " 'caption': 'The image shows a bunch of green asparagus spears in a brown paper bag on a wooden surface. The spears are long and thin, with a bright green color and no signs of damage or decay. There is moisture on the surface of the asparagus, which suggests that it has recently been harvested. The overall appearance of the asparagus is fresh and healthy.'},\n",
+ " {'image_id': '1317',\n",
+ " 'caption': 'The image shows a small garden area with a white building in the background. The building appears to be made of wood and has a thatched roof. There are several trees and plants in the garden, including a large palm tree in the center. The garden is surrounded by a white picket fence and there is a path leading to the front door of the building.'},\n",
+ " {'image_id': '1318',\n",
+ " 'caption': \"This is an image of a metal box with the letters 'c s a' engraved on it. The box is gray and rectangular with rounded corners. The bold, capitalized letters are engraved into the surface. The box sits on a wooden surface visible in the background.\"},\n",
+ " {'image_id': '1319',\n",
+ " 'caption': 'The image shows a woman wearing a pink kimono with a pattern of pink and white flowers on a white background. Her hair is styled in a bun on top of her head and she has pink lipstick on her lips. It appears to be a still from a fashion show or advertisement.'},\n",
+ " {'image_id': '1320',\n",
+ " 'caption': \"This image is a clay sculpture of a bull with horns and a long tail. The bull is standing on its hind legs, with its front legs resting on the ground. The sculpture appears to be made of clay and is painted with various colors, including red, brown, and yellow. The bull's body is long and slender, with a curved back and a small head. The horns are large and curved, and the tail is long and thin. The overall appearance of the sculpture is very detailed and realistic, with a lot of attention paid to the animal's anatomy and features.\"},\n",
+ " {'image_id': '1321',\n",
+ " 'caption': 'The woman in the image is wearing a white t-shirt, black jacket, ripped jeans, and white sneakers. She is holding a cellphone in one hand and a coffee cup in the other while standing in front of a building with a large window and metal door. The sidewalk in front of the building is made of cobblestones and there are no other people in the image.'},\n",
+ " {'image_id': '1322',\n",
+ " 'caption': \"The image is a pencil drawing of a woman's head with long, straight hair. The hair is drawn in pencil with shading and highlights to give it a realistic look. The woman's face is not shown, but the hair is drawn in great detail, with strands of hair hanging down from the top of her head. There is a pencil next to the drawing, which is used to create the image.\"},\n",
+ " {'image_id': '1323',\n",
+ " 'caption': 'The image shows a group of people standing around a tree that has been planted in the ground. One person is holding a hose and spraying water onto the roots of the tree, while another person is holding a shovel and digging around the base of the tree. The people are wearing yellow shirts and green hats, and there is a green tarp on the ground next to the tree. The background is a grassy field with some trees in the distance.'},\n",
+ " {'image_id': '1324',\n",
+ " 'caption': \"The image shows a group of people standing in a crowd. They appear to be looking at something in the distance. Some of them are wearing hats and coats, while others are not. The image is in black and white, with the exception of the people's faces, which are in color. The people in the image appear to be of different ages and ethnicities. There is a sense of tension in the image, as the people seem to be waiting for something to happen. Overall, the image is a snapshot of a moment in time, capturing the diversity and humanity of the people in the crowd.\"},\n",
+ " {'image_id': '1326',\n",
+ " 'caption': 'The image shows a view of a harbor with a large cruise ship docked in the foreground. Several smaller boats and yachts are also docked around it. In the background, there is a large cliff with a city built into the side of it. The water in the harbor is a deep blue color, and there are several small islands visible in the distance. The sky is clear and blue, with a few fluffy clouds visible.'},\n",
+ " {'image_id': '1327',\n",
+ " 'caption': 'This is an image of a brown leather bag with a zipper on the top, sitting on a wooden bench. The bag has a shoulder strap. Next to the bag, there are a pair of sunglasses, a water bottle, and a book on the bench. The background of the image is a park or outdoor area, with trees and greenery visible.'},\n",
+ " {'image_id': '1328',\n",
+ " 'caption': 'The image shows a clear glass vase sitting on a white surface with a pink flower arrangement inside it. The vase has a round shape and is made of clear glass. The arrangement inside the vase consists of small pink flowers and green leaves. There are no other objects in the image.'},\n",
+ " {'image_id': '1329',\n",
+ " 'caption': 'The image shows a set of different types of eyeglasses on a white background. The eyeglasses are arranged in a circle, with each one having a different shape, color, and material. Some of the eyeglasses have tinted lenses for specific purposes, such as driving or reading, while others have clear lenses. They are also designed as fashion accessories.'},\n",
+ " {'image_id': '1330',\n",
+ " 'caption': 'The image shows a plate with various fruits on it - watermelon, strawberries, grapes, kiwi, starfruit, and pineapple. The plate is set against a white background and is surrounded by a black border.'},\n",
+ " {'image_id': '1331',\n",
+ " 'caption': 'The image shows a black and brown dog standing on a leash outside a brick building with a wooden door and a small window. The dog is wearing a collar and its tail is wagging while its ears are perked up. The ground is made of cement and there are some puddles on it. The sky is cloudy and it appears to be raining.'},\n",
+ " {'image_id': '1332',\n",
+ " 'caption': 'The image shows a group of people standing behind a podium in front of a large screen displaying a map of the world. The people are dressed in business attire and appear to be listening to someone speaking at the podium. There are several laptops and other electronic devices on the table in front of them.'},\n",
+ " {'image_id': '1333',\n",
+ " 'caption': 'In the image, there is a body of water with a sandy beach next to it. The water is clear and calm, reflecting the blue sky above. On the other side of the water, there is a grassy area with some trees and bushes. In the background, there is a building with white walls and a red roof, which appears to be a house. There are no people visible in the image.'},\n",
+ " {'image_id': '1334',\n",
+ " 'caption': \"The building in the image is a large, modern hotel with several floors and many windows. It appears to be made of concrete and steel, with a gray and white color scheme. There are several balconies on the upper floors, and a large sign with the hotel's name is visible on the front of the building. There are several cars parked in front of the hotel, and a sidewalk runs along the front of the building.\"},\n",
+ " {'image_id': '1335',\n",
+ " 'caption': 'The image shows the River Thames in London at night with the Houses of Parliament and Big Ben in the background. Two people are sitting on a bench in the foreground, looking out over the river. The illuminated lights on the bridge and buildings create a festive atmosphere.'},\n",
+ " {'image_id': '1336',\n",
+ " 'caption': \"The image shows two hearts, one red and one black and white. The red heart is on the left side and the black and white heart is on the right side. The red heart has a black outline around it and the black and white heart has a white outline around it. The background is a black and white checkerboard pattern.\\n\\nA heart is a symbol of love and affection. It is often associated with romantic feelings and emotions. The red heart is a symbol of passion and desire, while the black and white heart is a symbol of balance and harmony. The black and white checkerboard pattern is a symbol of contrast and duality.\\n\\nThe image could be used to represent a romantic or passionate theme, such as a valentine's day card or a romantic movie poster. It could also be used to represent a theme of balance and harmony, such as a yin and yang symbol.\"},\n",
+ " {'image_id': '1337',\n",
+ " 'caption': 'The image shows a swimming pool in a hotel with a wooden deck, lounge chairs, and umbrellas. The building behind the pool has two floors with balconies on the second floor. The balconies have wooden railings and potted plants. The sky is clear and blue without any clouds.'},\n",
+ " {'image_id': '1338',\n",
+ " 'caption': 'The image shows a reception desk with two women standing behind it. The desk is made of wood with a white countertop and two chairs in front of it. The walls of the room are wood panels, and there is a large window behind the desk that lets in natural light. The floor is tiled, and there is a rug in front of the desk.'},\n",
+ " {'image_id': '1339',\n",
+ " 'caption': 'The image is a brown and pink floral scarf made of silk or silk blend material. The scarf has tassels on the ends and is draped over a white surface.'},\n",
+ " {'image_id': '1340',\n",
+ " 'caption': 'This image is a watercolor painting of a woman sitting on a bench in a park. The woman is wearing a white dress and has long, curly brown hair. She is looking down at her phone, which is sitting on the bench next to her. The background of the image is a park with trees and a path leading into the distance. There is a blue sky with fluffy clouds in the background. The overall mood of the image is one of relaxation and contemplation.'},\n",
+ " {'image_id': '1342',\n",
+ " 'caption': 'This is a dish of pork with orange sauce, served with rice and chopsticks on a white plate on a bamboo mat.'},\n",
+ " {'image_id': '1343',\n",
+ " 'caption': 'The image shows an open window with a view of a mountain range in the distance. The window is made of stone and has a wooden frame. The view through the window is of a valley with trees and a river running through it. The sky is clear and blue, with a few fluffy clouds in the distance. The sun is shining down on the landscape, casting long shadows on the ground. There are no other buildings or structures visible in the image.'},\n",
+ " {'image_id': '1345',\n",
+ " 'caption': 'The image shows a room with a black grand piano in the center, surrounded by black and white album covers hanging on the wall. There are several framed photographs on the wall, including one of a group of people in the 1960s. The room is dimly lit, with a single spotlight shining down on the piano.'},\n",
+ " {'image_id': '1346',\n",
+ " 'caption': 'The image shows a white candle with a gold ribbon wrapped around it, sitting on top of a stack of books. The books are stacked in a pyramid shape, with the largest book at the bottom and the smallest at the top. There is a gold star on the cover of each book. The background is a light grey.'},\n",
+ " {'image_id': '1347',\n",
+ " 'caption': 'The image shows a woman wearing a beige trench dress and black sunglasses standing on the street. The dress has a belt around the waist and the sleeves are rolled up to the elbows. The woman is wearing a pair of black sunglasses and carrying a black handbag. The background is a street with buildings and trees.\\n\\nAnswer: The image shows a woman wearing a beige trench dress and black sunglasses standing on the street. The dress has a belt around the waist and the sleeves are rolled up to the elbows. The woman is wearing a pair of black sunglasses and carrying a black handbag. The background is a street with buildings and trees.'},\n",
+ " {'image_id': '1348',\n",
+ " 'caption': 'The image shows a plate with several green and orange fruits on it. The fruits appear to be sliced in half and arranged in a circular pattern on the plate. The plate is sitting on top of a black and white checkered floor.'},\n",
+ " {'image_id': '1349',\n",
+ " 'caption': 'The image shows a man wearing sunglasses and a green polo shirt, sitting at a table with a laptop in front of him. He has short, curly hair and is wearing a watch on his left wrist. The background is not visible, but it appears to be a room with a white wall and some furniture. The image is well lit and the colors are bright and vivid.'},\n",
+ " {'image_id': '1351',\n",
+ " 'caption': \"This image is a bronze statue of a woman wearing a dress and holding a flower in her hand. The statue is standing on a black marble base. The woman's hair is styled in a bun and she has a serene expression on her face. The statue has intricate details, including the folds in the woman's dress and the petals of the flower. The overall effect is one of elegance and grace.\"},\n",
+ " {'image_id': '1352',\n",
+ " 'caption': 'little girl in a pink bikini holding a pink bag with a serious expression on her face. Her curly hair and pink sunglasses add to her cute appearance. The background is simple with a light gray color and a white wall.'},\n",
+ " {'image_id': '1356',\n",
+ " 'caption': 'The image shows a soccer player standing on a field with a ball at his feet. The player is wearing a yellow shirt and black shorts. The background is a dark night sky with a few stars visible. There is a crowd of people watching the game from behind a fence.'},\n",
+ " {'image_id': '1357',\n",
+ " 'caption': \"a gateway to another dimension. The Dark Side of the Moon remains one of Pink Floyd's most popular albums and is considered a classic of the rock genre.\"},\n",
+ " {'image_id': '1360',\n",
+ " 'caption': 'There are two beds in the room, one is a double bed and the other is a single bed. Both beds are made with white sheets.'},\n",
+ " {'image_id': '1361',\n",
+ " 'caption': 'This is a black and white photograph of a woman standing in front of a brick wall. She is wearing a white shirt and black pants, and has her arms crossed in front of her chest. Her hair is pulled back into a ponytail, and she has a serious expression on her face. The photograph is taken from a low angle, looking up at the woman from below her waist. There is a small amount of grain visible in the image, but it does not detract from the overall quality of the photograph. The image is well composed, with the woman standing in the center of the frame and the brick wall providing an interesting background.'},\n",
+ " ...]}"
+ ]
+ },
+ "execution_count": 518,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "caps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "acde12da",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a76d90b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7841cbd8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "eye",
+ "language": "python",
+ "name": "eye"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/develop.ipynb b/develop.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..074068320f5826debcc46dd6608cd1b4d1f0d500
--- /dev/null
+++ b/develop.ipynb
@@ -0,0 +1,929 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "d5ac353e",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import argparse\n",
+ "import os\n",
+ "import shutil\n",
+ "import random\n",
+ "from PIL import Image\n",
+ "\n",
+ "import numpy as np\n",
+ "import torch\n",
+ "import torch.backends.cudnn as cudnn\n",
+ "from transformers import StoppingCriteria, StoppingCriteriaList\n",
+ "\n",
+ "import lavis.tasks as tasks\n",
+ "from lavis.common.config import Config\n",
+ "from lavis.common.dist_utils import get_rank, init_distributed_mode\n",
+ "from lavis.common.logger import setup_logger\n",
+ "from lavis.common.optims import (\n",
+ " LinearWarmupCosineLRScheduler,\n",
+ " LinearWarmupStepLRScheduler,\n",
+ ")\n",
+ "from lavis.common.registry import registry\n",
+ "from lavis.common.utils import now\n",
+ "\n",
+ "# imports modules for registration\n",
+ "from lavis.datasets.builders import *\n",
+ "from lavis.models import *\n",
+ "from lavis.processors import *\n",
+ "from lavis.runners import *\n",
+ "from lavis.tasks import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4fdef7a6",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "shutil.copytree('/ibex/project/c2133/vicuna', '/tmp/vicuna')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "661f9e80",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "class StoppingCriteriaSub(StoppingCriteria):\n",
+ "\n",
+ " def __init__(self, stops = [], encounters=1):\n",
+ " super().__init__()\n",
+ " self.stops = stops\n",
+ "\n",
+ " def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):\n",
+ " for stop in self.stops:\n",
+ " if torch.all((stop == input_ids[0][-len(stop):])).item():\n",
+ " return True\n",
+ "\n",
+ " return False\n",
+ "\n",
+ "\n",
+ "stop_words_ids = [torch.tensor([835]).to('cuda:0'), \n",
+ " torch.tensor([2277, 29937]).to('cuda:0')] # '###' can be encoded in different ways.\n",
+ "stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "1822a77a",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "parser = argparse.ArgumentParser(description=\"Training\")\n",
+ "\n",
+ "parser.add_argument(\"--cfg-path\", required=True, help=\"path to configuration file.\")\n",
+ "parser.add_argument(\n",
+ " \"--options\",\n",
+ " nargs=\"+\",\n",
+ " help=\"override some settings in the used config, the key-value pair \"\n",
+ " \"in xxx=yyy format will be merged into config file (deprecate), \"\n",
+ " \"change to --cfg-options instead.\",\n",
+ ")\n",
+ "\n",
+ "args = parser.parse_args([\"--cfg-path\", \"lavis/projects/blip2/train/vicuna_pretrain_stage2_cc.yaml\"])\n",
+ "\n",
+ "cfg = Config(args)\n",
+ "device = 'cuda:0'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "57e90f19",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "vis_processor_cfg = cfg.datasets_cfg.cc_combine.vis_processor.train\n",
+ "vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "4cc521da",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading LLAMA\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "abeac6970d914446adc1fb73f7e5b5f9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading LLAMA Done\n",
+ "Load BLIP2-LLM Checkpoint: /home/zhud/project/blip2/lavis/output/BLIP2/Vicuna_pretrain_stage2_cc/20230405233/checkpoint_3.pth\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ in <module>:2 │\n",
+ "│ │\n",
+ "│ 1 task = tasks.setup_task(cfg) │\n",
+ "│ ❱ 2 model = task.build_model(cfg) │\n",
+ "│ 3 │\n",
+ "│ │\n",
+ "│ /home/zhud/project/blip2/lavis/tasks/base_task.py:33 in build_model │\n",
+ "│ │\n",
+ "│ 30 │ │ model_config = cfg.model_cfg │\n",
+ "│ 31 │ │ │\n",
+ "│ 32 │ │ model_cls = registry.get_model_class(model_config.arch) │\n",
+ "│ ❱ 33 │ │ return model_cls.from_config(model_config) │\n",
+ "│ 34 │ │\n",
+ "│ 35 │ def build_datasets(self, cfg): │\n",
+ "│ 36 │ │ \"\"\" │\n",
+ "│ │\n",
+ "│ /home/zhud/project/blip2/lavis/models/blip2_models/blip2_llama.py:315 in from_config │\n",
+ "│ │\n",
+ "│ 312 │ │ ckpt_path = cfg.get(\"ckpt\", \"\") │\n",
+ "│ 313 │ │ if ckpt_path: │\n",
+ "│ 314 │ │ │ print(\"Load BLIP2-LLM Checkpoint: {}\".format(ckpt_path)) │\n",
+ "│ ❱ 315 │ │ │ ckpt = torch.load(ckpt_path, map_location=\"cpu\") │\n",
+ "│ 316 │ │ │ msg = model.load_state_dict(ckpt['model'], strict=False) │\n",
+ "│ 317 │ │ │\n",
+ "│ 318 │ │ return model │\n",
+ "│ │\n",
+ "│ /home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/serialization.py:791 in load │\n",
+ "│ │\n",
+ "│ 788 │ if 'encoding' not in pickle_load_args.keys(): │\n",
+ "│ 789 │ │ pickle_load_args['encoding'] = 'utf-8' │\n",
+ "│ 790 │ │\n",
+ "│ ❱ 791 │ with _open_file_like(f, 'rb') as opened_file: │\n",
+ "│ 792 │ │ if _is_zipfile(opened_file): │\n",
+ "│ 793 │ │ │ # The zipfile reader is going to advance the current file position. │\n",
+ "│ 794 │ │ │ # If we want to actually tail call to torch.jit.load, we need to │\n",
+ "│ │\n",
+ "│ /home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/serialization.py:271 in │\n",
+ "│ _open_file_like │\n",
+ "│ │\n",
+ "│ 268 │\n",
+ "│ 269 def _open_file_like(name_or_buffer, mode): │\n",
+ "│ 270 │ if _is_path(name_or_buffer): │\n",
+ "│ ❱ 271 │ │ return _open_file(name_or_buffer, mode) │\n",
+ "│ 272 │ else: │\n",
+ "│ 273 │ │ if 'w' in mode: │\n",
+ "│ 274 │ │ │ return _open_buffer_writer(name_or_buffer) │\n",
+ "│ │\n",
+ "│ /home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/serialization.py:252 in __init__ │\n",
+ "│ │\n",
+ "│ 249 │\n",
+ "│ 250 class _open_file(_opener): │\n",
+ "│ 251 │ def __init__(self, name, mode): │\n",
+ "│ ❱ 252 │ │ super().__init__(open(name, mode)) │\n",
+ "│ 253 │ │\n",
+ "│ 254 │ def __exit__(self, *args): │\n",
+ "│ 255 │ │ self.file_like.close() │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "FileNotFoundError: [Errno 2] No such file or directory: \n",
+ "'/home/zhud/project/blip2/lavis/output/BLIP2/Vicuna_pretrain_stage2_cc/20230405233/checkpoint_3.pth'\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001B[31m╭─\u001B[0m\u001B[31m──────────────────────────────\u001B[0m\u001B[31m \u001B[0m\u001B[1;31mTraceback \u001B[0m\u001B[1;2;31m(most recent call last)\u001B[0m\u001B[31m \u001B[0m\u001B[31m───────────────────────────────\u001B[0m\u001B[31m─╮\u001B[0m\n",
+ "\u001B[31m│\u001B[0m in \u001B[92m\u001B[0m:\u001B[94m2\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m1 \u001B[0mtask = tasks.setup_task(cfg) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m2 model = task.build_model(cfg) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m3 \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2;33m/home/zhud/project/blip2/lavis/tasks/\u001B[0m\u001B[1;33mbase_task.py\u001B[0m:\u001B[94m33\u001B[0m in \u001B[92mbuild_model\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 30 \u001B[0m\u001B[2m│ │ \u001B[0mmodel_config = cfg.model_cfg \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 31 \u001B[0m\u001B[2m│ │ \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 32 \u001B[0m\u001B[2m│ │ \u001B[0mmodel_cls = registry.get_model_class(model_config.arch) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m 33 \u001B[2m│ │ \u001B[0m\u001B[94mreturn\u001B[0m model_cls.from_config(model_config) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 34 \u001B[0m\u001B[2m│ \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 35 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94mdef\u001B[0m \u001B[92mbuild_datasets\u001B[0m(\u001B[96mself\u001B[0m, cfg): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 36 \u001B[0m\u001B[2;90m│ │ \u001B[0m\u001B[33m\"\"\"\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2;33m/home/zhud/project/blip2/lavis/models/blip2_models/\u001B[0m\u001B[1;33mblip2_llama.py\u001B[0m:\u001B[94m315\u001B[0m in \u001B[92mfrom_config\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m312 \u001B[0m\u001B[2m│ │ \u001B[0mckpt_path = cfg.get(\u001B[33m\"\u001B[0m\u001B[33mckpt\u001B[0m\u001B[33m\"\u001B[0m, \u001B[33m\"\u001B[0m\u001B[33m\"\u001B[0m) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m313 \u001B[0m\u001B[2m│ │ \u001B[0m\u001B[94mif\u001B[0m ckpt_path: \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m314 \u001B[0m\u001B[2m│ │ │ \u001B[0m\u001B[96mprint\u001B[0m(\u001B[33m\"\u001B[0m\u001B[33mLoad BLIP2-LLM Checkpoint: \u001B[0m\u001B[33m{}\u001B[0m\u001B[33m\"\u001B[0m.format(ckpt_path)) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m315 \u001B[2m│ │ │ \u001B[0mckpt = torch.load(ckpt_path, map_location=\u001B[33m\"\u001B[0m\u001B[33mcpu\u001B[0m\u001B[33m\"\u001B[0m) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m316 \u001B[0m\u001B[2m│ │ │ \u001B[0mmsg = model.load_state_dict(ckpt[\u001B[33m'\u001B[0m\u001B[33mmodel\u001B[0m\u001B[33m'\u001B[0m], strict=\u001B[94mFalse\u001B[0m) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m317 \u001B[0m\u001B[2m│ │ \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m318 \u001B[0m\u001B[2m│ │ \u001B[0m\u001B[94mreturn\u001B[0m model \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2;33m/home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/\u001B[0m\u001B[1;33mserialization.py\u001B[0m:\u001B[94m791\u001B[0m in \u001B[92mload\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 788 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94mif\u001B[0m \u001B[33m'\u001B[0m\u001B[33mencoding\u001B[0m\u001B[33m'\u001B[0m \u001B[95mnot\u001B[0m \u001B[95min\u001B[0m pickle_load_args.keys(): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 789 \u001B[0m\u001B[2m│ │ \u001B[0mpickle_load_args[\u001B[33m'\u001B[0m\u001B[33mencoding\u001B[0m\u001B[33m'\u001B[0m] = \u001B[33m'\u001B[0m\u001B[33mutf-8\u001B[0m\u001B[33m'\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 790 \u001B[0m\u001B[2m│ \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m 791 \u001B[2m│ \u001B[0m\u001B[94mwith\u001B[0m _open_file_like(f, \u001B[33m'\u001B[0m\u001B[33mrb\u001B[0m\u001B[33m'\u001B[0m) \u001B[94mas\u001B[0m opened_file: \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 792 \u001B[0m\u001B[2m│ │ \u001B[0m\u001B[94mif\u001B[0m _is_zipfile(opened_file): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 793 \u001B[0m\u001B[2m│ │ │ \u001B[0m\u001B[2m# The zipfile reader is going to advance the current file position.\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 794 \u001B[0m\u001B[2m│ │ │ \u001B[0m\u001B[2m# If we want to actually tail call to torch.jit.load, we need to\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2;33m/home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/\u001B[0m\u001B[1;33mserialization.py\u001B[0m:\u001B[94m271\u001B[0m in \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[92m_open_file_like\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 268 \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 269 \u001B[0m\u001B[94mdef\u001B[0m \u001B[92m_open_file_like\u001B[0m(name_or_buffer, mode): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 270 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94mif\u001B[0m _is_path(name_or_buffer): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m 271 \u001B[2m│ │ \u001B[0m\u001B[94mreturn\u001B[0m _open_file(name_or_buffer, mode) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 272 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94melse\u001B[0m: \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 273 \u001B[0m\u001B[2m│ │ \u001B[0m\u001B[94mif\u001B[0m \u001B[33m'\u001B[0m\u001B[33mw\u001B[0m\u001B[33m'\u001B[0m \u001B[95min\u001B[0m mode: \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 274 \u001B[0m\u001B[2m│ │ │ \u001B[0m\u001B[94mreturn\u001B[0m _open_buffer_writer(name_or_buffer) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2;33m/home/zhud/anaconda3/envs/eye/lib/python3.9/site-packages/torch/\u001B[0m\u001B[1;33mserialization.py\u001B[0m:\u001B[94m252\u001B[0m in \u001B[92m__init__\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 249 \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 250 \u001B[0m\u001B[94mclass\u001B[0m \u001B[4;92m_open_file\u001B[0m(_opener): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 251 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94mdef\u001B[0m \u001B[92m__init__\u001B[0m(\u001B[96mself\u001B[0m, name, mode): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[31m❱ \u001B[0m 252 \u001B[2m│ │ \u001B[0m\u001B[96msuper\u001B[0m().\u001B[92m__init__\u001B[0m(\u001B[96mopen\u001B[0m(name, mode)) \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 253 \u001B[0m\u001B[2m│ \u001B[0m \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 254 \u001B[0m\u001B[2m│ \u001B[0m\u001B[94mdef\u001B[0m \u001B[92m__exit__\u001B[0m(\u001B[96mself\u001B[0m, *args): \u001B[31m│\u001B[0m\n",
+ "\u001B[31m│\u001B[0m \u001B[2m 255 \u001B[0m\u001B[2m│ │ \u001B[0m\u001B[96mself\u001B[0m.file_like.close() \u001B[31m│\u001B[0m\n",
+ "\u001B[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001B[0m\n",
+ "\u001B[1;91mFileNotFoundError: \u001B[0m\u001B[1m[\u001B[0mErrno \u001B[1;36m2\u001B[0m\u001B[1m]\u001B[0m No such file or directory: \n",
+ "\u001B[32m'/home/zhud/project/blip2/lavis/output/BLIP2/Vicuna_pretrain_stage2_cc/20230405233/checkpoint_3.pth'\u001B[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "task = tasks.setup_task(cfg)\n",
+ "model = task.build_model(cfg)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ba874036",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/ibex/project/c2133/vicuna'"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bf1c4e1c",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Load Checkpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a2a7f2bd",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_prompt_stage2_laion/20230410145/checkpoint_4.pth'\n",
+ "ckpt = torch.load(ckpt_path, map_location=\"cpu\")\n",
+ "msg = model.load_state_dict(ckpt['model'], strict=False)\n",
+ "model = model.to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "035a495f",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Example of Tokenizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "3426ae10",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "texts = [\"A chat\", \"The assistant gives helpful\"]\n",
+ "\n",
+ "llama_tokens = model.llama_tokenizer(\n",
+ " texts, \n",
+ " return_tensors=\"pt\", \n",
+ " padding=\"longest\",\n",
+ " truncation=True,\n",
+ " max_length=10).to(device)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "376400a4",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "texts = \"The assistant gives helpful\"\n",
+ "\n",
+ "llama_tokens = model.llama_tokenizer(\n",
+ " texts, \n",
+ " return_tensors=\"pt\", \n",
+ " padding=\"longest\",\n",
+ " truncation=True,\n",
+ " max_length=10).to(device)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "6988ee66",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 5])"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "llama_tokens.attention_mask.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "dc9e376d",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "targets = llama_tokens.input_ids.masked_fill(\n",
+ " llama_tokens.input_ids == model.llama_tokenizer.pad_token_id, -100\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "e458fa52",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([2, 3])"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "torch.ones([targets.shape[0], targets.shape[0]+1]).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "24607f7a",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "text = \\\n",
+ "\"### Human: What's your name?\" \\\n",
+ "\"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "llama_tokens = model.llama_tokenizer(\n",
+ " text, \n",
+ " return_tensors=\"pt\", \n",
+ " ).to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e69d3e1",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Example of Emb Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "id": "205b092f",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "I'm sorry, I am an AI language model and do not have a physical form or a name. My purpose is to assist you with any questions or tasks you may have to the best of my ability. Is there anything specific you would like help with?\n",
+ "###\n"
+ ]
+ }
+ ],
+ "source": [
+ "inputs_embeds = model.llama_model.model.embed_tokens(llama_tokens.input_ids)\n",
+ "outputs = model.llama_model.generate(\n",
+ " inputs_embeds=inputs_embeds,\n",
+ " query_embeds=None,\n",
+ " attention_mask=llama_tokens.attention_mask,\n",
+ " max_new_tokens=500,\n",
+ " stopping_criteria=stopping_criteria,\n",
+ " )\n",
+ "output_text = model.llama_tokenizer.decode(outputs[0])\n",
+ "print(output_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 189,
+ "id": "561b42f5",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 16, 5120])"
+ ]
+ },
+ "execution_count": 189,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inputs_embeds.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1694ad6",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Example of ID Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1dc7841",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "outputs = model.llama_model.generate(\n",
+ " input_ids=llama_tokens.input_ids,\n",
+ " query_embeds=None,\n",
+ " attention_mask=llama_tokens.attention_mask,\n",
+ " max_new_tokens=500,\n",
+ " stopping_criteria=stopping_criteria,\n",
+ " )\n",
+ "output_text = model.llama_tokenizer.decode(outputs[0])\n",
+ "print(output_text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19dd1f9d",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "468ac97e",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Example of Mixed Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "4af3a9bf",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ckpt_path = '/home/zhud/project/blip2/lavis/output/BLIP2/Vicuna_pretrain_stage2_cc/20230408015/checkpoint_2.pth'\n",
+ "ckpt = torch.load(ckpt_path, map_location=\"cpu\")\n",
+ "msg = model.load_state_dict(ckpt['model'], strict=False)\n",
+ "model = model.to(device)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "c3148611",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Load the image using PIL\n",
+ "image = Image.open('test_img5.jpg').convert('RGB')\n",
+ "image = vis_processor(image).unsqueeze(0).to(device)\n",
+ "inputs_llama, atts_llama = model.encode_img(image)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "07b82707",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "text = \\\n",
+ "\"A chat between a curious human and an artificial intelligence assistant. \" \\\n",
+ "\"The assistant gives helpful, detailed, and polite answers to the human's questions. \"\\\n",
+ "\"Human may ask questions related to a given image. \" \\\n",
+ "\"The image will be wrapped as IMAGE_CONTENT \" \\\n",
+ "\"### Human: To_Split \" \\\n",
+ "\"### Assistant: Received the image. \" \\\n",
+ "\"### Human: Describe the image in detail. Say everthing you see. Describe all the things.\" \\\n",
+ "\"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "text = \\\n",
+ "\"A chat between a curious human and an artificial intelligence assistant. \" \\\n",
+ "\"The assistant gives helpful, detailed, and polite answers to the human's questions. \"\\\n",
+ "\"Human may ask questions related to a given image. \" \\\n",
+ "\"The image will be wrapped as IMAGE_CONTENT \" \\\n",
+ "\"### Human: Describe the image in detail. Say everthing you see. To_Split \" \\\n",
+ "\"### Assistant: \"\n",
+ "\n",
+ "text = \\\n",
+ "\"### Human: Describe the image in detail. Say everthing you see. To_Split \" \\\n",
+ "\"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "\n",
+ "# text = \\\n",
+ "# \"A chat between a curious human and an artificial intelligence assistant. \" \\\n",
+ "# \"The assistant gives helpful, detailed, and polite answers to the human's questions. \"\\\n",
+ "# \"Human may ask questions related to a given image. \" \\\n",
+ "# \"The image will be wrapped as IMAGE_CONTENT \" \\\n",
+ "# \"### Human: To_Split \" \\\n",
+ "# \"### Assistant: Received the image. \" \\\n",
+ "# \"### Human: This is a draft of a website. Give me the html code to write this website. \" \\\n",
+ "# \"Btw, you need to come up with some jokes in the website to fill the placeholders. \" \\\n",
+ "# \"Also, make the website colorful and vivid. \" \\\n",
+ "# \"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "# text = \\\n",
+ "# \"Return what the human says. \" \\\n",
+ "# \"### Human: There is a big elephant in the sky. \" \\\n",
+ "# \"### Assistant: There is a big elephant in the sky. \" \\\n",
+ "# \"### Human: fdjlks klcznv_l1 \" \\\n",
+ "# \"### Assistant: fdjlks klcznv_l1 \" \\\n",
+ "# \"### Human: To_Split \" \\\n",
+ "# \"### Assistant: \"\n",
+ "\n",
+ "\n",
+ "text_1, text_2 = text.split('To_Split')\n",
+ "\n",
+ "text_1_tokens = model.llama_tokenizer(text_1, return_tensors=\"pt\").to(device)\n",
+ "text_2_tokens = model.llama_tokenizer(text_2, return_tensors=\"pt\", add_special_tokens=False).to(device)\n",
+ "text_1_emb = model.llama_model.model.embed_tokens(text_1_tokens.input_ids)\n",
+ "text_2_emb = model.llama_model.model.embed_tokens(text_2_tokens.input_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "136b9e97",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "The image shows a small bird perched on a tree stump, with a camera lens in the background\n",
+ "\n",
+ "The bird is a small bird, with a bright yellow beak and black feathers. It is perched on a tree stump, with its wings spread out and its beak open. The bird is looking to the left, as if it is about to take off.\n",
+ "\n",
+ "The camera lens in the background is a large, black lens with a silver ring around the front. The lens is attached to a camera, which is not visible in the image. The lens is pointed at the bird, with the camera's viewfinder showing the bird in the center of the frame.\n",
+ "\n",
+ "The background of the image is a forest, with trees and foliage visible in the distance. The trees are covered in leaves, and there is a thick layer of mist or fog in the air, which gives the image a dreamy, ethereal quality.\n",
+ "\n",
+ "The lighting in the image is soft and diffused, with the sun shining through the trees and casting a warm, golden light on the bird and the tree stump. The lighting creates deep shadows in the forest, which add to the sense of mystery and wonder in the image.\n",
+ "\n",
+ "The overall effect of the image is one of peacefulness and tranquility, with the bird and the forest creating a sense of calm and serenity. The image is beautifully composed, with the bird and the camera lens creating a visual balance that draws the viewer's eye to the center of the frame.\n",
+ "###\n"
+ ]
+ }
+ ],
+ "source": [
+ "outputs = model.llama_model.generate(\n",
+ " inputs_embeds=torch.concat([text_1_emb, inputs_llama, text_2_emb], dim=1),\n",
+ " query_embeds=None,\n",
+ " attention_mask=torch.concat([text_1_tokens.attention_mask, atts_llama, text_2_tokens.attention_mask], dim=1),\n",
+ " max_new_tokens=600,\n",
+ " stopping_criteria=stopping_criteria,\n",
+ " )\n",
+ "output_text = model.llama_tokenizer.decode(outputs[0])\n",
+ "print(output_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "id": "54cc3d4a",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "with open('lavis/prompts/image_caption.txt', 'r') as f:\n",
+ " prompts = f.read().splitlines()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "f52cd85c",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "prompt_token = model.llama_tokenizer(prompts, return_tensors=\"pt\", padding=\"longest\",)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "4b0cf1d0",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[(15, 6), (16, 11), (17, 17), (18, 17), (19, 27), (20, 18), (21, 21), (22, 4), (23, 6), (24, 2)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "my_list = prompt_token.attention_mask.sum(1).numpy()\n",
+ "counts = {}\n",
+ "\n",
+ "for element in my_list:\n",
+ " if element in counts:\n",
+ " counts[element] += 1\n",
+ " else:\n",
+ " counts[element] = 1\n",
+ "\n",
+ "print(sorted(counts.items(), key=lambda item: item[0]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "f7919e93",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1, 2, 1, 2, 1, 2]\n"
+ ]
+ }
+ ],
+ "source": [
+ "a,b = [1,1,1], [2,2,2]\n",
+ "c = [i for pair in zip(a,b) for i in pair]\n",
+ "print(c)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c64a037",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Example of Image Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "87164578",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a bird eating from a bird feeder\n",
+ "\n",
+ "bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird\n",
+ "bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird\n",
+ "bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird\n",
+ "bird feeder, bird feeder, bird feeder\n"
+ ]
+ }
+ ],
+ "source": [
+ "inputs_embeds = model.llama_model.model.embed_tokens(llama_tokens.input_ids)\n",
+ "bos_embeds = model.llama_model.model.embed_tokens(torch.tensor(model.llama_tokenizer.bos_token_id, device=device))[None, None]\n",
+ "outputs = model.llama_model.generate(\n",
+ " inputs_embeds=torch.concat([bos_embeds, inputs_llama], dim=1),\n",
+ " query_embeds=None,\n",
+ " attention_mask=torch.concat([atts_llama[:, :1], atts_llama], dim=1),\n",
+ " max_new_tokens=100,\n",
+ " stopping_criteria=stopping_criteria,\n",
+ " )\n",
+ "output_text = model.llama_tokenizer.decode(outputs[0])\n",
+ "print(output_text)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "eye",
+ "language": "python",
+ "name": "eye"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd7d73e43166b529932197a683047e32cc9b33f4
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,56 @@
+name: minigpt4
+channels:
+ - pytorch
+ - defaults
+dependencies:
+ - python=3.9
+ - pip
+ - pytorch=1.12.1
+ - pytorch-mutex=1.0=cuda
+ - torchaudio=0.12.1
+ - torchvision=0.13.1
+ - pip:
+ - accelerate==0.16.0
+ - aiohttp==3.8.4
+ - aiosignal==1.3.1
+ - async-timeout==4.0.2
+ - attrs==22.2.0
+ - bitsandbytes==0.37.0
+ - cchardet==2.1.7
+ - chardet==5.1.0
+ - contourpy==1.0.7
+ - cycler==0.11.0
+ - filelock==3.9.0
+ - fonttools==4.38.0
+ - frozenlist==1.3.3
+ - huggingface-hub==0.12.1
+ - importlib-resources==5.12.0
+ - kiwisolver==1.4.4
+ - matplotlib==3.7.0
+ - multidict==6.0.4
+ - openai==0.27.0
+ - packaging==23.0
+ - psutil==5.9.4
+ - pycocotools==2.0.6
+ - pyparsing==3.0.9
+ - python-dateutil==2.8.2
+ - pyyaml==6.0
+ - regex==2022.10.31
+ - tokenizers==0.13.2
+ - tqdm==4.64.1
+ - transformers==4.28.0
+ - timm==0.6.13
+ - spacy==3.5.1
+ - webdataset==0.2.48
+ - scikit-learn==1.2.2
+ - scipy==1.10.1
+ - yarl==1.8.2
+ - zipp==3.14.0
+ - tenacity==8.2.2
+ - peft
+ - pycocoevalcap
+ - sentence-transformers
+ - umap-learn
+ - notebook
+ - gradio
+ - wandb
diff --git a/eval_configs/minigpt4.yaml b/eval_configs/minigpt4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cf7e84d15541548a4c72079814703c5c0174660
--- /dev/null
+++ b/eval_configs/minigpt4.yaml
@@ -0,0 +1,30 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: mini_gpt4
+ model_type: pretrain_vicuna
+ freeze_vit: True
+ freeze_qformer: True
+ max_txt_len: 160
+ end_sym: "###"
+ prompt_path: "prompts/alignment.txt"
+ prompt_template: '###Human: {} ###Assistant: '
+ ckpt: '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_stage3_align/20230412191_laion_ckpt3/checkpoint_1.pth'
+
+
+datasets:
+ cc_align:
+ vis_processor:
+ train:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+
+run:
+ task: image_text_pretrain
+
diff --git a/examples/ad_1.png b/examples/ad_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d0378e43e9a0e797b2ab32f4d8f6261fa2224408
Binary files /dev/null and b/examples/ad_1.png differ
diff --git a/examples/ad_2.png b/examples/ad_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..674248b723bee885c43a55d85d83ea1c0fa41477
Binary files /dev/null and b/examples/ad_2.png differ
diff --git a/examples/cook_1.png b/examples/cook_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8cdb45c98492afd4f975b8626bb590b580616a5
Binary files /dev/null and b/examples/cook_1.png differ
diff --git a/examples/cook_2.png b/examples/cook_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..d08272b3733dda976bfa78733d9ca4eb544fee52
Binary files /dev/null and b/examples/cook_2.png differ
diff --git a/examples/describe_1.png b/examples/describe_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..02f3c92f54749fa354a5f8c617f24301728555b2
Binary files /dev/null and b/examples/describe_1.png differ
diff --git a/examples/describe_2.png b/examples/describe_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..20bf8c7cd86c03f9ed77f95d912057438997277d
Binary files /dev/null and b/examples/describe_2.png differ
diff --git a/examples/fact_1.png b/examples/fact_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f7522871916c3d1bd113cfb88c45386d8abda7a
Binary files /dev/null and b/examples/fact_1.png differ
diff --git a/examples/fact_2.png b/examples/fact_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..de6ef53ef7afd72894711a3a5288a24d62c39182
Binary files /dev/null and b/examples/fact_2.png differ
diff --git a/examples/fix_1.png b/examples/fix_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..023cfe6610747868805c70001e14f2b408f3cebb
Binary files /dev/null and b/examples/fix_1.png differ
diff --git a/examples/fix_2.png b/examples/fix_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..f60da5ff9bdef7018e98a92b19c0e59d31acd059
Binary files /dev/null and b/examples/fix_2.png differ
diff --git a/examples/fun_1.png b/examples/fun_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f720ea603f88019e24dbcb569328c3083c832baf
Binary files /dev/null and b/examples/fun_1.png differ
diff --git a/examples/fun_2.png b/examples/fun_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d37a8068feda3f7ecc2d0b22893d26071e68b64
Binary files /dev/null and b/examples/fun_2.png differ
diff --git a/examples/logo_1.png b/examples/logo_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bbe438bdc05ce023251045575c6b7e7b04f210f
Binary files /dev/null and b/examples/logo_1.png differ
diff --git a/examples/op_1.png b/examples/op_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dbb2ff51ca08f62171f48167bbb97ad604cc4d0
Binary files /dev/null and b/examples/op_1.png differ
diff --git a/examples/op_2.png b/examples/op_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2cd3e1f8b0326dea14d45bf866b244deb38ef409
Binary files /dev/null and b/examples/op_2.png differ
diff --git a/examples/people_1.png b/examples/people_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..7e95c42c710aef5efe94a52da280fd7451f185d7
Binary files /dev/null and b/examples/people_1.png differ
diff --git a/examples/people_2.png b/examples/people_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..aec6c83b217c96af91668dbc566e05a14238b2a8
Binary files /dev/null and b/examples/people_2.png differ
diff --git a/examples/rhyme_1.png b/examples/rhyme_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d133878d8b534867253c7be7b2805faffbd6ad7
Binary files /dev/null and b/examples/rhyme_1.png differ
diff --git a/examples/rhyme_2.png b/examples/rhyme_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9bf8958302e461dec8b58dd7cbbe2224a8e5c
Binary files /dev/null and b/examples/rhyme_2.png differ
diff --git a/examples/story_1.png b/examples/story_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3eb6ccb93fb5c866eeb758ba962904e5c3d57875
Binary files /dev/null and b/examples/story_1.png differ
diff --git a/examples/story_2.png b/examples/story_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d37142a9ae32f20d47ebd10cf1c395f10b363f7
Binary files /dev/null and b/examples/story_2.png differ
diff --git a/examples/web_1.png b/examples/web_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8943842c08609713b78d95ab3b5c418995569505
Binary files /dev/null and b/examples/web_1.png differ
diff --git a/examples/wop_1.png b/examples/wop_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..88f37d672bb2dd3dac34caced2ed4bebcfe15412
Binary files /dev/null and b/examples/wop_1.png differ
diff --git a/examples/wop_2.png b/examples/wop_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..8255974176014db0b388617821630bdb438b5e6b
Binary files /dev/null and b/examples/wop_2.png differ
diff --git a/minigpt4/__init__.py b/minigpt4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec06cef0e2e4e39e450746b0f3136776f6bcf143
--- /dev/null
+++ b/minigpt4/__init__.py
@@ -0,0 +1,31 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import sys
+
+from omegaconf import OmegaConf
+
+from minigpt4.common.registry import registry
+
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.tasks import *
+
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
+
+registry.register_path("library_root", root_dir)
+repo_root = os.path.join(root_dir, "..")
+registry.register_path("repo_root", repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path("cache_root", cache_root)
+
+registry.register("MAX_INT", sys.maxsize)
+registry.register("SPLIT_NAMES", ["train", "val", "test"])
diff --git a/minigpt4/common/__init__.py b/minigpt4/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/minigpt4/common/config.py b/minigpt4/common/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f58a6ba471fb448d71302f746ea82593a0b52a
--- /dev/null
+++ b/minigpt4/common/config.py
@@ -0,0 +1,468 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import json
+from typing import Dict
+
+from omegaconf import OmegaConf
+from minigpt4.common.registry import registry
+
+
+class Config:
+ def __init__(self, args):
+ self.config = {}
+
+ self.args = args
+
+ # Register the config and configuration for setup
+ registry.register("configuration", self)
+
+ user_config = self._build_opt_list(self.args.options)
+
+ config = OmegaConf.load(self.args.cfg_path)
+
+ runner_config = self.build_runner_config(config)
+ model_config = self.build_model_config(config, **user_config)
+ dataset_config = self.build_dataset_config(config)
+
+ # Validate the user-provided runner configuration
+ # model and dataset configuration are supposed to be validated by the respective classes
+ # [TODO] validate the model/dataset configuration
+ # self._validate_runner_config(runner_config)
+
+ # Override the default configuration with user options.
+ self.config = OmegaConf.merge(
+ runner_config, model_config, dataset_config, user_config
+ )
+
+ def _validate_runner_config(self, runner_config):
+ """
+ This method validates the configuration, such that
+ 1) all the user specified options are valid;
+ 2) no type mismatches between the user specified options and the config.
+ """
+ runner_config_validator = create_runner_config_validator()
+ runner_config_validator.validate(runner_config)
+
+ def _build_opt_list(self, opts):
+ opts_dot_list = self._convert_to_dot_list(opts)
+ return OmegaConf.from_dotlist(opts_dot_list)
+
+ @staticmethod
+ def build_model_config(config, **kwargs):
+ model = config.get("model", None)
+ assert model is not None, "Missing model configuration file."
+
+ model_cls = registry.get_model_class(model.arch)
+ assert model_cls is not None, f"Model '{model.arch}' has not been registered."
+
+ model_type = kwargs.get("model.model_type", None)
+ if not model_type:
+ model_type = model.get("model_type", None)
+ # else use the model type selected by user.
+
+ assert model_type is not None, "Missing model_type."
+
+ model_config_path = model_cls.default_config_path(model_type=model_type)
+
+ model_config = OmegaConf.create()
+ # hiararchy override, customized config > default config
+ model_config = OmegaConf.merge(
+ model_config,
+ OmegaConf.load(model_config_path),
+ {"model": config["model"]},
+ )
+
+ return model_config
+
+ @staticmethod
+ def build_runner_config(config):
+ return {"run": config.run}
+
+ @staticmethod
+ def build_dataset_config(config):
+ datasets = config.get("datasets", None)
+ if datasets is None:
+ raise KeyError(
+ "Expecting 'datasets' as the root key for dataset configuration."
+ )
+
+ dataset_config = OmegaConf.create()
+
+ for dataset_name in datasets:
+ builder_cls = registry.get_builder_class(dataset_name)
+
+ dataset_config_type = datasets[dataset_name].get("type", "default")
+ dataset_config_path = builder_cls.default_config_path(
+ type=dataset_config_type
+ )
+
+ # hiararchy override, customized config > default config
+ dataset_config = OmegaConf.merge(
+ dataset_config,
+ OmegaConf.load(dataset_config_path),
+ {"datasets": {dataset_name: config["datasets"][dataset_name]}},
+ )
+
+ return dataset_config
+
+ def _convert_to_dot_list(self, opts):
+ if opts is None:
+ opts = []
+
+ if len(opts) == 0:
+ return opts
+
+ has_equal = opts[0].find("=") != -1
+
+ if has_equal:
+ return opts
+
+ return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])]
+
+ def get_config(self):
+ return self.config
+
+ @property
+ def run_cfg(self):
+ return self.config.run
+
+ @property
+ def datasets_cfg(self):
+ return self.config.datasets
+
+ @property
+ def model_cfg(self):
+ return self.config.model
+
+ def pretty_print(self):
+ logging.info("\n===== Running Parameters =====")
+ logging.info(self._convert_node_to_json(self.config.run))
+
+ logging.info("\n====== Dataset Attributes ======")
+ datasets = self.config.datasets
+
+ for dataset in datasets:
+ if dataset in self.config.datasets:
+ logging.info(f"\n======== {dataset} =======")
+ dataset_config = self.config.datasets[dataset]
+ logging.info(self._convert_node_to_json(dataset_config))
+ else:
+ logging.warning(f"No dataset named '{dataset}' in config. Skipping")
+
+ logging.info(f"\n====== Model Attributes ======")
+ logging.info(self._convert_node_to_json(self.config.model))
+
+ def _convert_node_to_json(self, node):
+ container = OmegaConf.to_container(node, resolve=True)
+ return json.dumps(container, indent=4, sort_keys=True)
+
+ def to_dict(self):
+ return OmegaConf.to_container(self.config)
+
+
+def node_to_dict(node):
+ return OmegaConf.to_container(node)
+
+
+class ConfigValidator:
+ """
+ This is a preliminary implementation to centralize and validate the configuration.
+ May be altered in the future.
+
+ A helper class to validate configurations from yaml file.
+
+ This serves the following purposes:
+ 1. Ensure all the options in the yaml are defined, raise error if not.
+ 2. when type mismatches are found, the validator will raise an error.
+ 3. a central place to store and display helpful messages for supported configurations.
+
+ """
+
+ class _Argument:
+ def __init__(self, name, choices=None, type=None, help=None):
+ self.name = name
+ self.val = None
+ self.choices = choices
+ self.type = type
+ self.help = help
+
+ def __str__(self):
+ s = f"{self.name}={self.val}"
+ if self.type is not None:
+ s += f", ({self.type})"
+ if self.choices is not None:
+ s += f", choices: {self.choices}"
+ if self.help is not None:
+ s += f", ({self.help})"
+ return s
+
+ def __init__(self, description):
+ self.description = description
+
+ self.arguments = dict()
+
+ self.parsed_args = None
+
+ def __getitem__(self, key):
+ assert self.parsed_args is not None, "No arguments parsed yet."
+
+ return self.parsed_args[key]
+
+ def __str__(self) -> str:
+ return self.format_help()
+
+ def add_argument(self, *args, **kwargs):
+ """
+ Assume the first argument is the name of the argument.
+ """
+ self.arguments[args[0]] = self._Argument(*args, **kwargs)
+
+ def validate(self, config=None):
+ """
+ Convert yaml config (dict-like) to list, required by argparse.
+ """
+ for k, v in config.items():
+ assert (
+ k in self.arguments
+ ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
+
+ if self.arguments[k].type is not None:
+ try:
+ self.arguments[k].val = self.arguments[k].type(v)
+ except ValueError:
+ raise ValueError(f"{k} is not a valid {self.arguments[k].type}.")
+
+ if self.arguments[k].choices is not None:
+ assert (
+ v in self.arguments[k].choices
+ ), f"""{k} must be one of {self.arguments[k].choices}."""
+
+ return config
+
+ def format_arguments(self):
+ return str([f"{k}" for k in sorted(self.arguments.keys())])
+
+ def format_help(self):
+ # description + key-value pair string for each argument
+ help_msg = str(self.description)
+ return help_msg + ", available arguments: " + self.format_arguments()
+
+ def print_help(self):
+ # display help message
+ print(self.format_help())
+
+
+def create_runner_config_validator():
+ validator = ConfigValidator(description="Runner configurations")
+
+ validator.add_argument(
+ "runner",
+ type=str,
+ choices=["runner_base", "runner_iter"],
+ help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
+ runner runs based on iters. Default: runner_base""",
+ )
+ # add argumetns for training dataset ratios
+ validator.add_argument(
+ "train_dataset_ratios",
+ type=Dict[str, float],
+ help="""Ratios of training dataset. This is used in iteration-based runner.
+ Do not support for epoch-based runner because how to define an epoch becomes tricky.
+ Default: None""",
+ )
+ validator.add_argument(
+ "max_iters",
+ type=float,
+ help="Maximum number of iterations to run.",
+ )
+ validator.add_argument(
+ "max_epoch",
+ type=int,
+ help="Maximum number of epochs to run.",
+ )
+ # add arguments for iters_per_inner_epoch
+ validator.add_argument(
+ "iters_per_inner_epoch",
+ type=float,
+ help="Number of iterations per inner epoch. This is required when runner is runner_iter.",
+ )
+ lr_scheds_choices = registry.list_lr_schedulers()
+ validator.add_argument(
+ "lr_sched",
+ type=str,
+ choices=lr_scheds_choices,
+ help="Learning rate scheduler to use, from {}".format(lr_scheds_choices),
+ )
+ task_choices = registry.list_tasks()
+ validator.add_argument(
+ "task",
+ type=str,
+ choices=task_choices,
+ help="Task to use, from {}".format(task_choices),
+ )
+ # add arguments for init_lr
+ validator.add_argument(
+ "init_lr",
+ type=float,
+ help="Initial learning rate. This will be the learning rate after warmup and before decay.",
+ )
+ # add arguments for min_lr
+ validator.add_argument(
+ "min_lr",
+ type=float,
+ help="Minimum learning rate (after decay).",
+ )
+ # add arguments for warmup_lr
+ validator.add_argument(
+ "warmup_lr",
+ type=float,
+ help="Starting learning rate for warmup.",
+ )
+ # add arguments for learning rate decay rate
+ validator.add_argument(
+ "lr_decay_rate",
+ type=float,
+ help="Learning rate decay rate. Required if using a decaying learning rate scheduler.",
+ )
+ # add arguments for weight decay
+ validator.add_argument(
+ "weight_decay",
+ type=float,
+ help="Weight decay rate.",
+ )
+ # add arguments for training batch size
+ validator.add_argument(
+ "batch_size_train",
+ type=int,
+ help="Training batch size.",
+ )
+ # add arguments for evaluation batch size
+ validator.add_argument(
+ "batch_size_eval",
+ type=int,
+ help="Evaluation batch size, including validation and testing.",
+ )
+ # add arguments for number of workers for data loading
+ validator.add_argument(
+ "num_workers",
+ help="Number of workers for data loading.",
+ )
+ # add arguments for warm up steps
+ validator.add_argument(
+ "warmup_steps",
+ type=int,
+ help="Number of warmup steps. Required if a warmup schedule is used.",
+ )
+ # add arguments for random seed
+ validator.add_argument(
+ "seed",
+ type=int,
+ help="Random seed.",
+ )
+ # add arguments for output directory
+ validator.add_argument(
+ "output_dir",
+ type=str,
+ help="Output directory to save checkpoints and logs.",
+ )
+ # add arguments for whether only use evaluation
+ validator.add_argument(
+ "evaluate",
+ help="Whether to only evaluate the model. If true, training will not be performed.",
+ )
+ # add arguments for splits used for training, e.g. ["train", "val"]
+ validator.add_argument(
+ "train_splits",
+ type=list,
+ help="Splits to use for training.",
+ )
+ # add arguments for splits used for validation, e.g. ["val"]
+ validator.add_argument(
+ "valid_splits",
+ type=list,
+ help="Splits to use for validation. If not provided, will skip the validation.",
+ )
+ # add arguments for splits used for testing, e.g. ["test"]
+ validator.add_argument(
+ "test_splits",
+ type=list,
+ help="Splits to use for testing. If not provided, will skip the testing.",
+ )
+ # add arguments for accumulating gradient for iterations
+ validator.add_argument(
+ "accum_grad_iters",
+ type=int,
+ help="Number of iterations to accumulate gradient for.",
+ )
+
+ # ====== distributed training ======
+ validator.add_argument(
+ "device",
+ type=str,
+ choices=["cpu", "cuda"],
+ help="Device to use. Support 'cuda' or 'cpu' as for now.",
+ )
+ validator.add_argument(
+ "world_size",
+ type=int,
+ help="Number of processes participating in the job.",
+ )
+ validator.add_argument("dist_url", type=str)
+ validator.add_argument("distributed", type=bool)
+ # add arguments to opt using distributed sampler during evaluation or not
+ validator.add_argument(
+ "use_dist_eval_sampler",
+ type=bool,
+ help="Whether to use distributed sampler during evaluation or not.",
+ )
+
+ # ====== task specific ======
+ # generation task specific arguments
+ # add arguments for maximal length of text output
+ validator.add_argument(
+ "max_len",
+ type=int,
+ help="Maximal length of text output.",
+ )
+ # add arguments for minimal length of text output
+ validator.add_argument(
+ "min_len",
+ type=int,
+ help="Minimal length of text output.",
+ )
+ # add arguments number of beams
+ validator.add_argument(
+ "num_beams",
+ type=int,
+ help="Number of beams used for beam search.",
+ )
+
+ # vqa task specific arguments
+ # add arguments for number of answer candidates
+ validator.add_argument(
+ "num_ans_candidates",
+ type=int,
+ help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
+ )
+ # add arguments for inference method
+ validator.add_argument(
+ "inference_method",
+ type=str,
+ choices=["genearte", "rank"],
+ help="""Inference method to use for question answering. If rank, requires a answer list.""",
+ )
+
+ # ====== model specific ======
+ validator.add_argument(
+ "k_test",
+ type=int,
+ help="Number of top k most similar samples from ITC/VTC selection to be tested.",
+ )
+
+ return validator
diff --git a/minigpt4/common/dist_utils.py b/minigpt4/common/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..296a3c86f29c6e82fa8f1108c7dd9fa7d3e9ce45
--- /dev/null
+++ b/minigpt4/common/dist_utils.py
@@ -0,0 +1,137 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import functools
+import os
+
+import torch
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+
+
+def setup_for_distributed(is_master):
+ """
+ This function disables printing when not in master process
+ """
+ import builtins as __builtin__
+
+ builtin_print = __builtin__.print
+
+ def print(*args, **kwargs):
+ force = kwargs.pop("force", False)
+ if is_master or force:
+ builtin_print(*args, **kwargs)
+
+ __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+ if not dist.is_available():
+ return False
+ if not dist.is_initialized():
+ return False
+ return True
+
+
+def get_world_size():
+ if not is_dist_avail_and_initialized():
+ return 1
+ return dist.get_world_size()
+
+
+def get_rank():
+ if not is_dist_avail_and_initialized():
+ return 0
+ return dist.get_rank()
+
+
+def is_main_process():
+ return get_rank() == 0
+
+
+def init_distributed_mode(args):
+ if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+ args.rank = int(os.environ["RANK"])
+ args.world_size = int(os.environ["WORLD_SIZE"])
+ args.gpu = int(os.environ["LOCAL_RANK"])
+ elif "SLURM_PROCID" in os.environ:
+ args.rank = int(os.environ["SLURM_PROCID"])
+ args.gpu = args.rank % torch.cuda.device_count()
+ else:
+ print("Not using distributed mode")
+ args.distributed = False
+ return
+
+ args.distributed = True
+
+ torch.cuda.set_device(args.gpu)
+ args.dist_backend = "nccl"
+ print(
+ "| distributed init (rank {}, world {}): {}".format(
+ args.rank, args.world_size, args.dist_url
+ ),
+ flush=True,
+ )
+ torch.distributed.init_process_group(
+ backend=args.dist_backend,
+ init_method=args.dist_url,
+ world_size=args.world_size,
+ rank=args.rank,
+ timeout=datetime.timedelta(
+ days=365
+ ), # allow auto-downloading and de-compressing
+ )
+ torch.distributed.barrier()
+ setup_for_distributed(args.rank == 0)
+
+
+def get_dist_info():
+ if torch.__version__ < "1.0":
+ initialized = dist._initialized
+ else:
+ initialized = dist.is_initialized()
+ if initialized:
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+ else: # non-distributed training
+ rank = 0
+ world_size = 1
+ return rank, world_size
+
+
+def main_process(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ rank, _ = get_dist_info()
+ if rank == 0:
+ return func(*args, **kwargs)
+
+ return wrapper
+
+
+def download_cached_file(url, check_hash=True, progress=False):
+ """
+ Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+ If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+ """
+
+ def get_cached_file_path():
+ # a hack to sync the file path across processes
+ parts = torch.hub.urlparse(url)
+ filename = os.path.basename(parts.path)
+ cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+
+ return cached_file
+
+ if is_main_process():
+ timm_hub.download_cached_file(url, check_hash, progress)
+
+ if is_dist_avail_and_initialized():
+ dist.barrier()
+
+ return get_cached_file_path()
diff --git a/minigpt4/common/gradcam.py b/minigpt4/common/gradcam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d53a5254d4b319eaf2cbfbd081b0ca8e38c5c7a0
--- /dev/null
+++ b/minigpt4/common/gradcam.py
@@ -0,0 +1,24 @@
+import numpy as np
+from matplotlib import pyplot as plt
+from scipy.ndimage import filters
+from skimage import transform as skimage_transform
+
+
+def getAttMap(img, attMap, blur=True, overlap=True):
+ attMap -= attMap.min()
+ if attMap.max() > 0:
+ attMap /= attMap.max()
+ attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
+ if blur:
+ attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
+ attMap -= attMap.min()
+ attMap /= attMap.max()
+ cmap = plt.get_cmap("jet")
+ attMapV = cmap(attMap)
+ attMapV = np.delete(attMapV, 3, 2)
+ if overlap:
+ attMap = (
+ 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+ + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
+ )
+ return attMap
diff --git a/minigpt4/common/logger.py b/minigpt4/common/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccc100be114951acb6b97296244b939265400cf
--- /dev/null
+++ b/minigpt4/common/logger.py
@@ -0,0 +1,195 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import logging
+import time
+from collections import defaultdict, deque
+
+import torch
+import torch.distributed as dist
+
+from minigpt4.common import dist_utils
+
+
+class SmoothedValue(object):
+ """Track a series of values and provide access to smoothed values over a
+ window or the global series average.
+ """
+
+ def __init__(self, window_size=20, fmt=None):
+ if fmt is None:
+ fmt = "{median:.4f} ({global_avg:.4f})"
+ self.deque = deque(maxlen=window_size)
+ self.total = 0.0
+ self.count = 0
+ self.fmt = fmt
+
+ def update(self, value, n=1):
+ self.deque.append(value)
+ self.count += n
+ self.total += value * n
+
+ def synchronize_between_processes(self):
+ """
+ Warning: does not synchronize the deque!
+ """
+ if not dist_utils.is_dist_avail_and_initialized():
+ return
+ t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+ dist.barrier()
+ dist.all_reduce(t)
+ t = t.tolist()
+ self.count = int(t[0])
+ self.total = t[1]
+
+ @property
+ def median(self):
+ d = torch.tensor(list(self.deque))
+ return d.median().item()
+
+ @property
+ def avg(self):
+ d = torch.tensor(list(self.deque), dtype=torch.float32)
+ return d.mean().item()
+
+ @property
+ def global_avg(self):
+ return self.total / self.count
+
+ @property
+ def max(self):
+ return max(self.deque)
+
+ @property
+ def value(self):
+ return self.deque[-1]
+
+ def __str__(self):
+ return self.fmt.format(
+ median=self.median,
+ avg=self.avg,
+ global_avg=self.global_avg,
+ max=self.max,
+ value=self.value,
+ )
+
+
+class MetricLogger(object):
+ def __init__(self, delimiter="\t"):
+ self.meters = defaultdict(SmoothedValue)
+ self.delimiter = delimiter
+
+ def update(self, **kwargs):
+ for k, v in kwargs.items():
+ if isinstance(v, torch.Tensor):
+ v = v.item()
+ assert isinstance(v, (float, int))
+ self.meters[k].update(v)
+
+ def __getattr__(self, attr):
+ if attr in self.meters:
+ return self.meters[attr]
+ if attr in self.__dict__:
+ return self.__dict__[attr]
+ raise AttributeError(
+ "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+ )
+
+ def __str__(self):
+ loss_str = []
+ for name, meter in self.meters.items():
+ loss_str.append("{}: {}".format(name, str(meter)))
+ return self.delimiter.join(loss_str)
+
+ def global_avg(self):
+ loss_str = []
+ for name, meter in self.meters.items():
+ loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
+ return self.delimiter.join(loss_str)
+
+ def synchronize_between_processes(self):
+ for meter in self.meters.values():
+ meter.synchronize_between_processes()
+
+ def add_meter(self, name, meter):
+ self.meters[name] = meter
+
+ def log_every(self, iterable, print_freq, header=None):
+ i = 0
+ if not header:
+ header = ""
+ start_time = time.time()
+ end = time.time()
+ iter_time = SmoothedValue(fmt="{avg:.4f}")
+ data_time = SmoothedValue(fmt="{avg:.4f}")
+ space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+ log_msg = [
+ header,
+ "[{0" + space_fmt + "}/{1}]",
+ "eta: {eta}",
+ "{meters}",
+ "time: {time}",
+ "data: {data}",
+ ]
+ if torch.cuda.is_available():
+ log_msg.append("max mem: {memory:.0f}")
+ log_msg = self.delimiter.join(log_msg)
+ MB = 1024.0 * 1024.0
+ for obj in iterable:
+ data_time.update(time.time() - end)
+ yield obj
+ iter_time.update(time.time() - end)
+ if i % print_freq == 0 or i == len(iterable) - 1:
+ eta_seconds = iter_time.global_avg * (len(iterable) - i)
+ eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+ if torch.cuda.is_available():
+ print(
+ log_msg.format(
+ i,
+ len(iterable),
+ eta=eta_string,
+ meters=str(self),
+ time=str(iter_time),
+ data=str(data_time),
+ memory=torch.cuda.max_memory_allocated() / MB,
+ )
+ )
+ else:
+ print(
+ log_msg.format(
+ i,
+ len(iterable),
+ eta=eta_string,
+ meters=str(self),
+ time=str(iter_time),
+ data=str(data_time),
+ )
+ )
+ i += 1
+ end = time.time()
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ print(
+ "{} Total time: {} ({:.4f} s / it)".format(
+ header, total_time_str, total_time / len(iterable)
+ )
+ )
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+def setup_logger():
+ logging.basicConfig(
+ level=logging.INFO if dist_utils.is_main_process() else logging.WARN,
+ format="%(asctime)s [%(levelname)s] %(message)s",
+ handlers=[logging.StreamHandler()],
+ )
diff --git a/minigpt4/common/optims.py b/minigpt4/common/optims.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd10715172cec4bc9980690ab3ec07cfe9117b38
--- /dev/null
+++ b/minigpt4/common/optims.py
@@ -0,0 +1,119 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import math
+
+from minigpt4.common.registry import registry
+
+
+@registry.register_lr_scheduler("linear_warmup_step_lr")
+class LinearWarmupStepLRScheduler:
+ def __init__(
+ self,
+ optimizer,
+ max_epoch,
+ min_lr,
+ init_lr,
+ decay_rate=1,
+ warmup_start_lr=-1,
+ warmup_steps=0,
+ **kwargs
+ ):
+ self.optimizer = optimizer
+
+ self.max_epoch = max_epoch
+ self.min_lr = min_lr
+
+ self.decay_rate = decay_rate
+
+ self.init_lr = init_lr
+ self.warmup_steps = warmup_steps
+ self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+
+ def step(self, cur_epoch, cur_step):
+ if cur_epoch == 0:
+ warmup_lr_schedule(
+ step=cur_step,
+ optimizer=self.optimizer,
+ max_step=self.warmup_steps,
+ init_lr=self.warmup_start_lr,
+ max_lr=self.init_lr,
+ )
+ else:
+ step_lr_schedule(
+ epoch=cur_epoch,
+ optimizer=self.optimizer,
+ init_lr=self.init_lr,
+ min_lr=self.min_lr,
+ decay_rate=self.decay_rate,
+ )
+
+
+@registry.register_lr_scheduler("linear_warmup_cosine_lr")
+class LinearWarmupCosineLRScheduler:
+ def __init__(
+ self,
+ optimizer,
+ max_epoch,
+ iters_per_epoch,
+ min_lr,
+ init_lr,
+ warmup_steps=0,
+ warmup_start_lr=-1,
+ **kwargs
+ ):
+ self.optimizer = optimizer
+
+ self.max_epoch = max_epoch
+ self.iters_per_epoch = iters_per_epoch
+ self.min_lr = min_lr
+
+ self.init_lr = init_lr
+ self.warmup_steps = warmup_steps
+ self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+
+ def step(self, cur_epoch, cur_step):
+ total_cur_step = cur_epoch * self.iters_per_epoch + cur_step
+ if total_cur_step < self.warmup_steps:
+ warmup_lr_schedule(
+ step=cur_step,
+ optimizer=self.optimizer,
+ max_step=self.warmup_steps,
+ init_lr=self.warmup_start_lr,
+ max_lr=self.init_lr,
+ )
+ else:
+ cosine_lr_schedule(
+ epoch=total_cur_step,
+ optimizer=self.optimizer,
+ max_epoch=self.max_epoch * self.iters_per_epoch,
+ init_lr=self.init_lr,
+ min_lr=self.min_lr,
+ )
+
+
+def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
+ """Decay the learning rate"""
+ lr = (init_lr - min_lr) * 0.5 * (
+ 1.0 + math.cos(math.pi * epoch / max_epoch)
+ ) + min_lr
+ for param_group in optimizer.param_groups:
+ param_group["lr"] = lr
+
+
+def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
+ """Warmup the learning rate"""
+ lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
+ for param_group in optimizer.param_groups:
+ param_group["lr"] = lr
+
+
+def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
+ """Decay the learning rate"""
+ lr = max(min_lr, init_lr * (decay_rate**epoch))
+ for param_group in optimizer.param_groups:
+ param_group["lr"] = lr
diff --git a/minigpt4/common/registry.py b/minigpt4/common/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..33dfe7692e401d1b2cb3c3d221a5787dca419b86
--- /dev/null
+++ b/minigpt4/common/registry.py
@@ -0,0 +1,329 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+
+class Registry:
+ mapping = {
+ "builder_name_mapping": {},
+ "task_name_mapping": {},
+ "processor_name_mapping": {},
+ "model_name_mapping": {},
+ "lr_scheduler_name_mapping": {},
+ "runner_name_mapping": {},
+ "state": {},
+ "paths": {},
+ }
+
+ @classmethod
+ def register_builder(cls, name):
+ r"""Register a dataset builder to registry with key 'name'
+
+ Args:
+ name: Key with which the builder will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ from minigpt4.datasets.base_dataset_builder import BaseDatasetBuilder
+ """
+
+ def wrap(builder_cls):
+ from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+
+ assert issubclass(
+ builder_cls, BaseDatasetBuilder
+ ), "All builders must inherit BaseDatasetBuilder class, found {}".format(
+ builder_cls
+ )
+ if name in cls.mapping["builder_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["builder_name_mapping"][name]
+ )
+ )
+ cls.mapping["builder_name_mapping"][name] = builder_cls
+ return builder_cls
+
+ return wrap
+
+ @classmethod
+ def register_task(cls, name):
+ r"""Register a task to registry with key 'name'
+
+ Args:
+ name: Key with which the task will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+
+ def wrap(task_cls):
+ from minigpt4.tasks.base_task import BaseTask
+
+ assert issubclass(
+ task_cls, BaseTask
+ ), "All tasks must inherit BaseTask class"
+ if name in cls.mapping["task_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["task_name_mapping"][name]
+ )
+ )
+ cls.mapping["task_name_mapping"][name] = task_cls
+ return task_cls
+
+ return wrap
+
+ @classmethod
+ def register_model(cls, name):
+ r"""Register a task to registry with key 'name'
+
+ Args:
+ name: Key with which the task will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+
+ def wrap(model_cls):
+ from minigpt4.models import BaseModel
+
+ assert issubclass(
+ model_cls, BaseModel
+ ), "All models must inherit BaseModel class"
+ if name in cls.mapping["model_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["model_name_mapping"][name]
+ )
+ )
+ cls.mapping["model_name_mapping"][name] = model_cls
+ return model_cls
+
+ return wrap
+
+ @classmethod
+ def register_processor(cls, name):
+ r"""Register a processor to registry with key 'name'
+
+ Args:
+ name: Key with which the task will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+
+ def wrap(processor_cls):
+ from minigpt4.processors import BaseProcessor
+
+ assert issubclass(
+ processor_cls, BaseProcessor
+ ), "All processors must inherit BaseProcessor class"
+ if name in cls.mapping["processor_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["processor_name_mapping"][name]
+ )
+ )
+ cls.mapping["processor_name_mapping"][name] = processor_cls
+ return processor_cls
+
+ return wrap
+
+ @classmethod
+ def register_lr_scheduler(cls, name):
+ r"""Register a model to registry with key 'name'
+
+ Args:
+ name: Key with which the task will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+
+ def wrap(lr_sched_cls):
+ if name in cls.mapping["lr_scheduler_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["lr_scheduler_name_mapping"][name]
+ )
+ )
+ cls.mapping["lr_scheduler_name_mapping"][name] = lr_sched_cls
+ return lr_sched_cls
+
+ return wrap
+
+ @classmethod
+ def register_runner(cls, name):
+ r"""Register a model to registry with key 'name'
+
+ Args:
+ name: Key with which the task will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+
+ def wrap(runner_cls):
+ if name in cls.mapping["runner_name_mapping"]:
+ raise KeyError(
+ "Name '{}' already registered for {}.".format(
+ name, cls.mapping["runner_name_mapping"][name]
+ )
+ )
+ cls.mapping["runner_name_mapping"][name] = runner_cls
+ return runner_cls
+
+ return wrap
+
+ @classmethod
+ def register_path(cls, name, path):
+ r"""Register a path to registry with key 'name'
+
+ Args:
+ name: Key with which the path will be registered.
+
+ Usage:
+
+ from minigpt4.common.registry import registry
+ """
+ assert isinstance(path, str), "All path must be str."
+ if name in cls.mapping["paths"]:
+ raise KeyError("Name '{}' already registered.".format(name))
+ cls.mapping["paths"][name] = path
+
+ @classmethod
+ def register(cls, name, obj):
+ r"""Register an item to registry with key 'name'
+
+ Args:
+ name: Key with which the item will be registered.
+
+ Usage::
+
+ from minigpt4.common.registry import registry
+
+ registry.register("config", {})
+ """
+ path = name.split(".")
+ current = cls.mapping["state"]
+
+ for part in path[:-1]:
+ if part not in current:
+ current[part] = {}
+ current = current[part]
+
+ current[path[-1]] = obj
+
+ # @classmethod
+ # def get_trainer_class(cls, name):
+ # return cls.mapping["trainer_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_builder_class(cls, name):
+ return cls.mapping["builder_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_model_class(cls, name):
+ return cls.mapping["model_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_task_class(cls, name):
+ return cls.mapping["task_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_processor_class(cls, name):
+ return cls.mapping["processor_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_lr_scheduler_class(cls, name):
+ return cls.mapping["lr_scheduler_name_mapping"].get(name, None)
+
+ @classmethod
+ def get_runner_class(cls, name):
+ return cls.mapping["runner_name_mapping"].get(name, None)
+
+ @classmethod
+ def list_runners(cls):
+ return sorted(cls.mapping["runner_name_mapping"].keys())
+
+ @classmethod
+ def list_models(cls):
+ return sorted(cls.mapping["model_name_mapping"].keys())
+
+ @classmethod
+ def list_tasks(cls):
+ return sorted(cls.mapping["task_name_mapping"].keys())
+
+ @classmethod
+ def list_processors(cls):
+ return sorted(cls.mapping["processor_name_mapping"].keys())
+
+ @classmethod
+ def list_lr_schedulers(cls):
+ return sorted(cls.mapping["lr_scheduler_name_mapping"].keys())
+
+ @classmethod
+ def list_datasets(cls):
+ return sorted(cls.mapping["builder_name_mapping"].keys())
+
+ @classmethod
+ def get_path(cls, name):
+ return cls.mapping["paths"].get(name, None)
+
+ @classmethod
+ def get(cls, name, default=None, no_warning=False):
+ r"""Get an item from registry with key 'name'
+
+ Args:
+ name (string): Key whose value needs to be retrieved.
+ default: If passed and key is not in registry, default value will
+ be returned with a warning. Default: None
+ no_warning (bool): If passed as True, warning when key doesn't exist
+ will not be generated. Useful for MMF's
+ internal operations. Default: False
+ """
+ original_name = name
+ name = name.split(".")
+ value = cls.mapping["state"]
+ for subname in name:
+ value = value.get(subname, default)
+ if value is default:
+ break
+
+ if (
+ "writer" in cls.mapping["state"]
+ and value == default
+ and no_warning is False
+ ):
+ cls.mapping["state"]["writer"].warning(
+ "Key {} is not present in registry, returning default value "
+ "of {}".format(original_name, default)
+ )
+ return value
+
+ @classmethod
+ def unregister(cls, name):
+ r"""Remove an item from registry with key 'name'
+
+ Args:
+ name: Key which needs to be removed.
+ Usage::
+
+ from mmf.common.registry import registry
+
+ config = registry.unregister("config")
+ """
+ return cls.mapping["state"].pop(name, None)
+
+
+registry = Registry()
diff --git a/minigpt4/common/utils.py b/minigpt4/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d536eac1d32b35ad9e97abb29895120d850aacaf
--- /dev/null
+++ b/minigpt4/common/utils.py
@@ -0,0 +1,424 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import io
+import json
+import logging
+import os
+import pickle
+import re
+import shutil
+import urllib
+import urllib.error
+import urllib.request
+from typing import Optional
+from urllib.parse import urlparse
+
+import numpy as np
+import pandas as pd
+import yaml
+from iopath.common.download import download
+from iopath.common.file_io import file_lock, g_pathmgr
+from minigpt4.common.registry import registry
+from torch.utils.model_zoo import tqdm
+from torchvision.datasets.utils import (
+ check_integrity,
+ download_file_from_google_drive,
+ extract_archive,
+)
+
+
+def now():
+ from datetime import datetime
+
+ return datetime.now().strftime("%Y%m%d%H%M")[:-1]
+
+
+def is_url(url_or_filename):
+ parsed = urlparse(url_or_filename)
+ return parsed.scheme in ("http", "https")
+
+
+def get_cache_path(rel_path):
+ return os.path.expanduser(os.path.join(registry.get_path("cache_root"), rel_path))
+
+
+def get_abs_path(rel_path):
+ return os.path.join(registry.get_path("library_root"), rel_path)
+
+
+def load_json(filename):
+ with open(filename, "r") as f:
+ return json.load(f)
+
+
+# The following are adapted from torchvision and vissl
+# torchvision: https://github.com/pytorch/vision
+# vissl: https://github.com/facebookresearch/vissl/blob/main/vissl/utils/download.py
+
+
+def makedir(dir_path):
+ """
+ Create the directory if it does not exist.
+ """
+ is_success = False
+ try:
+ if not g_pathmgr.exists(dir_path):
+ g_pathmgr.mkdirs(dir_path)
+ is_success = True
+ except BaseException:
+ print(f"Error creating directory: {dir_path}")
+ return is_success
+
+
+def get_redirected_url(url: str):
+ """
+ Given a URL, returns the URL it redirects to or the
+ original URL in case of no indirection
+ """
+ import requests
+
+ with requests.Session() as session:
+ with session.get(url, stream=True, allow_redirects=True) as response:
+ if response.history:
+ return response.url
+ else:
+ return url
+
+
+def to_google_drive_download_url(view_url: str) -> str:
+ """
+ Utility function to transform a view URL of google drive
+ to a download URL for google drive
+ Example input:
+ https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp/view
+ Example output:
+ https://drive.google.com/uc?export=download&id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp
+ """
+ splits = view_url.split("/")
+ assert splits[-1] == "view"
+ file_id = splits[-2]
+ return f"https://drive.google.com/uc?export=download&id={file_id}"
+
+
+def download_google_drive_url(url: str, output_path: str, output_file_name: str):
+ """
+ Download a file from google drive
+ Downloading an URL from google drive requires confirmation when
+ the file of the size is too big (google drive notifies that
+ anti-viral checks cannot be performed on such files)
+ """
+ import requests
+
+ with requests.Session() as session:
+
+ # First get the confirmation token and append it to the URL
+ with session.get(url, stream=True, allow_redirects=True) as response:
+ for k, v in response.cookies.items():
+ if k.startswith("download_warning"):
+ url = url + "&confirm=" + v
+
+ # Then download the content of the file
+ with session.get(url, stream=True, verify=True) as response:
+ makedir(output_path)
+ path = os.path.join(output_path, output_file_name)
+ total_size = int(response.headers.get("Content-length", 0))
+ with open(path, "wb") as file:
+ from tqdm import tqdm
+
+ with tqdm(total=total_size) as progress_bar:
+ for block in response.iter_content(
+ chunk_size=io.DEFAULT_BUFFER_SIZE
+ ):
+ file.write(block)
+ progress_bar.update(len(block))
+
+
+def _get_google_drive_file_id(url: str) -> Optional[str]:
+ parts = urlparse(url)
+
+ if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
+ return None
+
+ match = re.match(r"/file/d/(?P[^/]*)", parts.path)
+ if match is None:
+ return None
+
+ return match.group("id")
+
+
+def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
+ with open(filename, "wb") as fh:
+ with urllib.request.urlopen(
+ urllib.request.Request(url, headers={"User-Agent": "vissl"})
+ ) as response:
+ with tqdm(total=response.length) as pbar:
+ for chunk in iter(lambda: response.read(chunk_size), ""):
+ if not chunk:
+ break
+ pbar.update(chunk_size)
+ fh.write(chunk)
+
+
+def download_url(
+ url: str,
+ root: str,
+ filename: Optional[str] = None,
+ md5: Optional[str] = None,
+) -> None:
+ """Download a file from a url and place it in root.
+ Args:
+ url (str): URL to download file from
+ root (str): Directory to place downloaded file in
+ filename (str, optional): Name to save the file under.
+ If None, use the basename of the URL.
+ md5 (str, optional): MD5 checksum of the download. If None, do not check
+ """
+ root = os.path.expanduser(root)
+ if not filename:
+ filename = os.path.basename(url)
+ fpath = os.path.join(root, filename)
+
+ makedir(root)
+
+ # check if file is already present locally
+ if check_integrity(fpath, md5):
+ print("Using downloaded and verified file: " + fpath)
+ return
+
+ # expand redirect chain if needed
+ url = get_redirected_url(url)
+
+ # check if file is located on Google Drive
+ file_id = _get_google_drive_file_id(url)
+ if file_id is not None:
+ return download_file_from_google_drive(file_id, root, filename, md5)
+
+ # download the file
+ try:
+ print("Downloading " + url + " to " + fpath)
+ _urlretrieve(url, fpath)
+ except (urllib.error.URLError, IOError) as e: # type: ignore[attr-defined]
+ if url[:5] == "https":
+ url = url.replace("https:", "http:")
+ print(
+ "Failed download. Trying https -> http instead."
+ " Downloading " + url + " to " + fpath
+ )
+ _urlretrieve(url, fpath)
+ else:
+ raise e
+
+ # check integrity of downloaded file
+ if not check_integrity(fpath, md5):
+ raise RuntimeError("File not found or corrupted.")
+
+
+def download_and_extract_archive(
+ url: str,
+ download_root: str,
+ extract_root: Optional[str] = None,
+ filename: Optional[str] = None,
+ md5: Optional[str] = None,
+ remove_finished: bool = False,
+) -> None:
+ download_root = os.path.expanduser(download_root)
+ if extract_root is None:
+ extract_root = download_root
+ if not filename:
+ filename = os.path.basename(url)
+
+ download_url(url, download_root, filename, md5)
+
+ archive = os.path.join(download_root, filename)
+ print("Extracting {} to {}".format(archive, extract_root))
+ extract_archive(archive, extract_root, remove_finished)
+
+
+def cache_url(url: str, cache_dir: str) -> str:
+ """
+ This implementation downloads the remote resource and caches it locally.
+ The resource will only be downloaded if not previously requested.
+ """
+ parsed_url = urlparse(url)
+ dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/")))
+ makedir(dirname)
+ filename = url.split("/")[-1]
+ cached = os.path.join(dirname, filename)
+ with file_lock(cached):
+ if not os.path.isfile(cached):
+ logging.info(f"Downloading {url} to {cached} ...")
+ cached = download(url, dirname, filename=filename)
+ logging.info(f"URL {url} cached in {cached}")
+ return cached
+
+
+# TODO (prigoyal): convert this into RAII-style API
+def create_file_symlink(file1, file2):
+ """
+ Simply create the symlinks for a given file1 to file2.
+ Useful during model checkpointing to symlinks to the
+ latest successful checkpoint.
+ """
+ try:
+ if g_pathmgr.exists(file2):
+ g_pathmgr.rm(file2)
+ g_pathmgr.symlink(file1, file2)
+ except Exception as e:
+ logging.info(f"Could NOT create symlink. Error: {e}")
+
+
+def save_file(data, filename, append_to_json=True, verbose=True):
+ """
+ Common i/o utility to handle saving data to various file formats.
+ Supported:
+ .pkl, .pickle, .npy, .json
+ Specifically for .json, users have the option to either append (default)
+ or rewrite by passing in Boolean value to append_to_json.
+ """
+ if verbose:
+ logging.info(f"Saving data to file: {filename}")
+ file_ext = os.path.splitext(filename)[1]
+ if file_ext in [".pkl", ".pickle"]:
+ with g_pathmgr.open(filename, "wb") as fopen:
+ pickle.dump(data, fopen, pickle.HIGHEST_PROTOCOL)
+ elif file_ext == ".npy":
+ with g_pathmgr.open(filename, "wb") as fopen:
+ np.save(fopen, data)
+ elif file_ext == ".json":
+ if append_to_json:
+ with g_pathmgr.open(filename, "a") as fopen:
+ fopen.write(json.dumps(data, sort_keys=True) + "\n")
+ fopen.flush()
+ else:
+ with g_pathmgr.open(filename, "w") as fopen:
+ fopen.write(json.dumps(data, sort_keys=True) + "\n")
+ fopen.flush()
+ elif file_ext == ".yaml":
+ with g_pathmgr.open(filename, "w") as fopen:
+ dump = yaml.dump(data)
+ fopen.write(dump)
+ fopen.flush()
+ else:
+ raise Exception(f"Saving {file_ext} is not supported yet")
+
+ if verbose:
+ logging.info(f"Saved data to file: {filename}")
+
+
+def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False):
+ """
+ Common i/o utility to handle loading data from various file formats.
+ Supported:
+ .pkl, .pickle, .npy, .json
+ For the npy files, we support reading the files in mmap_mode.
+ If the mmap_mode of reading is not successful, we load data without the
+ mmap_mode.
+ """
+ if verbose:
+ logging.info(f"Loading data from file: {filename}")
+
+ file_ext = os.path.splitext(filename)[1]
+ if file_ext == ".txt":
+ with g_pathmgr.open(filename, "r") as fopen:
+ data = fopen.readlines()
+ elif file_ext in [".pkl", ".pickle"]:
+ with g_pathmgr.open(filename, "rb") as fopen:
+ data = pickle.load(fopen, encoding="latin1")
+ elif file_ext == ".npy":
+ if mmap_mode:
+ try:
+ with g_pathmgr.open(filename, "rb") as fopen:
+ data = np.load(
+ fopen,
+ allow_pickle=allow_pickle,
+ encoding="latin1",
+ mmap_mode=mmap_mode,
+ )
+ except ValueError as e:
+ logging.info(
+ f"Could not mmap {filename}: {e}. Trying without g_pathmgr"
+ )
+ data = np.load(
+ filename,
+ allow_pickle=allow_pickle,
+ encoding="latin1",
+ mmap_mode=mmap_mode,
+ )
+ logging.info("Successfully loaded without g_pathmgr")
+ except Exception:
+ logging.info("Could not mmap without g_pathmgr. Trying without mmap")
+ with g_pathmgr.open(filename, "rb") as fopen:
+ data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+ else:
+ with g_pathmgr.open(filename, "rb") as fopen:
+ data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+ elif file_ext == ".json":
+ with g_pathmgr.open(filename, "r") as fopen:
+ data = json.load(fopen)
+ elif file_ext == ".yaml":
+ with g_pathmgr.open(filename, "r") as fopen:
+ data = yaml.load(fopen, Loader=yaml.FullLoader)
+ elif file_ext == ".csv":
+ with g_pathmgr.open(filename, "r") as fopen:
+ data = pd.read_csv(fopen)
+ else:
+ raise Exception(f"Reading from {file_ext} is not supported yet")
+ return data
+
+
+def abspath(resource_path: str):
+ """
+ Make a path absolute, but take into account prefixes like
+ "http://" or "manifold://"
+ """
+ regex = re.compile(r"^\w+://")
+ if regex.match(resource_path) is None:
+ return os.path.abspath(resource_path)
+ else:
+ return resource_path
+
+
+def makedir(dir_path):
+ """
+ Create the directory if it does not exist.
+ """
+ is_success = False
+ try:
+ if not g_pathmgr.exists(dir_path):
+ g_pathmgr.mkdirs(dir_path)
+ is_success = True
+ except BaseException:
+ logging.info(f"Error creating directory: {dir_path}")
+ return is_success
+
+
+def is_url(input_url):
+ """
+ Check if an input string is a url. look for http(s):// and ignoring the case
+ """
+ is_url = re.match(r"^(?:http)s?://", input_url, re.IGNORECASE) is not None
+ return is_url
+
+
+def cleanup_dir(dir):
+ """
+ Utility for deleting a directory. Useful for cleaning the storage space
+ that contains various training artifacts like checkpoints, data etc.
+ """
+ if os.path.exists(dir):
+ logging.info(f"Deleting directory: {dir}")
+ shutil.rmtree(dir)
+ logging.info(f"Deleted contents of directory: {dir}")
+
+
+def get_file_size(filename):
+ """
+ Given a file, get the size of file in MB
+ """
+ size_in_mb = os.path.getsize(filename) / float(1024**2)
+ return size_in_mb
diff --git a/minigpt4/configs/datasets/cc_combine/align.yaml b/minigpt4/configs/datasets/cc_combine/align.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbfa24f9c21236fc2822dc2cefb8718948b57ebd
--- /dev/null
+++ b/minigpt4/configs/datasets/cc_combine/align.yaml
@@ -0,0 +1,16 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ cc_align:
+ data_type: images
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ annotations:
+ train:
+ url: placeholder
+ storage: /ibex/project/c2133/blip_dataset/image_alignment_cc/filter_cap.json
+ images:
+ storage: /ibex/project/c2133/blip_dataset/image_alignment_cc/
diff --git a/minigpt4/configs/datasets/cc_combine/defaults.yaml b/minigpt4/configs/datasets/cc_combine/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53956604d11459a3f2b39c614d6754fae210b1f5
--- /dev/null
+++ b/minigpt4/configs/datasets/cc_combine/defaults.yaml
@@ -0,0 +1,11 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ cc_combine:
+ data_type: images
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ storage: /ibex/project/c2133/blip_dataset/cc3m/cc3m_cc12m_sbu/{00000..01255}.tar
diff --git a/minigpt4/configs/datasets/laion/defaults.yaml b/minigpt4/configs/datasets/laion/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbd37dfb6b080ccc7ef70ca7858c6b029ec0020e
--- /dev/null
+++ b/minigpt4/configs/datasets/laion/defaults.yaml
@@ -0,0 +1,13 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ laion:
+
+ data_type: images
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ storage: /ibex/project/c2133/blip_dataset/laion_1b/laion_gpu/{00000..10488}.tar
diff --git a/minigpt4/configs/default.yaml b/minigpt4/configs/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..644bf96b66603d6dfd3baf237a9096c1f130103a
--- /dev/null
+++ b/minigpt4/configs/default.yaml
@@ -0,0 +1,10 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+env:
+ # For default users
+ # cache_root: "cache"
+ # For internal use with persistent storage
+ cache_root: "/export/home/.cache/minigpt4"
diff --git a/minigpt4/configs/models/minigpt4.yaml b/minigpt4/configs/models/minigpt4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e04f3751e10bf355f49e542e5d5c5610382b236
--- /dev/null
+++ b/minigpt4/configs/models/minigpt4.yaml
@@ -0,0 +1,39 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: mini_gpt4
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+ freeze_vit: True
+ freeze_qformer: True
+
+ # Q-Former
+ num_query_token: 32
+
+ # Vicuna
+ llama_model: "/path/to/vicuna/weights/"
+
+ # generation configs
+ prompt: ""
+
+
+preprocess:
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
diff --git a/minigpt4/conversation/__init__.py b/minigpt4/conversation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/minigpt4/conversation/conversation.py b/minigpt4/conversation/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0c27115bdf58c5a13d497a6bc447998f1f44213
--- /dev/null
+++ b/minigpt4/conversation/conversation.py
@@ -0,0 +1,195 @@
+import argparse
+import time
+from PIL import Image
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+
+from minigpt4.common.registry import registry
+
+
+class SeparatorStyle(Enum):
+ """Different separator style."""
+ SINGLE = auto()
+ TWO = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+ """A class that keeps all conversation history."""
+ system: str
+ roles: List[str]
+ messages: List[List[str]]
+ offset: int
+ # system_img: List[Image.Image] = []
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+ sep: str = "###"
+ sep2: str = None
+
+ skip_next: bool = False
+ conv_id: Any = None
+
+ def get_prompt(self):
+ if self.sep_style == SeparatorStyle.SINGLE:
+ ret = self.system + self.sep
+ for role, message in self.messages:
+ if message:
+ ret += role + ": " + message + self.sep
+ else:
+ ret += role + ":"
+ return ret
+ elif self.sep_style == SeparatorStyle.TWO:
+ seps = [self.sep, self.sep2]
+ ret = self.system + seps[0]
+ for i, (role, message) in enumerate(self.messages):
+ if message:
+ ret += role + ": " + message + seps[i % 2]
+ else:
+ ret += role + ":"
+ return ret
+ else:
+ raise ValueError(f"Invalid style: {self.sep_style}")
+
+ def append_message(self, role, message):
+ self.messages.append([role, message])
+
+ def to_gradio_chatbot(self):
+ ret = []
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
+ if i % 2 == 0:
+ ret.append([msg, None])
+ else:
+ ret[-1][-1] = msg
+ return ret
+
+ def copy(self):
+ return Conversation(
+ system=self.system,
+ # system_img=self.system_img,
+ roles=self.roles,
+ messages=[[x, y] for x, y in self.messages],
+ offset=self.offset,
+ sep_style=self.sep_style,
+ sep=self.sep,
+ sep2=self.sep2,
+ conv_id=self.conv_id)
+
+ def dict(self):
+ return {
+ "system": self.system,
+ # "system_img": self.system_img,
+ "roles": self.roles,
+ "messages": self.messages,
+ "offset": self.offset,
+ "sep": self.sep,
+ "sep2": self.sep2,
+ "conv_id": self.conv_id,
+ }
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+
+ def __init__(self, stops=[], encounters=1):
+ super().__init__()
+ self.stops = stops
+
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+ for stop in self.stops:
+ if torch.all((stop == input_ids[0][-len(stop):])).item():
+ return True
+
+ return False
+
+
+CONV_VISION = Conversation(
+ system="Give the following image: ImageContent. "
+ "You will be able to see the image once I provide it to you. Please answer my questions.",
+ roles=("Human", "Assistant"),
+ messages=[],
+ offset=2,
+ sep_style=SeparatorStyle.SINGLE,
+ sep="###",
+)
+
+
+
+class Chat:
+ def __init__(self, model, vis_processor, device='cuda:0'):
+ self.device = device
+ self.model = model
+ self.vis_processor = vis_processor
+ stop_words_ids = [torch.tensor([835]).to(self.device),
+ torch.tensor([2277, 29937]).to(self.device)] # '###' can be encoded in two different ways.
+ self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+ def ask(self, text, conv):
+ if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
+ and conv.messages[-1][1][-6:] == '': # last message is image.
+ conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+ else:
+ conv.append_message(conv.roles[0], text)
+
+ def answer(self, conv, img_list, max_new_tokens=200, num_beams=5, min_length=1, top_p=0.9,
+ repetition_penalty=1.0, length_penalty=1, temperature=1):
+ conv.append_message(conv.roles[1], None)
+ embs = self.get_context_emb(conv, img_list)
+ outputs = self.model.llama_model.generate(
+ inputs_embeds=embs,
+ max_new_tokens=max_new_tokens,
+ stopping_criteria=self.stopping_criteria,
+ num_beams=num_beams,
+ min_length=min_length,
+ top_p=top_p,
+ repetition_penalty=repetition_penalty,
+ length_penalty=length_penalty,
+ temperature=temperature,
+ )
+ output_token = outputs[0]
+ if output_token[0] == 0:
+ output_token = output_token[1:]
+ output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
+ output_text = output_text.split('###')[0] # remove the stop sign '###'
+ output_text = output_text.split('Assistant:')[-1].strip()
+ conv.messages[-1][1] = output_text
+ return output_text, output_token.cpu().numpy()
+
+ def upload_img(self, image, conv, img_list):
+ if isinstance(image, str): # is a image path
+ raw_image = Image.open(image).convert('RGB')
+ image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+ elif isinstance(image, Image.Image):
+ raw_image = image
+ image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+ elif isinstance(image, torch.Tensor):
+ if len(image.shape) == 3:
+ image = image.unsqueeze(0)
+ image = image.to(self.device)
+
+ image_emb, _ = self.model.encode_img(image)
+ img_list.append(image_emb)
+ conv.append_message(conv.roles[0], "")
+ msg = "Received."
+ # self.conv.append_message(self.conv.roles[1], msg)
+ return msg
+
+ def get_context_emb(self, conv, img_list):
+ prompt = conv.get_prompt()
+ prompt_segs = prompt.split('')
+ assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
+ seg_tokens = [
+ self.model.llama_tokenizer(
+ seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
+ # only add bos to the first seg
+ for i, seg in enumerate(prompt_segs)
+ ]
+ seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+ mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
+ mixed_embs = torch.cat(mixed_embs, dim=1)
+ return mixed_embs
+
+
diff --git a/minigpt4/datasets/__init__.py b/minigpt4/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/minigpt4/datasets/builders/__init__.py b/minigpt4/datasets/builders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f19e672f951204dc80067f30db368818fa4e00
--- /dev/null
+++ b/minigpt4/datasets/builders/__init__.py
@@ -0,0 +1,72 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.datasets.builders.base_dataset_builder import load_dataset_config
+from minigpt4.datasets.builders.image_text_pair_builder import (
+ CCCombineBuilder,
+ LaionBuilder,
+ CCAlignBuilder
+)
+from minigpt4.common.registry import registry
+
+__all__ = [
+ "CCCombineBuilder",
+ "LaionBuilder",
+ "CCAlignBuilder"
+]
+
+
+def load_dataset(name, cfg_path=None, vis_path=None, data_type=None):
+ """
+ Example
+
+ >>> dataset = load_dataset("coco_caption", cfg=None)
+ >>> splits = dataset.keys()
+ >>> print([len(dataset[split]) for split in splits])
+
+ """
+ if cfg_path is None:
+ cfg = None
+ else:
+ cfg = load_dataset_config(cfg_path)
+
+ try:
+ builder = registry.get_builder_class(name)(cfg)
+ except TypeError:
+ print(
+ f"Dataset {name} not found. Available datasets:\n"
+ + ", ".join([str(k) for k in dataset_zoo.get_names()])
+ )
+ exit(1)
+
+ if vis_path is not None:
+ if data_type is None:
+ # use default data type in the config
+ data_type = builder.config.data_type
+
+ assert (
+ data_type in builder.config.build_info
+ ), f"Invalid data_type {data_type} for {name}."
+
+ builder.config.build_info.get(data_type).storage = vis_path
+
+ dataset = builder.build_datasets()
+ return dataset
+
+
+class DatasetZoo:
+ def __init__(self) -> None:
+ self.dataset_zoo = {
+ k: list(v.DATASET_CONFIG_DICT.keys())
+ for k, v in sorted(registry.mapping["builder_name_mapping"].items())
+ }
+
+ def get_names(self):
+ return list(self.dataset_zoo.keys())
+
+
+dataset_zoo = DatasetZoo()
diff --git a/minigpt4/datasets/builders/base_dataset_builder.py b/minigpt4/datasets/builders/base_dataset_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8885b2c2de676b717081a512230ed6f90a0064ce
--- /dev/null
+++ b/minigpt4/datasets/builders/base_dataset_builder.py
@@ -0,0 +1,235 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+import shutil
+import warnings
+
+from omegaconf import OmegaConf
+import torch.distributed as dist
+from torchvision.datasets.utils import download_url
+
+import minigpt4.common.utils as utils
+from minigpt4.common.dist_utils import is_dist_avail_and_initialized, is_main_process
+from minigpt4.common.registry import registry
+from minigpt4.processors.base_processor import BaseProcessor
+
+
+
+class BaseDatasetBuilder:
+ train_dataset_cls, eval_dataset_cls = None, None
+
+ def __init__(self, cfg=None):
+ super().__init__()
+
+ if cfg is None:
+ # help to create datasets from default config.
+ self.config = load_dataset_config(self.default_config_path())
+ elif isinstance(cfg, str):
+ self.config = load_dataset_config(cfg)
+ else:
+ # when called from task.build_dataset()
+ self.config = cfg
+
+ self.data_type = self.config.data_type
+
+ self.vis_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+ self.text_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+
+ def build_datasets(self):
+ # download, split, etc...
+ # only called on 1 GPU/TPU in distributed
+
+ if is_main_process():
+ self._download_data()
+
+ if is_dist_avail_and_initialized():
+ dist.barrier()
+
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ datasets = self.build() # dataset['train'/'val'/'test']
+
+ return datasets
+
+ def build_processors(self):
+ vis_proc_cfg = self.config.get("vis_processor")
+ txt_proc_cfg = self.config.get("text_processor")
+
+ if vis_proc_cfg is not None:
+ vis_train_cfg = vis_proc_cfg.get("train")
+ vis_eval_cfg = vis_proc_cfg.get("eval")
+
+ self.vis_processors["train"] = self._build_proc_from_cfg(vis_train_cfg)
+ self.vis_processors["eval"] = self._build_proc_from_cfg(vis_eval_cfg)
+
+ if txt_proc_cfg is not None:
+ txt_train_cfg = txt_proc_cfg.get("train")
+ txt_eval_cfg = txt_proc_cfg.get("eval")
+
+ self.text_processors["train"] = self._build_proc_from_cfg(txt_train_cfg)
+ self.text_processors["eval"] = self._build_proc_from_cfg(txt_eval_cfg)
+
+ @staticmethod
+ def _build_proc_from_cfg(cfg):
+ return (
+ registry.get_processor_class(cfg.name).from_config(cfg)
+ if cfg is not None
+ else None
+ )
+
+ @classmethod
+ def default_config_path(cls, type="default"):
+ return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type])
+
+ def _download_data(self):
+ self._download_ann()
+ self._download_vis()
+
+ def _download_ann(self):
+ """
+ Download annotation files if necessary.
+ All the vision-language datasets should have annotations of unified format.
+
+ storage_path can be:
+ (1) relative/absolute: will be prefixed with env.cache_root to make full path if relative.
+ (2) basename/dirname: will be suffixed with base name of URL if dirname is provided.
+
+ Local annotation paths should be relative.
+ """
+ anns = self.config.build_info.annotations
+
+ splits = anns.keys()
+
+ cache_root = registry.get_path("cache_root")
+
+ for split in splits:
+ info = anns[split]
+
+ urls, storage_paths = info.get("url", None), info.storage
+
+ if isinstance(urls, str):
+ urls = [urls]
+ if isinstance(storage_paths, str):
+ storage_paths = [storage_paths]
+
+ assert len(urls) == len(storage_paths)
+
+ for url_or_filename, storage_path in zip(urls, storage_paths):
+ # if storage_path is relative, make it full by prefixing with cache_root.
+ if not os.path.isabs(storage_path):
+ storage_path = os.path.join(cache_root, storage_path)
+
+ dirname = os.path.dirname(storage_path)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+
+ if os.path.isfile(url_or_filename):
+ src, dst = url_or_filename, storage_path
+ if not os.path.exists(dst):
+ shutil.copyfile(src=src, dst=dst)
+ else:
+ logging.info("Using existing file {}.".format(dst))
+ else:
+ if os.path.isdir(storage_path):
+ # if only dirname is provided, suffix with basename of URL.
+ raise ValueError(
+ "Expecting storage_path to be a file path, got directory {}".format(
+ storage_path
+ )
+ )
+ else:
+ filename = os.path.basename(storage_path)
+
+ download_url(url=url_or_filename, root=dirname, filename=filename)
+
+ def _download_vis(self):
+
+ storage_path = self.config.build_info.get(self.data_type).storage
+ storage_path = utils.get_cache_path(storage_path)
+
+ if not os.path.exists(storage_path):
+ warnings.warn(
+ f"""
+ The specified path {storage_path} for visual inputs does not exist.
+ Please provide a correct path to the visual inputs or
+ refer to datasets/download_scripts/README.md for downloading instructions.
+ """
+ )
+
+ def build(self):
+ """
+ Create by split datasets inheriting torch.utils.data.Datasets.
+
+ # build() can be dataset-specific. Overwrite to customize.
+ """
+ self.build_processors()
+
+ build_info = self.config.build_info
+
+ ann_info = build_info.annotations
+ vis_info = build_info.get(self.data_type)
+
+ datasets = dict()
+ for split in ann_info.keys():
+ if split not in ["train", "val", "test"]:
+ continue
+
+ is_train = split == "train"
+
+ # processors
+ vis_processor = (
+ self.vis_processors["train"]
+ if is_train
+ else self.vis_processors["eval"]
+ )
+ text_processor = (
+ self.text_processors["train"]
+ if is_train
+ else self.text_processors["eval"]
+ )
+
+ # annotation path
+ ann_paths = ann_info.get(split).storage
+ if isinstance(ann_paths, str):
+ ann_paths = [ann_paths]
+
+ abs_ann_paths = []
+ for ann_path in ann_paths:
+ if not os.path.isabs(ann_path):
+ ann_path = utils.get_cache_path(ann_path)
+ abs_ann_paths.append(ann_path)
+ ann_paths = abs_ann_paths
+
+ # visual data storage path
+ vis_path = os.path.join(vis_info.storage, split)
+
+ if not os.path.isabs(vis_path):
+ # vis_path = os.path.join(utils.get_cache_path(), vis_path)
+ vis_path = utils.get_cache_path(vis_path)
+
+ if not os.path.exists(vis_path):
+ warnings.warn("storage path {} does not exist.".format(vis_path))
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+ datasets[split] = dataset_cls(
+ vis_processor=vis_processor,
+ text_processor=text_processor,
+ ann_paths=ann_paths,
+ vis_root=vis_path,
+ )
+
+ return datasets
+
+
+def load_dataset_config(cfg_path):
+ cfg = OmegaConf.load(cfg_path).datasets
+ cfg = cfg[list(cfg.keys())[0]]
+
+ return cfg
diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3752be547778b7460c4cf3739723e723f9cb50d3
--- /dev/null
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@@ -0,0 +1,86 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+
+from minigpt4.common.registry import registry
+from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from minigpt4.datasets.datasets.laion_dataset import LaionDataset
+from minigpt4.datasets.datasets.cc_combine_dataset import CCCombineDataset, CCAlignDataset
+
+
+@registry.register_builder("cc_combine")
+class CCCombineBuilder(BaseDatasetBuilder):
+ train_dataset_cls = CCCombineDataset
+
+ DATASET_CONFIG_DICT = {"default": "configs/datasets/cc_combine/defaults.yaml"}
+
+ def _download_ann(self):
+ pass
+
+ def _download_vis(self):
+ pass
+
+ def build(self):
+ self.build_processors()
+
+ build_info = self.config.build_info
+
+ datasets = dict()
+ split = "train"
+
+ # create datasets
+ # [NOTE] return inner_datasets (wds.DataPipeline)
+ dataset_cls = self.train_dataset_cls
+ datasets[split] = dataset_cls(
+ vis_processor=self.vis_processors[split],
+ text_processor=self.text_processors[split],
+ location=build_info.storage,
+ ).inner_dataset
+
+ return datasets
+
+
+@registry.register_builder("laion")
+class LaionBuilder(BaseDatasetBuilder):
+ train_dataset_cls = LaionDataset
+
+ DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults.yaml"}
+
+ def _download_ann(self):
+ pass
+
+ def _download_vis(self):
+ pass
+
+ def build(self):
+ self.build_processors()
+
+ build_info = self.config.build_info
+
+ datasets = dict()
+ split = "train"
+
+ # create datasets
+ # [NOTE] return inner_datasets (wds.DataPipeline)
+ dataset_cls = self.train_dataset_cls
+ datasets[split] = dataset_cls(
+ vis_processor=self.vis_processors[split],
+ text_processor=self.text_processors[split],
+ location=build_info.storage,
+ ).inner_dataset
+
+ return datasets
+
+
+@registry.register_builder("cc_align")
+class CCAlignBuilder(BaseDatasetBuilder):
+ train_dataset_cls = CCAlignDataset
+
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/cc_combine/align.yaml",
+ }
\ No newline at end of file
diff --git a/minigpt4/datasets/data_utils.py b/minigpt4/datasets/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cddc4d68a8fa5a4e39bea0055d131c96ee81e7b7
--- /dev/null
+++ b/minigpt4/datasets/data_utils.py
@@ -0,0 +1,196 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import gzip
+import logging
+import os
+import random as rnd
+import tarfile
+import zipfile
+import random
+from typing import List
+from tqdm import tqdm
+
+import decord
+from decord import VideoReader
+import webdataset as wds
+import numpy as np
+import torch
+from torch.utils.data.dataset import IterableDataset
+
+from minigpt4.common.registry import registry
+from minigpt4.datasets.datasets.base_dataset import ConcatDataset
+
+
+decord.bridge.set_bridge("torch")
+MAX_INT = registry.get("MAX_INT")
+
+
+class ChainDataset(wds.DataPipeline):
+ r"""Dataset for chaining multiple :class:`DataPipeline` s.
+
+ This class is useful to assemble different existing dataset streams. The
+ chaining operation is done on-the-fly, so concatenating large-scale
+ datasets with this class will be efficient.
+
+ Args:
+ datasets (iterable of IterableDataset): datasets to be chained together
+ """
+ def __init__(self, datasets: List[wds.DataPipeline]) -> None:
+ super().__init__()
+ self.datasets = datasets
+ self.prob = []
+ self.names = []
+ for dataset in self.datasets:
+ if hasattr(dataset, 'name'):
+ self.names.append(dataset.name)
+ else:
+ self.names.append('Unknown')
+ if hasattr(dataset, 'sample_ratio'):
+ self.prob.append(dataset.sample_ratio)
+ else:
+ self.prob.append(1)
+ logging.info("One of the datapipeline doesn't define ratio and set to 1 automatically.")
+
+ def __iter__(self):
+ datastreams = [iter(dataset) for dataset in self.datasets]
+ while True:
+ select_datastream = random.choices(datastreams, weights=self.prob, k=1)[0]
+ yield next(select_datastream)
+
+
+def apply_to_sample(f, sample):
+ if len(sample) == 0:
+ return {}
+
+ def _apply(x):
+ if torch.is_tensor(x):
+ return f(x)
+ elif isinstance(x, dict):
+ return {key: _apply(value) for key, value in x.items()}
+ elif isinstance(x, list):
+ return [_apply(x) for x in x]
+ else:
+ return x
+
+ return _apply(sample)
+
+
+def move_to_cuda(sample):
+ def _move_to_cuda(tensor):
+ return tensor.cuda()
+
+ return apply_to_sample(_move_to_cuda, sample)
+
+
+def prepare_sample(samples, cuda_enabled=True):
+ if cuda_enabled:
+ samples = move_to_cuda(samples)
+
+ # TODO fp16 support
+
+ return samples
+
+
+def reorg_datasets_by_split(datasets):
+ """
+ Organizes datasets by split.
+
+ Args:
+ datasets: dict of torch.utils.data.Dataset objects by name.
+
+ Returns:
+ Dict of datasets by split {split_name: List[Datasets]}.
+ """
+ # if len(datasets) == 1:
+ # return datasets[list(datasets.keys())[0]]
+ # else:
+ reorg_datasets = dict()
+
+ # reorganize by split
+ for _, dataset in datasets.items():
+ for split_name, dataset_split in dataset.items():
+ if split_name not in reorg_datasets:
+ reorg_datasets[split_name] = [dataset_split]
+ else:
+ reorg_datasets[split_name].append(dataset_split)
+
+ return reorg_datasets
+
+
+def concat_datasets(datasets):
+ """
+ Concatenates multiple datasets into a single dataset.
+
+ It supports may-style datasets and DataPipeline from WebDataset. Currently, does not support
+ generic IterableDataset because it requires creating separate samplers.
+
+ Now only supports conctenating training datasets and assuming validation and testing
+ have only a single dataset. This is because metrics should not be computed on the concatenated
+ datasets.
+
+ Args:
+ datasets: dict of torch.utils.data.Dataset objects by split.
+
+ Returns:
+ Dict of concatenated datasets by split, "train" is the concatenation of multiple datasets,
+ "val" and "test" remain the same.
+
+ If the input training datasets contain both map-style and DataPipeline datasets, returns
+ a tuple, where the first element is a concatenated map-style dataset and the second
+ element is a chained DataPipeline dataset.
+
+ """
+ # concatenate datasets in the same split
+ for split_name in datasets:
+ if split_name != "train":
+ assert (
+ len(datasets[split_name]) == 1
+ ), "Do not support multiple {} datasets.".format(split_name)
+ datasets[split_name] = datasets[split_name][0]
+ else:
+ iterable_datasets, map_datasets = [], []
+ for dataset in datasets[split_name]:
+ if isinstance(dataset, wds.DataPipeline):
+ logging.info(
+ "Dataset {} is IterableDataset, can't be concatenated.".format(
+ dataset
+ )
+ )
+ iterable_datasets.append(dataset)
+ elif isinstance(dataset, IterableDataset):
+ raise NotImplementedError(
+ "Do not support concatenation of generic IterableDataset."
+ )
+ else:
+ map_datasets.append(dataset)
+
+ # if len(iterable_datasets) > 0:
+ # concatenate map-style datasets and iterable-style datasets separately
+ if len(iterable_datasets) > 1:
+ chained_datasets = (
+ ChainDataset(iterable_datasets)
+ )
+ elif len(iterable_datasets) == 1:
+ chained_datasets = iterable_datasets[0]
+ else:
+ chained_datasets = None
+
+ concat_datasets = (
+ ConcatDataset(map_datasets) if len(map_datasets) > 0 else None
+ )
+
+ train_datasets = concat_datasets, chained_datasets
+ train_datasets = tuple([x for x in train_datasets if x is not None])
+ train_datasets = (
+ train_datasets[0] if len(train_datasets) == 1 else train_datasets
+ )
+
+ datasets[split_name] = train_datasets
+
+ return datasets
+
diff --git a/minigpt4/datasets/datasets/__init__.py b/minigpt4/datasets/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/minigpt4/datasets/datasets/base_dataset.py b/minigpt4/datasets/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e1c6f26f09adc3500ab7253cb555d54daf76ae
--- /dev/null
+++ b/minigpt4/datasets/datasets/base_dataset.py
@@ -0,0 +1,68 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+from typing import Iterable
+
+from torch.utils.data import Dataset, ConcatDataset
+from torch.utils.data.dataloader import default_collate
+
+
+class BaseDataset(Dataset):
+ def __init__(
+ self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
+ ):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.annotation = []
+ for ann_path in ann_paths:
+ self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self._add_instance_ids()
+
+ def __len__(self):
+ return len(self.annotation)
+
+ def collater(self, samples):
+ return default_collate(samples)
+
+ def set_processors(self, vis_processor, text_processor):
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ def _add_instance_ids(self, key="instance_id"):
+ for idx, ann in enumerate(self.annotation):
+ ann[key] = str(idx)
+
+
+class ConcatDataset(ConcatDataset):
+ def __init__(self, datasets: Iterable[Dataset]) -> None:
+ super().__init__(datasets)
+
+ def collater(self, samples):
+ # TODO For now only supports datasets with same underlying collater implementations
+
+ all_keys = set()
+ for s in samples:
+ all_keys.update(s)
+
+ shared_keys = all_keys
+ for s in samples:
+ shared_keys = shared_keys & set(s.keys())
+
+ samples_shared_keys = []
+ for s in samples:
+ samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
+
+ return self.datasets[0].collater(samples_shared_keys)
diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8fb2014794a2800e5bea480ba3ecd353512915
--- /dev/null
+++ b/minigpt4/datasets/datasets/caption_datasets.py
@@ -0,0 +1,85 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+
+
+class __DisplMixin:
+ def displ_item(self, index):
+ sample, ann = self.__getitem__(index), self.annotation[index]
+
+ return OrderedDict(
+ {
+ "file": ann["image"],
+ "caption": ann["caption"],
+ "image": sample["image"],
+ }
+ )
+
+
+class CaptionDataset(BaseDataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ self.img_ids = {}
+ n = 0
+ for ann in self.annotation:
+ img_id = ann["image_id"]
+ if img_id not in self.img_ids.keys():
+ self.img_ids[img_id] = n
+ n += 1
+
+ def __getitem__(self, index):
+
+ # TODO this assumes image input, not general enough
+ ann = self.annotation[index]
+
+ img_file = '{:0>12}.jpg'.format(ann["image_id"])
+ image_path = os.path.join(self.vis_root, img_file)
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ caption = self.text_processor(ann["caption"])
+
+ return {
+ "image": image,
+ "text_input": caption,
+ "image_id": self.img_ids[ann["image_id"]],
+ }
+
+
+class CaptionEvalDataset(BaseDataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ split (string): val or test
+ """
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ def __getitem__(self, index):
+
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+
+ return {
+ "image": image,
+ "image_id": ann["image_id"],
+ "instance_id": ann["instance_id"],
+ }
diff --git a/minigpt4/datasets/datasets/cc_combine_dataset.py b/minigpt4/datasets/datasets/cc_combine_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..def863d405a4bbe34b8a46c7d9a3220efec2aaf6
--- /dev/null
+++ b/minigpt4/datasets/datasets/cc_combine_dataset.py
@@ -0,0 +1,53 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+from PIL import Image
+import webdataset as wds
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class CCCombineDataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, location):
+ super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+ self.inner_dataset = wds.DataPipeline(
+ wds.ResampledShards(location),
+ wds.tarfile_to_samples(handler=wds.warn_and_continue),
+ wds.shuffle(1000, handler=wds.warn_and_continue),
+ wds.decode("pilrgb", handler=wds.warn_and_continue),
+ wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+ wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+ wds.map(self.to_dict, handler=wds.warn_and_continue),
+ )
+
+ def to_dict(self, sample):
+ return {
+ "image": sample[0],
+ "text_input": self.text_processor(sample[1]["caption"]),
+ }
+
+
+class CCAlignDataset(CaptionDataset):
+
+ def __getitem__(self, index):
+
+ # TODO this assumes image input, not general enough
+ ann = self.annotation[index]
+
+ img_file = '{}.jpg'.format(ann["image_id"])
+ image_path = os.path.join(self.vis_root, img_file)
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ caption = ann["caption"]
+
+ return {
+ "image": image,
+ "text_input": caption,
+ "image_id": self.img_ids[ann["image_id"]],
+ }
\ No newline at end of file
diff --git a/minigpt4/datasets/datasets/dataloader_utils.py b/minigpt4/datasets/datasets/dataloader_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3459972e5bda3e4a40788acf97ebe3c114fe7c3e
--- /dev/null
+++ b/minigpt4/datasets/datasets/dataloader_utils.py
@@ -0,0 +1,162 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import time
+import random
+import torch
+from minigpt4.datasets.data_utils import move_to_cuda
+from torch.utils.data import DataLoader
+
+
+class MultiIterLoader:
+ """
+ A simple wrapper for iterating over multiple iterators.
+
+ Args:
+ loaders (List[Loader]): List of Iterator loaders.
+ ratios (List[float]): List of ratios to sample from each loader. If None, all loaders are sampled uniformly.
+ """
+
+ def __init__(self, loaders, ratios=None):
+ # assert all loaders has __next__ method
+ for loader in loaders:
+ assert hasattr(
+ loader, "__next__"
+ ), "Loader {} has no __next__ method.".format(loader)
+
+ if ratios is None:
+ ratios = [1.0] * len(loaders)
+ else:
+ assert len(ratios) == len(loaders)
+ ratios = [float(ratio) / sum(ratios) for ratio in ratios]
+
+ self.loaders = loaders
+ self.ratios = ratios
+
+ def __next__(self):
+ # random sample from each loader by ratio
+ loader_idx = random.choices(range(len(self.loaders)), self.ratios, k=1)[0]
+ return next(self.loaders[loader_idx])
+
+
+class PrefetchLoader(object):
+ """
+ Modified from https://github.com/ChenRocks/UNITER.
+
+ overlap compute and cuda data transfer
+ (copied and then modified from nvidia apex)
+ """
+
+ def __init__(self, loader):
+ self.loader = loader
+ self.stream = torch.cuda.Stream()
+
+ def __iter__(self):
+ loader_it = iter(self.loader)
+ self.preload(loader_it)
+ batch = self.next(loader_it)
+ while batch is not None:
+ is_tuple = isinstance(batch, tuple)
+ if is_tuple:
+ task, batch = batch
+
+ if is_tuple:
+ yield task, batch
+ else:
+ yield batch
+ batch = self.next(loader_it)
+
+ def __len__(self):
+ return len(self.loader)
+
+ def preload(self, it):
+ try:
+ self.batch = next(it)
+ except StopIteration:
+ self.batch = None
+ return
+ # if record_stream() doesn't work, another option is to make sure
+ # device inputs are created on the main stream.
+ # self.next_input_gpu = torch.empty_like(self.next_input,
+ # device='cuda')
+ # self.next_target_gpu = torch.empty_like(self.next_target,
+ # device='cuda')
+ # Need to make sure the memory allocated for next_* is not still in use
+ # by the main stream at the time we start copying to next_*:
+ # self.stream.wait_stream(torch.cuda.current_stream())
+ with torch.cuda.stream(self.stream):
+ self.batch = move_to_cuda(self.batch)
+ # more code for the alternative if record_stream() doesn't work:
+ # copy_ will record the use of the pinned source tensor in this
+ # side stream.
+ # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+ # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+ # self.next_input = self.next_input_gpu
+ # self.next_target = self.next_target_gpu
+
+ def next(self, it):
+ torch.cuda.current_stream().wait_stream(self.stream)
+ batch = self.batch
+ if batch is not None:
+ record_cuda_stream(batch)
+ self.preload(it)
+ return batch
+
+ def __getattr__(self, name):
+ method = self.loader.__getattribute__(name)
+ return method
+
+
+def record_cuda_stream(batch):
+ if isinstance(batch, torch.Tensor):
+ batch.record_stream(torch.cuda.current_stream())
+ elif isinstance(batch, list) or isinstance(batch, tuple):
+ for t in batch:
+ record_cuda_stream(t)
+ elif isinstance(batch, dict):
+ for t in batch.values():
+ record_cuda_stream(t)
+ else:
+ pass
+
+
+class IterLoader:
+ """
+ A wrapper to convert DataLoader as an infinite iterator.
+
+ Modified from:
+ https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+ """
+
+ def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+ self._dataloader = dataloader
+ self.iter_loader = iter(self._dataloader)
+ self._use_distributed = use_distributed
+ self._epoch = 0
+
+ @property
+ def epoch(self) -> int:
+ return self._epoch
+
+ def __next__(self):
+ try:
+ data = next(self.iter_loader)
+ except StopIteration:
+ self._epoch += 1
+ if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+ self._dataloader.sampler.set_epoch(self._epoch)
+ time.sleep(2) # Prevent possible deadlock during epoch transition
+ self.iter_loader = iter(self._dataloader)
+ data = next(self.iter_loader)
+
+ return data
+
+ def __iter__(self):
+ return self
+
+ def __len__(self):
+ return len(self._dataloader)
diff --git a/minigpt4/datasets/datasets/laion_dataset.py b/minigpt4/datasets/datasets/laion_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..78568fc7df9cc8213899e564babc00658c8575ac
--- /dev/null
+++ b/minigpt4/datasets/datasets/laion_dataset.py
@@ -0,0 +1,31 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import webdataset as wds
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+
+
+class LaionDataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, location):
+ super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+ self.inner_dataset = wds.DataPipeline(
+ wds.ResampledShards(location),
+ wds.tarfile_to_samples(handler=wds.warn_and_continue),
+ wds.shuffle(1000, handler=wds.warn_and_continue),
+ wds.decode("pilrgb", handler=wds.warn_and_continue),
+ wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+ wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+ wds.map(self.to_dict, handler=wds.warn_and_continue),
+ )
+
+ def to_dict(self, sample):
+ return {
+ "image": sample[0],
+ "text_input": self.text_processor(sample[1]["caption"]),
+ }
+
diff --git a/minigpt4/models/Qformer.py b/minigpt4/models/Qformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e71b12375e10511858a9c505dc795181e6ce5603
--- /dev/null
+++ b/minigpt4/models/Qformer.py
@@ -0,0 +1,1216 @@
+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+ ModelOutput,
+)
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+ MaskedLMOutput,
+ MultipleChoiceModelOutput,
+ NextSentencePredictorOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+ PreTrainedModel,
+ apply_chunking_to_forward,
+ find_pruneable_heads_and_indices,
+ prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+ """Construct the embeddings from word and position embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+ )
+ self.position_embeddings = nn.Embedding(
+ config.max_position_embeddings, config.hidden_size
+ )
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+ )
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+
+ self.config = config
+
+ def forward(
+ self,
+ input_ids=None,
+ position_ids=None,
+ query_embeds=None,
+ past_key_values_length=0,
+ ):
+ if input_ids is not None:
+ seq_length = input_ids.size()[1]
+ else:
+ seq_length = 0
+
+ if position_ids is None:
+ position_ids = self.position_ids[
+ :, past_key_values_length : seq_length + past_key_values_length
+ ].clone()
+
+ if input_ids is not None:
+ embeddings = self.word_embeddings(input_ids)
+ if self.position_embedding_type == "absolute":
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings = embeddings + position_embeddings
+
+ if query_embeds is not None:
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
+ else:
+ embeddings = query_embeds
+
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertSelfAttention(nn.Module):
+ def __init__(self, config, is_cross_attention):
+ super().__init__()
+ self.config = config
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+ config, "embedding_size"
+ ):
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
+ if is_cross_attention:
+ self.key = nn.Linear(config.encoder_width, self.all_head_size)
+ self.value = nn.Linear(config.encoder_width, self.all_head_size)
+ else:
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ self.max_position_embeddings = config.max_position_embeddings
+ self.distance_embedding = nn.Embedding(
+ 2 * config.max_position_embeddings - 1, self.attention_head_size
+ )
+ self.save_attention = False
+
+ def save_attn_gradients(self, attn_gradients):
+ self.attn_gradients = attn_gradients
+
+ def get_attn_gradients(self):
+ return self.attn_gradients
+
+ def save_attention_map(self, attention_map):
+ self.attention_map = attention_map
+
+ def get_attention_map(self):
+ return self.attention_map
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (
+ self.num_attention_heads,
+ self.attention_head_size,
+ )
+ x = x.view(*new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+
+ # If this is instantiated as a cross-attention module, the keys
+ # and values come from an encoder; the attention mask needs to be
+ # such that the encoder's padding tokens are not attended to.
+ is_cross_attention = encoder_hidden_states is not None
+
+ if is_cross_attention:
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+ attention_mask = encoder_attention_mask
+ elif past_key_value is not None:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+ else:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+ mixed_query_layer = self.query(hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ past_key_value = (key_layer, value_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ seq_length = hidden_states.size()[1]
+ position_ids_l = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(-1, 1)
+ position_ids_r = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(1, -1)
+ distance = position_ids_l - position_ids_r
+ positional_embedding = self.distance_embedding(
+ distance + self.max_position_embeddings - 1
+ )
+ positional_embedding = positional_embedding.to(
+ dtype=query_layer.dtype
+ ) # fp16 compatibility
+
+ if self.position_embedding_type == "relative_key":
+ relative_position_scores = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ attention_scores = attention_scores + relative_position_scores
+ elif self.position_embedding_type == "relative_key_query":
+ relative_position_scores_query = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ relative_position_scores_key = torch.einsum(
+ "bhrd,lrd->bhlr", key_layer, positional_embedding
+ )
+ attention_scores = (
+ attention_scores
+ + relative_position_scores_query
+ + relative_position_scores_key
+ )
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+ if is_cross_attention and self.save_attention:
+ self.save_attention_map(attention_probs)
+ attention_probs.register_hook(self.save_attn_gradients)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs_dropped = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs_dropped = attention_probs_dropped * head_mask
+
+ context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape)
+
+ outputs = (
+ (context_layer, attention_probs) if output_attentions else (context_layer,)
+ )
+
+ outputs = outputs + (past_key_value,)
+ return outputs
+
+
+class BertSelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertAttention(nn.Module):
+ def __init__(self, config, is_cross_attention=False):
+ super().__init__()
+ self.self = BertSelfAttention(config, is_cross_attention)
+ self.output = BertSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads,
+ self.self.num_attention_heads,
+ self.self.attention_head_size,
+ self.pruned_heads,
+ )
+
+ # Prune linear layers
+ self.self.query = prune_linear_layer(self.self.query, index)
+ self.self.key = prune_linear_layer(self.self.key, index)
+ self.self.value = prune_linear_layer(self.self.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+ self.self.all_head_size = (
+ self.self.attention_head_size * self.self.num_attention_heads
+ )
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+ self_outputs = self.self(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[
+ 1:
+ ] # add attentions if we output them
+ return outputs
+
+
+class BertIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class BertOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertLayer(nn.Module):
+ def __init__(self, config, layer_num):
+ super().__init__()
+ self.config = config
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = BertAttention(config)
+ self.layer_num = layer_num
+ if (
+ self.config.add_cross_attention
+ and layer_num % self.config.cross_attention_freq == 0
+ ):
+ self.crossattention = BertAttention(
+ config, is_cross_attention=self.config.add_cross_attention
+ )
+ self.has_cross_attention = True
+ else:
+ self.has_cross_attention = False
+ self.intermediate = BertIntermediate(config)
+ self.output = BertOutput(config)
+
+ self.intermediate_query = BertIntermediate(config)
+ self.output_query = BertOutput(config)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ query_length=0,
+ ):
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = (
+ past_key_value[:2] if past_key_value is not None else None
+ )
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ past_key_value=self_attn_past_key_value,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:-1]
+
+ present_key_value = self_attention_outputs[-1]
+
+ if query_length > 0:
+ query_attention_output = attention_output[:, :query_length, :]
+
+ if self.has_cross_attention:
+ assert (
+ encoder_hidden_states is not None
+ ), "encoder_hidden_states must be given for cross-attention layers"
+ cross_attention_outputs = self.crossattention(
+ query_attention_output,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ query_attention_output = cross_attention_outputs[0]
+ outputs = (
+ outputs + cross_attention_outputs[1:-1]
+ ) # add cross attentions if we output attention weights
+
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk_query,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ query_attention_output,
+ )
+ if attention_output.shape[1] > query_length:
+ layer_output_text = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output[:, query_length:, :],
+ )
+ layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+ else:
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output,
+ )
+ outputs = (layer_output,) + outputs
+
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+ def feed_forward_chunk(self, attention_output):
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output
+
+ def feed_forward_chunk_query(self, attention_output):
+ intermediate_output = self.intermediate_query(attention_output)
+ layer_output = self.output_query(intermediate_output, attention_output)
+ return layer_output
+
+
+class BertEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList(
+ [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+ )
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ query_length=0,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = (
+ () if output_attentions and self.config.add_cross_attention else None
+ )
+
+ next_decoder_cache = () if use_cache else None
+
+ for i in range(self.config.num_hidden_layers):
+ layer_module = self.layer[i]
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+ if use_cache:
+ logger.warn(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(
+ *inputs, past_key_value, output_attentions, query_length
+ )
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(layer_module),
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ query_length,
+ )
+
+ hidden_states = layer_outputs[0]
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ next_decoder_cache,
+ all_hidden_states,
+ all_self_attentions,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+class BertPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.transform = BertPredictionHeadTransform(config)
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+ self.decoder.bias = self.bias
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.predictions = BertLMPredictionHead(config)
+
+ def forward(self, sequence_output):
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = BertConfig
+ base_model_prefix = "bert"
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Embedding)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+ """
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+ cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+ all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+ argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ input to the forward pass.
+ """
+
+ def __init__(self, config, add_pooling_layer=False):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = BertEmbeddings(config)
+
+ self.encoder = BertEncoder(config)
+
+ self.pooler = BertPooler(config) if add_pooling_layer else None
+
+ self.init_weights()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ def get_extended_attention_mask(
+ self,
+ attention_mask: Tensor,
+ input_shape: Tuple[int],
+ device: device,
+ is_decoder: bool,
+ has_query: bool = False,
+ ) -> Tensor:
+ """
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+ Arguments:
+ attention_mask (:obj:`torch.Tensor`):
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+ input_shape (:obj:`Tuple[int]`):
+ The shape of the input to the model.
+ device: (:obj:`torch.device`):
+ The device of the input to the model.
+
+ Returns:
+ :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+ """
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if attention_mask.dim() == 3:
+ extended_attention_mask = attention_mask[:, None, :, :]
+ elif attention_mask.dim() == 2:
+ # Provided a padding mask of dimensions [batch_size, seq_length]
+ # - if the model is a decoder, apply a causal mask in addition to the padding mask
+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if is_decoder:
+ batch_size, seq_length = input_shape
+
+ seq_ids = torch.arange(seq_length, device=device)
+ causal_mask = (
+ seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+ <= seq_ids[None, :, None]
+ )
+
+ # add a prefix ones mask to the causal mask
+ # causal and attention masks must have same type with pytorch version < 1.3
+ causal_mask = causal_mask.to(attention_mask.dtype)
+
+ if causal_mask.shape[1] < attention_mask.shape[1]:
+ prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+ if has_query: # UniLM style attention mask
+ causal_mask = torch.cat(
+ [
+ torch.zeros(
+ (batch_size, prefix_seq_len, seq_length),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=1,
+ )
+ causal_mask = torch.cat(
+ [
+ torch.ones(
+ (batch_size, causal_mask.shape[1], prefix_seq_len),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=-1,
+ )
+ extended_attention_mask = (
+ causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+ )
+ else:
+ extended_attention_mask = attention_mask[:, None, None, :]
+ else:
+ raise ValueError(
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+ input_shape, attention_mask.shape
+ )
+ )
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = extended_attention_mask.to(
+ dtype=self.dtype
+ ) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+ return extended_attention_mask
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ is_decoder=False,
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ """
+ output_attentions = (
+ output_attentions
+ if output_attentions is not None
+ else self.config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states
+ if output_hidden_states is not None
+ else self.config.output_hidden_states
+ )
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ if input_ids is None:
+ assert (
+ query_embeds is not None
+ ), "You have to specify query_embeds when input_ids is None"
+
+ # past_key_values_length
+ past_key_values_length = (
+ past_key_values[0][0].shape[2] - self.config.query_length
+ if past_key_values is not None
+ else 0
+ )
+
+ query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ query_embeds=query_embeds,
+ past_key_values_length=past_key_values_length,
+ )
+
+ input_shape = embedding_output.size()[:-1]
+ batch_size, seq_length = input_shape
+ device = embedding_output.device
+
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ ((batch_size, seq_length + past_key_values_length)), device=device
+ )
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if is_decoder:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask,
+ input_ids.shape,
+ device,
+ is_decoder,
+ has_query=(query_embeds is not None),
+ )
+ else:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask, input_shape, device, is_decoder
+ )
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if encoder_hidden_states is not None:
+ if type(encoder_hidden_states) == list:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+ 0
+ ].size()
+ else:
+ (
+ encoder_batch_size,
+ encoder_sequence_length,
+ _,
+ ) = encoder_hidden_states.size()
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+ if type(encoder_attention_mask) == list:
+ encoder_extended_attention_mask = [
+ self.invert_attention_mask(mask) for mask in encoder_attention_mask
+ ]
+ elif encoder_attention_mask is None:
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = None
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_extended_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ query_length=query_length,
+ )
+ sequence_output = encoder_outputs[0]
+ pooled_output = (
+ self.pooler(sequence_output) if self.pooler is not None else None
+ )
+
+ if not return_dict:
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPoolingAndCrossAttentions(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ past_key_values=encoder_outputs.past_key_values,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ cross_attentions=encoder_outputs.cross_attentions,
+ )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ past_key_values=None,
+ use_cache=True,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=True,
+ reduction="mean",
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+ ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+ ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ Returns:
+ Example::
+ >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+ >>> import torch
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> config = BertConfig.from_pretrained("bert-base-cased")
+ >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> prediction_logits = outputs.logits
+ """
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+ if labels is not None:
+ use_cache = False
+ if past_key_values is not None:
+ query_embeds = None
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+
+ sequence_output = outputs[0]
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores[:, :-1, :].contiguous()
+
+ lm_loss = None
+ if labels is not None:
+ # we are doing next-token prediction; shift prediction scores and input ids by one
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+ labels = labels[:, 1:].contiguous()
+ loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+ lm_loss = loss_fct(
+ shifted_prediction_scores.view(-1, self.config.vocab_size),
+ labels.view(-1),
+ )
+ if reduction == "none":
+ lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((lm_loss,) + output) if lm_loss is not None else output
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=lm_loss,
+ logits=prediction_scores,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ cross_attentions=outputs.cross_attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+ ):
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+ if attention_mask is None:
+ attention_mask = input_ids.new_ones(input_ids.shape)
+ query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+ attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+ # cut decoder_input_ids if past is used
+ if past is not None:
+ input_ids = input_ids[:, -1:]
+
+ return {
+ "input_ids": input_ids,
+ "query_embeds": query_embeds,
+ "attention_mask": attention_mask,
+ "past_key_values": past,
+ "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+ "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+ "is_decoder": True,
+ }
+
+ def _reorder_cache(self, past, beam_idx):
+ reordered_past = ()
+ for layer_past in past:
+ reordered_past += (
+ tuple(
+ past_state.index_select(0, beam_idx) for past_state in layer_past
+ ),
+ )
+ return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=False,
+ ):
+ r"""
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+ config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ """
+
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
+ masked_lm_loss = loss_fct(
+ prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+ )
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return (
+ ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+ )
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
diff --git a/minigpt4/models/__init__.py b/minigpt4/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5096a04317a51def52386b728124962b72a2574c
--- /dev/null
+++ b/minigpt4/models/__init__.py
@@ -0,0 +1,200 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import torch
+from omegaconf import OmegaConf
+
+from minigpt4.common.registry import registry
+from minigpt4.models.base_model import BaseModel
+from minigpt4.models.blip2 import Blip2Base
+from minigpt4.models.mini_gpt4 import MiniGPT4
+from minigpt4.processors.base_processor import BaseProcessor
+
+
+__all__ = [
+ "load_model",
+ "BaseModel",
+ "Blip2Base",
+ "MiniGPT4",
+]
+
+
+def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
+ """
+ Load supported models.
+
+ To list all available models and types in registry:
+ >>> from minigpt4.models import model_zoo
+ >>> print(model_zoo)
+
+ Args:
+ name (str): name of the model.
+ model_type (str): type of the model.
+ is_eval (bool): whether the model is in eval mode. Default: False.
+ device (str): device to use. Default: "cpu".
+ checkpoint (str): path or to checkpoint. Default: None.
+ Note that expecting the checkpoint to have the same keys in state_dict as the model.
+
+ Returns:
+ model (torch.nn.Module): model.
+ """
+
+ model = registry.get_model_class(name).from_pretrained(model_type=model_type)
+
+ if checkpoint is not None:
+ model.load_checkpoint(checkpoint)
+
+ if is_eval:
+ model.eval()
+
+ if device == "cpu":
+ model = model.float()
+
+ return model.to(device)
+
+
+def load_preprocess(config):
+ """
+ Load preprocessor configs and construct preprocessors.
+
+ If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.
+
+ Args:
+ config (dict): preprocessor configs.
+
+ Returns:
+ vis_processors (dict): preprocessors for visual inputs.
+ txt_processors (dict): preprocessors for text inputs.
+
+ Key is "train" or "eval" for processors used in training and evaluation respectively.
+ """
+
+ def _build_proc_from_cfg(cfg):
+ return (
+ registry.get_processor_class(cfg.name).from_config(cfg)
+ if cfg is not None
+ else BaseProcessor()
+ )
+
+ vis_processors = dict()
+ txt_processors = dict()
+
+ vis_proc_cfg = config.get("vis_processor")
+ txt_proc_cfg = config.get("text_processor")
+
+ if vis_proc_cfg is not None:
+ vis_train_cfg = vis_proc_cfg.get("train")
+ vis_eval_cfg = vis_proc_cfg.get("eval")
+ else:
+ vis_train_cfg = None
+ vis_eval_cfg = None
+
+ vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
+ vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)
+
+ if txt_proc_cfg is not None:
+ txt_train_cfg = txt_proc_cfg.get("train")
+ txt_eval_cfg = txt_proc_cfg.get("eval")
+ else:
+ txt_train_cfg = None
+ txt_eval_cfg = None
+
+ txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
+ txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)
+
+ return vis_processors, txt_processors
+
+
+def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
+ """
+ Load model and its related preprocessors.
+
+ List all available models and types in registry:
+ >>> from minigpt4.models import model_zoo
+ >>> print(model_zoo)
+
+ Args:
+ name (str): name of the model.
+ model_type (str): type of the model.
+ is_eval (bool): whether the model is in eval mode. Default: False.
+ device (str): device to use. Default: "cpu".
+
+ Returns:
+ model (torch.nn.Module): model.
+ vis_processors (dict): preprocessors for visual inputs.
+ txt_processors (dict): preprocessors for text inputs.
+ """
+ model_cls = registry.get_model_class(name)
+
+ # load model
+ model = model_cls.from_pretrained(model_type=model_type)
+
+ if is_eval:
+ model.eval()
+
+ # load preprocess
+ cfg = OmegaConf.load(model_cls.default_config_path(model_type))
+ if cfg is not None:
+ preprocess_cfg = cfg.preprocess
+
+ vis_processors, txt_processors = load_preprocess(preprocess_cfg)
+ else:
+ vis_processors, txt_processors = None, None
+ logging.info(
+ f"""No default preprocess for model {name} ({model_type}).
+ This can happen if the model is not finetuned on downstream datasets,
+ or it is not intended for direct use without finetuning.
+ """
+ )
+
+ if device == "cpu" or device == torch.device("cpu"):
+ model = model.float()
+
+ return model.to(device), vis_processors, txt_processors
+
+
+class ModelZoo:
+ """
+ A utility class to create string representation of available model architectures and types.
+
+ >>> from minigpt4.models import model_zoo
+ >>> # list all available models
+ >>> print(model_zoo)
+ >>> # show total number of models
+ >>> print(len(model_zoo))
+ """
+
+ def __init__(self) -> None:
+ self.model_zoo = {
+ k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
+ for k, v in registry.mapping["model_name_mapping"].items()
+ }
+
+ def __str__(self) -> str:
+ return (
+ "=" * 50
+ + "\n"
+ + f"{'Architectures':<30} {'Types'}\n"
+ + "=" * 50
+ + "\n"
+ + "\n".join(
+ [
+ f"{name:<30} {', '.join(types)}"
+ for name, types in self.model_zoo.items()
+ ]
+ )
+ )
+
+ def __iter__(self):
+ return iter(self.model_zoo.items())
+
+ def __len__(self):
+ return sum([len(v) for v in self.model_zoo.values()])
+
+
+model_zoo = ModelZoo()
diff --git a/minigpt4/models/base_model.py b/minigpt4/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbfaf8e989d509bef7c4f06ac6d3de2b085e5d38
--- /dev/null
+++ b/minigpt4/models/base_model.py
@@ -0,0 +1,247 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+from minigpt4.common.dist_utils import download_cached_file, is_dist_avail_and_initialized
+from minigpt4.common.utils import get_abs_path, is_url
+from omegaconf import OmegaConf
+
+
+class BaseModel(nn.Module):
+ """Base class for models."""
+
+ def __init__(self):
+ super().__init__()
+
+ @property
+ def device(self):
+ return list(self.parameters())[0].device
+
+ def load_checkpoint(self, url_or_filename):
+ """
+ Load from a finetuned checkpoint.
+
+ This should expect no mismatch in the model keys and the checkpoint keys.
+ """
+
+ if is_url(url_or_filename):
+ cached_file = download_cached_file(
+ url_or_filename, check_hash=False, progress=True
+ )
+ checkpoint = torch.load(cached_file, map_location="cpu")
+ elif os.path.isfile(url_or_filename):
+ checkpoint = torch.load(url_or_filename, map_location="cpu")
+ else:
+ raise RuntimeError("checkpoint url or path is invalid")
+
+ if "model" in checkpoint.keys():
+ state_dict = checkpoint["model"]
+ else:
+ state_dict = checkpoint
+
+ msg = self.load_state_dict(state_dict, strict=False)
+
+ logging.info("Missing keys {}".format(msg.missing_keys))
+ logging.info("load checkpoint from %s" % url_or_filename)
+
+ return msg
+
+ @classmethod
+ def from_pretrained(cls, model_type):
+ """
+ Build a pretrained model from default configuration file, specified by model_type.
+
+ Args:
+ - model_type (str): model type, specifying architecture and checkpoints.
+
+ Returns:
+ - model (nn.Module): pretrained or finetuned model, depending on the configuration.
+ """
+ model_cfg = OmegaConf.load(cls.default_config_path(model_type)).model
+ model = cls.from_config(model_cfg)
+
+ return model
+
+ @classmethod
+ def default_config_path(cls, model_type):
+ assert (
+ model_type in cls.PRETRAINED_MODEL_CONFIG_DICT
+ ), "Unknown model type {}".format(model_type)
+ return get_abs_path(cls.PRETRAINED_MODEL_CONFIG_DICT[model_type])
+
+ def load_checkpoint_from_config(self, cfg, **kwargs):
+ """
+ Load checkpoint as specified in the config file.
+
+ If load_finetuned is True, load the finetuned model; otherwise, load the pretrained model.
+ When loading the pretrained model, each task-specific architecture may define their
+ own load_from_pretrained() method.
+ """
+ load_finetuned = cfg.get("load_finetuned", True)
+ if load_finetuned:
+ finetune_path = cfg.get("finetuned", None)
+ assert (
+ finetune_path is not None
+ ), "Found load_finetuned is True, but finetune_path is None."
+ self.load_checkpoint(url_or_filename=finetune_path)
+ else:
+ # load pre-trained weights
+ pretrain_path = cfg.get("pretrained", None)
+ assert "Found load_finetuned is False, but pretrain_path is None."
+ self.load_from_pretrained(url_or_filename=pretrain_path, **kwargs)
+
+ def before_evaluation(self, **kwargs):
+ pass
+
+ def show_n_params(self, return_str=True):
+ tot = 0
+ for p in self.parameters():
+ w = 1
+ for x in p.shape:
+ w *= x
+ tot += w
+ if return_str:
+ if tot >= 1e6:
+ return "{:.1f}M".format(tot / 1e6)
+ else:
+ return "{:.1f}K".format(tot / 1e3)
+ else:
+ return tot
+
+
+class BaseEncoder(nn.Module):
+ """
+ Base class for primitive encoders, such as ViT, TimeSformer, etc.
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def forward_features(self, samples, **kwargs):
+ raise NotImplementedError
+
+ @property
+ def device(self):
+ return list(self.parameters())[0].device
+
+
+class SharedQueueMixin:
+ @torch.no_grad()
+ def _dequeue_and_enqueue(self, image_feat, text_feat, idxs=None):
+ # gather keys before updating queue
+ image_feats = concat_all_gather(image_feat)
+ text_feats = concat_all_gather(text_feat)
+
+ batch_size = image_feats.shape[0]
+
+ ptr = int(self.queue_ptr)
+ assert self.queue_size % batch_size == 0 # for simplicity
+
+ # replace the keys at ptr (dequeue and enqueue)
+ self.image_queue[:, ptr : ptr + batch_size] = image_feats.T
+ self.text_queue[:, ptr : ptr + batch_size] = text_feats.T
+
+ if idxs is not None:
+ idxs = concat_all_gather(idxs)
+ self.idx_queue[:, ptr : ptr + batch_size] = idxs.T
+
+ ptr = (ptr + batch_size) % self.queue_size # move pointer
+ self.queue_ptr[0] = ptr
+
+
+class MomentumDistilationMixin:
+ @torch.no_grad()
+ def copy_params(self):
+ for model_pair in self.model_pairs:
+ for param, param_m in zip(
+ model_pair[0].parameters(), model_pair[1].parameters()
+ ):
+ param_m.data.copy_(param.data) # initialize
+ param_m.requires_grad = False # not update by gradient
+
+ @torch.no_grad()
+ def _momentum_update(self):
+ for model_pair in self.model_pairs:
+ for param, param_m in zip(
+ model_pair[0].parameters(), model_pair[1].parameters()
+ ):
+ param_m.data = param_m.data * self.momentum + param.data * (
+ 1.0 - self.momentum
+ )
+
+
+class GatherLayer(torch.autograd.Function):
+ """
+ Gather tensors from all workers with support for backward propagation:
+ This implementation does not cut the gradients as torch.distributed.all_gather does.
+ """
+
+ @staticmethod
+ def forward(ctx, x):
+ output = [
+ torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())
+ ]
+ torch.distributed.all_gather(output, x)
+ return tuple(output)
+
+ @staticmethod
+ def backward(ctx, *grads):
+ all_gradients = torch.stack(grads)
+ torch.distributed.all_reduce(all_gradients)
+ return all_gradients[torch.distributed.get_rank()]
+
+
+def all_gather_with_grad(tensors):
+ """
+ Performs all_gather operation on the provided tensors.
+ Graph remains connected for backward grad computation.
+ """
+ # Queue the gathered tensors
+ world_size = torch.distributed.get_world_size()
+ # There is no need for reduction in the single-proc case
+ if world_size == 1:
+ return tensors
+
+ # tensor_all = GatherLayer.apply(tensors)
+ tensor_all = GatherLayer.apply(tensors)
+
+ return torch.cat(tensor_all, dim=0)
+
+
+@torch.no_grad()
+def concat_all_gather(tensor):
+ """
+ Performs all_gather operation on the provided tensors.
+ *** Warning ***: torch.distributed.all_gather has no gradient.
+ """
+ # if use distributed training
+ if not is_dist_avail_and_initialized():
+ return tensor
+
+ tensors_gather = [
+ torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())
+ ]
+ torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+ output = torch.cat(tensors_gather, dim=0)
+ return output
+
+
+def tile(x, dim, n_tile):
+ init_dim = x.size(dim)
+ repeat_idx = [1] * x.dim()
+ repeat_idx[dim] = n_tile
+ x = x.repeat(*(repeat_idx))
+ order_index = torch.LongTensor(
+ np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)])
+ )
+ return torch.index_select(x, dim, order_index.to(x.device))
diff --git a/minigpt4/models/blip2.py b/minigpt4/models/blip2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb04c4d95f102266d9f97ddd98050d65f5be7ff2
--- /dev/null
+++ b/minigpt4/models/blip2.py
@@ -0,0 +1,221 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import contextlib
+import logging
+import os
+import time
+import datetime
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import torch.nn.functional as F
+
+import minigpt4.common.dist_utils as dist_utils
+from minigpt4.common.dist_utils import download_cached_file
+from minigpt4.common.utils import is_url
+from minigpt4.common.logger import MetricLogger
+from minigpt4.models.base_model import BaseModel
+from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
+from minigpt4.models.eva_vit import create_eva_vit_g
+from transformers import BertTokenizer
+
+
+class Blip2Base(BaseModel):
+ @classmethod
+ def init_tokenizer(cls):
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+ return tokenizer
+
+ def maybe_autocast(self, dtype=torch.float16):
+ # if on cpu, don't use autocast
+ # if on gpu, use autocast with dtype if provided, otherwise use torch.float16
+ enable_autocast = self.device != torch.device("cpu")
+
+ if enable_autocast:
+ return torch.cuda.amp.autocast(dtype=dtype)
+ else:
+ return contextlib.nullcontext()
+
+ @classmethod
+ def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2):
+ encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+ encoder_config.encoder_width = vision_width
+ # insert cross-attention layer every other block
+ encoder_config.add_cross_attention = True
+ encoder_config.cross_attention_freq = cross_attention_freq
+ encoder_config.query_length = num_query_token
+ Qformer = BertLMHeadModel(config=encoder_config)
+ query_tokens = nn.Parameter(
+ torch.zeros(1, num_query_token, encoder_config.hidden_size)
+ )
+ query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+ return Qformer, query_tokens
+
+ @classmethod
+ def init_vision_encoder(
+ cls, model_name, img_size, drop_path_rate, use_grad_checkpoint, precision
+ ):
+ assert model_name == "eva_clip_g", "vit model must be eva_clip_g for current version of MiniGPT-4"
+ visual_encoder = create_eva_vit_g(
+ img_size, drop_path_rate, use_grad_checkpoint, precision
+ )
+
+ ln_vision = LayerNorm(visual_encoder.num_features)
+ return visual_encoder, ln_vision
+
+ def load_from_pretrained(self, url_or_filename):
+ if is_url(url_or_filename):
+ cached_file = download_cached_file(
+ url_or_filename, check_hash=False, progress=True
+ )
+ checkpoint = torch.load(cached_file, map_location="cpu")
+ elif os.path.isfile(url_or_filename):
+ checkpoint = torch.load(url_or_filename, map_location="cpu")
+ else:
+ raise RuntimeError("checkpoint url or path is invalid")
+
+ state_dict = checkpoint["model"]
+
+ msg = self.load_state_dict(state_dict, strict=False)
+
+ # logging.info("Missing keys {}".format(msg.missing_keys))
+ logging.info("load checkpoint from %s" % url_or_filename)
+
+ return msg
+
+
+def disabled_train(self, mode=True):
+ """Overwrite model.train with this function to make sure train/eval mode
+ does not change anymore."""
+ return self
+
+
+class LayerNorm(nn.LayerNorm):
+ """Subclass torch's LayerNorm to handle fp16."""
+
+ def forward(self, x: torch.Tensor):
+ orig_type = x.dtype
+ ret = super().forward(x.type(torch.float32))
+ return ret.type(orig_type)
+
+
+def compute_sim_matrix(model, data_loader, **kwargs):
+ k_test = kwargs.pop("k_test")
+
+ metric_logger = MetricLogger(delimiter=" ")
+ header = "Evaluation:"
+
+ logging.info("Computing features for evaluation...")
+ start_time = time.time()
+
+ texts = data_loader.dataset.text
+ num_text = len(texts)
+ text_bs = 256
+ text_ids = []
+ text_embeds = []
+ text_atts = []
+ for i in range(0, num_text, text_bs):
+ text = texts[i : min(num_text, i + text_bs)]
+ text_input = model.tokenizer(
+ text,
+ padding="max_length",
+ truncation=True,
+ max_length=35,
+ return_tensors="pt",
+ ).to(model.device)
+ text_feat = model.forward_text(text_input)
+ text_embed = F.normalize(model.text_proj(text_feat))
+ text_embeds.append(text_embed)
+ text_ids.append(text_input.input_ids)
+ text_atts.append(text_input.attention_mask)
+
+ text_embeds = torch.cat(text_embeds, dim=0)
+ text_ids = torch.cat(text_ids, dim=0)
+ text_atts = torch.cat(text_atts, dim=0)
+
+ vit_feats = []
+ image_embeds = []
+ for samples in data_loader:
+ image = samples["image"]
+
+ image = image.to(model.device)
+ image_feat, vit_feat = model.forward_image(image)
+ image_embed = model.vision_proj(image_feat)
+ image_embed = F.normalize(image_embed, dim=-1)
+
+ vit_feats.append(vit_feat.cpu())
+ image_embeds.append(image_embed)
+
+ vit_feats = torch.cat(vit_feats, dim=0)
+ image_embeds = torch.cat(image_embeds, dim=0)
+
+ sims_matrix = []
+ for image_embed in image_embeds:
+ sim_q2t = image_embed @ text_embeds.t()
+ sim_i2t, _ = sim_q2t.max(0)
+ sims_matrix.append(sim_i2t)
+ sims_matrix = torch.stack(sims_matrix, dim=0)
+
+ score_matrix_i2t = torch.full(
+ (len(data_loader.dataset.image), len(texts)), -100.0
+ ).to(model.device)
+
+ num_tasks = dist_utils.get_world_size()
+ rank = dist_utils.get_rank()
+ step = sims_matrix.size(0) // num_tasks + 1
+ start = rank * step
+ end = min(sims_matrix.size(0), start + step)
+
+ for i, sims in enumerate(
+ metric_logger.log_every(sims_matrix[start:end], 50, header)
+ ):
+ topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+ image_inputs = vit_feats[start + i].repeat(k_test, 1, 1).to(model.device)
+ score = model.compute_itm(
+ image_inputs=image_inputs,
+ text_ids=text_ids[topk_idx],
+ text_atts=text_atts[topk_idx],
+ ).float()
+ score_matrix_i2t[start + i, topk_idx] = score + topk_sim
+
+ sims_matrix = sims_matrix.t()
+ score_matrix_t2i = torch.full(
+ (len(texts), len(data_loader.dataset.image)), -100.0
+ ).to(model.device)
+
+ step = sims_matrix.size(0) // num_tasks + 1
+ start = rank * step
+ end = min(sims_matrix.size(0), start + step)
+
+ for i, sims in enumerate(
+ metric_logger.log_every(sims_matrix[start:end], 50, header)
+ ):
+ topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+ image_inputs = vit_feats[topk_idx.cpu()].to(model.device)
+ score = model.compute_itm(
+ image_inputs=image_inputs,
+ text_ids=text_ids[start + i].repeat(k_test, 1),
+ text_atts=text_atts[start + i].repeat(k_test, 1),
+ ).float()
+ score_matrix_t2i[start + i, topk_idx] = score + topk_sim
+
+ if dist_utils.is_dist_avail_and_initialized():
+ dist.barrier()
+ torch.distributed.all_reduce(
+ score_matrix_i2t, op=torch.distributed.ReduceOp.SUM
+ )
+ torch.distributed.all_reduce(
+ score_matrix_t2i, op=torch.distributed.ReduceOp.SUM
+ )
+
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ logging.info("Evaluation time {}".format(total_time_str))
+
+ return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()
diff --git a/minigpt4/models/blip2_outputs.py b/minigpt4/models/blip2_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1135e8b51b8a92e0ea5bf67725fb989236e7a703
--- /dev/null
+++ b/minigpt4/models/blip2_outputs.py
@@ -0,0 +1,110 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from transformers.modeling_outputs import (
+ ModelOutput,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+)
+
+
+@dataclass
+class BlipSimilarity(ModelOutput):
+ sim_i2t: torch.FloatTensor = None
+ sim_t2i: torch.FloatTensor = None
+
+ sim_i2t_m: Optional[torch.FloatTensor] = None
+ sim_t2i_m: Optional[torch.FloatTensor] = None
+
+ sim_i2t_targets: Optional[torch.FloatTensor] = None
+ sim_t2i_targets: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class BlipIntermediateOutput(ModelOutput):
+ """
+ Data class for intermediate outputs of BLIP models.
+
+ image_embeds (torch.FloatTensor): Image embeddings, shape (batch_size, num_patches, embed_dim).
+ text_embeds (torch.FloatTensor): Text embeddings, shape (batch_size, seq_len, embed_dim).
+
+ image_embeds_m (torch.FloatTensor): Image embeddings from momentum visual encoder, shape (batch_size, num_patches, embed_dim).
+ text_embeds_m (torch.FloatTensor): Text embeddings from momentum text encoder, shape (batch_size, seq_len, embed_dim).
+
+ encoder_output (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder.
+ encoder_output_neg (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder for negative pairs.
+
+ decoder_output (CausalLMOutputWithCrossAttentions): output from the image-grounded text decoder.
+ decoder_labels (torch.LongTensor): labels for the captioning loss.
+
+ itm_logits (torch.FloatTensor): logits for the image-text matching loss, shape (batch_size * 3, 2).
+ itm_labels (torch.LongTensor): labels for the image-text matching loss, shape (batch_size * 3,)
+
+ """
+
+ # uni-modal features
+ image_embeds: torch.FloatTensor = None
+ text_embeds: Optional[torch.FloatTensor] = None
+
+ image_embeds_m: Optional[torch.FloatTensor] = None
+ text_embeds_m: Optional[torch.FloatTensor] = None
+
+ # intermediate outputs of multimodal encoder
+ encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+ encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+
+ itm_logits: Optional[torch.FloatTensor] = None
+ itm_labels: Optional[torch.LongTensor] = None
+
+ # intermediate outputs of multimodal decoder
+ decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None
+ decoder_labels: Optional[torch.LongTensor] = None
+
+
+@dataclass
+class BlipOutput(ModelOutput):
+ # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
+ sims: Optional[BlipSimilarity] = None
+
+ intermediate_output: BlipIntermediateOutput = None
+
+ loss: Optional[torch.FloatTensor] = None
+
+ loss_itc: Optional[torch.FloatTensor] = None
+
+ loss_itm: Optional[torch.FloatTensor] = None
+
+ loss_lm: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class BlipOutputFeatures(ModelOutput):
+ """
+ Data class of features from BlipFeatureExtractor.
+
+ Args:
+ image_embeds: (torch.FloatTensor) of shape (batch_size, num_patches+1, embed_dim), optional
+ image_features: (torch.FloatTensor) of shape (batch_size, num_patches+1, feature_dim), optional
+ text_embeds: (torch.FloatTensor) of shape (batch_size, sequence_length+1, embed_dim), optional
+ text_features: (torch.FloatTensor) of shape (batch_size, sequence_length+1, feature_dim), optional
+
+ The first embedding or feature is for the [CLS] token.
+
+ Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space.
+ """
+
+ image_embeds: Optional[torch.FloatTensor] = None
+ image_embeds_proj: Optional[torch.FloatTensor] = None
+
+ text_embeds: Optional[torch.FloatTensor] = None
+ text_embeds_proj: Optional[torch.FloatTensor] = None
+
+ multimodal_embeds: Optional[torch.FloatTensor] = None
diff --git a/minigpt4/models/eva_vit.py b/minigpt4/models/eva_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcc63a74049f1faf65c99943ef94f72383ca3f5
--- /dev/null
+++ b/minigpt4/models/eva_vit.py
@@ -0,0 +1,442 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+
+from minigpt4.common.dist_utils import download_cached_file
+
+def _cfg(url='', **kwargs):
+ return {
+ 'url': url,
+ 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+ 'crop_pct': .9, 'interpolation': 'bicubic',
+ 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+ **kwargs
+ }
+
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ # x = self.drop(x)
+ # commit this for the orignal BERT implement
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+ proj_drop=0., window_size=None, attn_head_dim=None):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ if attn_head_dim is not None:
+ head_dim = attn_head_dim
+ all_head_dim = head_dim * self.num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+ if qkv_bias:
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+ else:
+ self.q_bias = None
+ self.v_bias = None
+
+ if window_size:
+ self.window_size = window_size
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
+ # cls to token & token 2 cls & cls to cls
+
+ # get pair-wise relative position index for each token inside the window
+ coords_h = torch.arange(window_size[0])
+ coords_w = torch.arange(window_size[1])
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+ relative_position_index = \
+ torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
+ relative_position_index[0, 0] = self.num_relative_distance - 1
+
+ self.register_buffer("relative_position_index", relative_position_index)
+ else:
+ self.window_size = None
+ self.relative_position_bias_table = None
+ self.relative_position_index = None
+
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(all_head_dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x, rel_pos_bias=None):
+ B, N, C = x.shape
+ qkv_bias = None
+ if self.q_bias is not None:
+ qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+ # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+
+ if self.relative_position_bias_table is not None:
+ relative_position_bias = \
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+ self.window_size[0] * self.window_size[1] + 1,
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
+ attn = attn + relative_position_bias.unsqueeze(0)
+
+ if rel_pos_bias is not None:
+ attn = attn + rel_pos_bias
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+ window_size=None, attn_head_dim=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ if init_values is not None and init_values > 0:
+ self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ else:
+ self.gamma_1, self.gamma_2 = None, None
+
+ def forward(self, x, rel_pos_bias=None):
+ if self.gamma_1 is None:
+ x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ else:
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+ return x
+
+
+class PatchEmbed(nn.Module):
+ """ Image to Patch Embedding
+ """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+ self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, x, **kwargs):
+ B, C, H, W = x.shape
+ # FIXME look at relaxing size constraints
+ assert H == self.img_size[0] and W == self.img_size[1], \
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+ x = self.proj(x).flatten(2).transpose(1, 2)
+ return x
+
+
+class RelativePositionBias(nn.Module):
+
+ def __init__(self, window_size, num_heads):
+ super().__init__()
+ self.window_size = window_size
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
+ # cls to token & token 2 cls & cls to cls
+
+ # get pair-wise relative position index for each token inside the window
+ coords_h = torch.arange(window_size[0])
+ coords_w = torch.arange(window_size[1])
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+ relative_position_index = \
+ torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
+ relative_position_index[0, 0] = self.num_relative_distance - 1
+
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+ def forward(self):
+ relative_position_bias = \
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+ self.window_size[0] * self.window_size[1] + 1,
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
+ return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+ """ Vision Transformer with support for patch or hybrid CNN input stage
+ """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+ use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+ use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+ super().__init__()
+ self.image_size = img_size
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.patch_embed = PatchEmbed(
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ if use_abs_pos_emb:
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+ else:
+ self.pos_embed = None
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ if use_shared_rel_pos_bias:
+ self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+ else:
+ self.rel_pos_bias = None
+ self.use_checkpoint = use_checkpoint
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.use_rel_pos_bias = use_rel_pos_bias
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+ init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+ for i in range(depth)])
+# self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+# self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+# self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ if self.pos_embed is not None:
+ trunc_normal_(self.pos_embed, std=.02)
+ trunc_normal_(self.cls_token, std=.02)
+ # trunc_normal_(self.mask_token, std=.02)
+# if isinstance(self.head, nn.Linear):
+# trunc_normal_(self.head.weight, std=.02)
+ self.apply(self._init_weights)
+ self.fix_init_weight()
+# if isinstance(self.head, nn.Linear):
+# self.head.weight.data.mul_(init_scale)
+# self.head.bias.data.mul_(init_scale)
+
+ def fix_init_weight(self):
+ def rescale(param, layer_id):
+ param.div_(math.sqrt(2.0 * layer_id))
+
+ for layer_id, layer in enumerate(self.blocks):
+ rescale(layer.attn.proj.weight.data, layer_id + 1)
+ rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ x = self.patch_embed(x)
+ batch_size, seq_len, _ = x.size()
+
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
+ x = torch.cat((cls_tokens, x), dim=1)
+ if self.pos_embed is not None:
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+ for blk in self.blocks:
+ if self.use_checkpoint:
+ x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+ else:
+ x = blk(x, rel_pos_bias)
+ return x
+# x = self.norm(x)
+
+# if self.fc_norm is not None:
+# t = x[:, 1:, :]
+# return self.fc_norm(t.mean(1))
+# else:
+# return x[:, 0]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+# x = self.head(x)
+ return x
+
+ def get_intermediate_layers(self, x):
+ x = self.patch_embed(x)
+ batch_size, seq_len, _ = x.size()
+
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
+ x = torch.cat((cls_tokens, x), dim=1)
+ if self.pos_embed is not None:
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ features = []
+ rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+ for blk in self.blocks:
+ x = blk(x, rel_pos_bias)
+ features.append(x)
+
+ return features
+
+
+def interpolate_pos_embed(model, checkpoint_model):
+ if 'pos_embed' in checkpoint_model:
+ pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+ embedding_size = pos_embed_checkpoint.shape[-1]
+ num_patches = model.patch_embed.num_patches
+ num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+ # height (== width) for the checkpoint position embedding
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+ # height (== width) for the new position embedding
+ new_size = int(num_patches ** 0.5)
+ # class_token and dist_token are kept unchanged
+ if orig_size != new_size:
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+ # only the position tokens are interpolated
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+ pos_tokens = torch.nn.functional.interpolate(
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+ checkpoint_model['pos_embed'] = new_pos_embed
+
+
+def convert_weights_to_fp16(model: nn.Module):
+ """Convert applicable model parameters to fp16"""
+
+ def _convert_weights_to_fp16(l):
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+ l.weight.data = l.weight.data.half()
+ if l.bias is not None:
+ l.bias.data = l.bias.data.half()
+
+# if isinstance(l, (nn.MultiheadAttention, Attention)):
+# for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+# tensor = getattr(l, attr)
+# if tensor is not None:
+# tensor.data = tensor.data.half()
+
+ model.apply(_convert_weights_to_fp16)
+
+
+def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,precision="fp16"):
+ model = VisionTransformer(
+ img_size=img_size,
+ patch_size=14,
+ use_mean_pooling=False,
+ embed_dim=1408,
+ depth=39,
+ num_heads=1408//88,
+ mlp_ratio=4.3637,
+ qkv_bias=True,
+ drop_path_rate=drop_path_rate,
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ use_checkpoint=use_checkpoint,
+ )
+ url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+ cached_file = download_cached_file(
+ url, check_hash=False, progress=True
+ )
+ state_dict = torch.load(cached_file, map_location="cpu")
+ interpolate_pos_embed(model,state_dict)
+
+ incompatible_keys = model.load_state_dict(state_dict, strict=False)
+# print(incompatible_keys)
+
+ if precision == "fp16":
+# model.to("cuda")
+ convert_weights_to_fp16(model)
+ return model
\ No newline at end of file
diff --git a/minigpt4/models/mini_gpt4.py b/minigpt4/models/mini_gpt4.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c3a44d76d4554c26abb4d0700ff46d3c4c8a19
--- /dev/null
+++ b/minigpt4/models/mini_gpt4.py
@@ -0,0 +1,256 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import random
+
+import torch
+from torch.cuda.amp import autocast as autocast
+import torch.nn as nn
+
+from minigpt4.common.registry import registry
+from minigpt4.models.blip2 import Blip2Base, disabled_train
+from minigpt4.models.modeling_llama import LlamaForCausalLM
+from transformers import LlamaTokenizer
+
+
+@registry.register_model("mini_gpt4")
+class MiniGPT4(Blip2Base):
+ """
+ BLIP2 GPT-LLAMA model.
+ """
+
+ PRETRAINED_MODEL_CONFIG_DICT = {
+ "pretrain_vicuna": "configs/models/minigpt4.yaml",
+ }
+
+ def __init__(
+ self,
+ vit_model="eva_clip_g",
+ q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
+ img_size=224,
+ drop_path_rate=0,
+ use_grad_checkpoint=False,
+ vit_precision="fp16",
+ freeze_vit=True,
+ freeze_qformer=True,
+ num_query_token=32,
+ llama_model="",
+ llama_cache_dir='',
+ prompt_path="",
+ prompt_template="",
+ max_txt_len=32,
+ end_sym='\n',
+ ):
+ super().__init__()
+
+ self.tokenizer = self.init_tokenizer()
+
+ print('Loading VIT')
+ self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+ vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
+ )
+ if freeze_vit:
+ for name, param in self.visual_encoder.named_parameters():
+ param.requires_grad = False
+ self.visual_encoder = self.visual_encoder.eval()
+ self.visual_encoder.train = disabled_train
+ for name, param in self.ln_vision.named_parameters():
+ param.requires_grad = False
+ self.ln_vision = self.ln_vision.eval()
+ self.ln_vision.train = disabled_train
+ logging.info("freeze vision encoder")
+ print('Loading VIT Done')
+
+ print('Loading Q-Former')
+ self.Qformer, self.query_tokens = self.init_Qformer(
+ num_query_token, self.visual_encoder.num_features
+ )
+ self.Qformer.cls = None
+ self.Qformer.bert.embeddings.word_embeddings = None
+ self.Qformer.bert.embeddings.position_embeddings = None
+ for layer in self.Qformer.bert.encoder.layer:
+ layer.output = None
+ layer.intermediate = None
+ self.load_from_pretrained(url_or_filename=q_former_model)
+
+ if freeze_qformer:
+ for name, param in self.Qformer.named_parameters():
+ param.requires_grad = False
+ self.Qformer = self.Qformer.eval()
+ self.Qformer.train = disabled_train
+ self.query_tokens.requires_grad = False
+ logging.info("freeze Qformer")
+ print('Loading Q-Former Done')
+
+ print('Loading LLAMA')
+ self.llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model, use_fast=False)
+ self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
+
+ if llama_cache_dir:
+ self.llama_model = LlamaForCausalLM.from_pretrained(
+ llama_model, load_in_8bit=True, torch_dtype=torch.float16, device_map={'': 0}, cache_dir=llama_cache_dir
+ )
+ else:
+ self.llama_model = LlamaForCausalLM.from_pretrained(
+ llama_model, load_in_8bit=True, torch_dtype=torch.float16, device_map={'': 0}
+ )
+ for name, param in self.llama_model.named_parameters():
+ param.requires_grad = False
+ print('Loading LLAMA Done')
+
+ self.llama_proj = nn.Linear(
+ self.Qformer.config.hidden_size, self.llama_model.config.hidden_size
+ )
+ self.max_txt_len = max_txt_len
+ self.end_sym = end_sym
+
+ if prompt_path:
+ with open(prompt_path, 'r') as f:
+ raw_prompts = f.read().splitlines()
+ filted_prompts = [raw_prompt for raw_prompt in raw_prompts if "" in raw_prompt]
+ self.prompt_list = [prompt_template.format(p) for p in filted_prompts]
+ print('Load {} training prompts'.format(len(self.prompt_list)))
+ print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
+ else:
+ self.prompt_list = []
+
+ def encode_img(self, image):
+ with self.maybe_autocast():
+ image_embeds = self.ln_vision(self.visual_encoder(image))
+ image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+ image.device
+ )
+
+ query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+ query_output = self.Qformer.bert(
+ query_embeds=query_tokens,
+ encoder_hidden_states=image_embeds,
+ encoder_attention_mask=image_atts,
+ return_dict=True,
+ )
+
+ inputs_llama = self.llama_proj(query_output.last_hidden_state)
+ atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
+ return inputs_llama, atts_llama
+
+ def prompt_wrap(self, img_embeds, atts_img, prompt):
+ if prompt:
+ batch_size = img_embeds.shape[0]
+ p_before, p_after = prompt.split('')
+ p_before_tokens = self.llama_tokenizer(
+ p_before, return_tensors="pt", add_special_tokens=False).to(img_embeds.device)
+ p_after_tokens = self.llama_tokenizer(
+ p_after, return_tensors="pt", add_special_tokens=False).to(img_embeds.device)
+ p_before_embeds = self.llama_model.model.embed_tokens(p_before_tokens.input_ids).expand(batch_size, -1, -1)
+ p_after_embeds = self.llama_model.model.embed_tokens(p_after_tokens.input_ids).expand(batch_size, -1, -1)
+ wrapped_img_embeds = torch.cat([p_before_embeds, img_embeds, p_after_embeds], dim=1)
+ wrapped_atts_img = atts_img[:, :1].expand(-1, wrapped_img_embeds.shape[1])
+ return wrapped_img_embeds, wrapped_atts_img
+ else:
+ return img_embeds, atts_img
+
+ def forward(self, samples):
+ image = samples["image"]
+ img_embeds, atts_img = self.encode_img(image)
+ if hasattr(samples, 'question_split'): # VQA dataset
+ print('VQA Batch')
+ vqa_prompt = '###Human: '
+ img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, vqa_prompt)
+ elif self.prompt_list:
+ prompt = random.choice(self.prompt_list)
+ img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, prompt)
+
+ self.llama_tokenizer.padding_side = "right"
+
+ text = [t + self.end_sym for t in samples["text_input"]]
+
+ to_regress_tokens = self.llama_tokenizer(
+ text,
+ return_tensors="pt",
+ padding="longest",
+ truncation=True,
+ max_length=self.max_txt_len,
+ add_special_tokens=False
+ ).to(image.device)
+
+ targets = to_regress_tokens.input_ids.masked_fill(
+ to_regress_tokens.input_ids == self.llama_tokenizer.pad_token_id, -100
+ )
+
+ empty_targets = (
+ torch.ones([atts_img.shape[0], atts_img.shape[1]+1],
+ dtype=torch.long).to(image.device).fill_(-100) # plus one for bos
+ )
+ targets = torch.cat([empty_targets, targets], dim=1)
+
+ batch_size = img_embeds.shape[0]
+ bos = torch.ones([batch_size, 1],
+ dtype=to_regress_tokens.input_ids.dtype,
+ device=to_regress_tokens.input_ids.device) * self.llama_tokenizer.bos_token_id
+ bos_embeds = self.llama_model.model.embed_tokens(bos)
+ atts_bos = atts_img[:, :1]
+
+ to_regress_embeds = self.llama_model.model.embed_tokens(to_regress_tokens.input_ids)
+ inputs_embeds = torch.cat([bos_embeds, img_embeds, to_regress_embeds], dim=1)
+ attention_mask = torch.cat([atts_bos, atts_img, to_regress_tokens.attention_mask], dim=1)
+
+ with self.maybe_autocast():
+ outputs = self.llama_model(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ return_dict=True,
+ labels=targets,
+ )
+ loss = outputs.loss
+
+ return {"loss": loss}
+
+ @classmethod
+ def from_config(cls, cfg):
+ vit_model = cfg.get("vit_model", "eva_clip_g")
+ q_former_model = cfg.get("q_former_model", "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth")
+ img_size = cfg.get("image_size")
+ num_query_token = cfg.get("num_query_token")
+ llama_model = cfg.get("llama_model")
+
+ drop_path_rate = cfg.get("drop_path_rate", 0)
+ use_grad_checkpoint = cfg.get("use_grad_checkpoint", False)
+ vit_precision = cfg.get("vit_precision", "fp16")
+ freeze_vit = cfg.get("freeze_vit", True)
+ freeze_qformer = cfg.get("freeze_qformer", True)
+ llama_cache_dir = cfg.get("llama_cache_dir", "")
+
+ prompt_path = cfg.get("prompt_path", "")
+ prompt_template = cfg.get("prompt_template", "")
+ max_txt_len = cfg.get("max_txt_len", 32)
+ end_sym = cfg.get("end_sym", '\n')
+
+ model = cls(
+ vit_model=vit_model,
+ q_former_model=q_former_model,
+ img_size=img_size,
+ drop_path_rate=drop_path_rate,
+ use_grad_checkpoint=use_grad_checkpoint,
+ vit_precision=vit_precision,
+ freeze_vit=freeze_vit,
+ freeze_qformer=freeze_qformer,
+ llama_cache_dir=llama_cache_dir,
+ num_query_token=num_query_token,
+ llama_model=llama_model,
+ prompt_path=prompt_path,
+ prompt_template=prompt_template,
+ max_txt_len=max_txt_len,
+ end_sym=end_sym
+ )
+
+ ckpt_path = cfg.get("ckpt", "") # load weights of MiniGPT-4
+ if ckpt_path:
+ print("Load BLIP2-LLM Checkpoint: {}".format(ckpt_path))
+ ckpt = torch.load(ckpt_path, map_location="cpu")
+ msg = model.load_state_dict(ckpt['model'], strict=False)
+
+ return model
diff --git a/minigpt4/models/modeling_llama.py b/minigpt4/models/modeling_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cebacf454dd9cf82823fc372d7673142b2082c7
--- /dev/null
+++ b/minigpt4/models/modeling_llama.py
@@ -0,0 +1,772 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class LlamaRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ LlamaRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+ # convert into half-precision if necessary
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
+ hidden_states = hidden_states.to(self.weight.dtype)
+
+ return self.weight * hidden_states
+
+
+class LlamaRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+ self.register_buffer("inv_freq", inv_freq)
+
+ # Build here to make `torch.jit.trace` work.
+ self.max_seq_len_cached = max_position_embeddings
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+ if seq_len > self.max_seq_len_cached:
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1]
+ gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+ cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+ sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+ def __init__(
+ self,
+ hidden_size: int,
+ intermediate_size: int,
+ hidden_act: str,
+ ):
+ super().__init__()
+ self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+ self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+ self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+ self.act_fn = ACT2FN[hidden_act]
+
+ def forward(self, x):
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class LlamaAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: LlamaConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.max_position_embeddings = config.max_position_embeddings
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+ # [bsz, nh, t, hd]
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+ attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2)
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class LlamaDecoderLayer(nn.Module):
+ def __init__(self, config: LlamaConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = LlamaAttention(config=config)
+ self.mlp = LlamaMLP(
+ hidden_size=self.hidden_size,
+ intermediate_size=config.intermediate_size,
+ hidden_act=config.hidden_act,
+ )
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`LlamaConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+ LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+ config_class = LlamaConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["LlamaDecoderLayer"]
+ _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, LlamaModel):
+ module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+ LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+ Args:
+ config: LlamaConfig
+ """
+
+ def __init__(self, config: LlamaConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ query_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ if query_embeds is not None:
+ inputs_embeds = torch.cat([query_embeds, inputs_embeds], dim=1)
+ batch_size, seq_length, _ = inputs_embeds.shape
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = LlamaModel(config)
+
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ query_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+ >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you consciours? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ query_embeds=query_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, query_embeds=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+ query_embeds = None
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "query_embeds": query_embeds,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+ return reordered_past
+
diff --git a/minigpt4/processors/__init__.py b/minigpt4/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb0908e7603881b41be0228d7f8346f0d00840e
--- /dev/null
+++ b/minigpt4/processors/__init__.py
@@ -0,0 +1,33 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.processors.base_processor import BaseProcessor
+from minigpt4.processors.blip_processors import (
+ Blip2ImageTrainProcessor,
+ Blip2ImageEvalProcessor,
+ BlipCaptionProcessor,
+)
+
+from minigpt4.common.registry import registry
+
+__all__ = [
+ "BaseProcessor",
+ "Blip2ImageTrainProcessor",
+ "Blip2ImageEvalProcessor",
+ "BlipCaptionProcessor",
+]
+
+
+def load_processor(name, cfg=None):
+ """
+ Example
+
+ >>> processor = load_processor("alpro_video_train", cfg=None)
+ """
+ processor = registry.get_processor_class(name).from_config(cfg)
+
+ return processor
diff --git a/minigpt4/processors/base_processor.py b/minigpt4/processors/base_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c9d86859270a046623661a632587f2b3136b46
--- /dev/null
+++ b/minigpt4/processors/base_processor.py
@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from omegaconf import OmegaConf
+
+
+class BaseProcessor:
+ def __init__(self):
+ self.transform = lambda x: x
+ return
+
+ def __call__(self, item):
+ return self.transform(item)
+
+ @classmethod
+ def from_config(cls, cfg=None):
+ return cls()
+
+ def build(self, **kwargs):
+ cfg = OmegaConf.create(kwargs)
+
+ return self.from_config(cfg)
diff --git a/minigpt4/processors/blip_processors.py b/minigpt4/processors/blip_processors.py
new file mode 100644
index 0000000000000000000000000000000000000000..9853aedc2d51c546b9b34ff4c6ec587aded93dbf
--- /dev/null
+++ b/minigpt4/processors/blip_processors.py
@@ -0,0 +1,141 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import re
+
+from minigpt4.common.registry import registry
+from minigpt4.processors.base_processor import BaseProcessor
+from minigpt4.processors.randaugment import RandomAugment
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+
+class BlipImageBaseProcessor(BaseProcessor):
+ def __init__(self, mean=None, std=None):
+ if mean is None:
+ mean = (0.48145466, 0.4578275, 0.40821073)
+ if std is None:
+ std = (0.26862954, 0.26130258, 0.27577711)
+
+ self.normalize = transforms.Normalize(mean, std)
+
+
+@registry.register_processor("blip_caption")
+class BlipCaptionProcessor(BaseProcessor):
+ def __init__(self, prompt="", max_words=50):
+ self.prompt = prompt
+ self.max_words = max_words
+
+ def __call__(self, caption):
+ caption = self.prompt + self.pre_caption(caption)
+
+ return caption
+
+ @classmethod
+ def from_config(cls, cfg=None):
+ if cfg is None:
+ cfg = OmegaConf.create()
+
+ prompt = cfg.get("prompt", "")
+ max_words = cfg.get("max_words", 50)
+
+ return cls(prompt=prompt, max_words=max_words)
+
+ def pre_caption(self, caption):
+ caption = re.sub(
+ r"([.!\"()*#:;~])",
+ " ",
+ caption.lower(),
+ )
+ caption = re.sub(
+ r"\s{2,}",
+ " ",
+ caption,
+ )
+ caption = caption.rstrip("\n")
+ caption = caption.strip(" ")
+
+ # truncate caption
+ caption_words = caption.split(" ")
+ if len(caption_words) > self.max_words:
+ caption = " ".join(caption_words[: self.max_words])
+
+ return caption
+
+
+@registry.register_processor("blip2_image_train")
+class Blip2ImageTrainProcessor(BlipImageBaseProcessor):
+ def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
+ super().__init__(mean=mean, std=std)
+
+ self.transform = transforms.Compose(
+ [
+ transforms.RandomResizedCrop(
+ image_size,
+ scale=(min_scale, max_scale),
+ interpolation=InterpolationMode.BICUBIC,
+ ),
+ transforms.ToTensor(),
+ self.normalize,
+ ]
+ )
+
+ def __call__(self, item):
+ return self.transform(item)
+
+ @classmethod
+ def from_config(cls, cfg=None):
+ if cfg is None:
+ cfg = OmegaConf.create()
+
+ image_size = cfg.get("image_size", 224)
+
+ mean = cfg.get("mean", None)
+ std = cfg.get("std", None)
+
+ min_scale = cfg.get("min_scale", 0.5)
+ max_scale = cfg.get("max_scale", 1.0)
+
+ return cls(
+ image_size=image_size,
+ mean=mean,
+ std=std,
+ min_scale=min_scale,
+ max_scale=max_scale,
+ )
+
+
+@registry.register_processor("blip2_image_eval")
+class Blip2ImageEvalProcessor(BlipImageBaseProcessor):
+ def __init__(self, image_size=224, mean=None, std=None):
+ super().__init__(mean=mean, std=std)
+
+ self.transform = transforms.Compose(
+ [
+ transforms.Resize(
+ (image_size, image_size), interpolation=InterpolationMode.BICUBIC
+ ),
+ transforms.ToTensor(),
+ self.normalize,
+ ]
+ )
+
+ def __call__(self, item):
+ return self.transform(item)
+
+ @classmethod
+ def from_config(cls, cfg=None):
+ if cfg is None:
+ cfg = OmegaConf.create()
+
+ image_size = cfg.get("image_size", 224)
+
+ mean = cfg.get("mean", None)
+ std = cfg.get("std", None)
+
+ return cls(image_size=image_size, mean=mean, std=std)
\ No newline at end of file
diff --git a/minigpt4/processors/randaugment.py b/minigpt4/processors/randaugment.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c6a9e6d62f74358f490d19546c9829b3ac6aaef
--- /dev/null
+++ b/minigpt4/processors/randaugment.py
@@ -0,0 +1,398 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import cv2
+import numpy as np
+
+import torch
+
+
+## aug functions
+def identity_func(img):
+ return img
+
+
+def autocontrast_func(img, cutoff=0):
+ """
+ same output as PIL.ImageOps.autocontrast
+ """
+ n_bins = 256
+
+ def tune_channel(ch):
+ n = ch.size
+ cut = cutoff * n // 100
+ if cut == 0:
+ high, low = ch.max(), ch.min()
+ else:
+ hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+ low = np.argwhere(np.cumsum(hist) > cut)
+ low = 0 if low.shape[0] == 0 else low[0]
+ high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+ high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+ if high <= low:
+ table = np.arange(n_bins)
+ else:
+ scale = (n_bins - 1) / (high - low)
+ offset = -low * scale
+ table = np.arange(n_bins) * scale + offset
+ table[table < 0] = 0
+ table[table > n_bins - 1] = n_bins - 1
+ table = table.clip(0, 255).astype(np.uint8)
+ return table[ch]
+
+ channels = [tune_channel(ch) for ch in cv2.split(img)]
+ out = cv2.merge(channels)
+ return out
+
+
+def equalize_func(img):
+ """
+ same output as PIL.ImageOps.equalize
+ PIL's implementation is different from cv2.equalize
+ """
+ n_bins = 256
+
+ def tune_channel(ch):
+ hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+ non_zero_hist = hist[hist != 0].reshape(-1)
+ step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+ if step == 0:
+ return ch
+ n = np.empty_like(hist)
+ n[0] = step // 2
+ n[1:] = hist[:-1]
+ table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+ return table[ch]
+
+ channels = [tune_channel(ch) for ch in cv2.split(img)]
+ out = cv2.merge(channels)
+ return out
+
+
+def rotate_func(img, degree, fill=(0, 0, 0)):
+ """
+ like PIL, rotate by degree, not radians
+ """
+ H, W = img.shape[0], img.shape[1]
+ center = W / 2, H / 2
+ M = cv2.getRotationMatrix2D(center, degree, 1)
+ out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+ return out
+
+
+def solarize_func(img, thresh=128):
+ """
+ same output as PIL.ImageOps.posterize
+ """
+ table = np.array([el if el < thresh else 255 - el for el in range(256)])
+ table = table.clip(0, 255).astype(np.uint8)
+ out = table[img]
+ return out
+
+
+def color_func(img, factor):
+ """
+ same output as PIL.ImageEnhance.Color
+ """
+ ## implementation according to PIL definition, quite slow
+ # degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+ # out = blend(degenerate, img, factor)
+ # M = (
+ # np.eye(3) * factor
+ # + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+ # )[np.newaxis, np.newaxis, :]
+ M = np.float32(
+ [[0.886, -0.114, -0.114], [-0.587, 0.413, -0.587], [-0.299, -0.299, 0.701]]
+ ) * factor + np.float32([[0.114], [0.587], [0.299]])
+ out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+ return out
+
+
+def contrast_func(img, factor):
+ """
+ same output as PIL.ImageEnhance.Contrast
+ """
+ mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+ table = (
+ np.array([(el - mean) * factor + mean for el in range(256)])
+ .clip(0, 255)
+ .astype(np.uint8)
+ )
+ out = table[img]
+ return out
+
+
+def brightness_func(img, factor):
+ """
+ same output as PIL.ImageEnhance.Contrast
+ """
+ table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8)
+ out = table[img]
+ return out
+
+
+def sharpness_func(img, factor):
+ """
+ The differences the this result and PIL are all on the 4 boundaries, the center
+ areas are same
+ """
+ kernel = np.ones((3, 3), dtype=np.float32)
+ kernel[1][1] = 5
+ kernel /= 13
+ degenerate = cv2.filter2D(img, -1, kernel)
+ if factor == 0.0:
+ out = degenerate
+ elif factor == 1.0:
+ out = img
+ else:
+ out = img.astype(np.float32)
+ degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+ out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate)
+ out = out.astype(np.uint8)
+ return out
+
+
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+ H, W = img.shape[0], img.shape[1]
+ M = np.float32([[1, factor, 0], [0, 1, 0]])
+ out = cv2.warpAffine(
+ img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+ ).astype(np.uint8)
+ return out
+
+
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+ """
+ same output as PIL.Image.transform
+ """
+ H, W = img.shape[0], img.shape[1]
+ M = np.float32([[1, 0, -offset], [0, 1, 0]])
+ out = cv2.warpAffine(
+ img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+ ).astype(np.uint8)
+ return out
+
+
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+ """
+ same output as PIL.Image.transform
+ """
+ H, W = img.shape[0], img.shape[1]
+ M = np.float32([[1, 0, 0], [0, 1, -offset]])
+ out = cv2.warpAffine(
+ img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+ ).astype(np.uint8)
+ return out
+
+
+def posterize_func(img, bits):
+ """
+ same output as PIL.ImageOps.posterize
+ """
+ out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+ return out
+
+
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+ H, W = img.shape[0], img.shape[1]
+ M = np.float32([[1, 0, 0], [factor, 1, 0]])
+ out = cv2.warpAffine(
+ img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+ ).astype(np.uint8)
+ return out
+
+
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+ replace = np.array(replace, dtype=np.uint8)
+ H, W = img.shape[0], img.shape[1]
+ rh, rw = np.random.random(2)
+ pad_size = pad_size // 2
+ ch, cw = int(rh * H), int(rw * W)
+ x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+ y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+ out = img.copy()
+ out[x1:x2, y1:y2, :] = replace
+ return out
+
+
+### level to args
+def enhance_level_to_args(MAX_LEVEL):
+ def level_to_args(level):
+ return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+
+ return level_to_args
+
+
+def shear_level_to_args(MAX_LEVEL, replace_value):
+ def level_to_args(level):
+ level = (level / MAX_LEVEL) * 0.3
+ if np.random.random() > 0.5:
+ level = -level
+ return (level, replace_value)
+
+ return level_to_args
+
+
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+ def level_to_args(level):
+ level = (level / MAX_LEVEL) * float(translate_const)
+ if np.random.random() > 0.5:
+ level = -level
+ return (level, replace_value)
+
+ return level_to_args
+
+
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+ def level_to_args(level):
+ level = int((level / MAX_LEVEL) * cutout_const)
+ return (level, replace_value)
+
+ return level_to_args
+
+
+def solarize_level_to_args(MAX_LEVEL):
+ def level_to_args(level):
+ level = int((level / MAX_LEVEL) * 256)
+ return (level,)
+
+ return level_to_args
+
+
+def none_level_to_args(level):
+ return ()
+
+
+def posterize_level_to_args(MAX_LEVEL):
+ def level_to_args(level):
+ level = int((level / MAX_LEVEL) * 4)
+ return (level,)
+
+ return level_to_args
+
+
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+ def level_to_args(level):
+ level = (level / MAX_LEVEL) * 30
+ if np.random.random() < 0.5:
+ level = -level
+ return (level, replace_value)
+
+ return level_to_args
+
+
+func_dict = {
+ "Identity": identity_func,
+ "AutoContrast": autocontrast_func,
+ "Equalize": equalize_func,
+ "Rotate": rotate_func,
+ "Solarize": solarize_func,
+ "Color": color_func,
+ "Contrast": contrast_func,
+ "Brightness": brightness_func,
+ "Sharpness": sharpness_func,
+ "ShearX": shear_x_func,
+ "TranslateX": translate_x_func,
+ "TranslateY": translate_y_func,
+ "Posterize": posterize_func,
+ "ShearY": shear_y_func,
+}
+
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+ "Identity": none_level_to_args,
+ "AutoContrast": none_level_to_args,
+ "Equalize": none_level_to_args,
+ "Rotate": rotate_level_to_args(MAX_LEVEL, replace_value),
+ "Solarize": solarize_level_to_args(MAX_LEVEL),
+ "Color": enhance_level_to_args(MAX_LEVEL),
+ "Contrast": enhance_level_to_args(MAX_LEVEL),
+ "Brightness": enhance_level_to_args(MAX_LEVEL),
+ "Sharpness": enhance_level_to_args(MAX_LEVEL),
+ "ShearX": shear_level_to_args(MAX_LEVEL, replace_value),
+ "TranslateX": translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+ "TranslateY": translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+ "Posterize": posterize_level_to_args(MAX_LEVEL),
+ "ShearY": shear_level_to_args(MAX_LEVEL, replace_value),
+}
+
+
+class RandomAugment(object):
+ def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+ self.N = N
+ self.M = M
+ self.isPIL = isPIL
+ if augs:
+ self.augs = augs
+ else:
+ self.augs = list(arg_dict.keys())
+
+ def get_random_ops(self):
+ sampled_ops = np.random.choice(self.augs, self.N)
+ return [(op, 0.5, self.M) for op in sampled_ops]
+
+ def __call__(self, img):
+ if self.isPIL:
+ img = np.array(img)
+ ops = self.get_random_ops()
+ for name, prob, level in ops:
+ if np.random.random() > prob:
+ continue
+ args = arg_dict[name](level)
+ img = func_dict[name](img, *args)
+ return img
+
+
+class VideoRandomAugment(object):
+ def __init__(self, N=2, M=10, p=0.0, tensor_in_tensor_out=True, augs=[]):
+ self.N = N
+ self.M = M
+ self.p = p
+ self.tensor_in_tensor_out = tensor_in_tensor_out
+ if augs:
+ self.augs = augs
+ else:
+ self.augs = list(arg_dict.keys())
+
+ def get_random_ops(self):
+ sampled_ops = np.random.choice(self.augs, self.N, replace=False)
+ return [(op, self.M) for op in sampled_ops]
+
+ def __call__(self, frames):
+ assert (
+ frames.shape[-1] == 3
+ ), "Expecting last dimension for 3-channels RGB (b, h, w, c)."
+
+ if self.tensor_in_tensor_out:
+ frames = frames.numpy().astype(np.uint8)
+
+ num_frames = frames.shape[0]
+
+ ops = num_frames * [self.get_random_ops()]
+ apply_or_not = num_frames * [np.random.random(size=self.N) > self.p]
+
+ frames = torch.stack(
+ list(map(self._aug, frames, ops, apply_or_not)), dim=0
+ ).float()
+
+ return frames
+
+ def _aug(self, img, ops, apply_or_not):
+ for i, (name, level) in enumerate(ops):
+ if not apply_or_not[i]:
+ continue
+ args = arg_dict[name](level)
+ img = func_dict[name](img, *args)
+ return torch.from_numpy(img)
+
+
+if __name__ == "__main__":
+ a = RandomAugment()
+ img = np.random.randn(32, 32, 3)
+ a(img)
diff --git a/minigpt4/runners/__init__.py b/minigpt4/runners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d1dae7693eabffc2400d29642f56aa5115d43ca
--- /dev/null
+++ b/minigpt4/runners/__init__.py
@@ -0,0 +1,10 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.runners.runner_base import RunnerBase
+
+__all__ = ["RunnerBase"]
diff --git a/minigpt4/runners/runner_base.py b/minigpt4/runners/runner_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f667f213d3874e3b616080df22de9ff91a9844b
--- /dev/null
+++ b/minigpt4/runners/runner_base.py
@@ -0,0 +1,658 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import json
+import logging
+import os
+import time
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import webdataset as wds
+from minigpt4.common.dist_utils import (
+ download_cached_file,
+ get_rank,
+ get_world_size,
+ is_main_process,
+ main_process,
+)
+from minigpt4.common.registry import registry
+from minigpt4.common.utils import is_url
+from minigpt4.datasets.data_utils import concat_datasets, reorg_datasets_by_split, ChainDataset
+from minigpt4.datasets.datasets.dataloader_utils import (
+ IterLoader,
+ MultiIterLoader,
+ PrefetchLoader,
+)
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, DistributedSampler
+
+
+@registry.register_runner("runner_base")
+class RunnerBase:
+ """
+ A runner class to train and evaluate a model given a task and datasets.
+
+ The runner uses pytorch distributed data parallel by default. Future release
+ will support other distributed frameworks.
+ """
+
+ def __init__(self, cfg, task, model, datasets, job_id):
+ self.config = cfg
+ self.job_id = job_id
+
+ self.task = task
+ self.datasets = datasets
+
+ self._model = model
+
+ self._wrapped_model = None
+ self._device = None
+ self._optimizer = None
+ self._scaler = None
+ self._dataloaders = None
+ self._lr_sched = None
+
+ self.start_epoch = 0
+
+ # self.setup_seeds()
+ self.setup_output_dir()
+
+ @property
+ def device(self):
+ if self._device is None:
+ self._device = torch.device(self.config.run_cfg.device)
+
+ return self._device
+
+ @property
+ def use_distributed(self):
+ return self.config.run_cfg.distributed
+
+ @property
+ def model(self):
+ """
+ A property to get the DDP-wrapped model on the device.
+ """
+ # move model to device
+ if self._model.device != self.device:
+ self._model = self._model.to(self.device)
+
+ # distributed training wrapper
+ if self.use_distributed:
+ if self._wrapped_model is None:
+ self._wrapped_model = DDP(
+ self._model, device_ids=[self.config.run_cfg.gpu]
+ )
+ else:
+ self._wrapped_model = self._model
+
+ return self._wrapped_model
+
+ @property
+ def optimizer(self):
+ # TODO make optimizer class and configurations
+ if self._optimizer is None:
+ num_parameters = 0
+ p_wd, p_non_wd = [], []
+ for n, p in self.model.named_parameters():
+ if not p.requires_grad:
+ continue # frozen weights
+ print(n)
+ if p.ndim < 2 or "bias" in n or "ln" in n or "bn" in n:
+ p_non_wd.append(p)
+ else:
+ p_wd.append(p)
+ num_parameters += p.data.nelement()
+ logging.info("number of trainable parameters: %d" % num_parameters)
+ optim_params = [
+ {
+ "params": p_wd,
+ "weight_decay": float(self.config.run_cfg.weight_decay),
+ },
+ {"params": p_non_wd, "weight_decay": 0},
+ ]
+ beta2 = self.config.run_cfg.get("beta2", 0.999)
+ self._optimizer = torch.optim.AdamW(
+ optim_params,
+ lr=float(self.config.run_cfg.init_lr),
+ weight_decay=float(self.config.run_cfg.weight_decay),
+ betas=(0.9, beta2),
+ )
+
+ return self._optimizer
+
+ @property
+ def scaler(self):
+ amp = self.config.run_cfg.get("amp", False)
+
+ if amp:
+ if self._scaler is None:
+ self._scaler = torch.cuda.amp.GradScaler()
+
+ return self._scaler
+
+ @property
+ def lr_scheduler(self):
+ """
+ A property to get and create learning rate scheduler by split just in need.
+ """
+ if self._lr_sched is None:
+ lr_sched_cls = registry.get_lr_scheduler_class(self.config.run_cfg.lr_sched)
+
+ # max_epoch = self.config.run_cfg.max_epoch
+ max_epoch = self.max_epoch
+ # min_lr = self.config.run_cfg.min_lr
+ min_lr = self.min_lr
+ # init_lr = self.config.run_cfg.init_lr
+ init_lr = self.init_lr
+
+ # optional parameters
+ decay_rate = self.config.run_cfg.get("lr_decay_rate", None)
+ warmup_start_lr = self.config.run_cfg.get("warmup_lr", -1)
+ warmup_steps = self.config.run_cfg.get("warmup_steps", 0)
+ iters_per_epoch = self.config.run_cfg.get("iters_per_epoch", None)
+
+ if iters_per_epoch is None:
+ try:
+ iters_per_epoch = len(self.dataloaders['train'])
+ except (AttributeError, TypeError):
+ iters_per_epoch = 10000
+
+ self._lr_sched = lr_sched_cls(
+ optimizer=self.optimizer,
+ max_epoch=max_epoch,
+ iters_per_epoch=iters_per_epoch,
+ min_lr=min_lr,
+ init_lr=init_lr,
+ decay_rate=decay_rate,
+ warmup_start_lr=warmup_start_lr,
+ warmup_steps=warmup_steps,
+ )
+
+ return self._lr_sched
+
+ @property
+ def dataloaders(self) -> dict:
+ """
+ A property to get and create dataloaders by split just in need.
+
+ If no train_dataset_ratio is provided, concatenate map-style datasets and
+ chain wds.DataPipe datasets separately. Training set becomes a tuple
+ (ConcatDataset, ChainDataset), both are optional but at least one of them is
+ required. The resultant ConcatDataset and ChainDataset will be sampled evenly.
+
+ If train_dataset_ratio is provided, create a MultiIterLoader to sample
+ each dataset by ratios during training.
+
+ Currently do not support multiple datasets for validation and test.
+
+ Returns:
+ dict: {split_name: (tuples of) dataloader}
+ """
+ if self._dataloaders is None:
+
+ # concatenate map-style datasets and chain wds.DataPipe datasets separately
+ # training set becomes a tuple (ConcatDataset, ChainDataset), both are
+ # optional but at least one of them is required. The resultant ConcatDataset
+ # and ChainDataset will be sampled evenly.
+ logging.info(
+ "dataset_ratios not specified, datasets will be concatenated (map-style datasets) or chained (webdataset.DataPipeline)."
+ )
+
+ datasets = reorg_datasets_by_split(self.datasets)
+ self.datasets = datasets
+ # self.datasets = concat_datasets(datasets)
+
+ # print dataset statistics after concatenation/chaining
+ for split_name in self.datasets:
+ if isinstance(self.datasets[split_name], tuple) or isinstance(
+ self.datasets[split_name], list
+ ):
+ # mixed wds.DataPipeline and torch.utils.data.Dataset
+ num_records = sum(
+ [
+ len(d)
+ if not type(d) in [wds.DataPipeline, ChainDataset]
+ else 0
+ for d in self.datasets[split_name]
+ ]
+ )
+
+ else:
+ if hasattr(self.datasets[split_name], "__len__"):
+ # a single map-style dataset
+ num_records = len(self.datasets[split_name])
+ else:
+ # a single wds.DataPipeline
+ num_records = -1
+ logging.info(
+ "Only a single wds.DataPipeline dataset, no __len__ attribute."
+ )
+
+ if num_records >= 0:
+ logging.info(
+ "Loaded {} records for {} split from the dataset.".format(
+ num_records, split_name
+ )
+ )
+
+ # create dataloaders
+ split_names = sorted(self.datasets.keys())
+
+ datasets = [self.datasets[split] for split in split_names]
+ is_trains = [split in self.train_splits for split in split_names]
+
+ batch_sizes = [
+ self.config.run_cfg.batch_size_train
+ if split == "train"
+ else self.config.run_cfg.batch_size_eval
+ for split in split_names
+ ]
+
+ collate_fns = []
+ for dataset in datasets:
+ if isinstance(dataset, tuple) or isinstance(dataset, list):
+ collate_fns.append([getattr(d, "collater", None) for d in dataset])
+ else:
+ collate_fns.append(getattr(dataset, "collater", None))
+
+ dataloaders = self.create_loaders(
+ datasets=datasets,
+ num_workers=self.config.run_cfg.num_workers,
+ batch_sizes=batch_sizes,
+ is_trains=is_trains,
+ collate_fns=collate_fns,
+ )
+
+ self._dataloaders = {k: v for k, v in zip(split_names, dataloaders)}
+
+ return self._dataloaders
+
+ @property
+ def cuda_enabled(self):
+ return self.device.type == "cuda"
+
+ @property
+ def max_epoch(self):
+ return int(self.config.run_cfg.max_epoch)
+
+ @property
+ def log_freq(self):
+ log_freq = self.config.run_cfg.get("log_freq", 50)
+ return int(log_freq)
+
+ @property
+ def init_lr(self):
+ return float(self.config.run_cfg.init_lr)
+
+ @property
+ def min_lr(self):
+ return float(self.config.run_cfg.min_lr)
+
+ @property
+ def accum_grad_iters(self):
+ return int(self.config.run_cfg.get("accum_grad_iters", 1))
+
+ @property
+ def valid_splits(self):
+ valid_splits = self.config.run_cfg.get("valid_splits", [])
+
+ if len(valid_splits) == 0:
+ logging.info("No validation splits found.")
+
+ return valid_splits
+
+ @property
+ def test_splits(self):
+ test_splits = self.config.run_cfg.get("test_splits", [])
+
+ return test_splits
+
+ @property
+ def train_splits(self):
+ train_splits = self.config.run_cfg.get("train_splits", [])
+
+ if len(train_splits) == 0:
+ logging.info("Empty train splits.")
+
+ return train_splits
+
+ @property
+ def evaluate_only(self):
+ """
+ Set to True to skip training.
+ """
+ return self.config.run_cfg.evaluate
+
+ @property
+ def use_dist_eval_sampler(self):
+ return self.config.run_cfg.get("use_dist_eval_sampler", True)
+
+ @property
+ def resume_ckpt_path(self):
+ return self.config.run_cfg.get("resume_ckpt_path", None)
+
+ @property
+ def train_loader(self):
+ train_dataloader = self.dataloaders["train"]
+
+ return train_dataloader
+
+ def setup_output_dir(self):
+ lib_root = Path(registry.get_path("library_root"))
+
+ output_dir = lib_root / self.config.run_cfg.output_dir / self.job_id
+ result_dir = output_dir / "result"
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ result_dir.mkdir(parents=True, exist_ok=True)
+
+ registry.register_path("result_dir", str(result_dir))
+ registry.register_path("output_dir", str(output_dir))
+
+ self.result_dir = result_dir
+ self.output_dir = output_dir
+
+ def train(self):
+ start_time = time.time()
+ best_agg_metric = 0
+ best_epoch = 0
+
+ self.log_config()
+
+ # resume from checkpoint if specified
+ if not self.evaluate_only and self.resume_ckpt_path is not None:
+ self._load_checkpoint(self.resume_ckpt_path)
+
+ for cur_epoch in range(self.start_epoch, self.max_epoch):
+ # training phase
+ if not self.evaluate_only:
+ logging.info("Start training")
+ train_stats = self.train_epoch(cur_epoch)
+ self.log_stats(split_name="train", stats=train_stats)
+
+ # evaluation phase
+ if len(self.valid_splits) > 0:
+ for split_name in self.valid_splits:
+ logging.info("Evaluating on {}.".format(split_name))
+
+ val_log = self.eval_epoch(
+ split_name=split_name, cur_epoch=cur_epoch
+ )
+ if val_log is not None:
+ if is_main_process():
+ assert (
+ "agg_metrics" in val_log
+ ), "No agg_metrics found in validation log."
+
+ agg_metrics = val_log["agg_metrics"]
+ if agg_metrics > best_agg_metric and split_name == "val":
+ best_epoch, best_agg_metric = cur_epoch, agg_metrics
+
+ self._save_checkpoint(cur_epoch, is_best=True)
+
+ val_log.update({"best_epoch": best_epoch})
+ self.log_stats(val_log, split_name)
+
+ else:
+ # if no validation split is provided, we just save the checkpoint at the end of each epoch.
+ if not self.evaluate_only:
+ self._save_checkpoint(cur_epoch, is_best=False)
+
+ if self.evaluate_only:
+ break
+
+ if self.config.run_cfg.distributed:
+ dist.barrier()
+
+ # testing phase
+ test_epoch = "best" if len(self.valid_splits) > 0 else cur_epoch
+ self.evaluate(cur_epoch=test_epoch, skip_reload=self.evaluate_only)
+
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ logging.info("Training time {}".format(total_time_str))
+
+ def evaluate(self, cur_epoch="best", skip_reload=False):
+ test_logs = dict()
+
+ if len(self.test_splits) > 0:
+ for split_name in self.test_splits:
+ test_logs[split_name] = self.eval_epoch(
+ split_name=split_name, cur_epoch=cur_epoch, skip_reload=skip_reload
+ )
+
+ return test_logs
+
+ def train_epoch(self, epoch):
+ # train
+ self.model.train()
+
+ return self.task.train_epoch(
+ epoch=epoch,
+ model=self.model,
+ data_loader=self.train_loader,
+ optimizer=self.optimizer,
+ scaler=self.scaler,
+ lr_scheduler=self.lr_scheduler,
+ cuda_enabled=self.cuda_enabled,
+ log_freq=self.log_freq,
+ accum_grad_iters=self.accum_grad_iters,
+ )
+
+ @torch.no_grad()
+ def eval_epoch(self, split_name, cur_epoch, skip_reload=False):
+ """
+ Evaluate the model on a given split.
+
+ Args:
+ split_name (str): name of the split to evaluate on.
+ cur_epoch (int): current epoch.
+ skip_reload_best (bool): whether to skip reloading the best checkpoint.
+ During training, we will reload the best checkpoint for validation.
+ During testing, we will use provided weights and skip reloading the best checkpoint .
+ """
+ data_loader = self.dataloaders.get(split_name, None)
+ assert data_loader, "data_loader for split {} is None.".format(split_name)
+
+ # TODO In validation, you need to compute loss as well as metrics
+ # TODO consider moving to model.before_evaluation()
+ model = self.unwrap_dist_model(self.model)
+ if not skip_reload and cur_epoch == "best":
+ model = self._reload_best_model(model)
+ model.eval()
+
+ self.task.before_evaluation(
+ model=model,
+ dataset=self.datasets[split_name],
+ )
+ results = self.task.evaluation(model, data_loader)
+
+ if results is not None:
+ return self.task.after_evaluation(
+ val_result=results,
+ split_name=split_name,
+ epoch=cur_epoch,
+ )
+
+ def unwrap_dist_model(self, model):
+ if self.use_distributed:
+ return model.module
+ else:
+ return model
+
+ def create_loaders(
+ self,
+ datasets,
+ num_workers,
+ batch_sizes,
+ is_trains,
+ collate_fns,
+ dataset_ratios=None,
+ ):
+ """
+ Create dataloaders for training and validation.
+ """
+
+ def _create_loader(dataset, num_workers, bsz, is_train, collate_fn):
+ # create a single dataloader for each split
+ if isinstance(dataset, ChainDataset) or isinstance(
+ dataset, wds.DataPipeline
+ ):
+ # wds.WebdDataset instance are chained together
+ # webdataset.DataPipeline has its own sampler and collate_fn
+ loader = iter(
+ DataLoader(
+ dataset,
+ batch_size=bsz,
+ num_workers=num_workers,
+ pin_memory=True,
+ )
+ )
+ else:
+ # map-style dataset are concatenated together
+ # setup distributed sampler
+ if self.use_distributed:
+ sampler = DistributedSampler(
+ dataset,
+ shuffle=is_train,
+ num_replicas=get_world_size(),
+ rank=get_rank(),
+ )
+ if not self.use_dist_eval_sampler:
+ # e.g. retrieval evaluation
+ sampler = sampler if is_train else None
+ else:
+ sampler = None
+
+ loader = DataLoader(
+ dataset,
+ batch_size=bsz,
+ num_workers=num_workers,
+ pin_memory=True,
+ sampler=sampler,
+ shuffle=sampler is None and is_train,
+ collate_fn=collate_fn,
+ drop_last=True if is_train else False,
+ )
+ loader = PrefetchLoader(loader)
+
+ if is_train:
+ loader = IterLoader(loader, use_distributed=self.use_distributed)
+
+ return loader
+
+ loaders = []
+
+ for dataset, bsz, is_train, collate_fn in zip(
+ datasets, batch_sizes, is_trains, collate_fns
+ ):
+ if isinstance(dataset, list) or isinstance(dataset, tuple):
+ if hasattr(dataset[0], 'sample_ratio') and dataset_ratios is None:
+ dataset_ratios = [d.sample_ratio for d in dataset]
+ loader = MultiIterLoader(
+ loaders=[
+ _create_loader(d, num_workers, bsz, is_train, collate_fn[i])
+ for i, d in enumerate(dataset)
+ ],
+ ratios=dataset_ratios,
+ )
+ else:
+ loader = _create_loader(dataset, num_workers, bsz, is_train, collate_fn)
+
+ loaders.append(loader)
+
+ return loaders
+
+ @main_process
+ def _save_checkpoint(self, cur_epoch, is_best=False):
+ """
+ Save the checkpoint at the current epoch.
+ """
+ model_no_ddp = self.unwrap_dist_model(self.model)
+ param_grad_dic = {
+ k: v.requires_grad for (k, v) in model_no_ddp.named_parameters()
+ }
+ state_dict = model_no_ddp.state_dict()
+ for k in list(state_dict.keys()):
+ if k in param_grad_dic.keys() and not param_grad_dic[k]:
+ # delete parameters that do not require gradient
+ del state_dict[k]
+ save_obj = {
+ "model": state_dict,
+ "optimizer": self.optimizer.state_dict(),
+ "config": self.config.to_dict(),
+ "scaler": self.scaler.state_dict() if self.scaler else None,
+ "epoch": cur_epoch,
+ }
+ save_to = os.path.join(
+ self.output_dir,
+ "checkpoint_{}.pth".format("best" if is_best else cur_epoch),
+ )
+ logging.info("Saving checkpoint at epoch {} to {}.".format(cur_epoch, save_to))
+ torch.save(save_obj, save_to)
+
+ def _reload_best_model(self, model):
+ """
+ Load the best checkpoint for evaluation.
+ """
+ checkpoint_path = os.path.join(self.output_dir, "checkpoint_best.pth")
+
+ logging.info("Loading checkpoint from {}.".format(checkpoint_path))
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
+ try:
+ model.load_state_dict(checkpoint["model"])
+ except RuntimeError as e:
+ logging.warning(
+ """
+ Key mismatch when loading checkpoint. This is expected if only part of the model is saved.
+ Trying to load the model with strict=False.
+ """
+ )
+ model.load_state_dict(checkpoint["model"], strict=False)
+ return model
+
+ def _load_checkpoint(self, url_or_filename):
+ """
+ Resume from a checkpoint.
+ """
+ if is_url(url_or_filename):
+ cached_file = download_cached_file(
+ url_or_filename, check_hash=False, progress=True
+ )
+ checkpoint = torch.load(cached_file, map_location=self.device, strict=False)
+ elif os.path.isfile(url_or_filename):
+ checkpoint = torch.load(url_or_filename, map_location=self.device, strict=False)
+ else:
+ raise RuntimeError("checkpoint url or path is invalid")
+
+ state_dict = checkpoint["model"]
+ self.unwrap_dist_model(self.model).load_state_dict(state_dict)
+
+ self.optimizer.load_state_dict(checkpoint["optimizer"])
+ if self.scaler and "scaler" in checkpoint:
+ self.scaler.load_state_dict(checkpoint["scaler"])
+
+ self.start_epoch = checkpoint["epoch"] + 1
+ logging.info("Resume checkpoint from {}".format(url_or_filename))
+
+ @main_process
+ def log_stats(self, stats, split_name):
+ if isinstance(stats, dict):
+ log_stats = {**{f"{split_name}_{k}": v for k, v in stats.items()}}
+ with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+ f.write(json.dumps(log_stats) + "\n")
+ elif isinstance(stats, list):
+ pass
+
+ @main_process
+ def log_config(self):
+ with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+ f.write(json.dumps(self.config.to_dict(), indent=4) + "\n")
diff --git a/minigpt4/tasks/__init__.py b/minigpt4/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82913e9c1eefeb852eb58d9e4bcaedb8f832ae3b
--- /dev/null
+++ b/minigpt4/tasks/__init__.py
@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.common.registry import registry
+from minigpt4.tasks.base_task import BaseTask
+from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
+
+
+def setup_task(cfg):
+ assert "task" in cfg.run_cfg, "Task name must be provided."
+
+ task_name = cfg.run_cfg.task
+ task = registry.get_task_class(task_name).setup_task(cfg=cfg)
+ assert task is not None, "Task {} not properly registered.".format(task_name)
+
+ return task
+
+
+__all__ = [
+ "BaseTask",
+ "ImageTextPretrainTask",
+]
diff --git a/minigpt4/tasks/base_task.py b/minigpt4/tasks/base_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f82a2a52779a782e5a40dfb6a6d9a57e991e345
--- /dev/null
+++ b/minigpt4/tasks/base_task.py
@@ -0,0 +1,286 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+from minigpt4.common.dist_utils import get_rank, get_world_size, is_main_process, is_dist_avail_and_initialized
+from minigpt4.common.logger import MetricLogger, SmoothedValue
+from minigpt4.common.registry import registry
+from minigpt4.datasets.data_utils import prepare_sample
+
+
+class BaseTask:
+ def __init__(self, **kwargs):
+ super().__init__()
+
+ self.inst_id_key = "instance_id"
+
+ @classmethod
+ def setup_task(cls, **kwargs):
+ return cls()
+
+ def build_model(self, cfg):
+ model_config = cfg.model_cfg
+
+ model_cls = registry.get_model_class(model_config.arch)
+ return model_cls.from_config(model_config)
+
+ def build_datasets(self, cfg):
+ """
+ Build a dictionary of datasets, keyed by split 'train', 'valid', 'test'.
+ Download dataset and annotations automatically if not exist.
+
+ Args:
+ cfg (common.config.Config): _description_
+
+ Returns:
+ dict: Dictionary of torch.utils.data.Dataset objects by split.
+ """
+
+ datasets = dict()
+
+ datasets_config = cfg.datasets_cfg
+
+ assert len(datasets_config) > 0, "At least one dataset has to be specified."
+
+ for name in datasets_config:
+ dataset_config = datasets_config[name]
+
+ builder = registry.get_builder_class(name)(dataset_config)
+ dataset = builder.build_datasets()
+
+ dataset['train'].name = name
+ if 'sample_ratio' in dataset_config:
+ dataset['train'].sample_ratio = dataset_config.sample_ratio
+
+ datasets[name] = dataset
+
+ return datasets
+
+ def train_step(self, model, samples):
+ loss = model(samples)["loss"]
+ return loss
+
+ def valid_step(self, model, samples):
+ raise NotImplementedError
+
+ def before_evaluation(self, model, dataset, **kwargs):
+ model.before_evaluation(dataset=dataset, task_type=type(self))
+
+ def after_evaluation(self, **kwargs):
+ pass
+
+ def inference_step(self):
+ raise NotImplementedError
+
+ def evaluation(self, model, data_loader, cuda_enabled=True):
+ metric_logger = MetricLogger(delimiter=" ")
+ header = "Evaluation"
+ # TODO make it configurable
+ print_freq = 10
+
+ results = []
+
+ for samples in metric_logger.log_every(data_loader, print_freq, header):
+ samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
+
+ eval_output = self.valid_step(model=model, samples=samples)
+ results.extend(eval_output)
+
+ if is_dist_avail_and_initialized():
+ dist.barrier()
+
+ return results
+
+ def train_epoch(
+ self,
+ epoch,
+ model,
+ data_loader,
+ optimizer,
+ lr_scheduler,
+ scaler=None,
+ cuda_enabled=False,
+ log_freq=50,
+ accum_grad_iters=1,
+ ):
+ return self._train_inner_loop(
+ epoch=epoch,
+ iters_per_epoch=lr_scheduler.iters_per_epoch,
+ model=model,
+ data_loader=data_loader,
+ optimizer=optimizer,
+ scaler=scaler,
+ lr_scheduler=lr_scheduler,
+ log_freq=log_freq,
+ cuda_enabled=cuda_enabled,
+ accum_grad_iters=accum_grad_iters,
+ )
+
+ def train_iters(
+ self,
+ epoch,
+ start_iters,
+ iters_per_inner_epoch,
+ model,
+ data_loader,
+ optimizer,
+ lr_scheduler,
+ scaler=None,
+ cuda_enabled=False,
+ log_freq=50,
+ accum_grad_iters=1,
+ ):
+ return self._train_inner_loop(
+ epoch=epoch,
+ start_iters=start_iters,
+ iters_per_epoch=iters_per_inner_epoch,
+ model=model,
+ data_loader=data_loader,
+ optimizer=optimizer,
+ scaler=scaler,
+ lr_scheduler=lr_scheduler,
+ log_freq=log_freq,
+ cuda_enabled=cuda_enabled,
+ accum_grad_iters=accum_grad_iters,
+ )
+
+ def _train_inner_loop(
+ self,
+ epoch,
+ iters_per_epoch,
+ model,
+ data_loader,
+ optimizer,
+ lr_scheduler,
+ scaler=None,
+ start_iters=None,
+ log_freq=50,
+ cuda_enabled=False,
+ accum_grad_iters=1,
+ ):
+ """
+ An inner training loop compatible with both epoch-based and iter-based training.
+
+ When using epoch-based, training stops after one epoch; when using iter-based,
+ training stops after #iters_per_epoch iterations.
+ """
+ use_amp = scaler is not None
+
+ if not hasattr(data_loader, "__next__"):
+ # convert to iterator if not already
+ data_loader = iter(data_loader)
+
+ metric_logger = MetricLogger(delimiter=" ")
+ metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}"))
+ metric_logger.add_meter("loss", SmoothedValue(window_size=1, fmt="{value:.4f}"))
+
+ # if iter-based runner, schedule lr based on inner epoch.
+ logging.info(
+ "Start training epoch {}, {} iters per inner epoch.".format(
+ epoch, iters_per_epoch
+ )
+ )
+ header = "Train: data epoch: [{}]".format(epoch)
+ if start_iters is None:
+ # epoch-based runner
+ inner_epoch = epoch
+ else:
+ # In iter-based runner, we schedule the learning rate based on iterations.
+ inner_epoch = start_iters // iters_per_epoch
+ header = header + "; inner epoch [{}]".format(inner_epoch)
+
+ for i in metric_logger.log_every(range(iters_per_epoch), log_freq, header):
+ # if using iter-based runner, we stop after iters_per_epoch iterations.
+ if i >= iters_per_epoch:
+ break
+
+ samples = next(data_loader)
+
+ samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
+ samples.update(
+ {
+ "epoch": inner_epoch,
+ "num_iters_per_epoch": iters_per_epoch,
+ "iters": i,
+ }
+ )
+
+ lr_scheduler.step(cur_epoch=inner_epoch, cur_step=i)
+
+ with torch.cuda.amp.autocast(enabled=use_amp):
+ loss = self.train_step(model=model, samples=samples)
+
+ # after_train_step()
+ if use_amp:
+ scaler.scale(loss).backward()
+ else:
+ loss.backward()
+
+ # update gradients every accum_grad_iters iterations
+ if (i + 1) % accum_grad_iters == 0:
+ if use_amp:
+ scaler.step(optimizer)
+ scaler.update()
+ else:
+ optimizer.step()
+ optimizer.zero_grad()
+
+ metric_logger.update(loss=loss.item())
+ metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+ # after train_epoch()
+ # gather the stats from all processes
+ metric_logger.synchronize_between_processes()
+ logging.info("Averaged stats: " + str(metric_logger.global_avg()))
+ return {
+ k: "{:.3f}".format(meter.global_avg)
+ for k, meter in metric_logger.meters.items()
+ }
+
+ @staticmethod
+ def save_result(result, result_dir, filename, remove_duplicate=""):
+ import json
+
+ result_file = os.path.join(
+ result_dir, "%s_rank%d.json" % (filename, get_rank())
+ )
+ final_result_file = os.path.join(result_dir, "%s.json" % filename)
+
+ json.dump(result, open(result_file, "w"))
+
+ if is_dist_avail_and_initialized():
+ dist.barrier()
+
+ if is_main_process():
+ logging.warning("rank %d starts merging results." % get_rank())
+ # combine results from all processes
+ result = []
+
+ for rank in range(get_world_size()):
+ result_file = os.path.join(
+ result_dir, "%s_rank%d.json" % (filename, rank)
+ )
+ res = json.load(open(result_file, "r"))
+ result += res
+
+ if remove_duplicate:
+ result_new = []
+ id_list = []
+ for res in result:
+ if res[remove_duplicate] not in id_list:
+ id_list.append(res[remove_duplicate])
+ result_new.append(res)
+ result = result_new
+
+ json.dump(result, open(final_result_file, "w"))
+ print("result file saved to %s" % final_result_file)
+
+ return final_result_file
diff --git a/minigpt4/tasks/image_text_pretrain.py b/minigpt4/tasks/image_text_pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2214a2e887799fa5236f165ac7329b60bc81d8f
--- /dev/null
+++ b/minigpt4/tasks/image_text_pretrain.py
@@ -0,0 +1,18 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.common.registry import registry
+from minigpt4.tasks.base_task import BaseTask
+
+
+@registry.register_task("image_text_pretrain")
+class ImageTextPretrainTask(BaseTask):
+ def __init__(self):
+ super().__init__()
+
+ def evaluation(self, model, data_loader, cuda_enabled=True):
+ pass
diff --git a/prompts/alignment.txt b/prompts/alignment.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38ae75a9cee293861f06544cbff6fdc4aa941d85
--- /dev/null
+++ b/prompts/alignment.txt
@@ -0,0 +1,4 @@
+ Describe this image in detail.
+ Take a look at this image and describe what you notice.
+ Please provide a detailed description of the picture.
+ Could you describe the contents of this image for me?
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bbc3244d2e2c0684f678dba0ccd7c4f87f10b179
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,28 @@
+contexttimer
+decord
+einops>=0.4.1
+fairscale==0.4.4
+ftfy
+iopath
+ipython
+omegaconf
+opencv-python-headless==4.5.5.64
+opendatasets
+packaging
+pandas
+plotly
+pre-commit
+pycocoevalcap
+pycocotools
+python-magic
+scikit-image
+sentencepiece
+spacy
+streamlit
+timm==0.4.12
+torch>=1.10.0
+torchvision
+tqdm
+transformers>=4.25.0,<4.27
+webdataset
+wheel
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dd8946e95b70b5d8cc485e8d303c052c5506a6
--- /dev/null
+++ b/train.py
@@ -0,0 +1,103 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import argparse
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+import minigpt4.tasks as tasks
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank, init_distributed_mode
+from minigpt4.common.logger import setup_logger
+from minigpt4.common.optims import (
+ LinearWarmupCosineLRScheduler,
+ LinearWarmupStepLRScheduler,
+)
+from minigpt4.common.registry import registry
+from minigpt4.common.utils import now
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Training")
+
+ parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+ parser.add_argument(
+ "--options",
+ nargs="+",
+ help="override some settings in the used config, the key-value pair "
+ "in xxx=yyy format will be merged into config file (deprecate), "
+ "change to --cfg-options instead.",
+ )
+
+ args = parser.parse_args()
+ # if 'LOCAL_RANK' not in os.environ:
+ # os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ return args
+
+
+def setup_seeds(config):
+ seed = config.run_cfg.seed + get_rank()
+
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+ cudnn.benchmark = False
+ cudnn.deterministic = True
+
+
+def get_runner_class(cfg):
+ """
+ Get runner class from config. Default to epoch-based runner.
+ """
+ runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))
+
+ return runner_cls
+
+
+def main():
+ # allow auto-dl completes on main process without timeout when using NCCL backend.
+ # os.environ["NCCL_BLOCKING_WAIT"] = "1"
+
+ # set before init_distributed_mode() to ensure the same job_id shared across all ranks.
+ job_id = now()
+
+ cfg = Config(parse_args())
+
+ init_distributed_mode(cfg.run_cfg)
+
+ setup_seeds(cfg)
+
+ # set after init_distributed_mode() to only log on master.
+ setup_logger()
+
+ cfg.pretty_print()
+
+ task = tasks.setup_task(cfg)
+ datasets = task.build_datasets(cfg)
+ model = task.build_model(cfg)
+
+ runner = get_runner_class(cfg)(
+ cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
+ )
+ runner.train()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/train_configs/minigpt4_stage1_laion.yaml b/train_configs/minigpt4_stage1_laion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10d3bd4df77e7465f875651f8a8348d1595b2558
--- /dev/null
+++ b/train_configs/minigpt4_stage1_laion.yaml
@@ -0,0 +1,62 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: mini_gpt4
+ model_type: pretrain_vicuna
+ freeze_vit: True
+ freeze_qformer: True
+
+
+datasets:
+ laion:
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 115
+ cc_combine:
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 14
+
+
+run:
+ task: image_text_pretrain
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 1e-4
+ min_lr: 3e-5
+ warmup_lr: 1e-6
+
+ weight_decay: 0.05
+ max_epoch: 4
+ batch_size_train: 64
+ batch_size_eval: 64
+ num_workers: 4
+ warmup_steps: 5000
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/path/to/save/your/model/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/train_configs/minigpt4_stage2_align.yaml b/train_configs/minigpt4_stage2_align.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2fbf5c815d3114ceeaa621a266c49c3f56f91e7
--- /dev/null
+++ b/train_configs/minigpt4_stage2_align.yaml
@@ -0,0 +1,56 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: mini_gpt4
+ model_type: pretrain_vicuna
+ freeze_vit: True
+ freeze_qformer: True
+ max_txt_len: 160
+ end_sym: "###"
+ prompt_path: "prompts/alignment.txt"
+ prompt_template: '###Human: {} ###Assistant: '
+ ckpt: '/ibex/project/c2133/vicuna_jun_checkpoint_wihtout_prompt/20230412162/checkpoint_3.pth'
+
+
+datasets:
+ cc_align:
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+
+run:
+ task: image_text_pretrain
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 3e-5
+ min_lr: 1e-5
+ warmup_lr: 1e-6
+
+ weight_decay: 0.05
+ max_epoch: 5
+ iters_per_epoch: 200
+ batch_size_train: 12
+ batch_size_eval: 12
+ num_workers: 4
+ warmup_steps: 200
+
+ seed: 42
+ output_dir: "/ibex/project/c2133/vicuna_ckpt_test/minigpt4_stage2_align"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file