Spaces:

rxtan
/

Koala-video-llm

Sleeping

App Files Files Community

Reuben Tan commited on Mar 12, 2024

Commit

b2afdba

1 Parent(s): 205dfac

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +249 -0
ckpt/VL_LLaMA_2_7B_Finetuned.pth +3 -0
ckpt/finetuned_model.pth +3 -0
demo_video.py +249 -0
eval_configs/conversation_demo.yaml +78 -0
global_local/__init__.py +31 -0
global_local/__pycache__/__init__.cpython-39.pyc +0 -0
global_local/common/__init__.py +0 -0
global_local/common/__pycache__/__init__.cpython-39.pyc +0 -0
global_local/common/__pycache__/config.cpython-39.pyc +0 -0
global_local/common/__pycache__/dist_utils.cpython-39.pyc +0 -0
global_local/common/__pycache__/logger.cpython-39.pyc +0 -0
global_local/common/__pycache__/optims.cpython-39.pyc +0 -0
global_local/common/__pycache__/registry.cpython-39.pyc +0 -0
global_local/common/__pycache__/utils.cpython-39.pyc +0 -0
global_local/common/config.py +468 -0
global_local/common/dist_utils.py +156 -0
global_local/common/gradcam.py +24 -0
global_local/common/logger.py +195 -0
global_local/common/optims.py +134 -0
global_local/common/registry.py +329 -0
global_local/common/utils.py +424 -0
global_local/configs/datasets/cc_sbu/align.yaml +5 -0
global_local/configs/datasets/cc_sbu/defaults.yaml +5 -0
global_local/configs/datasets/instruct/llava_instruct.yaml +6 -0
global_local/configs/datasets/instruct/webvid_instruct.yaml +6 -0
global_local/configs/datasets/laion/defaults.yaml +5 -0
global_local/configs/datasets/webvid/defaults.yaml +6 -0
global_local/configs/default.yaml +5 -0
global_local/configs/models/minigpt4.yaml +33 -0
global_local/configs/models/video_llama.yaml +36 -0
global_local/conversation/__init__.py +0 -0
global_local/conversation/__pycache__/__init__.cpython-39.pyc +0 -0
global_local/conversation/__pycache__/conversation_video.cpython-39.pyc +0 -0
global_local/conversation/conversation_video.py +404 -0
global_local/datasets/__init__.py +0 -0
global_local/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
global_local/datasets/__pycache__/data_utils.cpython-39.pyc +0 -0
global_local/datasets/builders/__init__.py +77 -0
global_local/datasets/builders/__pycache__/__init__.cpython-39.pyc +0 -0
global_local/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc +0 -0
global_local/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc +0 -0
global_local/datasets/builders/__pycache__/instruct_builder.cpython-39.pyc +0 -0
global_local/datasets/builders/__pycache__/video_caption_builder.cpython-39.pyc +0 -0
global_local/datasets/builders/base_dataset_builder.py +236 -0
global_local/datasets/builders/image_text_pair_builder.py +106 -0
global_local/datasets/builders/instruct_builder.py +78 -0
global_local/datasets/builders/video_caption_builder.py +34 -0
global_local/datasets/data_utils.py +196 -0
global_local/datasets/datasets/__init__.py +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
+"""
+import argparse
+import os
+import sys
+import random
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import gradio as gr
+from global_local.common.config import Config
+from global_local.common.dist_utils import get_rank
+from global_local.common.registry import registry
+from global_local.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
+import decord
+decord.bridge.set_bridge('torch')
+#%%
+# imports modules for registration
+from global_local.datasets.builders import *
+from global_local.models import *
+from global_local.processors import *
+from global_local.runners import *
+from global_local.tasks import *
+#%%
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    #parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+    parser.add_argument("--cfg-path", type=str, default='./eval_configs/conversation_demo.yaml', help="path to configuration file.")
+    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
+    parser.add_argument("--model_type", type=str, default='llama_v2', help="specify LLM")
+    parser.add_argument('--pretrained_weight_path', type=str, default="./ckpt/finetuned_model.pth", metavar='PWP',
+                    help='path to pretrained weight path')
+    parser.add_argument('--num_frames_per_clip', type=int, default=16, metavar='NPPC',
+                    help='specify how frames to use per clip')
+    parser.add_argument('--num_segments', type=int, default=4, metavar='NS',
+                        help='specify number of video segments')
+    parser.add_argument('--hierarchical_agg_function', type=str, default="without-top-final-global-prompts-region-segment-full-dis-spatiotemporal-prompts-attn-early-attn-linear-learned", metavar='HAF',
+                        help='specify function to merge global and clip visual representations')
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+    args = parser.parse_args()
+    return args
+def setup_seeds(config):
+    seed = config.run_cfg.seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+# ========================================
+#             Model Initialization
+# ========================================
+print('Initializing Chat')
+args = parse_args()
+cfg = Config(args)
+model_config = cfg.model_cfg
+model_config.device_8bit = args.gpu_id
+model_cls = registry.get_model_class(model_config.arch)
+model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
+model.num_frames_per_clip = args.num_frames_per_clip
+model.num_segments = args.num_segments
+model.hierarchical_agg_function = args.hierarchical_agg_function
+model.global_region_embed_weight = None
+model.initialize_visual_agg_function()
+best_checkpoint = torch.load(args.pretrained_weight_path, map_location='cpu')['model_state_dict']
+pretrained_dict = {}
+for k, v in best_checkpoint.items():
+    pretrained_dict[k.replace('module.', '')] = v
+model_dict = model.state_dict()
+model_dict.update(pretrained_dict)
+model.load_state_dict(model_dict)
+model.cuda().eval()
+#vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
+print('Initialization Finished')
+# ========================================
+#             Gradio Setting
+# ========================================
+def gradio_reset(chat_state, img_list):
+    if chat_state is not None:
+        chat_state.messages = []
+    if img_list is not None:
+        img_list = []
+    return None, gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your video first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+def upload_imgorvideo(gr_video, gr_img, text_input, chat_state,chatbot):
+    if args.model_type == 'vicuna':
+        chat_state = default_conversation.copy()
+    else:
+        chat_state = conv_llava_llama_2.copy()
+    if gr_img is None and gr_video is None:
+        return None, None, None, gr.update(interactive=True), chat_state, None
+    elif gr_img is not None and gr_video is None:
+        print(gr_img)
+        chatbot = chatbot + [((gr_img,), None)]
+        chat_state.system =  "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
+        img_list = []
+        llm_message = chat.upload_img(gr_img, chat_state, img_list)
+        return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list,chatbot
+    elif gr_video is not None and gr_img is None:
+        print(gr_video)
+        chatbot = chatbot + [((gr_video,), None)]
+        chat_state.system =  "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
+        img_list = []
+        llm_message = chat.upload_video_without_audio(gr_video, chat_state, img_list)
+        return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list,chatbot
+    else:
+        # img_list = []
+        return gr.update(interactive=False), gr.update(interactive=False, placeholder='Currently, only one input is supported'), gr.update(value="Currently, only one input is supported", interactive=False), chat_state, None,chatbot
+def gradio_ask(user_message, chatbot, chat_state):
+    if len(user_message) == 0:
+        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+    chat.ask(user_message, chat_state)
+    chatbot = chatbot + [[user_message, None]]
+    return '', chatbot, chat_state
+def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
+    llm_message = chat.answer(conv=chat_state,
+                              img_list=img_list,
+                              num_beams=num_beams,
+                              temperature=temperature,
+                              max_new_tokens=300,
+                              max_length=2000)[0]
+    chatbot[-1][1] = llm_message
+    print(chat_state.get_prompt())
+    print(chat_state)
+    return chatbot, chat_state, img_list
+title = """
+<h1 align="center">Global-Local QFormer for Long Video Understanding with LLMs</h1>
+<h5 align="center">  Introduction: We introduce a Global-Local QFormer video model that is connected with a Large Language Model to understand \
+                    and answer questions about long videos. </h5>
+<div style='display:flex; gap: 0.25rem; '>
+<a href='https://huggingface.co/spaces/rxtan/rxtan/Global-Local-QFormer-Video-LLM'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+<a href=''><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
+</div>
+Thank you for using the Global-Local QFormer Demo Page! If you have any questions or feedback, feel free to contact us.
+Current online demo uses the 7B version of Llama-2 due to resource limitations.
+"""
+Note_markdown = ("""
+### We note that our Global-Local QFormer model may be limited at understanding videos from rare domains. Due to the pretraining data, the \
+    model may be susceptible to hallucinations
+We would like to acknowledge the Video-LLama repository which we copied the demo layout from.
+**Boston University**
+""")
+cite_markdown = ("""
+""")
+#case_note_upload = ("""
+### We provide some examples at the bottom of the page. Simply click on them to try them out directly.
+#""")
+#TODO show examples below
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            video = gr.Video()
+            #image = gr.Image(type="filepath")
+            #gr.Markdown(case_note_upload)
+            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+            clear = gr.Button("Restart")
+            num_beams = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                step=1,
+                interactive=True,
+                label="beam search numbers)",
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                interactive=True,
+                label="Temperature",
+            )
+            audio = gr.Checkbox(interactive=True, value=False, label="Audio")
+            gr.Markdown(Note_markdown)
+        with gr.Column():
+            chat_state = gr.State()
+            img_list = gr.State()
+            chatbot = gr.Chatbot(label='Global-Local QFormer')
+            text_input = gr.Textbox(label='User', placeholder='Please upload your video first.', interactive=False)
+    '''with gr.Column():
+        gr.Examples(examples=[
+            [f"examples/skateboarding_dog.mp4", "What is the dog doing? "],
+            [f"examples/birthday.mp4", "What is the boy doing? "],
+            [f"examples/IronMan.mp4", "Is the guy in the video Iron Man? "],
+        ], inputs=[video, text_input])'''
+    gr.Markdown(cite_markdown)
+    upload_button.click(upload_imgorvideo, [video, text_input, chat_state,chatbot], [video, text_input, upload_button, chat_state, img_list,chatbot])
+    text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
+    )
+    clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, text_input, upload_button, chat_state, img_list], queue=False)
+#demo.launch(share=False, enable_queue=True, debug=True)
+demo.launch(share=False, debug=True)

ckpt/VL_LLaMA_2_7B_Finetuned.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cec0e2979ed7656e08ecc5b185c2229a3c577b4b7a4721a94bd461ba0447c6e
+size 265559201

ckpt/finetuned_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3795447e3459f467aae141873ba5f666efcc0f1478ddd9316437d3ba56aa72fd
+size 38852011

demo_video.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
+"""
+import argparse
+import os
+import sys
+import random
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import gradio as gr
+from global_local.common.config import Config
+from global_local.common.dist_utils import get_rank
+from global_local.common.registry import registry
+from global_local.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
+import decord
+decord.bridge.set_bridge('torch')
+#%%
+# imports modules for registration
+from global_local.datasets.builders import *
+from global_local.models import *
+from global_local.processors import *
+from global_local.runners import *
+from global_local.tasks import *
+#%%
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    #parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+    parser.add_argument("--cfg-path", type=str, default='./eval_configs/conversation_demo.yaml', help="path to configuration file.")
+    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
+    parser.add_argument("--model_type", type=str, default='llama_v2', help="specify LLM")
+    parser.add_argument('--pretrained_weight_path', type=str, default="./ckpt/finetuned_model.pth", metavar='PWP',
+                    help='path to pretrained weight path')
+    parser.add_argument('--num_frames_per_clip', type=int, default=16, metavar='NPPC',
+                    help='specify how frames to use per clip')
+    parser.add_argument('--num_segments', type=int, default=4, metavar='NS',
+                        help='specify number of video segments')
+    parser.add_argument('--hierarchical_agg_function', type=str, default="without-top-final-global-prompts-region-segment-full-dis-spatiotemporal-prompts-attn-early-attn-linear-learned", metavar='HAF',
+                        help='specify function to merge global and clip visual representations')
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+    args = parser.parse_args()
+    return args
+def setup_seeds(config):
+    seed = config.run_cfg.seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+# ========================================
+#             Model Initialization
+# ========================================
+print('Initializing Chat')
+args = parse_args()
+cfg = Config(args)
+model_config = cfg.model_cfg
+model_config.device_8bit = args.gpu_id
+model_cls = registry.get_model_class(model_config.arch)
+model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
+model.num_frames_per_clip = args.num_frames_per_clip
+model.num_segments = args.num_segments
+model.hierarchical_agg_function = args.hierarchical_agg_function
+model.global_region_embed_weight = None
+model.initialize_visual_agg_function()
+best_checkpoint = torch.load(args.pretrained_weight_path, map_location='cpu')['model_state_dict']
+pretrained_dict = {}
+for k, v in best_checkpoint.items():
+    pretrained_dict[k.replace('module.', '')] = v
+model_dict = model.state_dict()
+model_dict.update(pretrained_dict)
+model.load_state_dict(model_dict)
+model.cuda().eval()
+#vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
+print('Initialization Finished')
+# ========================================
+#             Gradio Setting
+# ========================================
+def gradio_reset(chat_state, img_list):
+    if chat_state is not None:
+        chat_state.messages = []
+    if img_list is not None:
+        img_list = []
+    return None, gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your video first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+def upload_imgorvideo(gr_video, gr_img, text_input, chat_state,chatbot):
+    if args.model_type == 'vicuna':
+        chat_state = default_conversation.copy()
+    else:
+        chat_state = conv_llava_llama_2.copy()
+    if gr_img is None and gr_video is None:
+        return None, None, None, gr.update(interactive=True), chat_state, None
+    elif gr_img is not None and gr_video is None:
+        print(gr_img)
+        chatbot = chatbot + [((gr_img,), None)]
+        chat_state.system =  "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
+        img_list = []
+        llm_message = chat.upload_img(gr_img, chat_state, img_list)
+        return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list,chatbot
+    elif gr_video is not None and gr_img is None:
+        print(gr_video)
+        chatbot = chatbot + [((gr_video,), None)]
+        chat_state.system =  "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
+        img_list = []
+        llm_message = chat.upload_video_without_audio(gr_video, chat_state, img_list)
+        return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list,chatbot
+    else:
+        # img_list = []
+        return gr.update(interactive=False), gr.update(interactive=False, placeholder='Currently, only one input is supported'), gr.update(value="Currently, only one input is supported", interactive=False), chat_state, None,chatbot
+def gradio_ask(user_message, chatbot, chat_state):
+    if len(user_message) == 0:
+        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+    chat.ask(user_message, chat_state)
+    chatbot = chatbot + [[user_message, None]]
+    return '', chatbot, chat_state
+def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
+    llm_message = chat.answer(conv=chat_state,
+                              img_list=img_list,
+                              num_beams=num_beams,
+                              temperature=temperature,
+                              max_new_tokens=300,
+                              max_length=2000)[0]
+    chatbot[-1][1] = llm_message
+    print(chat_state.get_prompt())
+    print(chat_state)
+    return chatbot, chat_state, img_list
+title = """
+<h1 align="center">Global-Local QFormer for Long Video Understanding with LLMs</h1>
+<h5 align="center">  Introduction: We introduce a Global-Local QFormer video model that is connected with a Large Language Model to understand \
+                    and answer questions about long videos. </h5>
+<div style='display:flex; gap: 0.25rem; '>
+<a href='https://huggingface.co/spaces/rxtan/rxtan/Global-Local-QFormer-Video-LLM'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+<a href=''><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
+</div>
+Thank you for using the Global-Local QFormer Demo Page! If you have any questions or feedback, feel free to contact us.
+Current online demo uses the 7B version of Llama-2 due to resource limitations.
+"""
+Note_markdown = ("""
+### We note that our Global-Local QFormer model may be limited at understanding videos from rare domains. Due to the pretraining data, the \
+    model may be susceptible to hallucinations
+We would like to acknowledge the Video-LLama repository which we copied the demo layout from.
+**Boston University**
+""")
+cite_markdown = ("""
+""")
+#case_note_upload = ("""
+### We provide some examples at the bottom of the page. Simply click on them to try them out directly.
+#""")
+#TODO show examples below
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            video = gr.Video()
+            #image = gr.Image(type="filepath")
+            #gr.Markdown(case_note_upload)
+            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+            clear = gr.Button("Restart")
+            num_beams = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                step=1,
+                interactive=True,
+                label="beam search numbers)",
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                interactive=True,
+                label="Temperature",
+            )
+            audio = gr.Checkbox(interactive=True, value=False, label="Audio")
+            gr.Markdown(Note_markdown)
+        with gr.Column():
+            chat_state = gr.State()
+            img_list = gr.State()
+            chatbot = gr.Chatbot(label='Global-Local QFormer')
+            text_input = gr.Textbox(label='User', placeholder='Please upload your video first.', interactive=False)
+    '''with gr.Column():
+        gr.Examples(examples=[
+            [f"examples/skateboarding_dog.mp4", "What is the dog doing? "],
+            [f"examples/birthday.mp4", "What is the boy doing? "],
+            [f"examples/IronMan.mp4", "Is the guy in the video Iron Man? "],
+        ], inputs=[video, text_input])'''
+    gr.Markdown(cite_markdown)
+    upload_button.click(upload_imgorvideo, [video, text_input, chat_state,chatbot], [video, text_input, upload_button, chat_state, img_list,chatbot])
+    text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
+    )
+    clear.click(gradio_reset, [chat_state, img_list], [chatbot, video, text_input, upload_button, chat_state, img_list], queue=False)
+#demo.launch(share=False, enable_queue=True, debug=True)
+demo.launch(share=False, debug=True)

eval_configs/conversation_demo.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+model:
+  arch: video_instruction_llama
+  model_type: pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+  # Q-Former
+  num_query_token: 32
+  # If you want train models based on LLaMA-2-chat,
+  # some ckpts could be download from our provided huggingface repo
+  # i.e.  https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf
+  #llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/"
+  llama_model: "Video-LLaMA-2-7B-Finetuned/llama-2-7b-chat-hf/"
+  imagebind_ckpt_path: "ckpt/imagebind_path/"
+  # The ckpt of vision branch after stage1 pretrained,
+  ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth'   # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/
+  # only train vision branch
+  equip_audio_branch: False  # whether equips the audio branch
+  frozen_llama_proj: False
+  frozen_video_Qformer: True
+  frozen_audio_Qformer: True
+  fusion_head_layers: 2
+  max_frame_pos: 32
+  fusion_header_type: "seqTransf"
+  max_txt_len: 320
+  # for llama_2_chat:
+  end_sym: "</s>"
+  prompt_path: "prompts/alignment_image.txt"
+  prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] '
+datasets:
+  webvid:
+    vis_processor:
+      train:
+        name: "alpro_video_eval"
+        n_frms: 8
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+run:
+  task: video_text_pretrain
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 3e-5
+  min_lr: 1e-5
+  warmup_lr: 1e-6
+  weight_decay: 0.05
+  max_epoch: 3
+  iters_per_epoch: 1000
+  batch_size_train: 4
+  batch_size_eval: 4
+  num_workers: 4
+  warmup_steps: 1000
+  seed: 42
+  output_dir: "output/videollama_stage2_finetune"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: False
+  train_splits: ["train"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

global_local/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+import sys
+from omegaconf import OmegaConf
+from global_local.common.registry import registry
+from global_local.datasets.builders import *
+from global_local.models import *
+from global_local.processors import *
+from global_local.tasks import *
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
+registry.register_path("library_root", root_dir)
+repo_root = os.path.join(root_dir, "..")
+registry.register_path("repo_root", repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path("cache_root", cache_root)
+registry.register("MAX_INT", sys.maxsize)
+registry.register("SPLIT_NAMES", ["train", "val", "test"])

global_local/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (1.1 kB). View file

global_local/common/__init__.py ADDED Viewed

File without changes

global_local/common/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (219 Bytes). View file

global_local/common/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (12.2 kB). View file

global_local/common/__pycache__/dist_utils.cpython-39.pyc ADDED Viewed

Binary file (4.55 kB). View file

global_local/common/__pycache__/logger.cpython-39.pyc ADDED Viewed

Binary file (6.48 kB). View file

global_local/common/__pycache__/optims.cpython-39.pyc ADDED Viewed

Binary file (3.89 kB). View file

global_local/common/__pycache__/registry.cpython-39.pyc ADDED Viewed

Binary file (9.11 kB). View file

global_local/common/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (12.7 kB). View file

global_local/common/config.py ADDED Viewed

	@@ -0,0 +1,468 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import json
+from typing import Dict
+from omegaconf import OmegaConf
+from global_local.common.registry import registry
+class Config:
+    def __init__(self, args):
+        self.config = {}
+        self.args = args
+        # Register the config and configuration for setup
+        registry.register("configuration", self)
+        user_config = self._build_opt_list(self.args.options)
+        config = OmegaConf.load(self.args.cfg_path)
+        runner_config = self.build_runner_config(config)
+        model_config = self.build_model_config(config, **user_config)
+        dataset_config = self.build_dataset_config(config)
+        # Validate the user-provided runner configuration
+        # model and dataset configuration are supposed to be validated by the respective classes
+        # [TODO] validate the model/dataset configuration
+        # self._validate_runner_config(runner_config)
+        # Override the default configuration with user options.
+        self.config = OmegaConf.merge(
+            runner_config, model_config, dataset_config, user_config
+        )
+    def _validate_runner_config(self, runner_config):
+        """
+        This method validates the configuration, such that
+            1) all the user specified options are valid;
+            2) no type mismatches between the user specified options and the config.
+        """
+        runner_config_validator = create_runner_config_validator()
+        runner_config_validator.validate(runner_config)
+    def _build_opt_list(self, opts):
+        opts_dot_list = self._convert_to_dot_list(opts)
+        return OmegaConf.from_dotlist(opts_dot_list)
+    @staticmethod
+    def build_model_config(config, **kwargs):
+        model = config.get("model", None)
+        assert model is not None, "Missing model configuration file."
+        model_cls = registry.get_model_class(model.arch)
+        assert model_cls is not None, f"Model '{model.arch}' has not been registered."
+        model_type = kwargs.get("model.model_type", None)
+        if not model_type:
+            model_type = model.get("model_type", None)
+        # else use the model type selected by user.
+        assert model_type is not None, "Missing model_type."
+        model_config_path = model_cls.default_config_path(model_type=model_type)
+        model_config = OmegaConf.create()
+        # hierarchy override, customized config > default config
+        model_config = OmegaConf.merge(
+            model_config,
+            OmegaConf.load(model_config_path),
+            {"model": config["model"]},
+        )
+        return model_config
+    @staticmethod
+    def build_runner_config(config):
+        return {"run": config.run}
+    @staticmethod
+    def build_dataset_config(config):
+        datasets = config.get("datasets", None)
+        if datasets is None:
+            raise KeyError(
+                "Expecting 'datasets' as the root key for dataset configuration."
+            )
+        dataset_config = OmegaConf.create()
+        for dataset_name in datasets:
+            builder_cls = registry.get_builder_class(dataset_name)
+            dataset_config_type = datasets[dataset_name].get("type", "default")
+            dataset_config_path = builder_cls.default_config_path(
+                type=dataset_config_type
+            )
+            # hierarchy override, customized config > default config
+            dataset_config = OmegaConf.merge(
+                dataset_config,
+                OmegaConf.load(dataset_config_path),
+                {"datasets": {dataset_name: config["datasets"][dataset_name]}},
+            )
+        return dataset_config
+    def _convert_to_dot_list(self, opts):
+        if opts is None:
+            opts = []
+        if len(opts) == 0:
+            return opts
+        has_equal = opts[0].find("=") != -1
+        if has_equal:
+            return opts
+        return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])]
+    def get_config(self):
+        return self.config
+    @property
+    def run_cfg(self):
+        return self.config.run
+    @property
+    def datasets_cfg(self):
+        return self.config.datasets
+    @property
+    def model_cfg(self):
+        return self.config.model
+    def pretty_print(self):
+        logging.info("\n=====  Running Parameters    =====")
+        logging.info(self._convert_node_to_json(self.config.run))
+        logging.info("\n======  Dataset Attributes  ======")
+        datasets = self.config.datasets
+        for dataset in datasets:
+            if dataset in self.config.datasets:
+                logging.info(f"\n======== {dataset} =======")
+                dataset_config = self.config.datasets[dataset]
+                logging.info(self._convert_node_to_json(dataset_config))
+            else:
+                logging.warning(f"No dataset named '{dataset}' in config. Skipping")
+        logging.info(f"\n======  Model Attributes  ======")
+        logging.info(self._convert_node_to_json(self.config.model))
+    def _convert_node_to_json(self, node):
+        container = OmegaConf.to_container(node, resolve=True)
+        return json.dumps(container, indent=4, sort_keys=True)
+    def to_dict(self):
+        return OmegaConf.to_container(self.config)
+def node_to_dict(node):
+    return OmegaConf.to_container(node)
+class ConfigValidator:
+    """
+    This is a preliminary implementation to centralize and validate the configuration.
+    May be altered in the future.
+    A helper class to validate configurations from yaml file.
+    This serves the following purposes:
+        1. Ensure all the options in the yaml are defined, raise error if not.
+        2. when type mismatches are found, the validator will raise an error.
+        3. a central place to store and display helpful messages for supported configurations.
+    """
+    class _Argument:
+        def __init__(self, name, choices=None, type=None, help=None):
+            self.name = name
+            self.val = None
+            self.choices = choices
+            self.type = type
+            self.help = help
+        def __str__(self):
+            s = f"{self.name}={self.val}"
+            if self.type is not None:
+                s += f", ({self.type})"
+            if self.choices is not None:
+                s += f", choices: {self.choices}"
+            if self.help is not None:
+                s += f", ({self.help})"
+            return s
+    def __init__(self, description):
+        self.description = description
+        self.arguments = dict()
+        self.parsed_args = None
+    def __getitem__(self, key):
+        assert self.parsed_args is not None, "No arguments parsed yet."
+        return self.parsed_args[key]
+    def __str__(self) -> str:
+        return self.format_help()
+    def add_argument(self, *args, **kwargs):
+        """
+        Assume the first argument is the name of the argument.
+        """
+        self.arguments[args[0]] = self._Argument(*args, **kwargs)
+    def validate(self, config=None):
+        """
+        Convert yaml config (dict-like) to list, required by argparse.
+        """
+        for k, v in config.items():
+            assert (
+                k in self.arguments
+            ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
+            if self.arguments[k].type is not None:
+                try:
+                    self.arguments[k].val = self.arguments[k].type(v)
+                except ValueError:
+                    raise ValueError(f"{k} is not a valid {self.arguments[k].type}.")
+            if self.arguments[k].choices is not None:
+                assert (
+                    v in self.arguments[k].choices
+                ), f"""{k} must be one of {self.arguments[k].choices}."""
+        return config
+    def format_arguments(self):
+        return str([f"{k}" for k in sorted(self.arguments.keys())])
+    def format_help(self):
+        # description + key-value pair string for each argument
+        help_msg = str(self.description)
+        return help_msg + ", available arguments: " + self.format_arguments()
+    def print_help(self):
+        # display help message
+        print(self.format_help())
+def create_runner_config_validator():
+    validator = ConfigValidator(description="Runner configurations")
+    validator.add_argument(
+        "runner",
+        type=str,
+        choices=["runner_base", "runner_iter"],
+        help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
+            runner runs based on iters. Default: runner_base""",
+    )
+    # add argumetns for training dataset ratios
+    validator.add_argument(
+        "train_dataset_ratios",
+        type=Dict[str, float],
+        help="""Ratios of training dataset. This is used in iteration-based runner.
+        Do not support for epoch-based runner because how to define an epoch becomes tricky.
+        Default: None""",
+    )
+    validator.add_argument(
+        "max_iters",
+        type=float,
+        help="Maximum number of iterations to run.",
+    )
+    validator.add_argument(
+        "max_epoch",
+        type=int,
+        help="Maximum number of epochs to run.",
+    )
+    # add arguments for iters_per_inner_epoch
+    validator.add_argument(
+        "iters_per_inner_epoch",
+        type=float,
+        help="Number of iterations per inner epoch. This is required when runner is runner_iter.",
+    )
+    lr_scheds_choices = registry.list_lr_schedulers()
+    validator.add_argument(
+        "lr_sched",
+        type=str,
+        choices=lr_scheds_choices,
+        help="Learning rate scheduler to use, from {}".format(lr_scheds_choices),
+    )
+    task_choices = registry.list_tasks()
+    validator.add_argument(
+        "task",
+        type=str,
+        choices=task_choices,
+        help="Task to use, from {}".format(task_choices),
+    )
+    # add arguments for init_lr
+    validator.add_argument(
+        "init_lr",
+        type=float,
+        help="Initial learning rate. This will be the learning rate after warmup and before decay.",
+    )
+    # add arguments for min_lr
+    validator.add_argument(
+        "min_lr",
+        type=float,
+        help="Minimum learning rate (after decay).",
+    )
+    # add arguments for warmup_lr
+    validator.add_argument(
+        "warmup_lr",
+        type=float,
+        help="Starting learning rate for warmup.",
+    )
+    # add arguments for learning rate decay rate
+    validator.add_argument(
+        "lr_decay_rate",
+        type=float,
+        help="Learning rate decay rate. Required if using a decaying learning rate scheduler.",
+    )
+    # add arguments for weight decay
+    validator.add_argument(
+        "weight_decay",
+        type=float,
+        help="Weight decay rate.",
+    )
+    # add arguments for training batch size
+    validator.add_argument(
+        "batch_size_train",
+        type=int,
+        help="Training batch size.",
+    )
+    # add arguments for evaluation batch size
+    validator.add_argument(
+        "batch_size_eval",
+        type=int,
+        help="Evaluation batch size, including validation and testing.",
+    )
+    # add arguments for number of workers for data loading
+    validator.add_argument(
+        "num_workers",
+        help="Number of workers for data loading.",
+    )
+    # add arguments for warm up steps
+    validator.add_argument(
+        "warmup_steps",
+        type=int,
+        help="Number of warmup steps. Required if a warmup schedule is used.",
+    )
+    # add arguments for random seed
+    validator.add_argument(
+        "seed",
+        type=int,
+        help="Random seed.",
+    )
+    # add arguments for output directory
+    validator.add_argument(
+        "output_dir",
+        type=str,
+        help="Output directory to save checkpoints and logs.",
+    )
+    # add arguments for whether only use evaluation
+    validator.add_argument(
+        "evaluate",
+        help="Whether to only evaluate the model. If true, training will not be performed.",
+    )
+    # add arguments for splits used for training, e.g. ["train", "val"]
+    validator.add_argument(
+        "train_splits",
+        type=list,
+        help="Splits to use for training.",
+    )
+    # add arguments for splits used for validation, e.g. ["val"]
+    validator.add_argument(
+        "valid_splits",
+        type=list,
+        help="Splits to use for validation. If not provided, will skip the validation.",
+    )
+    # add arguments for splits used for testing, e.g. ["test"]
+    validator.add_argument(
+        "test_splits",
+        type=list,
+        help="Splits to use for testing. If not provided, will skip the testing.",
+    )
+    # add arguments for accumulating gradient for iterations
+    validator.add_argument(
+        "accum_grad_iters",
+        type=int,
+        help="Number of iterations to accumulate gradient for.",
+    )
+    # ====== distributed training ======
+    validator.add_argument(
+        "device",
+        type=str,
+        choices=["cpu", "cuda"],
+        help="Device to use. Support 'cuda' or 'cpu' as for now.",
+    )
+    validator.add_argument(
+        "world_size",
+        type=int,
+        help="Number of processes participating in the job.",
+    )
+    validator.add_argument("dist_url", type=str)
+    validator.add_argument("distributed", type=bool)
+    # add arguments to opt using distributed sampler during evaluation or not
+    validator.add_argument(
+        "use_dist_eval_sampler",
+        type=bool,
+        help="Whether to use distributed sampler during evaluation or not.",
+    )
+    # ====== task specific ======
+    # generation task specific arguments
+    # add arguments for maximal length of text output
+    validator.add_argument(
+        "max_len",
+        type=int,
+        help="Maximal length of text output.",
+    )
+    # add arguments for minimal length of text output
+    validator.add_argument(
+        "min_len",
+        type=int,
+        help="Minimal length of text output.",
+    )
+    # add arguments number of beams
+    validator.add_argument(
+        "num_beams",
+        type=int,
+        help="Number of beams used for beam search.",
+    )
+    # vqa task specific arguments
+    # add arguments for number of answer candidates
+    validator.add_argument(
+        "num_ans_candidates",
+        type=int,
+        help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
+    )
+    # add arguments for inference method
+    validator.add_argument(
+        "inference_method",
+        type=str,
+        choices=["genearte", "rank"],
+        help="""Inference method to use for question answering. If rank, requires a answer list.""",
+    )
+    # ====== model specific ======
+    validator.add_argument(
+        "k_test",
+        type=int,
+        help="Number of top k most similar samples from ITC/VTC selection to be tested.",
+    )
+    return validator

global_local/common/dist_utils.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import functools
+import os
+import torch
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}, world {}): {}".format(
+            args.rank, args.world_size, args.dist_url
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(
+            days=365
+        ),  # allow auto-downloading and de-compressing
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def get_dist_info():
+    if torch.__version__ < "1.0":
+        initialized = dist._initialized
+    else:
+        initialized = dist.is_initialized()
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:  # non-distributed training
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def no_grad_all_gather(tensors):
+    """
+    All gathers the provided tensors from all processes across machines.
+    Args:
+        tensors (list): tensors to perform all gather across all processes in
+        all machines.
+    """
+    gather_list = []
+    output_tensor = []
+    world_size = dist.get_world_size()
+    for tensor in tensors:
+        tensor_placeholder = [torch.ones_like(tensor) for _ in range(world_size)]
+        dist.all_gather(tensor_placeholder, tensor, async_op=False)
+        gather_list.append(tensor_placeholder)
+    for gathered_tensor in gather_list:
+        output_tensor.append(torch.cat(gathered_tensor, dim=0))
+    return output_tensor
+def main_process(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+        return cached_file
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+    #if is_dist_avail_and_initialized():
+    #    dist.barrier()
+    return get_cached_file_path()

global_local/common/gradcam.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import numpy as np
+from matplotlib import pyplot as plt
+from scipy.ndimage import filters
+from skimage import transform as skimage_transform
+def getAttMap(img, attMap, blur=True, overlap=True):
+    attMap -= attMap.min()
+    if attMap.max() > 0:
+        attMap /= attMap.max()
+    attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
+    if blur:
+        attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
+        attMap -= attMap.min()
+        attMap /= attMap.max()
+    cmap = plt.get_cmap("jet")
+    attMapV = cmap(attMap)
+    attMapV = np.delete(attMapV, 3, 2)
+    if overlap:
+        attMap = (
+            1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+            + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
+        )
+    return attMap

global_local/common/logger.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import logging
+import time
+from collections import defaultdict, deque
+import torch
+import torch.distributed as dist
+from global_local.common import dist_utils
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not dist_utils.is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+        )
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+    def global_avg(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        log_msg = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_msg.append("max mem: {memory:.0f}")
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(
+            "{} Total time: {} ({:.4f} s / it)".format(
+                header, total_time_str, total_time / len(iterable)
+            )
+        )
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def setup_logger():
+    logging.basicConfig(
+        level=logging.INFO if dist_utils.is_main_process() else logging.WARN,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[logging.StreamHandler()],
+    )

global_local/common/optims.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import math
+from global_local.common.registry import registry
+from torch.optim.lr_scheduler import LambdaLR
+@registry.register_lr_scheduler("linear_warmup_step_lr")
+class LinearWarmupStepLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        min_lr,
+        init_lr,
+        decay_rate=1,
+        warmup_start_lr=-1,
+        warmup_steps=0,
+        **kwargs
+    ):
+        self.optimizer = optimizer
+        self.max_epoch = max_epoch
+        self.min_lr = min_lr
+        self.decay_rate = decay_rate
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+    def step(self, cur_epoch, cur_step):
+        if cur_epoch == 0:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            step_lr_schedule(
+                epoch=cur_epoch,
+                optimizer=self.optimizer,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+                decay_rate=self.decay_rate,
+            )
+@registry.register_lr_scheduler("linear_warmup_cosine_lr")
+class LinearWarmupCosineLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        iters_per_epoch,
+        min_lr,
+        init_lr,
+        warmup_steps=0,
+        warmup_start_lr=-1,
+        **kwargs
+    ):
+        self.optimizer = optimizer
+        self.max_epoch = max_epoch
+        self.iters_per_epoch = iters_per_epoch
+        self.min_lr = min_lr
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+    def step(self, cur_epoch, cur_step):
+        total_cur_step = cur_epoch * self.iters_per_epoch + cur_step
+        if total_cur_step < self.warmup_steps:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            cosine_lr_schedule(
+                epoch=total_cur_step,
+                optimizer=self.optimizer,
+                max_epoch=self.max_epoch * self.iters_per_epoch,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+            )
+def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
+    """Decay the learning rate"""
+    lr = (init_lr - min_lr) * 0.5 * (
+        1.0 + math.cos(math.pi * epoch / max_epoch)
+    ) + min_lr
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
+    """Warmup the learning rate"""
+    lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
+    """Decay the learning rate"""
+    lr = max(min_lr, init_lr * (decay_rate**epoch))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases following the
+    values of the cosine function between 0 and `pi * cycles` after a warmup
+    period during which it increases linearly between 0 and 1.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    return LambdaLR(optimizer, lr_lambda, last_epoch)

global_local/common/registry.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+class Registry:
+    mapping = {
+        "builder_name_mapping": {},
+        "task_name_mapping": {},
+        "processor_name_mapping": {},
+        "model_name_mapping": {},
+        "lr_scheduler_name_mapping": {},
+        "runner_name_mapping": {},
+        "state": {},
+        "paths": {},
+    }
+    @classmethod
+    def register_builder(cls, name):
+        r"""Register a dataset builder to registry with key 'name'
+        Args:
+            name: Key with which the builder will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+            from video_llama.datasets.base_dataset_builder import BaseDatasetBuilder
+        """
+        def wrap(builder_cls):
+            from global_local.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+            assert issubclass(
+                builder_cls, BaseDatasetBuilder
+            ), "All builders must inherit BaseDatasetBuilder class, found {}".format(
+                builder_cls
+            )
+            if name in cls.mapping["builder_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["builder_name_mapping"][name]
+                    )
+                )
+            cls.mapping["builder_name_mapping"][name] = builder_cls
+            return builder_cls
+        return wrap
+    @classmethod
+    def register_task(cls, name):
+        r"""Register a task to registry with key 'name'
+        Args:
+            name: Key with which the task will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        def wrap(task_cls):
+            from global_local.tasks.base_task import BaseTask
+            assert issubclass(
+                task_cls, BaseTask
+            ), "All tasks must inherit BaseTask class"
+            if name in cls.mapping["task_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["task_name_mapping"][name]
+                    )
+                )
+            cls.mapping["task_name_mapping"][name] = task_cls
+            return task_cls
+        return wrap
+    @classmethod
+    def register_model(cls, name):
+        r"""Register a task to registry with key 'name'
+        Args:
+            name: Key with which the task will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        def wrap(model_cls):
+            from global_local.models import BaseModel
+            assert issubclass(
+                model_cls, BaseModel
+            ), "All models must inherit BaseModel class"
+            if name in cls.mapping["model_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["model_name_mapping"][name]
+                    )
+                )
+            cls.mapping["model_name_mapping"][name] = model_cls
+            return model_cls
+        return wrap
+    @classmethod
+    def register_processor(cls, name):
+        r"""Register a processor to registry with key 'name'
+        Args:
+            name: Key with which the task will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        def wrap(processor_cls):
+            from global_local.processors import BaseProcessor
+            assert issubclass(
+                processor_cls, BaseProcessor
+            ), "All processors must inherit BaseProcessor class"
+            if name in cls.mapping["processor_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["processor_name_mapping"][name]
+                    )
+                )
+            cls.mapping["processor_name_mapping"][name] = processor_cls
+            return processor_cls
+        return wrap
+    @classmethod
+    def register_lr_scheduler(cls, name):
+        r"""Register a model to registry with key 'name'
+        Args:
+            name: Key with which the task will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        def wrap(lr_sched_cls):
+            if name in cls.mapping["lr_scheduler_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["lr_scheduler_name_mapping"][name]
+                    )
+                )
+            cls.mapping["lr_scheduler_name_mapping"][name] = lr_sched_cls
+            return lr_sched_cls
+        return wrap
+    @classmethod
+    def register_runner(cls, name):
+        r"""Register a model to registry with key 'name'
+        Args:
+            name: Key with which the task will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        def wrap(runner_cls):
+            if name in cls.mapping["runner_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["runner_name_mapping"][name]
+                    )
+                )
+            cls.mapping["runner_name_mapping"][name] = runner_cls
+            return runner_cls
+        return wrap
+    @classmethod
+    def register_path(cls, name, path):
+        r"""Register a path to registry with key 'name'
+        Args:
+            name: Key with which the path will be registered.
+        Usage:
+            from video_llama.common.registry import registry
+        """
+        assert isinstance(path, str), "All path must be str."
+        if name in cls.mapping["paths"]:
+            raise KeyError("Name '{}' already registered.".format(name))
+        cls.mapping["paths"][name] = path
+    @classmethod
+    def register(cls, name, obj):
+        r"""Register an item to registry with key 'name'
+        Args:
+            name: Key with which the item will be registered.
+        Usage::
+            from video_llama.common.registry import registry
+            registry.register("config", {})
+        """
+        path = name.split(".")
+        current = cls.mapping["state"]
+        for part in path[:-1]:
+            if part not in current:
+                current[part] = {}
+            current = current[part]
+        current[path[-1]] = obj
+    # @classmethod
+    # def get_trainer_class(cls, name):
+    #     return cls.mapping["trainer_name_mapping"].get(name, None)
+    @classmethod
+    def get_builder_class(cls, name):
+        return cls.mapping["builder_name_mapping"].get(name, None)
+    @classmethod
+    def get_model_class(cls, name):
+        return cls.mapping["model_name_mapping"].get(name, None)
+    @classmethod
+    def get_task_class(cls, name):
+        return cls.mapping["task_name_mapping"].get(name, None)
+    @classmethod
+    def get_processor_class(cls, name):
+        return cls.mapping["processor_name_mapping"].get(name, None)
+    @classmethod
+    def get_lr_scheduler_class(cls, name):
+        return cls.mapping["lr_scheduler_name_mapping"].get(name, None)
+    @classmethod
+    def get_runner_class(cls, name):
+        return cls.mapping["runner_name_mapping"].get(name, None)
+    @classmethod
+    def list_runners(cls):
+        return sorted(cls.mapping["runner_name_mapping"].keys())
+    @classmethod
+    def list_models(cls):
+        return sorted(cls.mapping["model_name_mapping"].keys())
+    @classmethod
+    def list_tasks(cls):
+        return sorted(cls.mapping["task_name_mapping"].keys())
+    @classmethod
+    def list_processors(cls):
+        return sorted(cls.mapping["processor_name_mapping"].keys())
+    @classmethod
+    def list_lr_schedulers(cls):
+        return sorted(cls.mapping["lr_scheduler_name_mapping"].keys())
+    @classmethod
+    def list_datasets(cls):
+        return sorted(cls.mapping["builder_name_mapping"].keys())
+    @classmethod
+    def get_path(cls, name):
+        return cls.mapping["paths"].get(name, None)
+    @classmethod
+    def get(cls, name, default=None, no_warning=False):
+        r"""Get an item from registry with key 'name'
+        Args:
+            name (string): Key whose value needs to be retrieved.
+            default: If passed and key is not in registry, default value will
+                     be returned with a warning. Default: None
+            no_warning (bool): If passed as True, warning when key doesn't exist
+                               will not be generated. Useful for MMF's
+                               internal operations. Default: False
+        """
+        original_name = name
+        name = name.split(".")
+        value = cls.mapping["state"]
+        for subname in name:
+            value = value.get(subname, default)
+            if value is default:
+                break
+        if (
+            "writer" in cls.mapping["state"]
+            and value == default
+            and no_warning is False
+        ):
+            cls.mapping["state"]["writer"].warning(
+                "Key {} is not present in registry, returning default value "
+                "of {}".format(original_name, default)
+            )
+        return value
+    @classmethod
+    def unregister(cls, name):
+        r"""Remove an item from registry with key 'name'
+        Args:
+            name: Key which needs to be removed.
+        Usage::
+            from mmf.common.registry import registry
+            config = registry.unregister("config")
+        """
+        return cls.mapping["state"].pop(name, None)
+registry = Registry()

global_local/common/utils.py ADDED Viewed

	@@ -0,0 +1,424 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import io
+import json
+import logging
+import os
+import pickle
+import re
+import shutil
+import urllib
+import urllib.error
+import urllib.request
+from typing import Optional
+from urllib.parse import urlparse
+import numpy as np
+import pandas as pd
+import yaml
+from iopath.common.download import download
+from iopath.common.file_io import file_lock, g_pathmgr
+from global_local.common.registry import registry
+from torch.utils.model_zoo import tqdm
+from torchvision.datasets.utils import (
+    check_integrity,
+    download_file_from_google_drive,
+    extract_archive,
+)
+def now():
+    from datetime import datetime
+    return datetime.now().strftime("%Y%m%d%H%M")[:-1]
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def get_cache_path(rel_path):
+    return os.path.expanduser(os.path.join(registry.get_path("cache_root"), rel_path))
+def get_abs_path(rel_path):
+    return os.path.join(registry.get_path("library_root"), rel_path)
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+# The following are adapted from torchvision and vissl
+# torchvision: https://github.com/pytorch/vision
+# vissl: https://github.com/facebookresearch/vissl/blob/main/vissl/utils/download.py
+def makedir(dir_path):
+    """
+    Create the directory if it does not exist.
+    """
+    is_success = False
+    try:
+        if not g_pathmgr.exists(dir_path):
+            g_pathmgr.mkdirs(dir_path)
+        is_success = True
+    except BaseException:
+        print(f"Error creating directory: {dir_path}")
+    return is_success
+def get_redirected_url(url: str):
+    """
+    Given a URL, returns the URL it redirects to or the
+    original URL in case of no indirection
+    """
+    import requests
+    with requests.Session() as session:
+        with session.get(url, stream=True, allow_redirects=True) as response:
+            if response.history:
+                return response.url
+            else:
+                return url
+def to_google_drive_download_url(view_url: str) -> str:
+    """
+    Utility function to transform a view URL of google drive
+    to a download URL for google drive
+    Example input:
+        https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp/view
+    Example output:
+        https://drive.google.com/uc?export=download&id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp
+    """
+    splits = view_url.split("/")
+    assert splits[-1] == "view"
+    file_id = splits[-2]
+    return f"https://drive.google.com/uc?export=download&id={file_id}"
+def download_google_drive_url(url: str, output_path: str, output_file_name: str):
+    """
+    Download a file from google drive
+    Downloading an URL from google drive requires confirmation when
+    the file of the size is too big (google drive notifies that
+    anti-viral checks cannot be performed on such files)
+    """
+    import requests
+    with requests.Session() as session:
+        # First get the confirmation token and append it to the URL
+        with session.get(url, stream=True, allow_redirects=True) as response:
+            for k, v in response.cookies.items():
+                if k.startswith("download_warning"):
+                    url = url + "&confirm=" + v
+        # Then download the content of the file
+        with session.get(url, stream=True, verify=True) as response:
+            makedir(output_path)
+            path = os.path.join(output_path, output_file_name)
+            total_size = int(response.headers.get("Content-length", 0))
+            with open(path, "wb") as file:
+                from tqdm import tqdm
+                with tqdm(total=total_size) as progress_bar:
+                    for block in response.iter_content(
+                        chunk_size=io.DEFAULT_BUFFER_SIZE
+                    ):
+                        file.write(block)
+                        progress_bar.update(len(block))
+def _get_google_drive_file_id(url: str) -> Optional[str]:
+    parts = urlparse(url)
+    if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
+        return None
+    match = re.match(r"/file/d/(?P<id>[^/]*)", parts.path)
+    if match is None:
+        return None
+    return match.group("id")
+def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
+    with open(filename, "wb") as fh:
+        with urllib.request.urlopen(
+            urllib.request.Request(url, headers={"User-Agent": "vissl"})
+        ) as response:
+            with tqdm(total=response.length) as pbar:
+                for chunk in iter(lambda: response.read(chunk_size), ""):
+                    if not chunk:
+                        break
+                    pbar.update(chunk_size)
+                    fh.write(chunk)
+def download_url(
+    url: str,
+    root: str,
+    filename: Optional[str] = None,
+    md5: Optional[str] = None,
+) -> None:
+    """Download a file from a url and place it in root.
+    Args:
+        url (str): URL to download file from
+        root (str): Directory to place downloaded file in
+        filename (str, optional): Name to save the file under.
+                                  If None, use the basename of the URL.
+        md5 (str, optional): MD5 checksum of the download. If None, do not check
+    """
+    root = os.path.expanduser(root)
+    if not filename:
+        filename = os.path.basename(url)
+    fpath = os.path.join(root, filename)
+    makedir(root)
+    # check if file is already present locally
+    if check_integrity(fpath, md5):
+        print("Using downloaded and verified file: " + fpath)
+        return
+    # expand redirect chain if needed
+    url = get_redirected_url(url)
+    # check if file is located on Google Drive
+    file_id = _get_google_drive_file_id(url)
+    if file_id is not None:
+        return download_file_from_google_drive(file_id, root, filename, md5)
+    # download the file
+    try:
+        print("Downloading " + url + " to " + fpath)
+        _urlretrieve(url, fpath)
+    except (urllib.error.URLError, IOError) as e:  # type: ignore[attr-defined]
+        if url[:5] == "https":
+            url = url.replace("https:", "http:")
+            print(
+                "Failed download. Trying https -> http instead."
+                " Downloading " + url + " to " + fpath
+            )
+            _urlretrieve(url, fpath)
+        else:
+            raise e
+    # check integrity of downloaded file
+    if not check_integrity(fpath, md5):
+        raise RuntimeError("File not found or corrupted.")
+def download_and_extract_archive(
+    url: str,
+    download_root: str,
+    extract_root: Optional[str] = None,
+    filename: Optional[str] = None,
+    md5: Optional[str] = None,
+    remove_finished: bool = False,
+) -> None:
+    download_root = os.path.expanduser(download_root)
+    if extract_root is None:
+        extract_root = download_root
+    if not filename:
+        filename = os.path.basename(url)
+    download_url(url, download_root, filename, md5)
+    archive = os.path.join(download_root, filename)
+    print("Extracting {} to {}".format(archive, extract_root))
+    extract_archive(archive, extract_root, remove_finished)
+def cache_url(url: str, cache_dir: str) -> str:
+    """
+    This implementation downloads the remote resource and caches it locally.
+    The resource will only be downloaded if not previously requested.
+    """
+    parsed_url = urlparse(url)
+    dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/")))
+    makedir(dirname)
+    filename = url.split("/")[-1]
+    cached = os.path.join(dirname, filename)
+    with file_lock(cached):
+        if not os.path.isfile(cached):
+            logging.info(f"Downloading {url} to {cached} ...")
+            cached = download(url, dirname, filename=filename)
+    logging.info(f"URL {url} cached in {cached}")
+    return cached
+# TODO (prigoyal): convert this into RAII-style API
+def create_file_symlink(file1, file2):
+    """
+    Simply create the symlinks for a given file1 to file2.
+    Useful during model checkpointing to symlinks to the
+    latest successful checkpoint.
+    """
+    try:
+        if g_pathmgr.exists(file2):
+            g_pathmgr.rm(file2)
+        g_pathmgr.symlink(file1, file2)
+    except Exception as e:
+        logging.info(f"Could NOT create symlink. Error: {e}")
+def save_file(data, filename, append_to_json=True, verbose=True):
+    """
+    Common i/o utility to handle saving data to various file formats.
+    Supported:
+        .pkl, .pickle, .npy, .json
+    Specifically for .json, users have the option to either append (default)
+    or rewrite by passing in Boolean value to append_to_json.
+    """
+    if verbose:
+        logging.info(f"Saving data to file: {filename}")
+    file_ext = os.path.splitext(filename)[1]
+    if file_ext in [".pkl", ".pickle"]:
+        with g_pathmgr.open(filename, "wb") as fopen:
+            pickle.dump(data, fopen, pickle.HIGHEST_PROTOCOL)
+    elif file_ext == ".npy":
+        with g_pathmgr.open(filename, "wb") as fopen:
+            np.save(fopen, data)
+    elif file_ext == ".json":
+        if append_to_json:
+            with g_pathmgr.open(filename, "a") as fopen:
+                fopen.write(json.dumps(data, sort_keys=True) + "\n")
+                fopen.flush()
+        else:
+            with g_pathmgr.open(filename, "w") as fopen:
+                fopen.write(json.dumps(data, sort_keys=True) + "\n")
+                fopen.flush()
+    elif file_ext == ".yaml":
+        with g_pathmgr.open(filename, "w") as fopen:
+            dump = yaml.dump(data)
+            fopen.write(dump)
+            fopen.flush()
+    else:
+        raise Exception(f"Saving {file_ext} is not supported yet")
+    if verbose:
+        logging.info(f"Saved data to file: {filename}")
+def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False):
+    """
+    Common i/o utility to handle loading data from various file formats.
+    Supported:
+        .pkl, .pickle, .npy, .json
+    For the npy files, we support reading the files in mmap_mode.
+    If the mmap_mode of reading is not successful, we load data without the
+    mmap_mode.
+    """
+    if verbose:
+        logging.info(f"Loading data from file: {filename}")
+    file_ext = os.path.splitext(filename)[1]
+    if file_ext == ".txt":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = fopen.readlines()
+    elif file_ext in [".pkl", ".pickle"]:
+        with g_pathmgr.open(filename, "rb") as fopen:
+            data = pickle.load(fopen, encoding="latin1")
+    elif file_ext == ".npy":
+        if mmap_mode:
+            try:
+                with g_pathmgr.open(filename, "rb") as fopen:
+                    data = np.load(
+                        fopen,
+                        allow_pickle=allow_pickle,
+                        encoding="latin1",
+                        mmap_mode=mmap_mode,
+                    )
+            except ValueError as e:
+                logging.info(
+                    f"Could not mmap {filename}: {e}. Trying without g_pathmgr"
+                )
+                data = np.load(
+                    filename,
+                    allow_pickle=allow_pickle,
+                    encoding="latin1",
+                    mmap_mode=mmap_mode,
+                )
+                logging.info("Successfully loaded without g_pathmgr")
+            except Exception:
+                logging.info("Could not mmap without g_pathmgr. Trying without mmap")
+                with g_pathmgr.open(filename, "rb") as fopen:
+                    data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+        else:
+            with g_pathmgr.open(filename, "rb") as fopen:
+                data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+    elif file_ext == ".json":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = json.load(fopen)
+    elif file_ext == ".yaml":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = yaml.load(fopen, Loader=yaml.FullLoader)
+    elif file_ext == ".csv":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = pd.read_csv(fopen)
+    else:
+        raise Exception(f"Reading from {file_ext} is not supported yet")
+    return data
+def abspath(resource_path: str):
+    """
+    Make a path absolute, but take into account prefixes like
+    "http://" or "manifold://"
+    """
+    regex = re.compile(r"^\w+://")
+    if regex.match(resource_path) is None:
+        return os.path.abspath(resource_path)
+    else:
+        return resource_path
+def makedir(dir_path):
+    """
+    Create the directory if it does not exist.
+    """
+    is_success = False
+    try:
+        if not g_pathmgr.exists(dir_path):
+            g_pathmgr.mkdirs(dir_path)
+        is_success = True
+    except BaseException:
+        logging.info(f"Error creating directory: {dir_path}")
+    return is_success
+def is_url(input_url):
+    """
+    Check if an input string is a url. look for http(s):// and ignoring the case
+    """
+    is_url = re.match(r"^(?:http)s?://", input_url, re.IGNORECASE) is not None
+    return is_url
+def cleanup_dir(dir):
+    """
+    Utility for deleting a directory. Useful for cleaning the storage space
+    that contains various training artifacts like checkpoints, data etc.
+    """
+    if os.path.exists(dir):
+        logging.info(f"Deleting directory: {dir}")
+        shutil.rmtree(dir)
+    logging.info(f"Deleted contents of directory: {dir}")
+def get_file_size(filename):
+    """
+    Given a file, get the size of file in MB
+    """
+    size_in_mb = os.path.getsize(filename) / float(1024**2)
+    return size_in_mb

global_local/configs/datasets/cc_sbu/align.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets:
+  cc_sbu_align:
+    data_type: images
+    build_info:
+      storage: /path/to/cc_sbu_align_dataset

global_local/configs/datasets/cc_sbu/defaults.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets:
+  cc_sbu:
+    data_type: images
+    build_info:
+      storage: /path/to/cc_sbu_dataset/{00000..00001}.tar

global_local/configs/datasets/instruct/llava_instruct.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets:
+  llava_instruct:
+    data_type: image
+    build_info:
+      anno_dir: /path/llava_instruct_150k.json
+      videos_dir: /path/train2014/train2014/

global_local/configs/datasets/instruct/webvid_instruct.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets:
+  webvid_instruct:
+    data_type: image
+    build_info:
+      anno_dir: /path/webvid_align/videochat_instruct_11k.json
+      videos_dir: /path/webvid_align/videos/

global_local/configs/datasets/laion/defaults.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets:
+  laion:
+    data_type: images
+    build_info:
+      storage: path/laion/laion_dataset/{00000..00001}.tar

global_local/configs/datasets/webvid/defaults.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets:
+  webvid:
+    data_type: video
+    build_info:
+      anno_dir: path/webvid/webvid_tain_data/annotations/
+      videos_dir: path//webvid/webvid_tain_data/videos/

global_local/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+env:
+  # For default users
+  # cache_root: "cache"
+  # For internal use with persistent storage
+  cache_root: "/export/home/.cache/minigpt4"

global_local/configs/models/minigpt4.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+model:
+  arch: mini_gpt4
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+  # Q-Former
+  num_query_token: 32
+  # Vicuna
+  llama_model: "ckpt/vicuna-13b/"
+  # generation configs
+  prompt: ""
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip2_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"

global_local/configs/models/video_llama.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+model:
+  arch: video_llama
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+  # Q-Former
+  num_query_token: 32
+  # Vicuna
+  llama_model: "/projectnb/ivc-ml/samarth/projects/misc/minigpt-4-chat-models/Llama-2-7b-chat-hf"
+  # generation configs
+  prompt: ""
+preprocess:
+    vis_processor:
+        train:
+          name: "alpro_video_train"
+          image_size: 224
+          n_frms: 8
+        eval:
+          name: "alpro_video_eval"
+          image_size: 224
+          n_frms: 8
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"

global_local/conversation/__init__.py ADDED Viewed

File without changes

global_local/conversation/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (225 Bytes). View file

global_local/conversation/__pycache__/conversation_video.cpython-39.pyc ADDED Viewed

Binary file (12.2 kB). View file

global_local/conversation/conversation_video.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+Conversation prompt template of Video-LLaMA.
+Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/minigpt4/conversation/conversation.py
+"""
+import argparse
+import time
+from PIL import Image
+import sys
+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+import os
+import sys
+from global_local.common.registry import registry
+from global_local.processors.video_processor import ToTHWC,ToUint8,load_video
+from global_local.processors import Blip2ImageEvalProcessor
+#from video_llama.models.ImageBind.data import load_and_transform_audio_data
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    # system_img: List[Image.Image] = []
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    skip_next: bool = False
+    conv_id: Any = None
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            # system_img=self.system_img,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            conv_id=self.conv_id)
+    def dict(self):
+        return {
+            "system": self.system,
+            # "system_img": self.system_img,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "conv_id": self.conv_id,
+        }
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+        return False
+CONV_VISION = Conversation(
+    system="Give the following image: <Img>ImageContent</Img>. "
+           "You will be able to see the image once I provide it to you. Please answer my questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+default_conversation = Conversation(
+    system="",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+class Chat:
+    def __init__(self, model, vis_processor, device='cuda:0'):
+        self.device = device
+        self.model = model
+        self.vis_processor = vis_processor
+        self.image_vis_processor = Blip2ImageEvalProcessor()
+        # stop_words_ids = [torch.tensor([835]).to(self.device),
+        #                   torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
+        # self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        self.num_frames_per_clip = 16
+        self.num_segments = 4
+    def ask(self, text, conv):
+        if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
+                and ('</Video>' in conv.messages[-1][1] or '</Image>' in conv.messages[-1][1]):  # last message is image.
+            conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+        else:
+            conv.append_message(conv.roles[0], text)
+    def answer(self, conv, img_list, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9,
+               repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000):
+        conv.append_message(conv.roles[1], None)
+        embs = self.get_context_emb(conv, img_list)
+        current_max_len = embs.shape[1] + max_new_tokens
+        if current_max_len - max_length > 0:
+            print('Warning: The number of tokens in current conversation exceeds the max length. '
+                  'The model will not see the contexts outside the range.')
+        begin_idx = max(0, current_max_len - max_length)
+        embs = embs[:, begin_idx:]
+        if conv.sep =="###":
+            stop_words_ids = [torch.tensor([835]).to(self.device),
+                          torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
+            stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        else:
+            stop_words_ids = [torch.tensor([2]).to(self.device)]
+            stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        # stopping_criteria
+        outputs = self.model.llama_model.generate(
+            inputs_embeds=embs,
+            max_new_tokens=max_new_tokens,
+            stopping_criteria=stopping_criteria,
+            num_beams=num_beams,
+            do_sample=True,
+            min_length=min_length,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            temperature=temperature,
+        )
+        output_token = outputs[0]
+        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
+            output_token = output_token[1:]
+        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
+            output_token = output_token[1:]
+        output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
+        if conv.sep =="###":
+            output_text = output_text.split('###')[0]  # remove the stop sign '###'
+            output_text = output_text.split('Assistant:')[-1].strip()
+        else:
+            output_text = output_text.split(conv.sep2)[0]  # remove the stop sign '###'
+            output_text = output_text.split(conv.roles[1]+':')[-1].strip()
+        conv.messages[-1][1] = output_text
+        return output_text, output_token.cpu().numpy()
+    def upload_video(self, video_path, conv, img_list):
+        msg = ""
+        if isinstance(video_path, str):  # is a video path
+            ext = os.path.splitext(video_path)[-1].lower()
+            print(video_path)
+            # image = self.vis_processor(image).unsqueeze(0).to(self.device)
+            video, msg = load_video(
+                video_path=video_path,
+                n_frms=8,
+                height=224,
+                width=224,
+                sampling ="uniform", return_msg = True
+            )
+            video = self.vis_processor.transform(video)
+            video = video.unsqueeze(0).to(self.device)
+            # print(image)
+        else:
+            raise NotImplementedError
+        try:
+            audio_flag = 1
+            audio = load_and_transform_audio_data([video_path],"cpu",  clips_per_video=8)
+            audio = audio.to(self.device)
+        except :
+            print('no audio is found')
+            audio_flag = 0
+        finally:
+            if audio_flag == 1:
+                # image_emb, _ = self.model.encode_videoQformer_audiovideo(video,audio)
+                image_emb, _ = self.model.encode_videoQformer_visual(video)
+                audio_emb,_  = self.model.encode_audioQformer(audio)
+                img_list.append(audio_emb)
+                img_list.append(image_emb)
+                conv.system = ""
+                # conv.append_message(conv.roles[0], "The audio of this video is <Video><ImageHere></Video> ")
+                conv.append_message(conv.roles[0], "Close your eyes, open your ears and you imagine only based on the sound that: <ImageHere>. \
+                Close your ears, open your eyes and you see that <Video><ImageHere></Video>.  \
+                Now answer my question based on what you have just seen and heard.")
+            else:  # only vison no audio
+                # conv.system = "You can understand the video that the user provides. Follow the instructions carefully and explain your answers in detail."
+                image_emb, _ = self.model.encode_videoQformer_visual(video)
+                img_list.append(image_emb)
+                conv.append_message(conv.roles[0], "<Video><ImageHere></Video> "+ msg)
+            return "Received."
+    def upload_video_without_audio(self, video_path, conv, img_list):
+        msg = ""
+        if isinstance(video_path, str):  # is a video path
+            ext = os.path.splitext(video_path)[-1].lower()
+            print(video_path)
+            # image = self.vis_processor(image).unsqueeze(0).to(self.device)
+            video, msg = load_video(
+                video_path=video_path,
+                n_frms=self.num_frames_per_clip*self.num_segments,
+                height=224,
+                width=224,
+                sampling ="uniform", return_msg = True
+            )
+            video = self.vis_processor.transform(video)
+            video = video.unsqueeze(0).to(self.device)
+        else:
+            raise NotImplementedError
+        # conv.system = "You can understand the video that the user provides.  Follow the instructions carefully and explain your answers in detail."
+        #image_emb, _ = self.model.encode_videoQformer_visual(video)
+        image_emb, _ = self.process_video_frames(video)
+        img_list.append(image_emb)
+        conv.append_message(conv.roles[0], "<Video><ImageHere></Video> "+ msg)
+        return "Received."
+    def process_video_frames(self, all_frames):
+        total_num_frames = self.num_frames_per_clip * self.num_segments
+        global_clip_indices = torch.linspace(0, total_num_frames-1, steps=self.num_frames_per_clip)
+        short_window_indices = torch.linspace(0, total_num_frames-1, steps=self.num_frames_per_clip * self.num_segments)
+        global_processed_frames = []
+        for i in global_clip_indices:
+            i = int(i)
+            curr = all_frames[:, :, i]
+            #curr = np.uint8(all_frames[i])
+            #curr = frame_transform(Image.fromarray(curr))
+            global_processed_frames.append(curr)
+        global_processed_frames = torch.stack(global_processed_frames, dim=2)
+        '''if len(global_processed_frames) < args.num_frames_per_clip:
+            diff = args.num_frames_per_clip - len(global_processed_frames)
+            pad = global_processed_frames[-1].unsqueeze(0).repeat(diff, 1, 1, 1)
+            global_processed_frames = torch.cat((global_processed_frames, pad), dim=0)'''
+        short_window_processed_frames = []
+        for i in short_window_indices:
+            i = int(i)
+            curr = all_frames[:, :, i]
+            #curr = np.uint8(all_frames[i])
+            #curr = frame_transform(Image.fromarray(curr))
+            short_window_processed_frames.append(curr)
+        short_window_processed_frames = torch.stack(short_window_processed_frames, dim=2)
+        '''if len(short_window_processed_frames) < args.num_frames_per_clip * args.num_segments:
+            diff = args.num_frames_per_clip * args.num_segments - len(short_window_processed_frames)
+            pad = short_window_processed_frames[-1].unsqueeze(0).repeat(diff, 1, 1, 1)
+            short_window_processed_frames = torch.cat((short_window_processed_frames, pad), dim=0)'''
+        global_attn_mask = torch.zeros((self.num_frames_per_clip))
+        global_attn_mask[:global_processed_frames.size(2)] = True
+        short_window_attn_mask = torch.zeros((self.num_frames_per_clip * self.num_segments))
+        short_window_attn_mask[:short_window_processed_frames.size(2)] = True
+        global_processed_frames = global_processed_frames.permute((0, 2, 1, 3, 4)).cuda()
+        short_window_processed_frames = short_window_processed_frames.permute((0, 2, 1, 3, 4)).cuda()
+        global_frame_attn_mask = global_attn_mask.unsqueeze(0).cuda()
+        segments_frame_attn_mask = short_window_attn_mask.unsqueeze(0).cuda()
+        with torch.no_grad():
+            samples = {'global_video': global_processed_frames, 'global_frame_attn_mask': global_frame_attn_mask, 'segments_video': short_window_processed_frames, 'segments_frame_attn_mask': segments_frame_attn_mask}
+            merged_video_embeds, merged_video_embeds_mask = self.model.compute_merged_video_embeds(samples)
+        return merged_video_embeds, merged_video_embeds_mask
+    def upload_img(self, image, conv, img_list):
+        msg = ""
+        if isinstance(image, str):  # is a image path
+            raw_image = Image.open(image).convert('RGB') # 增加一个时间维度
+            image = self.image_vis_processor(raw_image).unsqueeze(0).unsqueeze(2).to(self.device)
+        elif isinstance(image, Image.Image):
+            raw_image = image
+            image = self.image_vis_processor(raw_image).unsqueeze(0).unsqueeze(2).to(self.device)
+        elif isinstance(image, torch.Tensor):
+            if len(image.shape) == 3:
+                image = image.unsqueeze(0)
+            image = image.to(self.device)
+        else:
+            raise NotImplementedError
+        image_emb, _ = self.model.encode_videoQformer_visual(image)
+        img_list.append(image_emb)
+        # Todo msg=""
+        conv.append_message(conv.roles[0], "<Image><ImageHere></Image> "+ msg)
+        return "Received."
+    def get_context_emb(self, conv, img_list):
+        prompt = conv.get_prompt()
+        prompt_segs = prompt.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
+        seg_tokens = [
+            self.model.llama_tokenizer(
+                seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
+            # only add bos to the first seg
+            for i, seg in enumerate(prompt_segs)
+        ]
+        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
+        mixed_embs = torch.cat(mixed_embs, dim=1)
+        return mixed_embs
+if __name__ =='__main__':
+    video_path = '/mnt/workspace/videoGPT/Video-LLaMA/examples/applausing.mp4'
+    # import torch.classes.torchaudio.ffmpeg_StreamReader
+    # ffmpeg_StreamReader(video_path)
+    load_and_transform_audio_data([video_path],"cpu",  clips_per_video=8)

global_local/datasets/__init__.py ADDED Viewed

File without changes

global_local/datasets/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (221 Bytes). View file

global_local/datasets/__pycache__/data_utils.cpython-39.pyc ADDED Viewed

Binary file (6.04 kB). View file

global_local/datasets/builders/__init__.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from global_local.datasets.builders.base_dataset_builder import load_dataset_config
+from global_local.datasets.builders.image_text_pair_builder import (
+    CCSBUBuilder,
+    LaionBuilder,
+    CCSBUAlignBuilder
+)
+from global_local.datasets.builders.video_caption_builder import WebvidBuilder
+from global_local.common.registry import registry
+from global_local.datasets.builders.instruct_builder import WebvidInstruct_Builder,LlavaInstruct_Builder
+__all__ = [
+    "CCSBUBuilder",
+    "LaionBuilder",
+    "CCSBUAlignBuilder",
+    "WebvidBuilder",
+    "LlavaInstruct_Builder",
+    "WebvidInstruct_Builder"
+]
+def load_dataset(name, cfg_path=None, vis_path=None, data_type=None):
+    """
+    Example
+    >>> dataset = load_dataset("coco_caption", cfg=None)
+    >>> splits = dataset.keys()
+    >>> print([len(dataset[split]) for split in splits])
+    """
+    if cfg_path is None:
+        cfg = None
+    else:
+        cfg = load_dataset_config(cfg_path)
+    try:
+        builder = registry.get_builder_class(name)(cfg)
+    except TypeError:
+        print(
+            f"Dataset {name} not found. Available datasets:\n"
+            + ", ".join([str(k) for k in dataset_zoo.get_names()])
+        )
+        exit(1)
+    if vis_path is not None:
+        if data_type is None:
+            # use default data type in the config
+            data_type = builder.config.data_type
+        assert (
+            data_type in builder.config.build_info
+        ), f"Invalid data_type {data_type} for {name}."
+        builder.config.build_info.get(data_type).storage = vis_path
+    dataset = builder.build_datasets()
+    return dataset
+class DatasetZoo:
+    def __init__(self) -> None:
+        self.dataset_zoo = {
+            k: list(v.DATASET_CONFIG_DICT.keys())
+            for k, v in sorted(registry.mapping["builder_name_mapping"].items())
+        }
+    def get_names(self):
+        return list(self.dataset_zoo.keys())
+dataset_zoo = DatasetZoo()

global_local/datasets/builders/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (2.69 kB). View file

global_local/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc ADDED Viewed

Binary file (6.15 kB). View file

global_local/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc ADDED Viewed

Binary file (3.06 kB). View file

global_local/datasets/builders/__pycache__/instruct_builder.cpython-39.pyc ADDED Viewed

Binary file (2.62 kB). View file

global_local/datasets/builders/__pycache__/video_caption_builder.cpython-39.pyc ADDED Viewed

Binary file (1.52 kB). View file

global_local/datasets/builders/base_dataset_builder.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+ This file is from
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import os
+import shutil
+import warnings
+from omegaconf import OmegaConf
+import torch.distributed as dist
+from torchvision.datasets.utils import download_url
+import global_local.common.utils as utils
+from global_local.common.dist_utils import is_dist_avail_and_initialized, is_main_process
+from global_local.common.registry import registry
+from global_local.processors.base_processor import BaseProcessor
+class BaseDatasetBuilder:
+    train_dataset_cls, eval_dataset_cls = None, None
+    def __init__(self, cfg=None):
+        super().__init__()
+        if cfg is None:
+            # help to create datasets from default config.
+            self.config = load_dataset_config(self.default_config_path())
+        elif isinstance(cfg, str):
+            self.config = load_dataset_config(cfg)
+        else:
+            # when called from task.build_dataset()
+            self.config = cfg
+        self.data_type = self.config.data_type
+        self.vis_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+        self.text_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+    def build_datasets(self):
+        # download, split, etc...
+        # only called on 1 GPU/TPU in distributed
+        if is_main_process():
+            self._download_data()
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        datasets = self.build()  # dataset['train'/'val'/'test']
+        return datasets
+    def build_processors(self):
+        vis_proc_cfg = self.config.get("vis_processor")
+        txt_proc_cfg = self.config.get("text_processor")
+        if vis_proc_cfg is not None:
+            vis_train_cfg = vis_proc_cfg.get("train")
+            vis_eval_cfg = vis_proc_cfg.get("eval")
+            self.vis_processors["train"] = self._build_proc_from_cfg(vis_train_cfg)
+            self.vis_processors["eval"] = self._build_proc_from_cfg(vis_eval_cfg)
+        if txt_proc_cfg is not None:
+            txt_train_cfg = txt_proc_cfg.get("train")
+            txt_eval_cfg = txt_proc_cfg.get("eval")
+            self.text_processors["train"] = self._build_proc_from_cfg(txt_train_cfg)
+            self.text_processors["eval"] = self._build_proc_from_cfg(txt_eval_cfg)
+    @staticmethod
+    def _build_proc_from_cfg(cfg):
+        return (
+            registry.get_processor_class(cfg.name).from_config(cfg)
+            if cfg is not None
+            else None
+        )
+    @classmethod
+    def default_config_path(cls, type="default"):
+        return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type])
+    def _download_data(self):
+        self._download_ann()
+        self._download_vis()
+    def _download_ann(self):
+        """
+        Download annotation files if necessary.
+        All the vision-language datasets should have annotations of unified format.
+        storage_path can be:
+          (1) relative/absolute: will be prefixed with env.cache_root to make full path if relative.
+          (2) basename/dirname: will be suffixed with base name of URL if dirname is provided.
+        Local annotation paths should be relative.
+        """
+        anns = self.config.build_info.annotations
+        splits = anns.keys()
+        cache_root = registry.get_path("cache_root")
+        for split in splits:
+            info = anns[split]
+            urls, storage_paths = info.get("url", None), info.storage
+            if isinstance(urls, str):
+                urls = [urls]
+            if isinstance(storage_paths, str):
+                storage_paths = [storage_paths]
+            assert len(urls) == len(storage_paths)
+            for url_or_filename, storage_path in zip(urls, storage_paths):
+                # if storage_path is relative, make it full by prefixing with cache_root.
+                if not os.path.isabs(storage_path):
+                    storage_path = os.path.join(cache_root, storage_path)
+                dirname = os.path.dirname(storage_path)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+                if os.path.isfile(url_or_filename):
+                    src, dst = url_or_filename, storage_path
+                    if not os.path.exists(dst):
+                        shutil.copyfile(src=src, dst=dst)
+                    else:
+                        logging.info("Using existing file {}.".format(dst))
+                else:
+                    if os.path.isdir(storage_path):
+                        # if only dirname is provided, suffix with basename of URL.
+                        raise ValueError(
+                            "Expecting storage_path to be a file path, got directory {}".format(
+                                storage_path
+                            )
+                        )
+                    else:
+                        filename = os.path.basename(storage_path)
+                    download_url(url=url_or_filename, root=dirname, filename=filename)
+    def _download_vis(self):
+        storage_path = self.config.build_info.get(self.data_type).storage
+        storage_path = utils.get_cache_path(storage_path)
+        if not os.path.exists(storage_path):
+            warnings.warn(
+                f"""
+                The specified path {storage_path} for visual inputs does not exist.
+                Please provide a correct path to the visual inputs or
+                refer to datasets/download_scripts/README.md for downloading instructions.
+                """
+            )
+    def build(self):
+        """
+        Create by split datasets inheriting torch.utils.data.Datasets.
+        # build() can be dataset-specific. Overwrite to customize.
+        """
+        self.build_processors()
+        build_info = self.config.build_info
+        ann_info = build_info.annotations
+        vis_info = build_info.get(self.data_type)
+        datasets = dict()
+        for split in ann_info.keys():
+            if split not in ["train", "val", "test"]:
+                continue
+            is_train = split == "train"
+            # processors
+            vis_processor = (
+                self.vis_processors["train"]
+                if is_train
+                else self.vis_processors["eval"]
+            )
+            text_processor = (
+                self.text_processors["train"]
+                if is_train
+                else self.text_processors["eval"]
+            )
+            # annotation path
+            ann_paths = ann_info.get(split).storage
+            if isinstance(ann_paths, str):
+                ann_paths = [ann_paths]
+            abs_ann_paths = []
+            for ann_path in ann_paths:
+                if not os.path.isabs(ann_path):
+                    ann_path = utils.get_cache_path(ann_path)
+                abs_ann_paths.append(ann_path)
+            ann_paths = abs_ann_paths
+            # visual data storage path
+            vis_path = os.path.join(vis_info.storage, split)
+            if not os.path.isabs(vis_path):
+                # vis_path = os.path.join(utils.get_cache_path(), vis_path)
+                vis_path = utils.get_cache_path(vis_path)
+            if not os.path.exists(vis_path):
+                warnings.warn("storage path {} does not exist.".format(vis_path))
+            # create datasets
+            dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+            datasets[split] = dataset_cls(
+                vis_processor=vis_processor,
+                text_processor=text_processor,
+                ann_paths=ann_paths,
+                vis_root=vis_path,
+            )
+        return datasets
+def load_dataset_config(cfg_path):
+    cfg = OmegaConf.load(cfg_path).datasets
+    cfg = cfg[list(cfg.keys())[0]]
+    return cfg

global_local/datasets/builders/image_text_pair_builder.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import logging
+import warnings
+from global_local.common.registry import registry
+from global_local.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from global_local.datasets.datasets.laion_dataset import LaionDataset
+from global_local.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
+@registry.register_builder("cc_sbu")
+class CCSBUBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CCSBUDataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/cc_sbu/defaults.yaml"}
+    def _download_ann(self):
+        pass
+    def _download_vis(self):
+        pass
+    def build(self):
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+        split = "train"
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            location=build_info.storage,
+        ).inner_dataset
+        return datasets
+@registry.register_builder("laion")
+class LaionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LaionDataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults.yaml"}
+    def _download_ann(self):
+        pass
+    def _download_vis(self):
+        pass
+    def build(self):
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+        split = "train"
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            location=build_info.storage,
+        ).inner_dataset
+        return datasets
+@registry.register_builder("cc_sbu_align")
+class CCSBUAlignBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CCSBUAlignDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/cc_sbu/align.yaml",
+    }
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        storage_path = build_info.storage
+        datasets = dict()
+        if not os.path.exists(storage_path):
+            warnings.warn("storage path {} does not exist.".format(storage_path))
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_paths=[os.path.join(storage_path, 'filter_cap.json')],
+            vis_root=os.path.join(storage_path, 'image'),
+        )
+        return datasets

global_local/datasets/builders/instruct_builder.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import logging
+import warnings
+from global_local.common.registry import registry
+from global_local.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from global_local.datasets.datasets.laion_dataset import LaionDataset
+from global_local.datasets.datasets.llava_instruct_dataset import Instruct_Dataset
+from global_local.datasets.datasets.video_instruct_dataset import Video_Instruct_Dataset
+@registry.register_builder("instruct")
+class Instruct_Builder(BaseDatasetBuilder):
+    train_dataset_cls = Instruct_Dataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/instruct/defaults.yaml"}
+    def _download_ann(self):
+        pass
+    def _download_vis(self):
+        pass
+    def build(self):
+        self.build_processors()
+        datasets = dict()
+        split = "train"
+        build_info = self.config.build_info
+        dataset_cls = self.train_dataset_cls
+        if self.config.num_video_query_token:
+            num_video_query_token = self.config.num_video_query_token
+        else:
+            num_video_query_token = 32
+        if self.config.tokenizer_name:
+            tokenizer_name = self.config.tokenizer_name
+        else:
+            tokenizer_name = '/mnt/workspace/ckpt/vicuna-13b/'
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            vis_root=build_info.videos_dir,
+            ann_root=build_info.anno_dir,
+            num_video_query_token = num_video_query_token,
+            tokenizer_name = tokenizer_name,
+            data_type = self.config.data_type
+        )
+        return datasets
+@registry.register_builder("webvid_instruct")
+class WebvidInstruct_Builder(Instruct_Builder):
+    train_dataset_cls = Video_Instruct_Dataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/instruct/webvid_instruct.yaml",
+    }
+@registry.register_builder("webvid_instruct_zh")
+class WebvidInstruct_zh_Builder(Instruct_Builder):
+    train_dataset_cls = Video_Instruct_Dataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/instruct/webvid_instruct.yaml",
+    }
+@registry.register_builder("llava_instruct")
+class LlavaInstruct_Builder(Instruct_Builder):
+    train_dataset_cls = Instruct_Dataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/instruct/llava_instruct.yaml",
+    }

global_local/datasets/builders/video_caption_builder.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import logging
+import warnings
+from global_local.common.registry import registry
+from global_local.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from global_local.datasets.datasets.webvid_datasets import WebvidDataset
+@registry.register_builder("webvid")
+class WebvidBuilder(BaseDatasetBuilder):
+    train_dataset_cls = WebvidDataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/webvid/defaults.yaml"}
+    def _download_ann(self):
+        pass
+    def _download_vis(self):
+        pass
+    def build(self):
+        self.build_processors()
+        datasets = dict()
+        split = "train"
+        build_info = self.config.build_info
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            vis_root=build_info.videos_dir,
+            ann_root=build_info.anno_dir
+        )
+        return datasets

global_local/datasets/data_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import gzip
+import logging
+import os
+import random as rnd
+import tarfile
+import zipfile
+import random
+from typing import List
+from tqdm import tqdm
+import decord
+from decord import VideoReader
+import webdataset as wds
+import numpy as np
+import torch
+from torch.utils.data.dataset import IterableDataset
+from global_local.common.registry import registry
+from global_local.datasets.datasets.base_dataset import ConcatDataset
+decord.bridge.set_bridge("torch")
+MAX_INT = registry.get("MAX_INT")
+class ChainDataset(wds.DataPipeline):
+    r"""Dataset for chaining multiple :class:`DataPipeline` s.
+    This class is useful to assemble different existing dataset streams. The
+    chaining operation is done on-the-fly, so concatenating large-scale
+    datasets with this class will be efficient.
+    Args:
+        datasets (iterable of IterableDataset): datasets to be chained together
+    """
+    def __init__(self, datasets: List[wds.DataPipeline]) -> None:
+        super().__init__()
+        self.datasets = datasets
+        self.prob = []
+        self.names = []
+        for dataset in self.datasets:
+            if hasattr(dataset, 'name'):
+                self.names.append(dataset.name)
+            else:
+                self.names.append('Unknown')
+            if hasattr(dataset, 'sample_ratio'):
+                self.prob.append(dataset.sample_ratio)
+            else:
+                self.prob.append(1)
+                logging.info("One of the datapipeline doesn't define ratio and set to 1 automatically.")
+    def __iter__(self):
+        datastreams = [iter(dataset) for dataset in self.datasets]
+        while True:
+            select_datastream = random.choices(datastreams, weights=self.prob, k=1)[0]
+            yield next(select_datastream)
+def apply_to_sample(f, sample):
+    if len(sample) == 0:
+        return {}
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        else:
+            return x
+    return _apply(sample)
+def move_to_cuda(sample):
+    def _move_to_cuda(tensor):
+        return tensor.cuda()
+    return apply_to_sample(_move_to_cuda, sample)
+def prepare_sample(samples, cuda_enabled=True):
+    if cuda_enabled:
+        samples = move_to_cuda(samples)
+    # TODO fp16 support
+    return samples
+def reorg_datasets_by_split(datasets):
+    """
+    Organizes datasets by split.
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by name.
+    Returns:
+        Dict of datasets by split {split_name: List[Datasets]}.
+    """
+    # if len(datasets) == 1:
+    #     return datasets[list(datasets.keys())[0]]
+    # else:
+    reorg_datasets = dict()
+    # reorganize by split
+    for _, dataset in datasets.items():
+        for split_name, dataset_split in dataset.items():
+            if split_name not in reorg_datasets:
+                reorg_datasets[split_name] = [dataset_split]
+            else:
+                reorg_datasets[split_name].append(dataset_split)
+    return reorg_datasets
+def concat_datasets(datasets):
+    """
+    Concatenates multiple datasets into a single dataset.
+    It supports may-style datasets and DataPipeline from WebDataset. Currently, does not support
+    generic IterableDataset because it requires creating separate samplers.
+    Now only supports conctenating training datasets and assuming validation and testing
+    have only a single dataset. This is because metrics should not be computed on the concatenated
+    datasets.
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by split.
+    Returns:
+        Dict of concatenated datasets by split, "train" is the concatenation of multiple datasets,
+        "val" and "test" remain the same.
+        If the input training datasets contain both map-style and DataPipeline datasets, returns
+        a tuple, where the first element is a concatenated map-style dataset and the second
+        element is a chained DataPipeline dataset.
+    """
+    # concatenate datasets in the same split
+    for split_name in datasets:
+        if split_name != "train":
+            assert (
+                len(datasets[split_name]) == 1
+            ), "Do not support multiple {} datasets.".format(split_name)
+            datasets[split_name] = datasets[split_name][0]
+        else:
+            iterable_datasets, map_datasets = [], []
+            for dataset in datasets[split_name]:
+                if isinstance(dataset, wds.DataPipeline):
+                    logging.info(
+                        "Dataset {} is IterableDataset, can't be concatenated.".format(
+                            dataset
+                        )
+                    )
+                    iterable_datasets.append(dataset)
+                elif isinstance(dataset, IterableDataset):
+                    raise NotImplementedError(
+                        "Do not support concatenation of generic IterableDataset."
+                    )
+                else:
+                    map_datasets.append(dataset)
+            # if len(iterable_datasets) > 0:
+            # concatenate map-style datasets and iterable-style datasets separately
+            if len(iterable_datasets) > 1:
+                chained_datasets = (
+                    ChainDataset(iterable_datasets)
+                )
+            elif len(iterable_datasets) == 1:
+                chained_datasets = iterable_datasets[0]
+            else:
+                chained_datasets = None
+            concat_datasets = (
+                ConcatDataset(map_datasets) if len(map_datasets) > 0 else None
+            )
+            train_datasets = concat_datasets, chained_datasets
+            train_datasets = tuple([x for x in train_datasets if x is not None])
+            train_datasets = (
+                train_datasets[0] if len(train_datasets) == 1 else train_datasets
+            )
+            datasets[split_name] = train_datasets
+    return datasets

global_local/datasets/datasets/__init__.py ADDED Viewed

File without changes