Spaces:

TencentARC
/

VLog

Build error

App Files Files Community

leiwx52 commited on Apr 22, 2023

Commit

5a444be

1 Parent(s): 57debeb

VLog hf gradio demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +159 -0
examples/C8lMW0MODFs.log +3 -0
examples/C8lMW0MODFs.mp4 +3 -0
examples/XZVHmRvfDHM.log +3 -0
examples/XZVHmRvfDHM.mp4 +3 -0
examples/basketball_vlog.log +3 -0
examples/basketball_vlog.mp4 +3 -0
examples/buy_watermelon.log +3 -0
examples/buy_watermelon.mp4 +3 -0
examples/covid.log +3 -0
examples/covid.mp4 +3 -0
examples/huaqiang.log +3 -0
examples/huaqiang.mp4 +3 -0
examples/news.log +3 -0
examples/news.mp4 +3 -0
examples/outcGtbnMuQ.log +3 -0
examples/outcGtbnMuQ.mp4 +3 -0
examples/travel_in_roman.log +3 -0
examples/travel_in_roman.mp4 +3 -0
examples/travel_in_roman_full.log +3 -0
examples/travel_in_roman_full.mp4 +3 -0
examples/vlog.jpg +0 -0
models/__init__.py +3 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/__pycache__/blip2_model.cpython-38.pyc +0 -0
models/__pycache__/clip_model.cpython-38.pyc +0 -0
models/__pycache__/gpt_model.cpython-38.pyc +0 -0
models/__pycache__/grit_model.cpython-38.pyc +0 -0
models/__pycache__/kts_model.cpython-38.pyc +0 -0
models/__pycache__/vlog.cpython-38.pyc +0 -0
models/__pycache__/whisper_model.cpython-38.pyc +0 -0
models/blip2_model.py +47 -0
models/clip_model.py +54 -0
models/gpt_model.py +102 -0
models/grit_model.py +21 -0
models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc +0 -0
models/grit_src/configs/Base.yaml +77 -0
models/grit_src/configs/GRiT_B_DenseCap.yaml +20 -0
models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml +23 -0
models/grit_src/configs/GRiT_B_ObjectDet.yaml +20 -0
models/grit_src/configs/GRiT_H_ObjectDet.yaml +21 -0
models/grit_src/configs/GRiT_L_ObjectDet.yaml +20 -0
models/grit_src/grit/__init__.py +7 -0
models/grit_src/grit/__pycache__/__init__.cpython-38.pyc +0 -0
models/grit_src/grit/__pycache__/config.cpython-38.pyc +0 -0
models/grit_src/grit/__pycache__/predictor.cpython-38.pyc +0 -0
models/grit_src/grit/config.py +50 -0
models/grit_src/grit/custom_solver.py +88 -0
models/grit_src/grit/data/__pycache__/custom_build_augmentation.cpython-38.pyc +0 -0
models/grit_src/grit/data/__pycache__/custom_dataset_mapper.cpython-38.pyc +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+import gradio as gr
+import openai
+import requests
+import csv
+import argparse
+from models.vlog import Vlogger
+parser = argparse.ArgumentParser()
+parser.add_argument('--video_path', default='examples/huaqiang.mp4')
+parser.add_argument('--alpha', default=10, type=int, help='Determine the maximum segment number for KTS algorithm, the larger the value, the fewer segments.')
+parser.add_argument('--beta', default=1, type=int, help='The smallest time gap between successive clips, in seconds.')
+parser.add_argument('--data_dir', default='./examples', type=str, help='Directory for saving videos and logs.')
+parser.add_argument('--tmp_dir', default='./tmp', type=str, help='Directory for saving intermediate files.')
+# * Models settings *
+parser.add_argument('--openai_api_key', default='xxx', type=str, help='OpenAI API key')
+parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP Image Caption')
+parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
+parser.add_argument('--feature_extractor', default='openai/clip-vit-base-patch32', help='Select the feature extractor model for video segmentation')
+parser.add_argument('--feature_extractor_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu')
+parser.add_argument('--image_captioner', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip2', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
+parser.add_argument('--image_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
+parser.add_argument('--dense_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
+parser.add_argument('--audio_translator', default='large')
+parser.add_argument('--audio_translator_device', choices=['cuda', 'cpu'], default='cuda')
+parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo'], default='gpt-3.5-turbo')
+args = parser.parse_args()
+def get_empty_state():
+    return {"total_tokens": 0, "messages": []}
+def submit_api_key_fn(api_key, vlogger):
+    try:
+        vlogger.init_llm_with_api_key(api_key)
+        return gr.update(value = "OpenAI key submitted successful 🎉"), True, vlogger
+    except Exception as e:
+        return gr.update(value = f"Error {e}"), False, vlogger
+def submit_message(prompt, state, vlogger, api_key_submitted, vlog_loaded):
+    if not api_key_submitted:
+        return gr.update(value=''), [("👀", "Please enter your OpenAI API key 😊"),], state, vlogger
+    if not vlog_loaded:
+        return gr.update(value=''), [("👀", "Please follow the instruction to select a video and generate the document for chatting 😊"),], state, vlogger
+    history = state['messages']
+    if not prompt:
+        return gr.update(value=''), [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)], state, vlogger
+    prompt_msg = { "role": "user", "content": prompt }
+    try:
+        history.append(prompt_msg)
+        answer = vlogger.chat2video(prompt)
+        history.append({"role": "system", "content": answer})
+    except Exception as e:
+        history.append(prompt_msg)
+        history.append({
+            "role": "system",
+            "content": f"Error: {e}"
+        })
+    chat_messages = [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)]
+    return '', chat_messages, state, vlogger
+def clear_conversation(vlogger):
+    vlogger.clean_history()
+    # return input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded
+    return gr.update(value=None, visible=True), gr.update(value=None, interactive=False), None, gr.update(value=None, visible=True), get_empty_state(),  vlogger, False
+def vlog_fn(vid_path, vlogger, api_key_submitted):
+    if not api_key_submitted:
+        log_text = "====== Please enter your OpenAI API key first 😊 ====="
+        return gr.update(value=log_text, visible=True), False, vlogger
+    print(vid_path)
+    if vid_path is None:
+        log_text = "====== Please select an video from examples first 🤔 ====="
+        vloaded_flag = False
+    else:
+        log_list = vlogger.video2log(vid_path)
+        log_text = "\n".join(log_list)
+        vloaded_flag = True
+    return gr.update(value=log_text, visible=True), vloaded_flag, vlogger
+css = """
+      #col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
+      #video_inp {min-height: 300px}
+      #chatbox {min-height: 100px;}
+      #header {text-align: center;
+      #hint {font-size: 0.9em; padding: 0.5em; margin: 0;}
+      .message { font-size: 1.2em; }
+      """
+with gr.Blocks(css=css) as demo:
+    state = gr.State(get_empty_state())
+    vlogger = gr.State(Vlogger(args))
+    vlog_loaded = gr.State(False)
+    api_key_submitted = gr.State(False)
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("""## 🎞️ VLog Demo
+                    Powered by BLIP2, GRIT, Whisper, ChatGPT and LangChain
+                    Github: [https://github.com/showlab/VLog](https://github.com/showlab/VLog)""",
+                    elem_id="header")
+        gr.Markdown("*Instruction*: For the current demo, please enter OpenAI api key, select an example video, click the button to generate a document and try chatting over the video 😊", elem_id="hint")
+        with gr.Row():
+            with gr.Column(scale=6):
+                video_inp = gr.Video(label="video_input", interactive=False)
+                chatbot = gr.Chatbot(elem_id="chatbox")
+                input_message = gr.Textbox(show_label=False, placeholder="Enter text and press enter", visible=True).style(container=False)
+                btn_submit = gr.Button("Submit")
+                btn_clear_conversation = gr.Button("🔃 Start New Conversation")
+            with gr.Column(scale=6):
+                vlog_btn = gr.Button("Generate Video Document")
+                vlog_outp = gr.Textbox(label="Document output", lines=30)
+            with gr.Column(scale=1):
+                openai_api_key = gr.Textbox(
+                    placeholder="Input OpenAI API key and press Enter",
+                    show_label=False,
+                    label = "OpenAI API Key",
+                    lines=1,
+                    type="password"
+                )
+                examples = gr.Examples(
+                    examples=[
+                        ["examples/basketball_vlog.mp4"],
+                        ["examples/travel_in_roman.mp4"],
+                        ["examples/C8lMW0MODFs.mp4"],
+                        ["examples/outcGtbnMuQ.mp4"],
+                        ["examples/huaqiang.mp4"],
+                    ],
+                    inputs=[video_inp],
+                )
+    gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/TencentARC/VLog?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br></center>''')
+    btn_submit.click(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
+    input_message.submit(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
+    btn_clear_conversation.click(clear_conversation, [vlogger], [input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded])
+    vlog_btn.click(vlog_fn, [video_inp, vlogger, api_key_submitted], [vlog_outp, vlog_loaded, vlogger])
+    openai_api_key.submit(submit_api_key_fn, [openai_api_key, vlogger], [vlog_outp, api_key_submitted, vlogger])
+    demo.load(queur=False)
+demo.queue(concurrency_count=10)
+demo.launch(height='800px', server_port=8749, debug=True, share=True)

examples/C8lMW0MODFs.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b044e554f8dc7a790b02aa1ebc391165b84d93cce9579fa6b2fe0418cd4d1122
+size 9075

examples/C8lMW0MODFs.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d094489e459ae952880f4cbd8fdbcc790df1a69ccf9fb4f6c5fca998b6871133
+size 10537029

examples/XZVHmRvfDHM.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e37046ae268e20f3d44df7410954c3cf5ffd73116e6f5e3f9ef73a690f001d51
+size 7262

examples/XZVHmRvfDHM.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2da0eae7e0b18c04ad4f2b8124a09fbbde407eeedb0a532dbf40701c8c744b5
+size 1961212

examples/basketball_vlog.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b2d18c6d7d7c5061ae41b9cd2b8cc0828d2aee2b02b40b4286fdd26905b0ac0
+size 23527

examples/basketball_vlog.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5d6034c324f3e9de35278783ed68a85081ef74a252c9394e273b339f7d1b6c3
+size 32376805

examples/buy_watermelon.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd0e7bfca9fba4b71428235d41b446083ffe8d7496ef43249f7438017def067
+size 3922

examples/buy_watermelon.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:926ee7ec1ca4d3e0674a647bf84887bdf077961c3972148ae23fb569c22e0e4e
+size 6209789

examples/covid.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50281df7c21815c662e2f03e461b02dd5b2f8253a3f92bcd1dfca4229d89e3ce
+size 9782

examples/covid.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53c35480ff6ac15f2f8747aa9ba9dc36086d5f4e342ac79eac5e43e5bd248817
+size 16090827

examples/huaqiang.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd0e7bfca9fba4b71428235d41b446083ffe8d7496ef43249f7438017def067
+size 3922

examples/huaqiang.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:926ee7ec1ca4d3e0674a647bf84887bdf077961c3972148ae23fb569c22e0e4e
+size 6209789

examples/news.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42068b9573daee32bf33d5aa4049f937bfa2cb3c6472d40c42332f8d2173a929
+size 8968

examples/news.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:905e453db16213c962d01371357877b8a168da50508676b81cf474d431d3d2ca
+size 23599849

examples/outcGtbnMuQ.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45a3911acfe78745ed9cfc9502deebef1ab6912dc89566735fcbdf7acda00b44
+size 63033

examples/outcGtbnMuQ.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47f4ddd4debd3c5955cb7c0a1f5e2ffa9c0d6a171931898ee085c5eab521f33d
+size 98609326

examples/travel_in_roman.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f90d4b4c46322b6f15984b64aaedf35232b2fb21ddac518f1c5784fe25944e3c
+size 9166

examples/travel_in_roman.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b02522bb72215bcb1a69657af9d08cad0141e1b3e30553024609cb0927471e04
+size 34442658

examples/travel_in_roman_full.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a163943c7676168b51adf08e5305dd78fba7c54e6cd00330c06541eb23d0d23
+size 45295

examples/travel_in_roman_full.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1cb54427e21a1ccbba23bfe5314e4ae2d45658d0b4b654f815abf1861c1ca3c
+size 92642344

examples/vlog.jpg ADDED Viewed

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .kts_src import *
+from .clip_model import *
+from .grit_model import *

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (215 Bytes). View file

models/__pycache__/blip2_model.cpython-38.pyc ADDED Viewed

Binary file (2.02 kB). View file

models/__pycache__/clip_model.cpython-38.pyc ADDED Viewed

Binary file (1.91 kB). View file

models/__pycache__/gpt_model.cpython-38.pyc ADDED Viewed

Binary file (3.43 kB). View file

models/__pycache__/grit_model.cpython-38.pyc ADDED Viewed

Binary file (1.21 kB). View file

models/__pycache__/kts_model.cpython-38.pyc ADDED Viewed

Binary file (1.34 kB). View file

models/__pycache__/vlog.cpython-38.pyc ADDED Viewed

Binary file (4.34 kB). View file

models/__pycache__/whisper_model.cpython-38.pyc ADDED Viewed

Binary file (1.24 kB). View file

models/blip2_model.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from PIL import Image
+from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForConditionalGeneration
+class ImageCaptioner:
+    def __init__(self,  model_name="blip2-opt", device="cpu"):
+        self.model_name = model_name
+        self.device = device
+        self.processor, self.model = self.initialize_model()
+    def initialize_model(self):
+        if self.device == 'cpu':
+            self.data_type = torch.float32
+        else:
+            self.data_type = torch.float16
+        processor, model = None, None
+        if self.model_name == "blip2-opt":
+            processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
+            model = Blip2ForConditionalGeneration.from_pretrained(
+                "Salesforce/blip2-opt-2.7b-coco", torch_dtype=self.data_type, low_cpu_mem_usage=True)
+        elif self.model_name == "blip2-flan-t5":
+            processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+            model = Blip2ForConditionalGeneration.from_pretrained(
+                "Salesforce/blip2-flan-t5-xl", torch_dtype=self.data_type, low_cpu_mem_usage=True)
+        # for gpu with small memory
+        elif self.model_name == "blip":
+            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        else:
+            raise NotImplementedError(f"{self.model_name} not implemented.")
+        model.to(self.device)
+        if self.device != 'cpu':
+            model.half()
+        return processor, model
+    def image_caption(self, image):
+        inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text
+    def image_caption_debug(self, image_src):
+        return "A dish with salmon, broccoli, and something yellow."

models/clip_model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import cv2
+import pdb
+import torch
+import numpy as np
+from PIL import Image
+from transformers import CLIPProcessor, CLIPVisionModelWithProjection
+from transformers import logging
+logging.set_verbosity_error()
+class FeatureExtractor():
+    def __init__(self, args):
+        self.device = args.feature_extractor_device
+        self.beta = args.beta
+        self.processor = CLIPProcessor.from_pretrained(args.feature_extractor)
+        self.model = CLIPVisionModelWithProjection.from_pretrained(args.feature_extractor).to(self.device)
+        self.data_dir = args.data_dir
+        self.tmp_dir = args.tmp_dir
+    def __call__(self, video_path, video_id):
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        video_length = frame_count / fps
+        sample_rate = int(fps) * self.beta
+        save_path = os.path.join(self.tmp_dir, video_id + '.npz')
+        if os.path.exists(save_path):
+            data = np.load(save_path)
+            clip_features = data['features']
+            return clip_features, video_length
+        clip_features = []
+        print("Extract the clip feature.")
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if cap.get(cv2.CAP_PROP_POS_FRAMES) % sample_rate == 0:
+                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                inputs = self.processor(images=image, return_tensors="pt").pixel_values
+                inputs = inputs.to(self.device)
+                with torch.no_grad():
+                    feat = self.model(inputs)['image_embeds']
+                    clip_features.append(feat.cpu().numpy())
+        print("Finished.")
+        clip_features = np.concatenate(clip_features, axis=0)
+        np.savez_compressed(save_path, features=clip_features)
+        return clip_features, video_length

models/gpt_model.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import pdb
+import pickle
+from langchain.llms import OpenAI
+from langchain.vectorstores.faiss import FAISS
+from langchain.chains import ChatVectorDBChain
+from langchain.prompts.prompt import PromptTemplate
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.embeddings import OpenAIEmbeddings
+_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
+You can assume the discussion is about the video content.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+qa_template = """You are an AI assistant designed for answering questions about a video.
+You are given a document and a question, the document records what people see and hear from this video.
+Try to connet these information and provide a conversational answer.
+Question: {question}
+=========
+{context}
+=========
+"""
+QA_PROMPT = PromptTemplate(template=qa_template, input_variables=["question", "context"])
+class LlmReasoner():
+    def __init__(self, args):
+        self.history = []
+        self.gpt_version = args.gpt_version
+        self.data_dir = args.data_dir
+        self.tmp_dir = args.tmp_dir
+        self.qa_chain = None
+        self.vectorstore = None
+        self.top_k = 3
+        self.llm = OpenAI(temperature=0,  model_name=self.gpt_version)
+    def exist_vectorstore(self, video_id):
+        pkl_path = os.path.join(self.tmp_dir, f"{video_id}.pkl")
+        log_path = os.path.join(self.data_dir, f"{video_id}.log")
+        if os.path.exists(pkl_path) and os.path.exists(log_path):
+            with open(pkl_path, 'rb') as file:
+                self.vectorstore = pickle.load(file)
+            self.qa_chain = ChatVectorDBChain.from_llm(
+            self.llm,
+            self.vectorstore,
+            qa_prompt=QA_PROMPT,
+            condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+            )
+            self.qa_chain.top_k_docs_for_context = self.top_k
+            return True
+        return False
+    def create_vectorstore(self, video_id):
+        pkl_path = os.path.join(self.tmp_dir, f"{video_id}.pkl")
+        if not os.path.exists(pkl_path):
+            loader = UnstructuredFileLoader(os.path.join(self.data_dir, f"{video_id}.log"))
+            raw_documents = loader.load()
+            # Split text
+            text_splitter = RecursiveCharacterTextSplitter()
+            documents = text_splitter.split_documents(raw_documents)
+            # Load Data to vectorstore
+            embeddings = OpenAIEmbeddings()
+            vectorstore = FAISS.from_documents(documents, embeddings)
+            # Save vectorstore
+            with open(pkl_path, "wb") as f:
+                pickle.dump(vectorstore, f)
+        with open(pkl_path, 'rb') as file:
+            self.vectorstore = pickle.load(file)
+        self.qa_chain = ChatVectorDBChain.from_llm(
+            self.llm,
+            self.vectorstore,
+            qa_prompt=QA_PROMPT,
+            condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+        )
+        self.qa_chain.top_k_docs_for_context = self.top_k
+        return
+    def __call__(self, question):
+        print(f"Question: {question}")
+        response = self.qa_chain({"question": question, "chat_history": self.history})["answer"]
+        self.history.append((question, response))
+        print(f"Assistant: {response}")
+        print("\n")
+        return response
+    def clean_history(self):
+        self.history = []

models/grit_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from models.grit_src.image_dense_captions import image_caption_api
+class DenseCaptioner():
+    def __init__(self, device):
+        self.device = device
+    def initialize_model(self):
+        pass
+    def image_dense_caption_debug(self, image_src):
+        dense_caption = """
+        1. the broccoli is green, [0, 0, 333, 325];
+        2. a piece of broccoli, [0, 147, 143, 324];
+        3. silver fork on plate, [4, 547, 252, 612];
+        """
+        return dense_caption
+    def image_dense_caption(self, image_src):
+        dense_caption = image_caption_api(image_src, self.device)
+        return dense_caption

models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc ADDED Viewed

Binary file (2.33 kB). View file

models/grit_src/configs/Base.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+MODEL:
+  META_ARCHITECTURE: "GRiT"
+  MASK_ON: True
+  PROPOSAL_GENERATOR:
+    NAME: "CenterNet"
+  FPN:
+    IN_FEATURES: ["layer3", "layer4", "layer5"]
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.12, 57.375]
+  ROI_HEADS:
+    NAME: GRiTROIHeadsAndTextDecoder
+    IN_FEATURES: ["p3", "p4", "p5"]
+    IOU_THRESHOLDS: [0.6]
+    NUM_CLASSES: 1
+    SCORE_THRESH_TEST: 0.02
+    NMS_THRESH_TEST: 0.5
+    OBJECT_FEAT_POOLER_RES: 14
+  ROI_BOX_CASCADE_HEAD:
+    IOUS: [0.6, 0.7, 0.8]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    CLS_AGNOSTIC_BBOX_REG: True
+    MULT_PROPOSAL_SCORE: True
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    CLS_AGNOSTIC_MASK: True
+  CENTERNET:
+    NUM_CLASSES: 1
+    REG_WEIGHT: 1.
+    NOT_NORM_REG: True
+    ONLY_PROPOSAL: True
+    WITH_AGN_HM: True
+    INFERENCE_TH: 0.0001
+    PRE_NMS_TOPK_TRAIN: 4000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TEST: 256
+    NMS_TH_TRAIN: 0.9
+    NMS_TH_TEST: 0.9
+    POS_WEIGHT: 0.5
+    NEG_WEIGHT: 0.5
+    IGNORE_HIGH_FP: 0.85
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  SAMPLER_TRAIN: "MultiDatasetSampler"
+  DATASET_RATIO: [1]
+  DATASET_INPUT_SIZE: [1024]
+  DATASET_INPUT_SCALE: [[0.1, 2.0]]
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 8
+TEST:
+  DETECTIONS_PER_IMAGE: 256
+SOLVER:
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+  CHECKPOINT_PERIOD: 10000
+  WARMUP_ITERS: 1000
+  WARMUP_FACTOR: 0.001
+  USE_CUSTOM_SOLVER: True
+  OPTIMIZER: "ADAMW"
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.00008
+  VIT_LAYER_DECAY: True
+  CLIP_GRADIENTS:
+    ENABLED: True
+INPUT:
+  FORMAT: RGB
+  CUSTOM_AUG: EfficientDetResizeCrop
+  TRAIN_SIZE: 640
+USE_ACT_CHECKPOINT: True
+VERSION: 2

models/grit_src/configs/GRiT_B_DenseCap.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["DenseCap"]
+  TEST_TASK: "DenseCap"
+  MASK_ON: False
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: False
+  BEAM_SIZE: 1
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("vg_train",)
+  TEST: ("vg_test",)
+DATALOADER:
+  DATASET_BS: 2
+OUTPUT_DIR: "./output/GRiT_B_DenseCap"

models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet", "DenseCap"]
+  TEST_TASK: "DenseCap" # DenseCap or ObjectDet: Choose one for testing
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: False
+  BEAM_SIZE: 1
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train", "vg_train")
+  TEST: ("coco_2017_test-dev",)
+DATALOADER:
+  DATASET_RATIO: [1, 1]
+  DATASET_BS: 2
+  DATASET_INPUT_SIZE: [1024, 1024]
+  DATASET_INPUT_SCALE: [[0.1, 2.0], [0.1, 2.0]]
+OUTPUT_DIR: "./output/GRiT_B_DenseCap_ObjectDet"

models/grit_src/configs/GRiT_B_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 2
+OUTPUT_DIR: "./output/GRiT_B_ObjectDet"

models/grit_src/configs/GRiT_H_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone_huge
+  VIT_LAYERS: 32
+SOLVER:
+  MAX_ITER: 135000
+  VIT_LAYER_DECAY_RATE: 0.9
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 1
+OUTPUT_DIR: "./output/GRiT_H_ObjectDet"

models/grit_src/configs/GRiT_L_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone_large
+  VIT_LAYERS: 24
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.8
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 1
+OUTPUT_DIR: "./output/GRiT_L_ObjectDet"

models/grit_src/grit/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .modeling.meta_arch import grit
+from .modeling.roi_heads import grit_roi_heads
+from .modeling.backbone import vit
+from .data.datasets import object365
+from .data.datasets import vg
+from .data.datasets import grit_coco

models/grit_src/grit/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (414 Bytes). View file

models/grit_src/grit/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (1.41 kB). View file

models/grit_src/grit/__pycache__/predictor.cpython-38.pyc ADDED Viewed

Binary file (2.65 kB). View file

models/grit_src/grit/config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from detectron2.config import CfgNode as CN
+def add_grit_config(cfg):
+    _C = cfg
+    _C.MODEL.BEAM_SIZE = 1
+    _C.MODEL.TRAIN_TASK = ["ObjectDet", "DenseCap"]
+    _C.MODEL.TEST_TASK = "DenseCap"  # This can be varied if the model is jointly trained on multiple tasks
+    _C.MODEL.ROI_BOX_HEAD.USE_BIAS = 0.0 # >= 0: not use
+    _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False
+    _C.MODEL.ROI_HEADS.MASK_WEIGHT = 1.0
+    _C.MODEL.ROI_HEADS.OBJECT_FEAT_POOLER_RES = 14
+    _C.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
+    # Backbones
+    _C.MODEL.VIT_LAYERS = 12
+    # Text Decoder
+    _C.TEXT_DECODER = CN()
+    _C.TEXT_DECODER.VOCAB_SIZE = 30522
+    _C.TEXT_DECODER.HIDDEN_SIZE = 768
+    _C.TEXT_DECODER.NUM_LAYERS = 6
+    _C.TEXT_DECODER.ATTENTION_HEADS = 12
+    _C.TEXT_DECODER.FEEDFORWARD_SIZE = 768 * 4
+    # Multi-dataset dataloader
+    _C.DATALOADER.DATASET_RATIO = [1, 1]  # sample ratio
+    _C.DATALOADER.DATASET_BS = 1
+    _C.DATALOADER.DATASET_INPUT_SIZE = [1024, 1024]
+    _C.DATALOADER.DATASET_INPUT_SCALE = [(0.1, 2.0), (0.1, 2.0)]
+    _C.DATALOADER.DATASET_MIN_SIZES = [(640, 800), (640, 800)]
+    _C.DATALOADER.DATASET_MAX_SIZES = [1333, 1333]
+    _C.SOLVER.USE_CUSTOM_SOLVER = True
+    _C.SOLVER.OPTIMIZER = 'ADAMW'
+    _C.SOLVER.VIT_LAYER_DECAY = True
+    _C.SOLVER.VIT_LAYER_DECAY_RATE = 0.7
+    _C.INPUT.CUSTOM_AUG = 'EfficientDetResizeCrop'
+    _C.INPUT.TRAIN_SIZE = 1024
+    _C.INPUT.TEST_SIZE = 1024
+    _C.INPUT.SCALE_RANGE = (0.1, 2.)
+    # 'default' for fixed short / long edge
+    _C.INPUT.TEST_INPUT_TYPE = 'default'
+    _C.FIND_UNUSED_PARAM = True
+    _C.USE_ACT_CHECKPOINT = True

models/grit_src/grit/custom_solver.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified by Jialian Wu from https://github.com/facebookresearch/Detic/blob/main/detic/custom_solver.py
+import itertools
+from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
+import torch
+from detectron2.config import CfgNode
+from detectron2.solver.build import maybe_add_gradient_clipping
+def build_custom_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    optimizer_type = cfg.SOLVER.OPTIMIZER
+    for key, value in model.named_parameters(recurse=True):
+        if not value.requires_grad:
+            continue
+        # Avoid duplicating parameters
+        if value in memo:
+            continue
+        memo.add(value)
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if cfg.SOLVER.VIT_LAYER_DECAY:
+            lr = lr * get_vit_lr_decay_rate(key, cfg.SOLVER.VIT_LAYER_DECAY_RATE, cfg.MODEL.VIT_LAYERS)
+        param = {"params": [value], "lr": lr}
+        if optimizer_type != 'ADAMW':
+            param['weight_decay'] = weight_decay
+        params += [param]
+    def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
+        # detectron2 doesn't have full model gradient clipping now
+        clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+        enable = (
+            cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+            and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+            and clip_norm_val > 0.0
+        )
+        class FullModelGradientClippingOptimizer(optim):
+            def step(self, closure=None):
+                all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                super().step(closure=closure)
+        return FullModelGradientClippingOptimizer if enable else optim
+    if optimizer_type == 'SGD':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+            params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM,
+            nesterov=cfg.SOLVER.NESTEROV
+        )
+    elif optimizer_type == 'ADAMW':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+            params, cfg.SOLVER.BASE_LR,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY
+        )
+    else:
+        raise NotImplementedError(f"no optimizer type {optimizer_type}")
+    if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+        optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

models/grit_src/grit/data/__pycache__/custom_build_augmentation.cpython-38.pyc ADDED Viewed

Binary file (1.22 kB). View file

models/grit_src/grit/data/__pycache__/custom_dataset_mapper.cpython-38.pyc ADDED Viewed

Binary file (5.69 kB). View file