luodian
/

OTTER-Video-LLaMA7B-DenseCaption

Text2Text Generation

Transformers

PyTorch

otter

Inference Endpoints

Model card Files Files and versions Community

luodian commited on Jun 23, 2023

Commit

f77a3be

•

1 Parent(s): 2079d37

Update README.md

Browse files

Files changed (1) hide show

README.md +36 -51

README.md CHANGED Viewed

@@ -33,7 +33,9 @@ license: mit
 ![](https://black.readthedocs.io/en/stable/_static/license.svg)
 ![](https://img.shields.io/badge/code%20style-black-000000.svg)
-An example of using this model to run on your video. Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the `Otter` folder to make sure it has the access to `otter/modeling_otter.py`.
 ```python
 import mimetypes
@@ -44,7 +46,6 @@ import requests
 import torch
 import transformers
 from PIL import Image
 from otter.modeling_otter import OtterForConditionalGeneration
 # Disable warnings
@@ -61,7 +62,7 @@ def get_content_type(file_path):
 # ------------------- Image and Video Handling Functions -------------------
-def extract_frames(video_path, num_frames=128):
     video = cv2.VideoCapture(video_path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_step = total_frames // num_frames
@@ -83,9 +84,7 @@ def get_image(url: str) -> Union[Image.Image, list]:
     if "://" not in url:  # Local file
         content_type = get_content_type(url)
     else:  # Remote URL
-        content_type = requests.head(url, stream=True, verify=False).headers.get(
-            "Content-Type"
-        )
     if "image" in content_type:
         if "://" not in url:  # Local file
@@ -114,25 +113,13 @@ def get_formatted_prompt(prompt: str) -> str:
     return f"<image>User: {prompt} GPT:<answer>"
-def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
     if isinstance(input_data, Image.Image):
-        vision_x = (
-            image_processor.preprocess([input_data], return_tensors="pt")[
-                "pixel_values"
-            ]
-            .unsqueeze(1)
-            .unsqueeze(0)
-        )
     elif isinstance(input_data, list):  # list of video frames
-        vision_x = (
-            image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"]
-            .unsqueeze(1)
-            .unsqueeze(0)
-        )
     else:
-        raise ValueError(
-            "Invalid input data. Expected PIL Image or list of video frames."
-        )
     lang_x = model.text_tokenizer(
         [
@@ -142,7 +129,7 @@ def get_response(input_data, prompt: str, model=None, image_processor=None) -> s
     )
     generated_text = model.generate(
-        vision_x=vision_x.to(model.device),
         lang_x=lang_x["input_ids"].to(model.device),
         attention_mask=lang_x["attention_mask"].to(model.device),
         max_new_tokens=512,
@@ -162,39 +149,37 @@ def get_response(input_data, prompt: str, model=None, image_processor=None) -> s
     )
     return parsed_output
-if __name__ == "__main__":
-    # ------------------- Main Function -------------------
-    load_bit = "fp16"
-    if load_bit == "fp16":
-        precision = {"torch_dtype": torch.float16}
-    elif load_bit == "bf16":
-        precision = {"torch_dtype": torch.bfloat16}
-    elif load_bit == "fp32":
-        precision = {"torch_dtype": torch.float32}
-    # This model version is trained on MIMIC-IT DC dataset.
-    model = OtterForConditionalGeneration.from_pretrained(
-        "luodian/otter-9b-dc-hf", device_map="auto", **precision
-    )
-    model.text_tokenizer.padding_side = "left"
-    tokenizer = model.text_tokenizer
-    image_processor = transformers.CLIPImageProcessor()
-    model.eval()
-    while True:
-        video_url = "demo.mp4"  # Replace with the path to your video file
-        frames_list = get_image(video_url)
-        prompts_input = input("Enter prompts (comma-separated): ")
-        prompts = [prompt.strip() for prompt in prompts_input.split(",")]
-        for prompt in prompts:
-            print(f"\nPrompt: {prompt}")
-            response = get_response(frames_list, prompt, model, image_processor)
-            print(f"Response: {response}")
-        if prompts_input.lower() == "quit":
-            break
 ```

 ![](https://black.readthedocs.io/en/stable/_static/license.svg)
 ![](https://img.shields.io/badge/code%20style-black-000000.svg)
+An example of using this model to run on your video.
+Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk.
+Place following script inside the `Otter` folder to make sure it has the access to `otter/modeling_otter.py`.
 ```python
 import mimetypes
 import torch
 import transformers
 from PIL import Image
 from otter.modeling_otter import OtterForConditionalGeneration
 # Disable warnings
 # ------------------- Image and Video Handling Functions -------------------
+def extract_frames(video_path, num_frames=16):
     video = cv2.VideoCapture(video_path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_step = total_frames // num_frames
     if "://" not in url:  # Local file
         content_type = get_content_type(url)
     else:  # Remote URL
+        content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")
     if "image" in content_type:
         if "://" not in url:  # Local file
     return f"<image>User: {prompt} GPT:<answer>"
+def get_response(input_data, prompt: str, model=None, image_processor=None, tensor_dtype=None) -> str:
     if isinstance(input_data, Image.Image):
+        vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
     elif isinstance(input_data, list):  # list of video frames
+        vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(0).unsqueeze(0)
     else:
+        raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
     lang_x = model.text_tokenizer(
         [
     )
     generated_text = model.generate(
+        vision_x=vision_x.to(model.device, dtype=tensor_dtype),
         lang_x=lang_x["input_ids"].to(model.device),
         attention_mask=lang_x["attention_mask"].to(model.device),
         max_new_tokens=512,
     )
     return parsed_output
+# ------------------- Main Function -------------------
+load_bit = "fp16"
+if load_bit == "fp16":
+    precision = {"torch_dtype": torch.float16}
+elif load_bit == "bf16":
+    precision = {"torch_dtype": torch.bfloat16}
+elif load_bit == "fp32":
+    precision = {"torch_dtype": torch.float32}
+# This model version is trained on MIMIC-IT DC dataset.
+model = OtterForConditionalGeneration.from_pretrained("luodian/OTTER-9B-DenseCaption", device_map="auto", **precision)
+tensor_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}[load_bit]
+model.text_tokenizer.padding_side = "left"
+tokenizer = model.text_tokenizer
+image_processor = transformers.CLIPImageProcessor()
+model.eval()
+while True:
+    video_url = "/path/to/your_video.mp4"  # Replace with the path to your video file, could be any common format.
+    frames_list = get_image(video_url)
+    prompts_input = input("Enter prompts (comma-separated): ")
+    prompts = [prompt.strip() for prompt in prompts_input.split(",")]
+    for prompt in prompts:
+        print(f"\nPrompt: {prompt}")
+        response = get_response(frames_list, prompt, model, image_processor, tensor_dtype)
+        print(f"Response: {response}")
+    if prompts_input.lower() == "quit":
+        break
 ```