Spaces:

SkalskiP
/

florence-2-video

Running on Zero

+import os
+from unittest.mock import patch
+import gradio as gr
+import numpy as np
+import supervision as sv
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoProcessor
+from utils.imports import fixed_get_imports
+from utils.models import (
+    run_captioning,
+    CAPTIONING_TASK,
+    run_caption_to_phrase_grounding
+)
+from utils.video import (
+    create_directory,
+    remove_files_older_than,
+    generate_file_name,
+    calculate_end_frame_index
+)
+MARKDOWN = """
+# Florence-2 for Videos 🎬
+<div>
+    <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
+    </a>
+    <a href="https://blog.roboflow.com/florence-2/">
+        <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
+    </a>
+    <a href="https://arxiv.org/abs/2311.06242">
+        <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
+    </a>
+</div>
+"""
+RESULTS = "results"
+CHECKPOINT = "microsoft/Florence-2-base-ft"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+    MODEL = AutoModelForCausalLM.from_pretrained(
+        CHECKPOINT, trust_remote_code=True).to(DEVICE)
+    PROCESSOR = AutoProcessor.from_pretrained(
+        CHECKPOINT, trust_remote_code=True)
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(color_lookup=sv.ColorLookup.TRACK)
+LABEL_ANNOTATOR = sv.LabelAnnotator(color_lookup=sv.ColorLookup.TRACK)
+TRACKER = sv.ByteTrack()
+# creating video results directory
+create_directory(directory_path=RESULTS)
+def annotate_image(
+    input_image: np.ndarray,
+    detections: sv.Detections
+) -> np.ndarray:
+    output_image = input_image.copy()
+    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
+    return output_image
+def process_video(
+    input_video: str,
+    progress=gr.Progress(track_tqdm=True)
+) -> str:
+    # cleanup of old video files
+    remove_files_older_than(RESULTS, 30)
+    video_info = sv.VideoInfo.from_video_path(input_video)
+    total = calculate_end_frame_index(input_video)
+    frame_generator = sv.get_video_frames_generator(
+        source_path=input_video,
+        end=total
+    )
+    result_file_name = generate_file_name(extension="mp4")
+    result_file_path = os.path.join(RESULTS, result_file_name)
+    TRACKER.reset()
+    with sv.VideoSink(result_file_path, video_info=video_info) as sink:
+        for _ in tqdm(range(total), desc="Processing video..."):
+            frame = next(frame_generator)
+            caption = run_captioning(
+                model=MODEL,
+                processor=PROCESSOR,
+                image=frame,
+                device=DEVICE
+            )[CAPTIONING_TASK]
+            detections = run_caption_to_phrase_grounding(
+                model=MODEL,
+                processor=PROCESSOR,
+                caption=caption,
+                image=frame,
+                device=DEVICE
+            )
+            detections = TRACKER.update_with_detections(detections)
+            frame = annotate_image(
+                input_image=frame,
+                detections=detections
+            )
+            sink.write_frame(frame)
+    return result_file_path
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        input_video_component = gr.Video(
+            label='Input Video'
+        )
+        output_video_component = gr.Video(
+            label='Output Video'
+        )
+    with gr.Row():
+        submit_button_component = gr.Button(
+            value='Submit',
+            scale=1,
+            variant='primary'
+        )
+    submit_button_component.click(
+        fn=process_video,
+        inputs=[
+            input_video_component,
+        ],
+        outputs=output_video_component
+    )
+demo.launch(debug=False, show_error=True, max_threads=1)

local-requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+tqdm
+einops
+timm
+gradio
+transformers
+git+https://github.com/roboflow/supervision.git

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+tqdm
+einops
+timm
+gradio
+transformers
+git+https://github.com/roboflow/supervision.git

utils/__init__.py ADDED Viewed

File without changes

utils/imports.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from typing import Union
+from transformers.dynamic_module_utils import get_imports
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports

utils/models.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import numpy as np
+import supervision as sv
+from PIL import Image
+CAPTIONING_TASK = "<DETAILED_CAPTION>"
+CAPTION_TO_PHRASE_GROUNDING_TASK = "<CAPTION_TO_PHRASE_GROUNDING>"
+def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str:
+    image = Image.fromarray(image).convert("RGB")
+    text = "<DETAILED_CAPTION>"
+    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    return processor.post_process_generation(
+        generated_text, task=CAPTIONING_TASK, image_size=image.size)
+def run_caption_to_phrase_grounding(
+    model,
+    processor,
+    caption: str,
+    image: np.ndarray,
+    device: torch.device
+) -> sv.Detections:
+    image = Image.fromarray(image).convert("RGB")
+    text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}"
+    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size)
+    return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size)

utils/video.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import datetime
+import uuid
+from typing import List
+import supervision as sv
+MAX_VIDEO_LENGTH_SEC = 1
+# MAX_VIDEO_LENGTH_SEC = 2
+def generate_file_name(extension="mp4"):
+    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    unique_id = uuid.uuid4()
+    return f"{current_datetime}_{unique_id}.{extension}"
+def list_files_older_than(directory: str, diff_minutes: int) -> List[str]:
+    diff_seconds = diff_minutes * 60
+    now = datetime.datetime.now()
+    older_files: List[str] = []
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if os.path.isfile(file_path):
+            file_mod_time = os.path.getmtime(file_path)
+            file_mod_datetime = datetime.datetime.fromtimestamp(file_mod_time)
+            time_diff = now - file_mod_datetime
+            if time_diff.total_seconds() > diff_seconds:
+                older_files.append(file_path)
+    return older_files
+def remove_files_older_than(directory: str, diff_minutes: int) -> None:
+    older_files = list_files_older_than(directory, diff_minutes)
+    file_count = len(older_files)
+    for file_path in older_files:
+        os.remove(file_path)
+    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(
+        f"[{now}] Removed {file_count} files older than {diff_minutes} minutes from "
+        f"'{directory}' directory."
+    )
+def calculate_end_frame_index(source_video_path: str) -> int:
+    video_info = sv.VideoInfo.from_video_path(source_video_path)
+    return min(
+        video_info.total_frames,
+        video_info.fps * MAX_VIDEO_LENGTH_SEC
+    )
+def create_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)