SkalskiP commited on
Commit
4ae7d54
1 Parent(s): 7c048e0

initial code version

Browse files
Files changed (9) hide show
  1. .gitignore +3 -0
  2. README.md +2 -2
  3. app.py +136 -0
  4. local-requirements.txt +7 -0
  5. requirements.txt +6 -0
  6. utils/__init__.py +0 -0
  7. utils/imports.py +13 -0
  8. utils/models.py +48 -0
  9. utils/video.py +60 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv
2
+ results
3
+ .idea
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Florence 2 Video
3
- emoji: 💻
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
+ title: Florence-2 for Videos
3
+ emoji: 🎬
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from unittest.mock import patch
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ import supervision as sv
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoModelForCausalLM, AutoProcessor
10
+
11
+ from utils.imports import fixed_get_imports
12
+ from utils.models import (
13
+ run_captioning,
14
+ CAPTIONING_TASK,
15
+ run_caption_to_phrase_grounding
16
+ )
17
+ from utils.video import (
18
+ create_directory,
19
+ remove_files_older_than,
20
+ generate_file_name,
21
+ calculate_end_frame_index
22
+ )
23
+
24
+ MARKDOWN = """
25
+ # Florence-2 for Videos 🎬
26
+
27
+ <div>
28
+ <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
29
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
30
+ </a>
31
+ <a href="https://blog.roboflow.com/florence-2/">
32
+ <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
33
+ </a>
34
+ <a href="https://arxiv.org/abs/2311.06242">
35
+ <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
36
+ </a>
37
+ </div>
38
+ """
39
+
40
+ RESULTS = "results"
41
+
42
+ CHECKPOINT = "microsoft/Florence-2-base-ft"
43
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
+
45
+
46
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
47
+ MODEL = AutoModelForCausalLM.from_pretrained(
48
+ CHECKPOINT, trust_remote_code=True).to(DEVICE)
49
+ PROCESSOR = AutoProcessor.from_pretrained(
50
+ CHECKPOINT, trust_remote_code=True)
51
+
52
+
53
+ BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(color_lookup=sv.ColorLookup.TRACK)
54
+ LABEL_ANNOTATOR = sv.LabelAnnotator(color_lookup=sv.ColorLookup.TRACK)
55
+ TRACKER = sv.ByteTrack()
56
+
57
+ # creating video results directory
58
+ create_directory(directory_path=RESULTS)
59
+
60
+
61
+ def annotate_image(
62
+ input_image: np.ndarray,
63
+ detections: sv.Detections
64
+ ) -> np.ndarray:
65
+ output_image = input_image.copy()
66
+ output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
67
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
68
+ return output_image
69
+
70
+
71
+ def process_video(
72
+ input_video: str,
73
+ progress=gr.Progress(track_tqdm=True)
74
+ ) -> str:
75
+ # cleanup of old video files
76
+ remove_files_older_than(RESULTS, 30)
77
+
78
+ video_info = sv.VideoInfo.from_video_path(input_video)
79
+ total = calculate_end_frame_index(input_video)
80
+ frame_generator = sv.get_video_frames_generator(
81
+ source_path=input_video,
82
+ end=total
83
+ )
84
+ result_file_name = generate_file_name(extension="mp4")
85
+ result_file_path = os.path.join(RESULTS, result_file_name)
86
+ TRACKER.reset()
87
+ with sv.VideoSink(result_file_path, video_info=video_info) as sink:
88
+ for _ in tqdm(range(total), desc="Processing video..."):
89
+ frame = next(frame_generator)
90
+ caption = run_captioning(
91
+ model=MODEL,
92
+ processor=PROCESSOR,
93
+ image=frame,
94
+ device=DEVICE
95
+ )[CAPTIONING_TASK]
96
+ detections = run_caption_to_phrase_grounding(
97
+ model=MODEL,
98
+ processor=PROCESSOR,
99
+ caption=caption,
100
+ image=frame,
101
+ device=DEVICE
102
+ )
103
+ detections = TRACKER.update_with_detections(detections)
104
+ frame = annotate_image(
105
+ input_image=frame,
106
+ detections=detections
107
+ )
108
+ sink.write_frame(frame)
109
+ return result_file_path
110
+
111
+
112
+ with gr.Blocks() as demo:
113
+ gr.Markdown(MARKDOWN)
114
+ with gr.Row():
115
+ input_video_component = gr.Video(
116
+ label='Input Video'
117
+ )
118
+ output_video_component = gr.Video(
119
+ label='Output Video'
120
+ )
121
+ with gr.Row():
122
+ submit_button_component = gr.Button(
123
+ value='Submit',
124
+ scale=1,
125
+ variant='primary'
126
+ )
127
+
128
+ submit_button_component.click(
129
+ fn=process_video,
130
+ inputs=[
131
+ input_video_component,
132
+ ],
133
+ outputs=output_video_component
134
+ )
135
+
136
+ demo.launch(debug=False, show_error=True, max_threads=1)
local-requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ tqdm
3
+ einops
4
+ timm
5
+ gradio
6
+ transformers
7
+ git+https://github.com/roboflow/supervision.git
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ tqdm
2
+ einops
3
+ timm
4
+ gradio
5
+ transformers
6
+ git+https://github.com/roboflow/supervision.git
utils/__init__.py ADDED
File without changes
utils/imports.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import Union
4
+ from transformers.dynamic_module_utils import get_imports
5
+
6
+
7
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
8
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
9
+ if not str(filename).endswith("/modeling_florence2.py"):
10
+ return get_imports(filename)
11
+ imports = get_imports(filename)
12
+ imports.remove("flash_attn")
13
+ return imports
utils/models.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import numpy as np
4
+ import supervision as sv
5
+ from PIL import Image
6
+
7
+
8
+ CAPTIONING_TASK = "<DETAILED_CAPTION>"
9
+ CAPTION_TO_PHRASE_GROUNDING_TASK = "<CAPTION_TO_PHRASE_GROUNDING>"
10
+
11
+
12
+ def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str:
13
+ image = Image.fromarray(image).convert("RGB")
14
+ text = "<DETAILED_CAPTION>"
15
+
16
+ inputs = processor(text=text, images=image, return_tensors="pt").to(device)
17
+ generated_ids = model.generate(
18
+ input_ids=inputs["input_ids"],
19
+ pixel_values=inputs["pixel_values"],
20
+ max_new_tokens=1024,
21
+ num_beams=3
22
+ )
23
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
24
+ return processor.post_process_generation(
25
+ generated_text, task=CAPTIONING_TASK, image_size=image.size)
26
+
27
+
28
+ def run_caption_to_phrase_grounding(
29
+ model,
30
+ processor,
31
+ caption: str,
32
+ image: np.ndarray,
33
+ device: torch.device
34
+ ) -> sv.Detections:
35
+ image = Image.fromarray(image).convert("RGB")
36
+ text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}"
37
+
38
+ inputs = processor(text=text, images=image, return_tensors="pt").to(device)
39
+ generated_ids = model.generate(
40
+ input_ids=inputs["input_ids"],
41
+ pixel_values=inputs["pixel_values"],
42
+ max_new_tokens=1024,
43
+ num_beams=3
44
+ )
45
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
46
+ response = processor.post_process_generation(
47
+ generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size)
48
+ return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size)
utils/video.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import uuid
4
+ from typing import List
5
+
6
+ import supervision as sv
7
+
8
+
9
+ MAX_VIDEO_LENGTH_SEC = 1
10
+ # MAX_VIDEO_LENGTH_SEC = 2
11
+
12
+
13
+ def generate_file_name(extension="mp4"):
14
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
15
+ unique_id = uuid.uuid4()
16
+ return f"{current_datetime}_{unique_id}.{extension}"
17
+
18
+
19
+ def list_files_older_than(directory: str, diff_minutes: int) -> List[str]:
20
+ diff_seconds = diff_minutes * 60
21
+ now = datetime.datetime.now()
22
+ older_files: List[str] = []
23
+
24
+ for filename in os.listdir(directory):
25
+ file_path = os.path.join(directory, filename)
26
+ if os.path.isfile(file_path):
27
+ file_mod_time = os.path.getmtime(file_path)
28
+ file_mod_datetime = datetime.datetime.fromtimestamp(file_mod_time)
29
+ time_diff = now - file_mod_datetime
30
+ if time_diff.total_seconds() > diff_seconds:
31
+ older_files.append(file_path)
32
+
33
+ return older_files
34
+
35
+
36
+ def remove_files_older_than(directory: str, diff_minutes: int) -> None:
37
+ older_files = list_files_older_than(directory, diff_minutes)
38
+ file_count = len(older_files)
39
+
40
+ for file_path in older_files:
41
+ os.remove(file_path)
42
+
43
+ now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
44
+ print(
45
+ f"[{now}] Removed {file_count} files older than {diff_minutes} minutes from "
46
+ f"'{directory}' directory."
47
+ )
48
+
49
+
50
+ def calculate_end_frame_index(source_video_path: str) -> int:
51
+ video_info = sv.VideoInfo.from_video_path(source_video_path)
52
+ return min(
53
+ video_info.total_frames,
54
+ video_info.fps * MAX_VIDEO_LENGTH_SEC
55
+ )
56
+
57
+
58
+ def create_directory(directory_path: str) -> None:
59
+ if not os.path.exists(directory_path):
60
+ os.makedirs(directory_path)