Spaces:
Running
on
A100
Running
on
A100
prompt + transformers adjustments
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import tempfile
|
|
5 |
import torch
|
6 |
import spaces
|
7 |
from pathlib import Path
|
8 |
-
from transformers import AutoProcessor,
|
9 |
import subprocess
|
10 |
import logging
|
11 |
|
@@ -49,7 +49,7 @@ class VideoHighlightDetector:
|
|
49 |
|
50 |
# Initialize model and processor
|
51 |
self.processor = AutoProcessor.from_pretrained(model_path)
|
52 |
-
self.model =
|
53 |
model_path,
|
54 |
torch_dtype=torch.bfloat16,
|
55 |
# _attn_implementation="flash_attention_2"
|
@@ -88,15 +88,11 @@ class VideoHighlightDetector:
|
|
88 |
messages = [
|
89 |
{
|
90 |
"role": "system",
|
91 |
-
"content": [{"type": "text", "text": "You are a
|
92 |
},
|
93 |
{
|
94 |
"role": "user",
|
95 |
-
"content": [{"type": "text", "text": f"""Here is a description of a video:
|
96 |
-
|
97 |
-
{video_description}
|
98 |
-
|
99 |
-
Based on this description, list which rare segments should be included in a best of the best highlight."""}]
|
100 |
}
|
101 |
]
|
102 |
|
@@ -116,18 +112,19 @@ class VideoHighlightDetector:
|
|
116 |
def process_segment(self, video_path: str, highlight_types: str) -> bool:
|
117 |
"""Process a video segment and determine if it contains highlights."""
|
118 |
messages = [
|
|
|
|
|
|
|
|
|
119 |
{
|
120 |
"role": "user",
|
121 |
"content": [
|
122 |
{"type": "video", "path": video_path},
|
123 |
-
{"type": "text", "text": f"""{highlight_types}
|
124 |
-
|
125 |
-
|
126 |
-
Do you see any of those elements in the video? answer yes if you do and answer no if you don't."""}
|
127 |
-
]
|
128 |
}
|
129 |
]
|
130 |
|
|
|
131 |
print(messages)
|
132 |
|
133 |
|
|
|
5 |
import torch
|
6 |
import spaces
|
7 |
from pathlib import Path
|
8 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
9 |
import subprocess
|
10 |
import logging
|
11 |
|
|
|
49 |
|
50 |
# Initialize model and processor
|
51 |
self.processor = AutoProcessor.from_pretrained(model_path)
|
52 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
53 |
model_path,
|
54 |
torch_dtype=torch.bfloat16,
|
55 |
# _attn_implementation="flash_attention_2"
|
|
|
88 |
messages = [
|
89 |
{
|
90 |
"role": "system",
|
91 |
+
"content": [{"type": "text", "text": "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type."}]
|
92 |
},
|
93 |
{
|
94 |
"role": "user",
|
95 |
+
"content": [{"type": "text", "text": f"""Here is a description of a video:\n\n{video_description}\n\nList potential highlight moments to look for in this video:"""}]
|
|
|
|
|
|
|
|
|
96 |
}
|
97 |
]
|
98 |
|
|
|
112 |
def process_segment(self, video_path: str, highlight_types: str) -> bool:
|
113 |
"""Process a video segment and determine if it contains highlights."""
|
114 |
messages = [
|
115 |
+
{
|
116 |
+
"role": "system",
|
117 |
+
"content": [{"type": "text", "text": "You are a video highlight analyzer. Your role is to identify moments that have high dramatic value, focusing on displays of skill, emotion, personality, or tension. Compare video segments against provided example highlights to find moments with similar emotional impact and visual interest, even if the specific actions differ."}]
|
118 |
+
},
|
119 |
{
|
120 |
"role": "user",
|
121 |
"content": [
|
122 |
{"type": "video", "path": video_path},
|
123 |
+
{"type": "text", "text": f"""Given these highlight examples:\n{highlight_types}\n\nDoes this video contain a moment that matches the core action of one of the highlights? Answer with:\n'yes' or 'no'\nIf yes, justify it"""}]
|
|
|
|
|
|
|
|
|
124 |
}
|
125 |
]
|
126 |
|
127 |
+
|
128 |
print(messages)
|
129 |
|
130 |
|