Spaces:

HuggingFaceTB
/

SmolVLM2-HighlightGenerator

Running on A100

App Files Files Community

mfarre HF staff commited on 10 days ago

Commit

3680a17

1 Parent(s): bd08551

prompt + transformers adjustments

Browse files

Files changed (1) hide show

app.py +10 -13

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import tempfile
 import torch
 import spaces
 from pathlib import Path
-from transformers import AutoProcessor, AutoModelForVision2Seq
 import subprocess
 import logging
@@ -49,7 +49,7 @@ class VideoHighlightDetector:
         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
-        self.model = AutoModelForVision2Seq.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,
             # _attn_implementation="flash_attention_2"
@@ -88,15 +88,11 @@ class VideoHighlightDetector:
         messages = [
             {
                 "role": "system",
-                "content": [{"type": "text", "text": "You are a professional video editor specializing in creating viral highlight reels.  You understand that the most engaging highlights are brief and focus only on exceptional moments that are statistically rare or particularly dramatic. Moments that would make viewers say 'I can't believe that happened!"}]
             },
             {
                 "role": "user",
-                "content": [{"type": "text", "text": f"""Here is a description of a video:
-                {video_description}
-                Based on this description, list which rare segments should be included in a best of the best highlight."""}]
             }
         ]
@@ -116,18 +112,19 @@ class VideoHighlightDetector:
     def process_segment(self, video_path: str, highlight_types: str) -> bool:
         """Process a video segment and determine if it contains highlights."""
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
-                    {"type": "text", "text": f"""{highlight_types}
-                    Do you see any of those elements in the video? answer yes if you do and answer no if you don't."""}
-                ]
             }
         ]
         print(messages)

 import torch
 import spaces
 from pathlib import Path
+from transformers import AutoProcessor, AutoModelForImageTextToText
 import subprocess
 import logging
         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
+        self.model = AutoModelForImageTextToText.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,
             # _attn_implementation="flash_attention_2"
         messages = [
             {
                 "role": "system",
+                "content": [{"type": "text", "text": "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type."}]
             },
             {
                 "role": "user",
+                "content": [{"type": "text", "text": f"""Here is a description of a video:\n\n{video_description}\n\nList potential highlight moments to look for in this video:"""}]
             }
         ]
     def process_segment(self, video_path: str, highlight_types: str) -> bool:
         """Process a video segment and determine if it contains highlights."""
         messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a video highlight analyzer. Your role is to identify moments that have high dramatic value, focusing on displays of skill, emotion, personality, or tension. Compare video segments against provided example highlights to find moments with similar emotional impact and visual interest, even if the specific actions differ."}]
+            },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
+                    {"type": "text", "text": f"""Given these highlight examples:\n{highlight_types}\n\nDoes this video contain a moment that matches the core action of one of the highlights? Answer with:\n'yes' or 'no'\nIf yes, justify it"""}]
             }
         ]
         print(messages)