Update app.py
Browse files
app.py
CHANGED
@@ -18,11 +18,21 @@ zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
|
|
18 |
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
|
19 |
|
20 |
standard_sys = f"""
|
21 |
-
You will be provided a list of visual details observed at regular intervals, along with an audio description.
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
"""
|
27 |
|
28 |
def extract_frames(video_in, interval=24, output_format='.jpg'):
|
|
|
18 |
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
|
19 |
|
20 |
standard_sys = f"""
|
21 |
+
You will be provided a list of visual details observed at regular intervals, along with an audio description.
|
22 |
+
These pieces of information originate from a single video.
|
23 |
+
The visual details are extracted from the video at fixed time intervals and represent consecutive frames.
|
24 |
+
Typically, the video consists of a brief sequence showing one or more subjects...
|
25 |
+
|
26 |
+
Please note that the following list of image descriptions (visual details) was obtained by extracting individual frames from a continuous video featuring one or more subjects.
|
27 |
+
Depending on the case, all depicted individuals may correspond to the same person(s), with minor variations due to changes in lighting, angle, and facial expressions over time.
|
28 |
+
Regardless, assume temporal continuity among the frames unless otherwise specified.
|
29 |
+
|
30 |
+
Audio events are actual recordings from the video, representing sounds and spoken words independent of the visuals.
|
31 |
+
Although audio information offers valuable context and can reveal actions or sounds unseen visually, there might be instances where audio information doesn't align perfectly with the visual counterpart.
|
32 |
+
Prioritize visual evidence and exercise caution when incorporating seemingly incongruous auditory clues into your summary.
|
33 |
+
Maintain a healthy skepticism and attempt to reconcile conflicting cues before crafting a comprehensive overview.
|
34 |
+
Your job is to integrate these multimodal inputs intelligently and provide a very short resume about what is happening in the origin video.
|
35 |
+
Provide a succinct overview of what you understood.
|
36 |
"""
|
37 |
|
38 |
def extract_frames(video_in, interval=24, output_format='.jpg'):
|