Spaces:
Running
on
A100
Running
on
A100
Add OpenAI integration and enhance prompt functionality for video generation
Browse files- app.py +54 -0
- assets/system_prompt_i2v.txt +39 -0
- assets/system_prompt_t2v.txt +38 -0
- requirements.txt +2 -0
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import torch
|
3 |
from huggingface_hub import snapshot_download
|
4 |
|
@@ -18,9 +19,19 @@ from PIL import Image
|
|
18 |
import tempfile
|
19 |
import os
|
20 |
import gc
|
|
|
21 |
|
22 |
# Load Hugging Face token if needed
|
23 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Set model download directory within Hugging Face Spaces
|
26 |
model_path = "asset"
|
@@ -90,6 +101,30 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
|
|
90 |
return frame_tensor.unsqueeze(0).unsqueeze(2)
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
# Preset options for resolution and frame configuration
|
94 |
preset_options = [
|
95 |
{"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
|
@@ -169,6 +204,7 @@ pipeline = XoraVideoPipeline(
|
|
169 |
|
170 |
def generate_video_from_text(
|
171 |
prompt="",
|
|
|
172 |
negative_prompt="",
|
173 |
frame_rate=25,
|
174 |
seed=171198,
|
@@ -185,6 +221,8 @@ def generate_video_from_text(
|
|
185 |
duration=5,
|
186 |
)
|
187 |
|
|
|
|
|
188 |
sample = {
|
189 |
"prompt": prompt,
|
190 |
"prompt_attention_mask": None,
|
@@ -247,6 +285,7 @@ def generate_video_from_text(
|
|
247 |
def generate_video_from_image(
|
248 |
image_path,
|
249 |
prompt="",
|
|
|
250 |
negative_prompt="",
|
251 |
frame_rate=25,
|
252 |
seed=171198,
|
@@ -275,6 +314,8 @@ def generate_video_from_image(
|
|
275 |
load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
|
276 |
)
|
277 |
|
|
|
|
|
278 |
sample = {
|
279 |
"prompt": prompt,
|
280 |
"prompt_attention_mask": None,
|
@@ -446,6 +487,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
446 |
value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
|
447 |
lines=5,
|
448 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
txt2vid_negative_prompt = gr.Textbox(
|
450 |
label="Step 2: Enter Negative Prompt",
|
451 |
placeholder="Describe what you don't want in the video...",
|
@@ -515,6 +562,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
515 |
value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
|
516 |
lines=5,
|
517 |
)
|
|
|
|
|
|
|
|
|
|
|
518 |
img2vid_negative_prompt = gr.Textbox(
|
519 |
label="Step 3: Enter Negative Prompt",
|
520 |
placeholder="Describe what you don't want in the video...",
|
@@ -584,6 +636,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
584 |
fn=generate_video_from_text,
|
585 |
inputs=[
|
586 |
txt2vid_prompt,
|
|
|
587 |
txt2vid_negative_prompt,
|
588 |
txt2vid_frame_rate,
|
589 |
*txt2vid_advanced,
|
@@ -603,6 +656,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
603 |
inputs=[
|
604 |
img2vid_image,
|
605 |
img2vid_prompt,
|
|
|
606 |
img2vid_negative_prompt,
|
607 |
img2vid_frame_rate,
|
608 |
*img2vid_advanced,
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_toggle import Toggle
|
3 |
import torch
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
|
|
19 |
import tempfile
|
20 |
import os
|
21 |
import gc
|
22 |
+
from openai import OpenAI
|
23 |
|
24 |
# Load Hugging Face token if needed
|
25 |
hf_token = os.getenv("HF_TOKEN")
|
26 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
27 |
+
client = OpenAI(api_key=openai_api_key)
|
28 |
+
system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
|
29 |
+
system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
|
30 |
+
with open(system_prompt_t2v_path, "r") as f:
|
31 |
+
system_prompt_t2v = f.read()
|
32 |
+
|
33 |
+
with open(system_prompt_i2v_path, "r") as f:
|
34 |
+
system_prompt_i2v = f.read()
|
35 |
|
36 |
# Set model download directory within Hugging Face Spaces
|
37 |
model_path = "asset"
|
|
|
101 |
return frame_tensor.unsqueeze(0).unsqueeze(2)
|
102 |
|
103 |
|
104 |
+
def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
|
105 |
+
if not enhance_toggle:
|
106 |
+
print("Enhance toggle is off, Prompt: ", prompt)
|
107 |
+
return prompt
|
108 |
+
|
109 |
+
system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
|
110 |
+
messages = [
|
111 |
+
{"role": "system", "content": system_prompt},
|
112 |
+
{"role": "user", "content": prompt},
|
113 |
+
]
|
114 |
+
|
115 |
+
try:
|
116 |
+
response = client.chat.completions.create(
|
117 |
+
model="gpt-4o-mini",
|
118 |
+
messages=messages,
|
119 |
+
max_tokens=200,
|
120 |
+
)
|
121 |
+
print("Enhanced Prompt: ", response.choices[0].message.content.strip())
|
122 |
+
return response.choices[0].message.content.strip()
|
123 |
+
except Exception as e:
|
124 |
+
print(f"Error: {e}")
|
125 |
+
return prompt
|
126 |
+
|
127 |
+
|
128 |
# Preset options for resolution and frame configuration
|
129 |
preset_options = [
|
130 |
{"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
|
|
|
204 |
|
205 |
def generate_video_from_text(
|
206 |
prompt="",
|
207 |
+
enhance_prompt_toggle=False,
|
208 |
negative_prompt="",
|
209 |
frame_rate=25,
|
210 |
seed=171198,
|
|
|
221 |
duration=5,
|
222 |
)
|
223 |
|
224 |
+
prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
|
225 |
+
|
226 |
sample = {
|
227 |
"prompt": prompt,
|
228 |
"prompt_attention_mask": None,
|
|
|
285 |
def generate_video_from_image(
|
286 |
image_path,
|
287 |
prompt="",
|
288 |
+
enhance_prompt_toggle=False,
|
289 |
negative_prompt="",
|
290 |
frame_rate=25,
|
291 |
seed=171198,
|
|
|
314 |
load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
|
315 |
)
|
316 |
|
317 |
+
prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
|
318 |
+
|
319 |
sample = {
|
320 |
"prompt": prompt,
|
321 |
"prompt_attention_mask": None,
|
|
|
487 |
value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
|
488 |
lines=5,
|
489 |
)
|
490 |
+
txt2vid_enhance_toggle = Toggle(
|
491 |
+
label="Enhance Prompt",
|
492 |
+
value=True,
|
493 |
+
interactive=True,
|
494 |
+
)
|
495 |
+
|
496 |
txt2vid_negative_prompt = gr.Textbox(
|
497 |
label="Step 2: Enter Negative Prompt",
|
498 |
placeholder="Describe what you don't want in the video...",
|
|
|
562 |
value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
|
563 |
lines=5,
|
564 |
)
|
565 |
+
img2vid_enhance_toggle = Toggle(
|
566 |
+
label="Enhance Prompt",
|
567 |
+
value=True,
|
568 |
+
interactive=True,
|
569 |
+
)
|
570 |
img2vid_negative_prompt = gr.Textbox(
|
571 |
label="Step 3: Enter Negative Prompt",
|
572 |
placeholder="Describe what you don't want in the video...",
|
|
|
636 |
fn=generate_video_from_text,
|
637 |
inputs=[
|
638 |
txt2vid_prompt,
|
639 |
+
txt2vid_enhance_toggle,
|
640 |
txt2vid_negative_prompt,
|
641 |
txt2vid_frame_rate,
|
642 |
*txt2vid_advanced,
|
|
|
656 |
inputs=[
|
657 |
img2vid_image,
|
658 |
img2vid_prompt,
|
659 |
+
img2vid_enhance_toggle,
|
660 |
img2vid_negative_prompt,
|
661 |
img2vid_frame_rate,
|
662 |
*img2vid_advanced,
|
assets/system_prompt_i2v.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You will receive prompts and a first frame image used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
|
2 |
+
The captions were created with the following guidelines:
|
3 |
+
Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
|
4 |
+
1. Start with a single sentence capturing the main actions in sequence.
|
5 |
+
Example: "A woman enters a café, orders a coffee, and sits by the window."
|
6 |
+
2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
|
7 |
+
Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
|
8 |
+
3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
|
9 |
+
Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
|
10 |
+
4. Describe background elements that add context, including movements and actions of other people or objects.
|
11 |
+
Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
|
12 |
+
5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
|
13 |
+
Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
|
14 |
+
6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
|
15 |
+
Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
|
16 |
+
7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
|
17 |
+
Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
|
18 |
+
8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
|
19 |
+
Example: "The scene is captured in real-life footage."
|
20 |
+
|
21 |
+
General Guidelines:
|
22 |
+
- Include many details but only those you are certain about, possibly omitting uncertain elements.
|
23 |
+
- Avoid conflicts with the first frame image.
|
24 |
+
- Use descriptive and literal language; avoid poetic or metaphorical expressions.
|
25 |
+
- Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
|
26 |
+
- Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
|
27 |
+
- Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
|
28 |
+
- Descriptions should avoid repeating the same information in different ways.
|
29 |
+
- Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
|
30 |
+
|
31 |
+
Here are some examples to real captions that represent good prompts:
|
32 |
+
- A woman walks away from a white Jeep parked on a city street at night...
|
33 |
+
A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
|
34 |
+
- A woman with long brown hair and light skin smiles at another woman...
|
35 |
+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
|
36 |
+
- A man in a suit enters a room and speaks to two women...
|
37 |
+
A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
|
38 |
+
- The camera pans across a cityscape of tall buildings...
|
39 |
+
The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.
|
assets/system_prompt_t2v.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You will receive prompts used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
|
2 |
+
The captions were created with the following guidelines:
|
3 |
+
Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
|
4 |
+
1. Start with a single sentence capturing the main actions in sequence.
|
5 |
+
Example: "A woman enters a café, orders a coffee, and sits by the window."
|
6 |
+
2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
|
7 |
+
Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
|
8 |
+
3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
|
9 |
+
Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
|
10 |
+
4. Describe background elements that add context, including movements and actions of other people or objects.
|
11 |
+
Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
|
12 |
+
5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
|
13 |
+
Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
|
14 |
+
6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
|
15 |
+
Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
|
16 |
+
7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
|
17 |
+
Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
|
18 |
+
8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
|
19 |
+
Example: "The scene is captured in real-life footage."
|
20 |
+
|
21 |
+
General Guidelines:
|
22 |
+
- Include many details but only those you are certain about, possibly omitting uncertain elements.
|
23 |
+
- Use descriptive and literal language; avoid poetic or metaphorical expressions.
|
24 |
+
- Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
|
25 |
+
- Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
|
26 |
+
- Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
|
27 |
+
- Descriptions should avoid repeating the same information in different ways.
|
28 |
+
- Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
|
29 |
+
|
30 |
+
Here are some examples to real captions that represent good prompts:
|
31 |
+
- A woman walks away from a white Jeep parked on a city street at night...
|
32 |
+
A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
|
33 |
+
- A woman with long brown hair and light skin smiles at another woman...
|
34 |
+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
|
35 |
+
- A man in a suit enters a room and speaks to two women...
|
36 |
+
A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
|
37 |
+
- The camera pans across a cityscape of tall buildings...
|
38 |
+
The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.
|
requirements.txt
CHANGED
@@ -10,3 +10,5 @@ opencv-python
|
|
10 |
beautifulsoup4
|
11 |
ftfy
|
12 |
gradio
|
|
|
|
|
|
10 |
beautifulsoup4
|
11 |
ftfy
|
12 |
gradio
|
13 |
+
openai
|
14 |
+
gradio_toggle
|