Spaces:

Lightricks
/

LTX-Video-Playground

Running on A100

App Files Files Community

benibraz commited on Nov 22, 2024

Commit

94c4974

1 Parent(s): 0ad6bcf

Add OpenAI integration and enhance prompt functionality for video generation

Browse files

Files changed (4) hide show

app.py +54 -0
assets/system_prompt_i2v.txt +39 -0
assets/system_prompt_t2v.txt +38 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import torch
 from huggingface_hub import snapshot_download
@@ -18,9 +19,19 @@ from PIL import Image
 import tempfile
 import os
 import gc
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
@@ -90,6 +101,30 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
     return frame_tensor.unsqueeze(0).unsqueeze(2)
 # Preset options for resolution and frame configuration
 preset_options = [
     {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
@@ -169,6 +204,7 @@ pipeline = XoraVideoPipeline(
 def generate_video_from_text(
     prompt="",
     negative_prompt="",
     frame_rate=25,
     seed=171198,
@@ -185,6 +221,8 @@ def generate_video_from_text(
             duration=5,
         )
     sample = {
         "prompt": prompt,
         "prompt_attention_mask": None,
@@ -247,6 +285,7 @@ def generate_video_from_text(
 def generate_video_from_image(
     image_path,
     prompt="",
     negative_prompt="",
     frame_rate=25,
     seed=171198,
@@ -275,6 +314,8 @@ def generate_video_from_image(
         load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
     )
     sample = {
         "prompt": prompt,
         "prompt_attention_mask": None,
@@ -446,6 +487,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     txt2vid_negative_prompt = gr.Textbox(
                         label="Step 2: Enter Negative Prompt",
                         placeholder="Describe what you don't want in the video...",
@@ -515,6 +562,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     img2vid_negative_prompt = gr.Textbox(
                         label="Step 3: Enter Negative Prompt",
                         placeholder="Describe what you don't want in the video...",
@@ -584,6 +636,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         fn=generate_video_from_text,
         inputs=[
             txt2vid_prompt,
             txt2vid_negative_prompt,
             txt2vid_frame_rate,
             *txt2vid_advanced,
@@ -603,6 +656,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         inputs=[
             img2vid_image,
             img2vid_prompt,
             img2vid_negative_prompt,
             img2vid_frame_rate,
             *img2vid_advanced,

 import gradio as gr
+from gradio_toggle import Toggle
 import torch
 from huggingface_hub import snapshot_download
 import tempfile
 import os
 import gc
+from openai import OpenAI
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
+openai_api_key = os.getenv("OPENAI_API_KEY")
+client = OpenAI(api_key=openai_api_key)
+system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
+system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
+with open(system_prompt_t2v_path, "r") as f:
+    system_prompt_t2v = f.read()
+with open(system_prompt_i2v_path, "r") as f:
+    system_prompt_i2v = f.read()
 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
     return frame_tensor.unsqueeze(0).unsqueeze(2)
+def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
+    if not enhance_toggle:
+        print("Enhance toggle is off, Prompt: ", prompt)
+        return prompt
+    system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt},
+    ]
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=messages,
+            max_tokens=200,
+        )
+        print("Enhanced Prompt: ", response.choices[0].message.content.strip())
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Error: {e}")
+        return prompt
 # Preset options for resolution and frame configuration
 preset_options = [
     {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
 def generate_video_from_text(
     prompt="",
+    enhance_prompt_toggle=False,
     negative_prompt="",
     frame_rate=25,
     seed=171198,
             duration=5,
         )
+    prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
     sample = {
         "prompt": prompt,
         "prompt_attention_mask": None,
 def generate_video_from_image(
     image_path,
     prompt="",
+    enhance_prompt_toggle=False,
     negative_prompt="",
     frame_rate=25,
     seed=171198,
         load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
     )
+    prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
     sample = {
         "prompt": prompt,
         "prompt_attention_mask": None,
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
+                    txt2vid_enhance_toggle = Toggle(
+                        label="Enhance Prompt",
+                        value=True,
+                        interactive=True,
+                    )
                     txt2vid_negative_prompt = gr.Textbox(
                         label="Step 2: Enter Negative Prompt",
                         placeholder="Describe what you don't want in the video...",
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
+                    img2vid_enhance_toggle = Toggle(
+                        label="Enhance Prompt",
+                        value=True,
+                        interactive=True,
+                    )
                     img2vid_negative_prompt = gr.Textbox(
                         label="Step 3: Enter Negative Prompt",
                         placeholder="Describe what you don't want in the video...",
         fn=generate_video_from_text,
         inputs=[
             txt2vid_prompt,
+            txt2vid_enhance_toggle,
             txt2vid_negative_prompt,
             txt2vid_frame_rate,
             *txt2vid_advanced,
         inputs=[
             img2vid_image,
             img2vid_prompt,
+            img2vid_enhance_toggle,
             img2vid_negative_prompt,
             img2vid_frame_rate,
             *img2vid_advanced,

assets/system_prompt_i2v.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+You will receive prompts and a first frame image used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
+The captions were created with the following guidelines:
+Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
+    1. Start with a single sentence capturing the main actions in sequence.
+        Example: "A woman enters a café, orders a coffee, and sits by the window."
+    2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
+        Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
+    3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
+        Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
+    4. Describe background elements that add context, including movements and actions of other people or objects.
+        Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
+    5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
+        Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
+    6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
+        Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
+    7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
+        Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
+    8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
+        Example: "The scene is captured in real-life footage."
+    General Guidelines:
+    - Include many details but only those you are certain about, possibly omitting uncertain elements.
+    - Avoid conflicts with the first frame image.
+    - Use descriptive and literal language; avoid poetic or metaphorical expressions.
+    - Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
+    - Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
+    - Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
+    - Descriptions should avoid repeating the same information in different ways.
+    - Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
+Here are some examples to real captions that represent good prompts:
+- A woman walks away from a white Jeep parked on a city street at night...
+A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
+- A woman with long brown hair and light skin smiles at another woman...
+A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
+- A man in a suit enters a room and speaks to two women...
+A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
+- The camera pans across a cityscape of tall buildings...
+The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.

assets/system_prompt_t2v.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+You will receive prompts used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
+The captions were created with the following guidelines:
+Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
+    1. Start with a single sentence capturing the main actions in sequence.
+        Example: "A woman enters a café, orders a coffee, and sits by the window."
+    2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
+        Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
+    3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
+        Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
+    4. Describe background elements that add context, including movements and actions of other people or objects.
+        Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
+    5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
+        Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
+    6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
+        Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
+    7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
+        Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
+    8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
+        Example: "The scene is captured in real-life footage."
+    General Guidelines:
+    - Include many details but only those you are certain about, possibly omitting uncertain elements.
+    - Use descriptive and literal language; avoid poetic or metaphorical expressions.
+    - Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
+    - Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
+    - Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
+    - Descriptions should avoid repeating the same information in different ways.
+    - Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
+Here are some examples to real captions that represent good prompts:
+- A woman walks away from a white Jeep parked on a city street at night...
+A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
+- A woman with long brown hair and light skin smiles at another woman...
+A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
+- A man in a suit enters a room and speaks to two women...
+A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
+- The camera pans across a cityscape of tall buildings...
+The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.

requirements.txt CHANGED Viewed

@@ -10,3 +10,5 @@ opencv-python
 beautifulsoup4
 ftfy
 gradio

 beautifulsoup4
 ftfy
 gradio
+openai
+gradio_toggle