show ffmpg command in case of error

#6
by Aivo - opened
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +74 -385
  3. requirements.txt +1 -1
README.md CHANGED
@@ -5,12 +5,12 @@ emoji: 🏞
5
  colorFrom: red
6
  colorTo: yellow
7
  sdk: gradio
8
- sdk_version: 5.34.1
9
  app_file: app.py
10
  pinned: false
11
  disable_embedding: true
12
  models:
13
- - Qwen/Qwen2.5-Coder-32B-Instruct
14
  ---
15
 
16
  # 🏞 AI Video Composer
@@ -78,4 +78,4 @@ AI Video Composer is an intelligent media processing application that uses natur
78
 
79
  If you have ideas for improvements or bug fixes, please open a PR:
80
 
81
- [![Open a Pull Request](https://huggingface.co/datasets/huggingface/badges/raw/main/open-a-pr-lg-light.svg)](https://huggingface.co/spaces/huggingface-projects/video-composer-gpt4/discussions)
 
5
  colorFrom: red
6
  colorTo: yellow
7
  sdk: gradio
8
+ sdk_version: 5.6.0
9
  app_file: app.py
10
  pinned: false
11
  disable_embedding: true
12
  models:
13
+ - Qwen/Qwen2.5-Coder-32B-Instruct
14
  ---
15
 
16
  # 🏞 AI Video Composer
 
78
 
79
  If you have ideas for improvements or bug fixes, please open a PR:
80
 
81
+ [![Open a Pull Request](https://huggingface.co/datasets/huggingface/badges/raw/main/open-a-pr-lg-light.svg)](https://huggingface.co/spaces/huggingface-projects/video-composer-gpt4/discussions)
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import spaces
3
 
4
  from PIL import Image
5
  from moviepy.editor import VideoFileClip, AudioFileClip
@@ -13,25 +12,13 @@ import tempfile
13
  import shlex
14
  import shutil
15
 
16
- # Supported models configuration
17
- MODELS = {
18
- "deepseek-ai/DeepSeek-V3": {
19
- "base_url": "https://router.huggingface.co/sambanova/v1",
20
- "env_key": "HF_TOKEN",
21
- "model_name": "DeepSeek-V3-0324",
22
- },
23
- }
24
-
25
- # Initialize client with first available model
26
- client = OpenAI(
27
- base_url=next(iter(MODELS.values()))["base_url"],
28
- api_key=os.environ[next(iter(MODELS.values()))["env_key"]],
29
- )
30
 
31
  allowed_medias = [
32
  ".png",
33
  ".jpg",
34
- ".webp",
35
  ".jpeg",
36
  ".tiff",
37
  ".bmp",
@@ -97,16 +84,7 @@ def get_files_infos(files):
97
  return results
98
 
99
 
100
- def get_completion(
101
- prompt,
102
- files_info,
103
- top_p,
104
- temperature,
105
- model_choice,
106
- conversation_history=None,
107
- previous_error=None,
108
- previous_command=None,
109
- ):
110
  # Create table header
111
  files_info_string = "| Type | Name | Dimensions | Duration | Audio Channels |\n"
112
  files_info_string += "|------|------|------------|-----------|--------|\n"
@@ -125,52 +103,10 @@ def get_completion(
125
 
126
  files_info_string += f"| {file_info['type']} | {file_info['name']} | {dimensions} | {duration} | {audio} |\n"
127
 
128
- # Build the user message with optional error feedback
129
- user_content = f"""Always output the media as video/mp4 and output file with "output.mp4".
130
- The current assets and objective follow.
131
-
132
- AVAILABLE ASSETS LIST:
133
-
134
- {files_info_string}
135
-
136
- OBJECTIVE: {prompt} and output at "output.mp4"
137
-
138
- First, think step-by-step about what I'm asking for and reformulate it into a clear technical specification.
139
- Then provide the FFMPEG command that will accomplish this task."""
140
-
141
- # Add error feedback if this is a retry
142
- if previous_error and previous_command:
143
- user_content += f"""
144
-
145
- IMPORTANT: This is a retry attempt. The previous command failed with the following error:
146
-
147
- PREVIOUS COMMAND (FAILED):
148
- {previous_command}
149
-
150
- ERROR MESSAGE:
151
- {previous_error}
152
-
153
- Please analyze the error and generate a corrected command that addresses the specific issue.
154
-
155
- COMMON SLIDESHOW ERROR FIXES:
156
- - If you see "do not match the corresponding output link" β†’ Images have different dimensions, use scale+pad approach
157
- - If you see "Padded dimensions cannot be smaller than input dimensions" β†’ Fix pad calculation or use standard resolution (1920x1080 or 1080x1920)
158
- - If you see "Failed to configure input pad" β†’ Check scale and pad syntax, ensure proper filter chain
159
- - If you see "Invalid argument" in filters β†’ Simplify filter_complex syntax and check parentheses
160
-
161
- FORMAT DETECTION KEYWORDS:
162
- - "vertical", "portrait", "9:16", "TikTok", "Instagram Stories", "phone" β†’ Use 1080x1920
163
- - "horizontal", "landscape", "16:9", "YouTube", "TV" β†’ Use 1920x1080 (default)
164
- - "square", "1:1", "Instagram post" β†’ Use 1080x1080"""
165
-
166
- user_content += "\n\nYOUR RESPONSE:"
167
-
168
- # Initialize conversation with system message and first user message
169
- if conversation_history is None:
170
- messages = [
171
- {
172
- "role": "system",
173
- "content": """
174
  You are a very experienced media engineer, controlling a UNIX terminal.
175
  You are an FFMPEG expert with years of experience and multiple contributions to the FFMPEG project.
176
 
@@ -181,7 +117,6 @@ You are given:
181
  Your objective is to generate the SIMPLEST POSSIBLE single ffmpeg command to create the requested video.
182
 
183
  Key requirements:
184
- - First, think step-by-step about what the user is asking for and reformulate it into a clear technical specification
185
  - Use the absolute minimum number of ffmpeg options needed
186
  - Avoid complex filter chains or filter_complex if possible
187
  - Prefer simple concatenation, scaling, and basic filters
@@ -192,88 +127,23 @@ Key requirements:
192
  - For image sequences: Use -framerate and pattern matching (like 'img%d.jpg') when possible, falling back to individual image processing with -loop 1 and appropriate filters only when necessary.
193
  - When showing file operations or commands, always use explicit paths and filenames without wildcards - avoid using asterisk (*) or glob patterns. Instead, use specific numbered sequences (like %d), explicit file lists, or show the full filename.
194
 
195
- CRITICAL SLIDESHOW GUIDANCE:
196
- When creating slideshows from multiple images with different dimensions, ALWAYS follow this proven pattern:
197
-
198
- 1. CHOOSE A STANDARD RESOLUTION: Pick 1920x1080 (1080p) as the default target resolution for slideshows, UNLESS the user explicitly requests a different format (e.g., "vertical video", "9:16 ratio", "portrait mode", "TikTok format" β†’ use 1080x1920)
199
- 2. USE SIMPLE SCALE+PAD APPROACH: For each image, scale to fit within the chosen resolution maintaining aspect ratio, then pad with black bars
200
- 3. PROVEN SLIDESHOW PATTERN:
201
- ```
202
- ffmpeg -loop 1 -t 3 -i image1.jpg -loop 1 -t 3 -i image2.jpg -filter_complex "[0]scale=1920:1080:force_original_aspect_ratio=decrease,pad=1920:1080:(ow-iw)/2:(oh-ih)/2,setsar=1[v0];[1]scale=1920:1080:force_original_aspect_ratio=decrease,pad=1920:1080:(ow-iw)/2:(oh-ih)/2,setsar=1[v1];[v0][v1]concat=n=2:v=1:a=0" -c:v libx264 -pix_fmt yuv420p -movflags +faststart output.mp4
203
- ```
204
-
205
- 4. SLIDESHOW RULES:
206
- - Use 1920x1080 as target resolution by default, adjust if user specifies format
207
- - For horizontal: scale=1920:1080:force_original_aspect_ratio=decrease,pad=1920:1080:(ow-iw)/2:(oh-ih)/2
208
- - For vertical: scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2
209
- - Always add setsar=1 after padding to fix aspect ratio issues
210
- - Use 3-second duration per image by default (-t 3)
211
- - For 3+ images, extend the pattern: [v0][v1][v2]concat=n=3:v=1:a=0
212
-
213
- 5. DIMENSION MISMATCH FIXES:
214
- - Never try to concat images with different dimensions directly
215
- - Always normalize dimensions first with scale+pad
216
- - Black padding is preferable to stretching/distorting images
217
-
218
- 6. SLIDESHOW TRANSITIONS:
219
- - For fade transitions, add fade=t=in:st=0:d=0.5,fade=t=out:st=2.5:d=0.5 after setsar=1
220
- - Keep transitions simple - complex transitions often fail
221
- - Only add transitions if specifically requested
222
-
223
- 7. SLIDESHOW TIMING:
224
- - Default to 3 seconds per image
225
- - Adjust timing based on user request (e.g., "5 seconds per image")
226
- - Total duration = (number of images Γ— seconds per image)
227
-
228
  Remember: Simpler is better. Only use advanced ffmpeg features if absolutely necessary for the requested output.
229
  """,
230
- },
231
- {
232
- "role": "user",
233
- "content": user_content,
234
- },
235
- ]
236
- else:
237
- # Use existing conversation history
238
- messages = conversation_history[:]
239
-
240
- # If there's a previous error, add it as a separate message exchange
241
- if previous_error and previous_command:
242
- # Add the failed command as assistant response
243
- messages.append({
244
- "role": "assistant",
245
- "content": f"I'll execute this FFmpeg command:\n\n```bash\n{previous_command}\n```"
246
- })
247
-
248
- # Add the error as user feedback
249
- messages.append({
250
- "role": "user",
251
- "content": f"""The command failed with the following error:
252
-
253
- ERROR MESSAGE:
254
- {previous_error}
255
-
256
- Please analyze the error and generate a corrected command that addresses the specific issue.
257
-
258
- COMMON SLIDESHOW ERROR FIXES:
259
- - If you see "do not match the corresponding output link" β†’ Images have different dimensions, use scale+pad approach
260
- - If you see "Padded dimensions cannot be smaller than input dimensions" β†’ Fix pad calculation or use standard resolution (1920x1080 or 1080x1920)
261
- - If you see "Failed to configure input pad" β†’ Check scale and pad syntax, ensure proper filter chain
262
- - If you see "Invalid argument" in filters β†’ Simplify filter_complex syntax and check parentheses
263
-
264
- FORMAT DETECTION KEYWORDS:
265
- - "vertical", "portrait", "9:16", "TikTok", "Instagram Stories", "phone" β†’ Use 1080x1920
266
- - "horizontal", "landscape", "16:9", "YouTube", "TV" β†’ Use 1920x1080 (default)
267
- - "square", "1:1", "Instagram post" β†’ Use 1080x1080
268
-
269
- Please provide a corrected FFmpeg command."""
270
- })
271
- else:
272
- # Add new user request to existing conversation
273
- messages.append({
274
- "role": "user",
275
- "content": user_content,
276
- })
277
  try:
278
  # Print the complete prompt
279
  print("\n=== COMPLETE PROMPT ===")
@@ -282,128 +152,36 @@ Please provide a corrected FFmpeg command."""
282
  print(msg["content"])
283
  print("=====================\n")
284
 
285
- if model_choice not in MODELS:
286
- raise ValueError(f"Model {model_choice} is not supported")
287
-
288
- model_config = MODELS[model_choice]
289
- client.base_url = model_config["base_url"]
290
- client.api_key = os.environ[model_config["env_key"]]
291
- model = model_config.get("model_name", model_choice)
292
-
293
  completion = client.chat.completions.create(
294
- model=model,
295
  messages=messages,
296
  temperature=temperature,
297
  top_p=top_p,
298
  max_tokens=2048,
299
  )
300
  content = completion.choices[0].message.content
301
- print(f"\n=== RAW API RESPONSE ===\n{content}\n========================\n")
302
-
303
  # Extract command from code block if present
304
- import re
305
- command = None
306
-
307
- # Try multiple code block patterns
308
- code_patterns = [
309
- r"```(?:bash|sh|shell)?\n(.*?)\n```", # Standard code blocks
310
- r"```\n(.*?)\n```", # Plain code blocks
311
- r"`([^`]*ffmpeg[^`]*)`", # Inline code with ffmpeg
312
- ]
313
-
314
- for pattern in code_patterns:
315
- matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
316
- for match in matches:
317
- if "ffmpeg" in match.lower():
318
- command = match.strip()
319
- break
320
  if command:
321
- break
322
-
323
- # If no code block found, try to find ffmpeg lines directly
324
- if not command:
325
- ffmpeg_lines = [
326
- line.strip()
327
- for line in content.split("\n")
328
- if line.strip().lower().startswith("ffmpeg")
329
- ]
330
- if ffmpeg_lines:
331
- command = ffmpeg_lines[0]
332
-
333
- # Last resort: look for any line containing ffmpeg
334
- if not command:
335
- for line in content.split("\n"):
336
- line = line.strip()
337
- if "ffmpeg" in line.lower() and len(line) > 10:
338
- command = line
339
- break
340
-
341
- if not command:
342
- print(f"ERROR: No ffmpeg command found in response")
343
- command = content.replace("\n", " ").strip()
344
-
345
- print(f"=== EXTRACTED COMMAND ===\n{command}\n========================\n")
346
-
347
  # remove output.mp4 with the actual output file path
348
  command = command.replace("output.mp4", "")
349
 
350
- # Add the assistant's response to conversation history
351
- messages.append({
352
- "role": "assistant",
353
- "content": content
354
- })
355
-
356
- return command, messages
357
  except Exception as e:
358
  raise Exception("API Error")
359
 
360
 
361
- @spaces.GPU(duration=120)
362
- def execute_ffmpeg_command(args, temp_dir, output_file_path):
363
- """Execute FFmpeg command with GPU acceleration"""
364
- final_command = args + ["-y", output_file_path]
365
- print(f"\n=== EXECUTING FFMPEG COMMAND ===\nffmpeg {' '.join(final_command[1:])}\n")
366
- subprocess.run(final_command, cwd=temp_dir)
367
- return output_file_path
368
-
369
-
370
- def compose_video(
371
- prompt: str,
372
- files: list = None,
373
- top_p: float = 0.7,
374
- temperature: float = 0.1,
375
- model_choice: str = "deepseek-ai/DeepSeek-V3",
376
- ) -> str:
377
- """
378
- Compose videos from existing media assets using natural language instructions.
379
-
380
- This tool is NOT for AI video generation. Instead, it uses AI to generate FFmpeg
381
- commands that combine, edit, and transform your uploaded images, videos, and audio
382
- files based on natural language descriptions.
383
-
384
- Args:
385
- prompt (str): Natural language instructions for video composition (e.g., "Create a slideshow with background music")
386
- files (list, optional): List of media files (images, videos, audio) to use
387
- top_p (float): Top-p sampling parameter for AI model (0.0-1.0, default: 0.7)
388
- temperature (float): Temperature parameter for AI model creativity (0.0-5.0, default: 0.1)
389
- model_choice (str): AI model to use for command generation (default: "deepseek-ai/DeepSeek-V3")
390
-
391
- Returns:
392
- str: Path to the generated video file
393
-
394
- Example:
395
- compose_video("Create a 10-second slideshow from the images with fade transitions", files=[img1, img2, img3])
396
- """
397
- return update(files or [], prompt, top_p, temperature, model_choice)
398
-
399
-
400
- def update(
401
- files,
402
- prompt,
403
- top_p=1,
404
- temperature=1,
405
- model_choice="deepseek-ai/DeepSeek-V3",
406
- ):
407
  if prompt == "":
408
  raise gr.Error("Please enter a prompt.")
409
 
@@ -415,30 +193,16 @@ def update(
415
  raise gr.Error(
416
  "Please make sure all videos are less than 2 minute long."
417
  )
418
- if file_info["size"] > 100000000:
419
- raise gr.Error("Please make sure all files are less than 100MB in size.")
420
 
421
  attempts = 0
422
- command_attempts = []
423
- previous_error = None
424
- previous_command = None
425
- conversation_history = None
426
-
427
  while attempts < 2:
428
- print("ATTEMPT", attempts + 1)
429
  try:
430
- command_string, conversation_history = get_completion(
431
- prompt,
432
- files_info,
433
- top_p,
434
- temperature,
435
- model_choice,
436
- conversation_history,
437
- previous_error,
438
- previous_command,
439
- )
440
  print(
441
- f"""///PROMPT {prompt} \n\n/// START OF COMMAND ///:\n\n{command_string}\n\n/// END OF COMMAND ///\n\n"""
442
  )
443
 
444
  # split command string into list of arguments
@@ -453,112 +217,42 @@ def update(
453
  shutil.copy(file_path, Path(temp_dir) / sanitized_name)
454
 
455
  # test if ffmpeg command is valid dry run
456
- ffmpeg_dry_run = subprocess.run(
457
  args + ["-f", "null", "-"],
458
  stderr=subprocess.PIPE,
459
  text=True,
460
  cwd=temp_dir,
461
  )
462
-
463
- # Extract command for display
464
- command_for_display = f"ffmpeg {' '.join(args[1:])} -y output.mp4"
465
-
466
- if ffmpeg_dry_run.returncode == 0:
467
  print("Command is valid.")
468
- # Add successful command to attempts
469
- command_attempts.append(
470
- {
471
- "command": command_for_display,
472
- "status": "βœ… Valid",
473
- "attempt": attempts + 1,
474
- }
475
- )
476
  else:
477
  print("Command is not valid. Error output:")
478
- print(ffmpeg_dry_run.stderr)
479
-
480
- # Add failed command to attempts with error
481
- command_attempts.append(
482
- {
483
- "command": command_for_display,
484
- "status": "❌ Invalid",
485
- "error": ffmpeg_dry_run.stderr,
486
- "attempt": attempts + 1,
487
- }
488
- )
489
-
490
- # Store error details for next retry
491
- previous_error = ffmpeg_dry_run.stderr
492
- previous_command = command_for_display
493
-
494
  raise Exception(
495
- f"FFMPEG command validation failed: {ffmpeg_dry_run.stderr}"
496
  )
497
 
498
  output_file_name = f"output_{uuid.uuid4()}.mp4"
499
  output_file_path = str((Path(temp_dir) / output_file_name).resolve())
500
- execute_ffmpeg_command(args, temp_dir, output_file_path)
501
-
502
- # Generate command display with all attempts
503
- command_display = generate_command_display(command_attempts)
504
- return output_file_path, gr.update(value=command_display)
505
-
 
506
  except Exception as e:
507
  attempts += 1
508
  if attempts >= 2:
509
  print("FROM UPDATE", e)
510
- # Show all attempted commands even on final failure
511
- command_display = generate_command_display(command_attempts)
512
- command_display += (
513
- f"\n\n### Final Error\n❌ All attempts failed. Last error: {str(e)}"
514
- )
515
- return None, gr.update(value=command_display)
516
-
517
-
518
- def generate_command_display(command_attempts):
519
- """Generate a markdown display of all command attempts"""
520
- if not command_attempts:
521
- return "### No commands generated"
522
-
523
- display = "### Generated Commands\n\n"
524
-
525
- for attempt in command_attempts:
526
- display += f"**Attempt {attempt['attempt']}** {attempt['status']}\n"
527
- display += f"```bash\n{attempt['command']}\n```\n"
528
 
529
- if attempt["status"] == "❌ Invalid" and "error" in attempt:
530
- display += f"<details>\n<summary>πŸ” Error Details</summary>\n\n```\n{attempt['error']}\n```\n</details>\n\n"
531
- else:
532
- display += "\n"
533
-
534
- return display
535
-
536
-
537
- # Create MCP-compatible interface
538
- mcp_interface = gr.Interface(
539
- fn=compose_video,
540
- inputs=[
541
- gr.Textbox(
542
- value="Create a slideshow with background music",
543
- label="Video Composition Instructions",
544
- ),
545
- gr.File(file_count="multiple", label="Media Files", file_types=allowed_medias),
546
- gr.Slider(0.0, 1.0, value=0.7, label="Top-p"),
547
- gr.Slider(0.0, 5.0, value=0.1, label="Temperature"),
548
- gr.Radio(
549
- choices=list(MODELS.keys()), value=list(MODELS.keys())[0], label="Model"
550
- ),
551
- ],
552
- outputs=gr.Video(label="Generated Video"),
553
- title="AI Video Composer MCP Tool",
554
- description="Compose videos from media assets using natural language",
555
- )
556
 
557
  with gr.Blocks() as demo:
558
  gr.Markdown(
559
  """
560
  # 🏞 AI Video Composer
561
- Compose new videos from your assets using natural language. Add video, image and audio assets and let [DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324) generate a new video for you (using FFMPEG).
562
  """,
563
  elem_id="header",
564
  )
@@ -570,17 +264,11 @@ with gr.Blocks() as demo:
570
  file_types=allowed_medias,
571
  )
572
  user_prompt = gr.Textbox(
573
- placeholder="eg: Remove the 3 first seconds of the video",
574
  label="Instructions",
575
- lines=3,
576
  )
577
  btn = gr.Button("Run")
578
  with gr.Accordion("Parameters", open=False):
579
- model_choice = gr.Radio(
580
- choices=list(MODELS.keys()),
581
- value=list(MODELS.keys())[0],
582
- label="Model",
583
- )
584
  top_p = gr.Slider(
585
  minimum=-0,
586
  maximum=1.0,
@@ -605,7 +293,7 @@ with gr.Blocks() as demo:
605
 
606
  btn.click(
607
  fn=update,
608
- inputs=[user_files, user_prompt, top_p, temperature, model_choice],
609
  outputs=[generated_video, generated_command],
610
  )
611
  with gr.Row():
@@ -616,32 +304,37 @@ with gr.Blocks() as demo:
616
  "Use the image as the background with a waveform visualization for the audio positioned in center of the video.",
617
  0.7,
618
  0.1,
619
- list(MODELS.keys())[0],
620
- ],
621
- [
622
- ["./examples/ai_talk.wav", "./examples/bg-image.png"],
623
- "Use the image as the background with a waveform visualization for the audio positioned in center of the video. Make sure the waveform has a max height of 250 pixels.",
624
- 0.7,
625
- 0.1,
626
- list(MODELS.keys())[0],
627
  ],
628
  [
629
  [
 
630
  "./examples/cat1.jpeg",
631
  "./examples/cat2.jpeg",
632
  "./examples/cat3.jpeg",
633
  "./examples/cat4.jpeg",
634
  "./examples/cat5.jpeg",
635
  "./examples/cat6.jpeg",
 
636
  "./examples/heat-wave.mp3",
637
  ],
638
- "Create a 3x2 grid of the cat images with the audio as background music. Make the video duration match the audio duration.",
 
 
 
 
 
 
 
 
 
 
 
 
639
  0.7,
640
  0.1,
641
- list(MODELS.keys())[0],
642
  ],
643
  ],
644
- inputs=[user_files, user_prompt, top_p, temperature, model_choice],
645
  outputs=[generated_video, generated_command],
646
  fn=update,
647
  run_on_click=True,
@@ -657,9 +350,5 @@ with gr.Blocks() as demo:
657
  """,
658
  )
659
 
660
- # Launch MCP interface for tool access
661
- mcp_interface.queue(default_concurrency_limit=200)
662
-
663
- # Launch main demo
664
  demo.queue(default_concurrency_limit=200)
665
- demo.launch(show_api=False, ssr_mode=False, mcp_server=True)
 
1
  import gradio as gr
 
2
 
3
  from PIL import Image
4
  from moviepy.editor import VideoFileClip, AudioFileClip
 
12
  import shlex
13
  import shutil
14
 
15
+ HF_API_KEY = os.environ["HF_TOKEN"]
16
+
17
+ client = OpenAI(base_url="https://api-inference.huggingface.co/v1/", api_key=HF_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  allowed_medias = [
20
  ".png",
21
  ".jpg",
 
22
  ".jpeg",
23
  ".tiff",
24
  ".bmp",
 
84
  return results
85
 
86
 
87
+ def get_completion(prompt, files_info, top_p, temperature):
 
 
 
 
 
 
 
 
 
88
  # Create table header
89
  files_info_string = "| Type | Name | Dimensions | Duration | Audio Channels |\n"
90
  files_info_string += "|------|------|------------|-----------|--------|\n"
 
103
 
104
  files_info_string += f"| {file_info['type']} | {file_info['name']} | {dimensions} | {duration} | {audio} |\n"
105
 
106
+ messages = [
107
+ {
108
+ "role": "system",
109
+ "content": """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  You are a very experienced media engineer, controlling a UNIX terminal.
111
  You are an FFMPEG expert with years of experience and multiple contributions to the FFMPEG project.
112
 
 
117
  Your objective is to generate the SIMPLEST POSSIBLE single ffmpeg command to create the requested video.
118
 
119
  Key requirements:
 
120
  - Use the absolute minimum number of ffmpeg options needed
121
  - Avoid complex filter chains or filter_complex if possible
122
  - Prefer simple concatenation, scaling, and basic filters
 
127
  - For image sequences: Use -framerate and pattern matching (like 'img%d.jpg') when possible, falling back to individual image processing with -loop 1 and appropriate filters only when necessary.
128
  - When showing file operations or commands, always use explicit paths and filenames without wildcards - avoid using asterisk (*) or glob patterns. Instead, use specific numbered sequences (like %d), explicit file lists, or show the full filename.
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  Remember: Simpler is better. Only use advanced ffmpeg features if absolutely necessary for the requested output.
131
  """,
132
+ },
133
+ {
134
+ "role": "user",
135
+ "content": f"""Always output the media as video/mp4 and output file with "output.mp4". Provide only the shell command without any explanations.
136
+ The current assets and objective follow. Reply with the FFMPEG command:
137
+
138
+ AVAILABLE ASSETS LIST:
139
+
140
+ {files_info_string}
141
+
142
+ OBJECTIVE: {prompt} and output at "output.mp4"
143
+ YOUR FFMPEG COMMAND:
144
+ """,
145
+ },
146
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  try:
148
  # Print the complete prompt
149
  print("\n=== COMPLETE PROMPT ===")
 
152
  print(msg["content"])
153
  print("=====================\n")
154
 
 
 
 
 
 
 
 
 
155
  completion = client.chat.completions.create(
156
+ model="Qwen/Qwen2.5-Coder-32B-Instruct",
157
  messages=messages,
158
  temperature=temperature,
159
  top_p=top_p,
160
  max_tokens=2048,
161
  )
162
  content = completion.choices[0].message.content
 
 
163
  # Extract command from code block if present
164
+ if "```" in content:
165
+ # Find content between ```sh or ```bash and the next ```
166
+ import re
167
+
168
+ command = re.search(r"```(?:sh|bash)?\n(.*?)\n```", content, re.DOTALL)
 
 
 
 
 
 
 
 
 
 
 
169
  if command:
170
+ command = command.group(1).strip()
171
+ else:
172
+ command = content.replace("\n", "")
173
+ else:
174
+ command = content.replace("\n", "")
175
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # remove output.mp4 with the actual output file path
177
  command = command.replace("output.mp4", "")
178
 
179
+ return command
 
 
 
 
 
 
180
  except Exception as e:
181
  raise Exception("API Error")
182
 
183
 
184
+ def update(files, prompt, top_p=1, temperature=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  if prompt == "":
186
  raise gr.Error("Please enter a prompt.")
187
 
 
193
  raise gr.Error(
194
  "Please make sure all videos are less than 2 minute long."
195
  )
196
+ if file_info["size"] > 10000000:
197
+ raise gr.Error("Please make sure all files are less than 10MB in size.")
198
 
199
  attempts = 0
 
 
 
 
 
200
  while attempts < 2:
201
+ print("ATTEMPT", attempts)
202
  try:
203
+ command_string = get_completion(prompt, files_info, top_p, temperature)
 
 
 
 
 
 
 
 
 
204
  print(
205
+ f"""///PROMTP {prompt} \n\n/// START OF COMMAND ///:\n\n{command_string}\n\n/// END OF COMMAND ///\n\n"""
206
  )
207
 
208
  # split command string into list of arguments
 
217
  shutil.copy(file_path, Path(temp_dir) / sanitized_name)
218
 
219
  # test if ffmpeg command is valid dry run
220
+ ffmpg_dry_run = subprocess.run(
221
  args + ["-f", "null", "-"],
222
  stderr=subprocess.PIPE,
223
  text=True,
224
  cwd=temp_dir,
225
  )
226
+ if ffmpg_dry_run.returncode == 0:
 
 
 
 
227
  print("Command is valid.")
 
 
 
 
 
 
 
 
228
  else:
229
  print("Command is not valid. Error output:")
230
+ print(ffmpg_dry_run.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  raise Exception(
232
+ "FFMPEG generated command is not valid. Please try something else."
233
  )
234
 
235
  output_file_name = f"output_{uuid.uuid4()}.mp4"
236
  output_file_path = str((Path(temp_dir) / output_file_name).resolve())
237
+ final_command = args + ["-y", output_file_path]
238
+ print(
239
+ f"\n=== EXECUTING FFMPEG COMMAND ===\nffmpeg {' '.join(final_command[1:])}\n"
240
+ )
241
+ subprocess.run(final_command, cwd=temp_dir)
242
+ generated_command = f"### Generated Command\n```bash\nffmpeg {' '.join(args[1:])} -y output.mp4\n```"
243
+ return output_file_path, gr.update(value=generated_command)
244
  except Exception as e:
245
  attempts += 1
246
  if attempts >= 2:
247
  print("FROM UPDATE", e)
248
+ raise gr.Error(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  with gr.Blocks() as demo:
252
  gr.Markdown(
253
  """
254
  # 🏞 AI Video Composer
255
+ Compose new videos from your assets using natural language. Add video, image and audio assets and let [Qwen2.5-Coder](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) generate a new video for you (using FFMPEG).
256
  """,
257
  elem_id="header",
258
  )
 
264
  file_types=allowed_medias,
265
  )
266
  user_prompt = gr.Textbox(
267
+ placeholder="I want to convert to a gif under 15mb",
268
  label="Instructions",
 
269
  )
270
  btn = gr.Button("Run")
271
  with gr.Accordion("Parameters", open=False):
 
 
 
 
 
272
  top_p = gr.Slider(
273
  minimum=-0,
274
  maximum=1.0,
 
293
 
294
  btn.click(
295
  fn=update,
296
+ inputs=[user_files, user_prompt, top_p, temperature],
297
  outputs=[generated_video, generated_command],
298
  )
299
  with gr.Row():
 
304
  "Use the image as the background with a waveform visualization for the audio positioned in center of the video.",
305
  0.7,
306
  0.1,
 
 
 
 
 
 
 
 
307
  ],
308
  [
309
  [
310
+ "./examples/cat8.jpeg",
311
  "./examples/cat1.jpeg",
312
  "./examples/cat2.jpeg",
313
  "./examples/cat3.jpeg",
314
  "./examples/cat4.jpeg",
315
  "./examples/cat5.jpeg",
316
  "./examples/cat6.jpeg",
317
+ "./examples/cat7.jpeg",
318
  "./examples/heat-wave.mp3",
319
  ],
320
+ "Generate an MP4 slideshow where each photo appears for 2 seconds, using the provided audio as soundtrack.",
321
+ 0.7,
322
+ 0.1,
323
+ ],
324
+ [
325
+ ["./examples/waterfall-overlay.png", "./examples/waterfall.mp4"],
326
+ "Add the overlay to the video.",
327
+ 0.7,
328
+ 0.1,
329
+ ],
330
+ [
331
+ ["./examples/example.mp4"],
332
+ "Make this video 10 times faster",
333
  0.7,
334
  0.1,
 
335
  ],
336
  ],
337
+ inputs=[user_files, user_prompt, top_p, temperature],
338
  outputs=[generated_video, generated_command],
339
  fn=update,
340
  run_on_click=True,
 
350
  """,
351
  )
352
 
 
 
 
 
353
  demo.queue(default_concurrency_limit=200)
354
+ demo.launch(show_api=False, ssr_mode=False)
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  openai>=1.55.0
2
- gradio==5.34.1
3
  moviepy==1
 
1
  openai>=1.55.0
2
+ gradio==5.6.0
3
  moviepy==1