import json
import os
import shutil
import subprocess
import sys
import time
import math
import cv2
import requests
from pydub import AudioSegment
import numpy as np
from dotenv import load_dotenv
import gradio as gr
from gradio_client import Client, file
# Function to get a friendly name from an audio file name
def get_friendly_name(filename):
return os.path.splitext(filename)[0].capitalize()
# Get audio files and their friendly names
audio_files_dir = "audio_folder" # Path to your audio folder
audio_files = [(get_friendly_name(f), f) for f in os.listdir(audio_files_dir) if f.endswith(".mp3") or f.endswith(".wav")]
# Load environment variables
def parse(narration):
data = []
narrations = []
lines = narration.split("\n")
for line in lines:
if line.startswith('Narrator: '):
text = line.replace('Narrator: ', '')
"type": "text",
"content": text.strip('"'),
elif line.startswith('['):
background = line.strip('[]')
"type": "image",
"description": background,
return data, narrations
def create(data, output_folder, audio_file):
if not os.path.exists(output_folder):
# Initialize Gradio Client
client = Client("tonyassi/voice-clone")
audio_files_dir = "audio_folder" # Path to your audio folder
for element in data:
if element["type"] != "text":
# Make prediction using the provided API
audio_file_path = os.path.join(audio_files_dir, audio_file)
result = client.predict(
audio=file(audio_file_path) # Include reference style audio for API
# Move the response audio file to the output folder
temp_dir = os.path.dirname(result)
response_file_path = os.path.join(output_folder, f"narration_{len(os.listdir(output_folder)) + 1}.wav")
shutil.move(result, response_file_path)
print(f"Audio file generated for '{element['content']}' saved at: {response_file_path}")
def generate(prompt, output_file, size="576x1024"):
url = 'https://api.lemonfox.ai/v1/images/generations'
headers = {
'Authorization': LEMONFOX_API_KEY,
'Content-Type': 'application/json'
data = {
'prompt': prompt,
'size': size,
'n': 1
response = requests.post(url, json=data, headers=headers)
if response.ok:
response_data = response.json()
if 'data' in response_data and len(response_data['data']) > 0:
image_info = response_data['data'][0]
image_url = image_info['url']
image_response = requests.get(image_url)
with open(output_file, 'wb') as f:
print(f"No image data found for prompt: {prompt}")
print(f"Failed to generate image for prompt: {prompt}. Status Code: {response.status_code}")
except Exception as e:
print(f"Error occurred while processing prompt: {prompt}")
def create_from_data(data, output_dir):
if not os.path.exists(output_dir):
image_number = 0
for element in data:
if element["type"] != "image":
image_number += 1
image_name = f"image_{image_number}.webp"
generate(element["description"], os.path.join(output_dir, image_name))
def get_audio_duration(audio_file):
return len(AudioSegment.from_file(audio_file))
def resize_image(image, width, height):
aspect_ratio = image.shape[1] / image.shape[0]
if aspect_ratio > (width / height):
new_width = width
new_height = int(width / aspect_ratio)
new_height = height
new_width = int(height * aspect_ratio)
return cv2.resize(image, (new_width, new_height))
def write_text(text, frame, video_writer):
white_color = (255, 255, 255)
black_color = (0, 0, 0)
thickness = 10
font_scale = 3
border = 5
text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
text_x = (frame.shape[1] - text_size[0]) // 2
text_y = (frame.shape[0] + text_size[1]) // 2
org = (text_x, text_y)
frame = cv2.putText(frame, text, org, font, font_scale, black_color, thickness + border * 2, cv2.LINE_AA)
frame = cv2.putText(frame, text, org, font, font_scale, white_color, thickness, cv2.LINE_AA)
def add_narration_to_video(narrations, input_video, output_dir, output_file, text_color, text_position):
offset = 50
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
temp_video = os.path.join(output_dir, "with_transcript.avi")
out = cv2.VideoWriter(temp_video, fourcc, 30, (int(cap.get(3)), int(cap.get(4))))
full_narration = AudioSegment.empty()
for i, narration in enumerate(narrations):
audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.wav")
duration = get_audio_duration(audio)
narration_frames = math.floor(duration / 1000 * 30)
full_narration += AudioSegment.from_file(audio)
char_count = len(narration.replace(" ", ""))
ms_per_char = duration / char_count
frames_written = 0
words = narration.split(" ")
for w, word in enumerate(words):
word_ms = len(word) * ms_per_char
if i == 0 and w == 0:
word_ms -= offset
if word_ms < 0:
word_ms = 0
for _ in range(math.floor(word_ms/1000*30)):
ret, frame = cap.read()
if not ret:
write_text(word, frame, out)
frames_written += 1
for _ in range(narration_frames - frames_written):
ret, frame = cap.read()
while out.isOpened():
ret, frame = cap.read()
if not ret:
temp_narration = os.path.join(output_dir, "narration.wav")
full_narration.export(temp_narration, format="wav")
ffmpeg_command = [
'-i', temp_video,
'-i', temp_narration,
'-map', '0:v',
'-map', '1:a',
'-c:v', 'copy',
'-c:a', 'aac',
'-strict', 'experimental',
os.path.join(output_dir, output_file)
subprocess.run(ffmpeg_command, capture_output=True)
def create_video(narrations, output_dir, output_file, text_color, text_position): # Add text_color and text_position parameters here
width, height = 1080, 1920
frame_rate = 30
fade_time = 1000
fourcc = cv2.VideoWriter_fourcc(*'XVID')
temp_video = os.path.join(output_dir, "temp_video.avi")
out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height))
image_paths = os.listdir(os.path.join(output_dir, "images"))
image_count = len(image_paths)
for i in range(image_count):
image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp"))
if i+1 < image_count:
image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp"))
image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp"))
image1 = resize_image(image1, width, height)
image2 = resize_image(image2, width, height)
narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.wav")
duration = get_audio_duration(narration)
if i > 0:
duration -= fade_time
if i == image_count-1:
duration -= fade_time
for _ in range(math.floor(duration/1000*30)):
vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
vertical_video_frame[:image1.shape[0], :] = image1
for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)):
blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0)
vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
vertical_video_frame[:image1.shape[0], :] = blended_image
add_narration_to_video(narrations, temp_video, output_dir, output_file, text_color, text_position) # Pass text_color and text_position here
def generate_video(topic, voice_choice):
short_id = str(int(time.time()))
basedir = os.path.join("shorts", short_id)
if not os.path.exists(basedir):
filename = topic.replace("_", " ").replace("/", "_").replace(".", "_")
output_file = f"{filename}.avi"
# Extract the voice file based on voice_choice
voice_file = [file for name, file in audio_files if name == voice_choice][0]
chat_url = 'https://api.lemonfox.ai/v1/chat/completions'
headers = {
'Authorization': f'Bearer {LEMONFOX_API_KEY}',
'Content-Type': 'application/json'
payload = {
"model": "mixtral-chat",
"messages": [
"role": "system",
"content": "You are a YouTube short video creator."
"role": "user",
"content": f"""make a short video on: \n\n{topic} Generate 30 seconds to 1 minute of video. You will need to generate a very short description of images for each of the sentences. They will be used for background images. Note that the script will be fed into a text-to-speech engine, so dont use special characters. Respond with a pair of an image description in square brackets and a script below it. Both of them should be on their own lines, as follows: ###
[Description of a background image]
Narrator: "One sentence of narration"
### The short should be 6 sentences maximum."""
response = requests.post(chat_url, json=payload, headers=headers)
if response.status_code == 200:
response_text = response.json()['choices'][0]['message']['content']
response_text = response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("β€œ", '"').replace("”", '"')
with open(os.path.join(basedir, f"response.txt"), "a") as f:
f.write(response_text + "\n")
data, narrations = parse(response_text)
with open(os.path.join(basedir, f"data.json"), "a") as f:
json.dump(data, f, ensure_ascii=False)
print(f"Generating narration for: {topic}...")
create(data, os.path.join(basedir, f"narrations"), voice_file)
print("Generating images...")
create_from_data(data, os.path.join(basedir, f"images"))
print("Generating video...")
create_video(narrations, basedir, output_file, text_color="white", text_position="center") # Pass text_color and text_position here
print("Deleting files and folders...")
os.remove(os.path.join(basedir, "response.txt"))
os.remove(os.path.join(basedir, "data.json"))
shutil.rmtree(os.path.join(basedir, "narrations"))
shutil.rmtree(os.path.join(basedir, "images"))
print(f"DONE! Here's your video: {os.path.join(basedir, output_file)}")
return os.path.join(basedir, output_file)
print(f"Failed to generate script for source material: {topic}. Status Code: {response.status_code}")
return None
iface = gr.Interface(
inputs=["text", gr.Dropdown(choices=[name for name, _ in audio_files], label="Select Voice")],
css="footer {visibility: hidden}",
description="Generate a free short video. Best for YouTube Shorts, Instagram Reels or TikTok. This is a prototype."
title="Text to Short Video Free"