import json import os import shutil import subprocess import sys import time import math import cv2 import requests from pydub import AudioSegment import numpy as np from dotenv import load_dotenv import gradio as gr from gradio_client import Client, file # Function to get a friendly name from an audio file name def get_friendly_name(filename): return os.path.splitext(filename)[0].capitalize() # Get audio files and their friendly names audio_files_dir = "audio_folder" # Path to your audio folder audio_files = [(get_friendly_name(f), f) for f in os.listdir(audio_files_dir) if f.endswith(".mp3") or f.endswith(".wav")] # Load environment variables load_dotenv(override=True) LEMONFOX_API_KEY = os.getenv("LEMONFOX_API_KEY") def parse(narration): data = [] narrations = [] lines = narration.split("\n") for line in lines: if line.startswith('Narrator: '): text = line.replace('Narrator: ', '') data.append({ "type": "text", "content": text.strip('"'), }) narrations.append(text.strip('"')) elif line.startswith('['): background = line.strip('[]') data.append({ "type": "image", "description": background, }) return data, narrations def create(data, output_folder, audio_file): if not os.path.exists(output_folder): os.makedirs(output_folder) # Initialize Gradio Client client = Client("tonyassi/voice-clone") audio_files_dir = "audio_folder" # Path to your audio folder for element in data: if element["type"] != "text": continue # Make prediction using the provided API audio_file_path = os.path.join(audio_files_dir, audio_file) result = client.predict( text=element["content"], audio=file(audio_file_path) # Include reference style audio for API ) # Move the response audio file to the output folder temp_dir = os.path.dirname(result) response_file_path = os.path.join(output_folder, f"narration_{len(os.listdir(output_folder)) + 1}.wav") shutil.move(result, response_file_path) print(f"Audio file generated for '{element['content']}' saved at: {response_file_path}") def generate(prompt, output_file, size="576x1024"): url = 'https://api.lemonfox.ai/v1/images/generations' headers = { 'Authorization': LEMONFOX_API_KEY, 'Content-Type': 'application/json' } data = { 'prompt': prompt, 'size': size, 'n': 1 } try: response = requests.post(url, json=data, headers=headers) if response.ok: response_data = response.json() if 'data' in response_data and len(response_data['data']) > 0: image_info = response_data['data'][0] image_url = image_info['url'] image_response = requests.get(image_url) with open(output_file, 'wb') as f: f.write(image_response.content) else: print(f"No image data found for prompt: {prompt}") else: print(f"Failed to generate image for prompt: {prompt}. Status Code: {response.status_code}") except Exception as e: print(f"Error occurred while processing prompt: {prompt}") print(str(e)) def create_from_data(data, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) image_number = 0 for element in data: if element["type"] != "image": continue image_number += 1 image_name = f"image_{image_number}.webp" generate(element["description"], os.path.join(output_dir, image_name)) def get_audio_duration(audio_file): return len(AudioSegment.from_file(audio_file)) def resize_image(image, width, height): aspect_ratio = image.shape[1] / image.shape[0] if aspect_ratio > (width / height): new_width = width new_height = int(width / aspect_ratio) else: new_height = height new_width = int(height * aspect_ratio) return cv2.resize(image, (new_width, new_height)) def write_text(text, frame, video_writer): font = cv2.FONT_HERSHEY_SIMPLEX white_color = (255, 255, 255) black_color = (0, 0, 0) thickness = 10 font_scale = 3 border = 5 text_size = cv2.getTextSize(text, font, font_scale, thickness)[0] text_x = (frame.shape[1] - text_size[0]) // 2 text_y = (frame.shape[0] + text_size[1]) // 2 org = (text_x, text_y) frame = cv2.putText(frame, text, org, font, font_scale, black_color, thickness + border * 2, cv2.LINE_AA) frame = cv2.putText(frame, text, org, font, font_scale, white_color, thickness, cv2.LINE_AA) video_writer.write(frame) def add_narration_to_video(narrations, input_video, output_dir, output_file, text_color, text_position): offset = 50 cap = cv2.VideoCapture(input_video) fourcc = cv2.VideoWriter_fourcc(*'XVID') temp_video = os.path.join(output_dir, "with_transcript.avi") out = cv2.VideoWriter(temp_video, fourcc, 60, (int(cap.get(3)), int(cap.get(4)))) full_narration = AudioSegment.empty() for i, narration in enumerate(narrations): audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.wav") duration = get_audio_duration(audio) narration_frames = math.floor(duration / 2000 * 60) full_narration += AudioSegment.from_file(audio) char_count = len(narration.replace(" ", "")) ms_per_char = duration / char_count frames_written = 0 words = narration.split(" ") for w, word in enumerate(words): word_ms = len(word) * ms_per_char if i == 0 and w == 0: word_ms -= offset if word_ms < 0: word_ms = 0 for _ in range(math.floor(word_ms/2000*60)): ret, frame = cap.read() if not ret: break write_text(word, frame, out) frames_written += 1 for _ in range(narration_frames - frames_written): ret, frame = cap.read() out.write(frame) while out.isOpened(): ret, frame = cap.read() if not ret: break out.write(frame) temp_narration = os.path.join(output_dir, "narration.wav") full_narration.export(temp_narration, format="wav") cap.release() out.release() cv2.destroyAllWindows() ffmpeg_command = [ 'ffmpeg', '-y', '-i', temp_video, '-i', temp_narration, '-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac', '-strict', 'experimental', os.path.join(output_dir, output_file) ] subprocess.run(ffmpeg_command, capture_output=True) os.remove(temp_video) os.remove(temp_narration) def create_video(narrations, output_dir, output_file, text_color, text_position): # Add text_color and text_position parameters here width, height = 1080, 1920 frame_rate = 60 fade_time = 2000 fourcc = cv2.VideoWriter_fourcc(*'XVID') temp_video = os.path.join(output_dir, "temp_video.avi") out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height)) image_paths = os.listdir(os.path.join(output_dir, "images")) image_count = len(image_paths) for i in range(image_count): image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp")) if i+1 < image_count: image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp")) else: image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp")) image1 = resize_image(image1, width, height) image2 = resize_image(image2, width, height) narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.wav") duration = get_audio_duration(narration) if i > 0: duration -= fade_time if i == image_count-1: duration -= fade_time for _ in range(math.floor(duration/2000*60)): vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) vertical_video_frame[:image1.shape[0], :] = image1 out.write(vertical_video_frame) for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)): blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0) vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) vertical_video_frame[:image1.shape[0], :] = blended_image out.write(vertical_video_frame) out.release() cv2.destroyAllWindows() add_narration_to_video(narrations, temp_video, output_dir, output_file, text_color, text_position) # Pass text_color and text_position here os.remove(temp_video) def generate_video(topic, voice_choice): short_id = str(int(time.time())) basedir = os.path.join("shorts", short_id) if not os.path.exists(basedir): os.makedirs(basedir) filename = topic.replace("_", " ").replace("/", "_").replace(".", "_") output_file = f"{filename}.avi" # Extract the voice file based on voice_choice voice_file = [file for name, file in audio_files if name == voice_choice][0] chat_url = 'https://api.lemonfox.ai/v1/chat/completions' headers = { 'Authorization': f'Bearer {LEMONFOX_API_KEY}', 'Content-Type': 'application/json' } payload = { "model": "mixtral-chat", "messages": [ { "role": "system", "content": "You are a YouTube short video creator." }, { "role": "user", "content": f"""make a short video on: \n\n{topic} Generate 60 seconds to 1 minute of video. You will need to generate a very short description of images for each of the sentences. They will be used for background images. Note that the script will be fed into a text-to-speech engine, so dont use special characters. Respond with a pair of an image description in square brackets and a script below it. Both of them should be on their own lines, as follows: ### [Description of a background image] Narrator: "One sentence of narration" ### The short should be 6 sentences maximum.""" } ] } response = requests.post(chat_url, json=payload, headers=headers) if response.status_code == 200: response_text = response.json()['choices'][0]['message']['content'] response_text = response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("“", '"').replace("”", '"') with open(os.path.join(basedir, f"response.txt"), "a") as f: f.write(response_text + "\n") data, narrations = parse(response_text) with open(os.path.join(basedir, f"data.json"), "a") as f: json.dump(data, f, ensure_ascii=False) f.write("\n") print(f"Generating narration for: {topic}...") create(data, os.path.join(basedir, f"narrations"), voice_file) print("Generating images...") create_from_data(data, os.path.join(basedir, f"images")) print("Generating video...") create_video(narrations, basedir, output_file, text_color="white", text_position="center") # Pass text_color and text_position here print("Deleting files and folders...") os.remove(os.path.join(basedir, "response.txt")) os.remove(os.path.join(basedir, "data.json")) shutil.rmtree(os.path.join(basedir, "narrations")) shutil.rmtree(os.path.join(basedir, "images")) print(f"DONE! Here's your video: {os.path.join(basedir, output_file)}") return os.path.join(basedir, output_file) else: print(f"Failed to generate script for source material: {topic}. Status Code: {response.status_code}") return None iface = gr.Interface( fn=generate_video, inputs=["text", gr.Dropdown(choices=[name for name, _ in audio_files], label="Select Voice")], outputs="video", css="footer {visibility: hidden}", description="Generate a free short video. Best for YouTube Shorts, Instagram Reels or TikTok. This is a prototype. If you want better software, please inbox or email me at aheedsajid@gmail.com and do like and [Click here to Donate](https://nowpayments.io/donation/aheed)", title="Text to Short Video Free" ) iface.launch()