Spaces:
PlayHT
/
Running on CPU Upgrade

get_rizzed / app.py
legofan94's picture
Update app.py
53296c8 verified
raw
history blame
5.71 kB
import os
import google.generativeai as genai
import gradio as gr
import requests
from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip
from PIL import Image
# Configure Google Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# Play.ht API keys
API_KEY = os.getenv('PLAY_API_KEY')
USER_ID = os.getenv('PLAY_USER_ID')
# Ensure compatibility with updated PIL library
if not hasattr(Image, 'ANTIALIAS'): # Image.ANTIALIAS is deprecated; LANCZOS is the replacement
Image.ANTIALIAS = Image.LANCZOS
# Theme selection
theme = gr.themes.Base(
primary_hue="emerald",
)
# Function to upload image to Gemini and get roasted text
def upload_to_gemini(path, mime_type="image/jpeg"):
file = genai.upload_file(path, mime_type=mime_type)
return file
def generate_roast(image_path):
try:
uploaded_file = upload_to_gemini(image_path)
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash-002",
generation_config=generation_config,
system_instruction="Generate a conversation between two women complimenting the uploaded image in less than 150 words. Please abide by these guidelines.
1. Begin conversation turns with the prefix Host: 1 and Host: 2
2. Uses humor, irony and sarcasm to entertain
3. Your output should be a well-written text suitable for reading aloud, it will be passed to a generative speech model to voice it, so don't include any special symbols like double asterisks, slash, at, em dash, ellipses, and so on. ",
)
chat_session = model.start_chat(
history=[{"role": "user", "parts": [uploaded_file]}]
)
response = chat_session.send_message("Rizz this image!")
return response.text
except Exception as e:
return f"Error generating rizz: {e}"
# Function to convert text to speech with Play.ht
def text_to_speech(text):
try:
url = "https://api.play.ai/api/v1/tts/stream"
payload = {
"model": "PlayDialog",
"voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
"voice2": "s3://voice-cloning-zero-shot/fdb74aec-ede9-45f8-ad87-71cb45f01816/original/manifest.json",
"turnPrefix": "Host 1:",
"turnPrefix2": "Host 2:",
"output_format": "mp3",
"text": text,
}
headers = {
"content-type": "application/json",
"Authorization": API_KEY,
"X-User-ID": USER_ID
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
audio_path = "output_audio.mp3"
with open(audio_path, "wb") as audio_file:
audio_file.write(response.content)
return audio_path
else:
return f"Error generating audio: {response.status_code} - {response.text}"
except Exception as e:
return f"Error generating audio: {e}"
# Function to create video from image, audio, and add logo overlay
def create_video(image, audio):
try:
# Load the audio file
audio_clip = AudioFileClip(audio)
# Load the main image and set its duration to match the audio
image_clip = ImageClip(image).set_duration(audio_clip.duration)
# Load the logo image, resize it, and position it in the top-right corner
#logo = ImageClip("Logo.png").resize(height=75) # Adjust the height as needed
logo = ImageClip("PlayAI-Logo-RIZZ-URL.png").resize(height=75) # Adjust the height as needed
logo = logo.margin(bottom=10, opacity=0).set_position(("center", "bottom")).set_duration(audio_clip.duration)
# Create a composite video with the main image and the logo overlay
video_clip = CompositeVideoClip([image_clip, logo]).set_audio(audio_clip)
# Save the video to a temporary file
output_path = "/tmp/output_video_with_logo.mp4"
video_clip.write_videofile(
output_path,
fps=30,
codec="libx264",
audio_codec="aac",
preset="slow",
ffmpeg_params=["-b:v", "2000k"] # Adjust bitrate if needed
)
return output_path
except Exception as e:
return f"Error generating video: {e}"
# Function to process all steps at once
def process_roast(image_path):
roast_text = generate_roast(image_path)
audio_path = text_to_speech(roast_text)
video_path = create_video(image_path, audio_path)
return roast_text, audio_path, video_path
# Gradio Interface
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# Get Rizzed, Ready?")
gr.Markdown("Upload an image, click 'Rizz Image', and the AI will roast it")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload Image")
with gr.Column():
output_text = gr.Textbox(label="Roast Text")
audio_output = gr.Audio(label="Roast Audio")
video_output = gr.Video(label="Roast Video")
# Single button to handle all actions
roast_button = gr.Button("Rizz Image")
roast_button.click(process_roast, inputs=image_input, outputs=[output_text, audio_output, video_output])
gr.Examples(
examples=[["elon_musk.png"], ["jensen_huang.png"]],
inputs=image_input
)
# Launch the app
demo.launch(debug=True)