ChatWithTranscriptDev2 / crop_utils.py
AhmadMustafa's picture
add: street interview crops
09fada4
import base64
import os
from io import BytesIO
import cv2
import gradio as gr
import numpy as np
import pyrebase
import requests
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
from prompts import remove_unwanted_prompt
model = YOLO("yolo11n.pt")
def get_middle_thumbnail(input_image: Image, grid_size=(10, 10), padding=3):
"""
Extract the middle thumbnail from a sprite sheet, handling different aspect ratios
and removing padding.
Args:
input_image: PIL Image
grid_size: Tuple of (columns, rows)
padding: Number of padding pixels on each side (default 3)
Returns:
PIL.Image: The middle thumbnail image with padding removed
"""
sprite_sheet = input_image
# Calculate thumbnail dimensions based on actual sprite sheet size
sprite_width, sprite_height = sprite_sheet.size
thumb_width_with_padding = sprite_width // grid_size[0]
thumb_height_with_padding = sprite_height // grid_size[1]
# Remove padding to get actual image dimensions
thumb_width = thumb_width_with_padding - (2 * padding) # 726 - 6 = 720
thumb_height = thumb_height_with_padding - (2 * padding) # varies based on input
# Calculate the middle position
total_thumbs = grid_size[0] * grid_size[1]
middle_index = total_thumbs // 2
# Calculate row and column of middle thumbnail
middle_row = middle_index // grid_size[0]
middle_col = middle_index % grid_size[0]
# Calculate pixel coordinates for cropping, including padding offset
left = (middle_col * thumb_width_with_padding) + padding
top = (middle_row * thumb_height_with_padding) + padding
right = left + thumb_width # Don't add padding here
bottom = top + thumb_height # Don't add padding here
# Crop and return the middle thumbnail
middle_thumb = sprite_sheet.crop((left, top, right, bottom))
return middle_thumb
def encode_image_to_base64(image: Image.Image, format: str = "JPEG") -> str:
"""
Convert a PIL image to a base64 string.
Args:
image: PIL Image object
format: Image format to use for encoding (default: PNG)
Returns:
Base64 encoded string of the image
"""
buffered = BytesIO()
image.save(buffered, format=format)
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def add_top_numbers(
input_image,
num_divisions=20,
margin=90,
font_size=70,
dot_spacing=20,
):
"""
Add numbered divisions across the top and bottom of any image with dotted vertical lines.
Args:
input_image (Image): PIL Image
num_divisions (int): Number of divisions to create
margin (int): Size of margin in pixels for numbers
font_size (int): Font size for numbers
dot_spacing (int): Spacing between dots in pixels
"""
# Load the image
original_image = input_image
# Create new image with extra space for numbers on top and bottom
new_width = original_image.width
new_height = original_image.height + (
2 * margin
) # Add margin to both top and bottom
new_image = Image.new("RGB", (new_width, new_height), "white")
# Paste original image in the middle
new_image.paste(original_image, (0, margin))
# Initialize drawing context
draw = ImageDraw.Draw(new_image)
try:
font = ImageFont.truetype("arial.ttf", font_size)
except OSError:
print("Using default font")
font = ImageFont.load_default(size=font_size)
# Calculate division width
division_width = original_image.width / num_divisions
# Draw division numbers and dotted lines
for i in range(num_divisions):
x = (i * division_width) + (division_width / 2)
# Draw number at top
draw.text((x, margin // 2), str(i + 1), fill="black", font=font, anchor="mm")
# Draw number at bottom
draw.text(
(x, new_height - (margin // 2)),
str(i + 1),
fill="black",
font=font,
anchor="mm",
)
# Draw dotted line from top margin to bottom margin
y_start = margin
y_end = new_height - margin
# Draw dots with specified spacing
current_y = y_start
while current_y < y_end:
draw.circle(
[x - 1, current_y - 1, x + 1, current_y + 1],
fill="black",
width=5,
radius=3,
)
current_y += dot_spacing
return new_image
def analyze_image(numbered_input_image: Image, prompt, input_image, ct):
"""
Perform inference on an image using GPT-4V.
Args:
numbered_input_image (Image): PIL Image
prompt (str): The prompt/question about the image
input_image (Image): input image without numbers
Returns:
str: The model's response
"""
client = OpenAI()
base64_image = encode_image_to_base64(numbered_input_image, format="JPEG")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
],
}
]
response = client.chat.completions.create(
model="gpt-4o", messages=messages, max_tokens=300
)
messages.extend(
[
{"role": "assistant", "content": response.choices[0].message.content},
{
"role": "user",
"content": "please return the response in the json with keys left_row, right_row, and num_of_speakers",
},
],
)
response = (
client.chat.completions.create(model="gpt-4o", messages=messages)
.choices[0]
.message.content
)
left_index = response.find("{")
right_index = response.rfind("}")
try:
if left_index != -1 and right_index != -1:
print(response[left_index : right_index + 1])
response_json = eval(response[left_index : right_index + 1])
except Exception as e:
print(e)
return 0, 20
return (
response_json["left_row"],
response_json["right_row"],
response_json["num_of_speakers"],
)
def get_sprite_firebase(cid, rsid, uid):
config = {
"apiKey": f"{os.getenv('FIREBASE_API_KEY')}",
"authDomain": f"{os.getenv('FIREBASE_AUTH_DOMAIN')}",
"databaseURL": f"{os.getenv('FIREBASE_DATABASE_URL')}",
"projectId": f"{os.getenv('FIREBASE_PROJECT_ID')}",
"storageBucket": f"{os.getenv('FIREBASE_STORAGE_BUCKET')}",
"messagingSenderId": f"{os.getenv('FIREBASE_MESSAGING_SENDER_ID')}",
"appId": f"{os.getenv('FIREBASE_APP_ID')}",
"measurementId": f"{os.getenv('FIREBASE_MEASUREMENT_ID')}",
}
firebase = pyrebase.initialize_app(config)
db = firebase.database()
account_id = os.getenv("ROLL_ACCOUNT")
COLLAB_EDIT_LINK = "collab_sprite_link_handler"
path = f"{account_id}/{COLLAB_EDIT_LINK}/{uid}/{cid}/{rsid}"
data = db.child(path).get()
return data.val()
def find_persons_center(image, num_of_speakers=1):
"""
Find the center point of the largest num_of_speakers persons in the image.
If multiple persons are detected, merge the bounding boxes of only the largest ones.
Args:
image: CV2/numpy array image
num_of_speakers: Number of speakers to consider (default: 1)
Returns:
int: x-coordinate of the center point of all considered persons
"""
# Detect persons (class 0 in COCO dataset)
results = model(image, classes=[0], conf=0.6)
if not results or len(results[0].boxes) == 0:
# If no persons detected, return center of image
return image.shape[1] // 2
# Get all person boxes
boxes = results[0].boxes.xyxy.cpu().numpy()
# Print the number of persons detected (for debugging)
print(f"Detected {len(boxes)} persons in the image")
if len(boxes) == 1:
# If only one person, return center of their bounding box
x1, _, x2, _ = boxes[0]
center_x = int((x1 + x2) // 2)
print(f"Single person detected at center x: {center_x}")
return center_x
else:
# Multiple persons - consider only the largest num_of_speakers boxes
# Calculate area for each box
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
# Sort boxes by area (largest first) and take top num_of_speakers
sorted_indices = sorted(
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
)
# Use all available boxes if fewer detected than requested
num_boxes_to_use = min(num_of_speakers, len(boxes))
selected_indices = sorted_indices[:num_boxes_to_use]
selected_boxes = [boxes[i] for i in selected_indices]
# Create a merged bounding box from selected boxes
left_x = min(box[0] for box in selected_boxes)
right_x = max(box[2] for box in selected_boxes)
merged_center_x = int((left_x + right_x) // 2)
print(
f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}"
)
print(f"Merged bounds: left={left_x}, right={right_x}")
return merged_center_x
def create_layouts(image, left_division, right_division, num_of_speakers):
"""
Create different layout variations of the image using specific aspect ratios.
All layout variations will be centered on detected persons.
Args:
image: PIL Image
left_division: Left division index (1-20)
right_division: Right division index (1-20)
Returns:
tuple: (standard_crops, threehalfs_layouts, twothirdhalfs_layouts, twoequalhalfs_layouts, visualization_data)
"""
# Convert PIL Image to cv2 format
if isinstance(image, Image.Image):
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
else:
image_cv = image.copy()
# Get image dimensions
height, width = image_cv.shape[:2]
# Calculate division width and crop boundaries
division_width = width / 20 # Assuming 20 divisions
left_boundary = int((left_division - 1) * division_width)
right_boundary = int(right_division * division_width)
# 1. Create cutout image based on divisions
cutout_image = image_cv[:, left_boundary:right_boundary].copy()
cutout_width = right_boundary - left_boundary
cutout_height = cutout_image.shape[0]
# 2. Run YOLO on cutout to get person bounding box and center
results = model(cutout_image, classes=[0], conf=0.6)
# Default center if no detection
cutout_center_x = cutout_image.shape[1] // 2
cutout_center_y = cutout_height // 2
# Default values for bounding box
person_top = 0.0
person_height = float(cutout_height)
if results and len(results[0].boxes) > 0:
# Get person detection
boxes = results[0].boxes.xyxy.cpu().numpy()
if len(boxes) == 1:
# Single person
x1, y1, x2, y2 = boxes[0]
cutout_center_x = int((x1 + x2) // 2)
cutout_center_y = int((y1 + y2) // 2)
person_top = y1
person_height = y2 - y1
else:
# Multiple persons - consider only the largest num_of_speakers boxes
# Calculate area for each box
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
# Sort boxes by area (largest first) and take top num_of_speakers
sorted_indices = sorted(
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
)
# Use all available boxes if fewer detected than requested
num_boxes_to_use = min(num_of_speakers, len(boxes))
selected_indices = sorted_indices[:num_boxes_to_use]
selected_boxes = [boxes[i] for i in selected_indices]
# Merge bounding boxes of selected boxes
left_x = min(box[0] for box in selected_boxes)
right_x = max(box[2] for box in selected_boxes)
top_y = min(box[1] for box in selected_boxes) # Top of highest person
bottom_y = max(box[3] for box in selected_boxes) # Bottom of lowest person
cutout_center_x = int((left_x + right_x) // 2)
cutout_center_y = int((top_y + bottom_y) // 2)
person_top = top_y
person_height = bottom_y - top_y
# 3. Create 16:9 and 9:16 versions with person properly framed
aspect_16_9 = 16 / 9
aspect_9_16 = 9 / 16
# For 16:9 version (with 5% margin above person)
target_height_16_9 = int(cutout_width / aspect_16_9)
if target_height_16_9 <= cutout_height:
# Calculate 5% of person height for top margin
top_margin = int(person_height * 0.05)
# Start 5% above the person's top
y_start = int(max(0, person_top - top_margin))
# If this would make the crop exceed the bottom, adjust y_start
if y_start + target_height_16_9 > cutout_height:
y_start = int(max(0, cutout_height - target_height_16_9))
y_end = int(min(cutout_height, y_start + target_height_16_9))
cutout_16_9 = cutout_image[y_start:y_end, :].copy()
else:
# Handle rare case where we need to adjust width (not expected with normal images)
new_width = int(cutout_height * aspect_16_9)
x_start = max(
0, min(cutout_width - new_width, cutout_center_x - new_width // 2)
)
x_end = min(cutout_width, x_start + new_width)
cutout_16_9 = cutout_image[:, x_start:x_end].copy()
# For 9:16 version (centered on person, adjusted upward for face visibility)
target_width_9_16 = int(cutout_height * aspect_9_16)
# Adjust center point upward by 20% of person height to ensure face is visible
adjusted_center_y = int(cutout_center_y - (person_height * 0.2))
if target_width_9_16 <= cutout_width:
# Center horizontally around person
x_start = int(
max(
0,
min(
cutout_width - target_width_9_16,
cutout_center_x - target_width_9_16 // 2,
),
)
)
x_end = int(min(cutout_width, x_start + target_width_9_16))
# Use adjusted center point for vertical positioning
y_start = int(
max(
0,
min(
cutout_height - cutout_height,
adjusted_center_y - cutout_height // 2,
),
)
)
cutout_9_16 = cutout_image[y_start:, x_start:x_end].copy()
else:
# Handle rare case where we need to adjust height
new_height = int(cutout_width / aspect_9_16)
# Use adjusted center point for vertical positioning
y_start = int(
max(0, min(cutout_height - new_height, adjusted_center_y - new_height // 2))
)
y_end = int(min(cutout_height, y_start + new_height))
cutout_9_16 = cutout_image[y_start:y_end, :].copy()
# 4. Scale the center back to original image coordinates
original_center_x = left_boundary + cutout_center_x
original_center_y = cutout_center_y
original_person_top = person_top
# Store visualization data for drawing
visualization_data = {
"original_center_x": original_center_x,
"original_center_y": original_center_y,
"original_person_top": original_person_top,
"original_person_height": person_height,
"cutout_bounds": (left_boundary, right_boundary),
}
# 5. Create new layout variations - each segment is independently centered on the subject
# ----- Create crops for threehalfs layout -----
# For 16:9 (three 5.3:9 segments, each independently centered)
aspect_5_3_9 = 5.3 / 9
# Calculate dimensions for each segment
segment_height_16_9 = cutout_height # Use full height
segment_width_16_9 = int(segment_height_16_9 * aspect_5_3_9)
# Create three segments for 16:9 threehalfs - all centered on the person
threehalfs_16_9_segments = []
for i in range(3):
# Each segment is centered on the person
segment_x_start = int(
max(
0,
min(
cutout_width - segment_width_16_9,
cutout_center_x - segment_width_16_9 // 2,
),
)
)
segment_x_end = int(min(cutout_width, segment_x_start + segment_width_16_9))
# Create the segment
segment = cutout_image[:, segment_x_start:segment_x_end].copy()
# Add a label for visualization
label = f"Part {i+1}"
cv2.putText(
segment,
label,
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
threehalfs_16_9_segments.append(segment)
# For 9:16 (three 9:5.3 segments, each independently centered)
aspect_9_5_3 = 9 / 5.3
# Calculate dimensions for each segment
segment_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop
segment_height_9_16 = int(segment_width_9_16 / aspect_9_5_3)
# Get adjusted center for 9:16 segments (move up by 20% of person height)
cutout_9_16_center_y = cutout_9_16.shape[0] // 2
adjusted_9_16_center_y = int(cutout_9_16_center_y - (person_height * 0.2))
cutout_9_16_height = cutout_9_16.shape[0]
# Create three segments for 9:16 threehalfs - all centered on the person
threehalfs_9_16_segments = []
for i in range(3):
# Each segment is centered on the person with adjusted center point
segment_y_start = int(
max(
0,
min(
cutout_9_16_height - segment_height_9_16,
person_top,
),
)
)
segment_y_end = int(
min(cutout_9_16_height, segment_y_start + segment_height_9_16)
)
# Create the segment
segment = cutout_9_16[segment_y_start:segment_y_end, :].copy()
# Add a label for visualization
label = f"Part {i+1}"
cv2.putText(
segment,
label,
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
threehalfs_9_16_segments.append(segment)
# ----- Create crops for twothirdhalfs layout -----
# For 16:9 (two segments: 10.6:9 and 5.3:9 OR 5.3:9 and 10.6:9)
aspect_10_6_9 = 10.6 / 9
# Calculate dimensions for segments
segment1_height_16_9 = cutout_height # Use full height
segment1_width_16_9 = int(segment1_height_16_9 * aspect_10_6_9)
segment2_height_16_9 = cutout_height # Use full height
segment2_width_16_9 = int(segment2_height_16_9 * aspect_5_3_9)
# Create segments for 16:9 twothirdhalfs var1 (10.6:9 then 5.3:9)
# Both segments independently centered on the person
# First segment (10.6:9)
segment_x_start = int(
max(
0,
min(
cutout_width - segment1_width_16_9,
cutout_center_x - segment1_width_16_9 // 2,
),
)
)
segment_x_end = int(min(cutout_width, segment_x_start + segment1_width_16_9))
segment1 = cutout_image[:, segment_x_start:segment_x_end].copy()
# Add label
cv2.putText(
segment1,
"10.6:9",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
# Second segment (5.3:9)
segment_x_start = int(
max(
0,
min(
cutout_width - segment2_width_16_9,
cutout_center_x - segment2_width_16_9 // 2,
),
)
)
segment_x_end = int(min(cutout_width, segment_x_start + segment2_width_16_9))
segment2 = cutout_image[:, segment_x_start:segment_x_end].copy()
# Add label
cv2.putText(
segment2,
"5.3:9",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
twothirdhalfs_16_9_var1_segments = [segment1, segment2]
# Create segments for 16:9 twothirdhalfs var2 (5.3:9 then 10.6:9)
# First segment (5.3:9) - reuse segment2 from var1
# Second segment (10.6:9) - reuse segment1 from var1
twothirdhalfs_16_9_var2_segments = [segment2.copy(), segment1.copy()]
# For 9:16 (two segments stacked: 9:10.6 and 9:5.3 OR 9:5.3 and 9:10.6)
aspect_9_10_6 = 9 / 10.6
aspect_9_5_3 = 9 / 5.3
# Calculate dimensions for segments
segment1_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop
segment1_height_9_16 = int(segment1_width_9_16 / aspect_9_10_6)
segment2_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop
segment2_height_9_16 = int(segment2_width_9_16 / aspect_9_5_3)
# Create segments for 9:16 twothirdhalfs var1 (9:10.6 then 9:5.3)
# Both segments independently centered on the person with adjusted center point
# First segment (9:10.6)
segment_y_start = int(
max(
0,
min(
cutout_9_16_height - segment1_height_9_16,
adjusted_9_16_center_y - segment1_height_9_16 // 2,
),
)
)
segment_y_end = int(min(cutout_9_16_height, segment_y_start + segment1_height_9_16))
segment1 = cutout_9_16[segment_y_start:segment_y_end, :].copy()
# Add label
cv2.putText(
segment1,
"9:10.6",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
# Second segment (9:5.3)
segment_y_start = int(
max(
0,
min(
cutout_9_16_height - segment2_height_9_16,
person_top,
),
)
)
segment_y_end = int(min(cutout_9_16_height, segment_y_start + segment2_height_9_16))
segment2 = cutout_9_16[segment_y_start:segment_y_end, :].copy()
# Add label
cv2.putText(
segment2,
"9:5.3",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
twothirdhalfs_9_16_var1_segments = [segment1, segment2]
# Create segments for 9:16 twothirdhalfs var2 (9:5.3 then 9:10.6)
# First segment (9:5.3) - reuse segment2 from var1
# Second segment (9:10.6) - reuse segment1 from var1
twothirdhalfs_9_16_var2_segments = [segment2.copy(), segment1.copy()]
# ----- Create crops for twoequalhalfs layout -----
# For 16:9 (two 8:9 segments side by side)
aspect_8_9 = 8 / 9
# Calculate dimensions for segments
segment_height_16_9_equal = cutout_height # Use full height
segment_width_16_9_equal = int(segment_height_16_9_equal * aspect_8_9)
# Create segments for 16:9 twoequalhalfs - both centered on the person
# First segment (8:9)
segment_x_start = int(
max(
0,
min(
cutout_width - segment_width_16_9_equal,
cutout_center_x - segment_width_16_9_equal // 2,
),
)
)
segment_x_end = int(min(cutout_width, segment_x_start + segment_width_16_9_equal))
segment1 = cutout_image[:, segment_x_start:segment_x_end].copy()
# Add label
cv2.putText(
segment1,
"8:9 (1)",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
# Second segment (identical to first for equal halfs)
segment2 = segment1.copy()
# Update label for segment 2
cv2.putText(
segment2,
"8:9 (2)",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
twoequalhalfs_16_9_segments = [segment1, segment2]
# For 9:16 (two 9:8 segments stacked)
aspect_9_8 = 9 / 8
# Calculate dimensions for segments
segment_width_9_16_equal = cutout_9_16.shape[1] # Use full width of 9:16 crop
segment_height_9_16_equal = int(segment_width_9_16_equal / aspect_9_8)
# Create segments for 9:16 twoequalhalfs - both centered on the person with adjusted center point
# First segment (9:8)
segment_y_start = int(
max(
0,
min(
cutout_9_16_height - segment_height_9_16_equal,
max(0, person_top - person_height * 0.05),
),
)
)
segment_y_end = int(
min(cutout_9_16_height, segment_y_start + segment_height_9_16_equal)
)
segment1 = cutout_9_16[segment_y_start:segment_y_end, :].copy()
# Add label
cv2.putText(
segment1,
"9:8 (1)",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
# Second segment (identical to first for equal halfs)
segment2 = segment1.copy()
# Update label for segment 2
cv2.putText(
segment2,
"9:8 (2)",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 255, 255),
2,
cv2.LINE_AA,
)
twoequalhalfs_9_16_segments = [segment1, segment2]
# 6. Create composite layouts by joining segments
# Function to create a composite image
def create_composite(segments, horizontal=True):
if not segments:
return None
if horizontal:
# Calculate the total width and max height
total_width = sum(segment.shape[1] for segment in segments)
max_height = max(segment.shape[0] for segment in segments)
# Create a canvas
composite = np.zeros((max_height, total_width, 3), dtype=np.uint8)
# Place segments side by side
x_offset = 0
for segment in segments:
h, w = segment.shape[:2]
composite[:h, x_offset : x_offset + w] = segment
x_offset += w
else: # vertical stacking
# Calculate the max width and total height
max_width = max(segment.shape[1] for segment in segments)
total_height = sum(segment.shape[0] for segment in segments)
# Create a canvas
composite = np.zeros((total_height, max_width, 3), dtype=np.uint8)
# Place segments top to bottom
y_offset = 0
for segment in segments:
h, w = segment.shape[:2]
composite[y_offset : y_offset + h, :w] = segment
y_offset += h
return composite
# Create composite layouts
threehalfs_16_9_composite = create_composite(
threehalfs_16_9_segments, horizontal=True
)
threehalfs_9_16_composite = create_composite(
threehalfs_9_16_segments, horizontal=False
)
twothirdhalfs_16_9_var1_composite = create_composite(
twothirdhalfs_16_9_var1_segments, horizontal=True
)
twothirdhalfs_16_9_var2_composite = create_composite(
twothirdhalfs_16_9_var2_segments, horizontal=True
)
twothirdhalfs_9_16_var1_composite = create_composite(
twothirdhalfs_9_16_var1_segments, horizontal=False
)
twothirdhalfs_9_16_var2_composite = create_composite(
twothirdhalfs_9_16_var2_segments, horizontal=False
)
twoequalhalfs_16_9_composite = create_composite(
twoequalhalfs_16_9_segments, horizontal=True
)
twoequalhalfs_9_16_composite = create_composite(
twoequalhalfs_9_16_segments, horizontal=False
)
# Add labels to all composites
def add_label(img, label):
if img is None:
return None
font = cv2.FONT_HERSHEY_SIMPLEX
label_settings = {
"fontScale": 1.0,
"fontFace": font,
"thickness": 2,
}
# Draw background for text
text_size = cv2.getTextSize(
label,
fontFace=label_settings["fontFace"],
fontScale=label_settings["fontScale"],
thickness=label_settings["thickness"],
)
cv2.rectangle(
img,
(10, 10),
(10 + text_size[0][0] + 10, 10 + text_size[0][1] + 10),
(0, 0, 0),
-1,
) # Black background
# Draw text
cv2.putText(
img,
label,
(15, 15 + text_size[0][1]),
fontFace=label_settings["fontFace"],
fontScale=label_settings["fontScale"],
thickness=label_settings["thickness"],
color=(255, 255, 255),
lineType=cv2.LINE_AA,
)
return img
# Label the basic crops
cutout_image_labeled = add_label(cutout_image.copy(), "Cutout")
cutout_16_9_labeled = add_label(cutout_16_9.copy(), "16:9")
cutout_9_16_labeled = add_label(cutout_9_16.copy(), "9:16")
# Label the composite layouts
threehalfs_16_9_labeled = add_label(threehalfs_16_9_composite, "Three Halfs 16:9")
threehalfs_9_16_labeled = add_label(threehalfs_9_16_composite, "Three Halfs 9:16")
twothirdhalfs_16_9_var1_labeled = add_label(
twothirdhalfs_16_9_var1_composite, "Two Thirds Var1 16:9"
)
twothirdhalfs_16_9_var2_labeled = add_label(
twothirdhalfs_16_9_var2_composite, "Two Thirds Var2 16:9"
)
twothirdhalfs_9_16_var1_labeled = add_label(
twothirdhalfs_9_16_var1_composite, "Two Thirds Var1 9:16"
)
twothirdhalfs_9_16_var2_labeled = add_label(
twothirdhalfs_9_16_var2_composite, "Two Thirds Var2 9:16"
)
twoequalhalfs_16_9_labeled = add_label(
twoequalhalfs_16_9_composite, "Two Equal Halfs 16:9"
)
twoequalhalfs_9_16_labeled = add_label(
twoequalhalfs_9_16_composite, "Two Equal Halfs 9:16"
)
# Convert all output images to PIL format
def cv2_to_pil(img):
if img is None:
return None
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
# Convert standard crops
standard_crops = {
"cutout": cv2_to_pil(cutout_image_labeled),
"16:9": cv2_to_pil(cutout_16_9_labeled),
"9:16": cv2_to_pil(cutout_9_16_labeled),
}
# Convert threehalfs layouts
threehalfs_layouts = {
"16:9": cv2_to_pil(threehalfs_16_9_labeled),
"9:16": cv2_to_pil(threehalfs_9_16_labeled),
}
# Convert twothirdhalfs layouts
twothirdhalfs_layouts = {
"16:9_var1": cv2_to_pil(twothirdhalfs_16_9_var1_labeled),
"16:9_var2": cv2_to_pil(twothirdhalfs_16_9_var2_labeled),
"9:16_var1": cv2_to_pil(twothirdhalfs_9_16_var1_labeled),
"9:16_var2": cv2_to_pil(twothirdhalfs_9_16_var2_labeled),
}
# Convert twoequalhalfs layouts
twoequalhalfs_layouts = {
"16:9": cv2_to_pil(twoequalhalfs_16_9_labeled),
"9:16": cv2_to_pil(twoequalhalfs_9_16_labeled),
}
return (
standard_crops,
threehalfs_layouts,
twothirdhalfs_layouts,
twoequalhalfs_layouts,
visualization_data,
)
def draw_layout_regions(
image, left_division, right_division, visualization_data, layout_type
):
"""
Create a visualization showing the layout regions overlaid on the original image.
Each region is independently centered on the subject, as in practice different videos
would be stacked in these layouts.
Args:
image: PIL Image
left_division: Left division index (1-20)
right_division: Right division index (1-20)
visualization_data: Dictionary with visualization data from create_layouts
layout_type: Type of layout to visualize (e.g., "standard", "threehalfs", "twothirdhalfs_var1", etc.)
Returns:
PIL Image: Original image with layout regions visualized
"""
# Convert PIL Image to cv2 format
if isinstance(image, Image.Image):
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
else:
image_cv = image.copy()
# Get a clean copy for drawing
visualization = image_cv.copy()
# Get image dimensions
height, width = image_cv.shape[:2]
# Extract visualization data
original_center_x = visualization_data["original_center_x"]
original_center_y = visualization_data["original_center_y"]
original_person_top = visualization_data["original_person_top"]
original_person_height = visualization_data["original_person_height"]
left_boundary, right_boundary = visualization_data["cutout_bounds"]
cutout_width = right_boundary - left_boundary
# Define colors for different layouts (BGR format)
colors = {
"standard": {"16:9": (0, 255, 0), "9:16": (255, 0, 0)}, # Green, Blue
"threehalfs": {"16:9": (0, 165, 255), "9:16": (255, 255, 0)}, # Orange, Cyan
"twothirdhalfs_var1": {
"16:9": (255, 0, 255),
"9:16": (128, 0, 128),
}, # Magenta, Purple
"twothirdhalfs_var2": {
"16:9": (0, 255, 255),
"9:16": (128, 128, 0),
}, # Yellow, Teal
"twoequalhalfs": {
"16:9": (0, 128, 128),
"9:16": (255, 165, 0),
}, # Dark Cyan, Blue-Green
}
# Define line thickness and font
thickness = 3
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.8
font_thickness = 2
# Draw standard layouts (16:9 and 9:16)
if layout_type == "standard":
# Draw 16:9 crop
aspect_16_9 = 16 / 9
target_height_16_9 = int(cutout_width / aspect_16_9)
# Calculate 20% of person height for top margin
top_margin = int(original_person_height * 0.05)
y_start = int(max(0, original_person_top - top_margin))
if y_start + target_height_16_9 > height:
y_start = int(max(0, height - target_height_16_9))
y_end = int(min(height, y_start + target_height_16_9))
cv2.rectangle(
visualization,
(left_boundary, y_start),
(right_boundary, y_end),
colors["standard"]["16:9"],
thickness,
)
cv2.putText(
visualization,
"16:9",
(left_boundary + 5, y_start + 30),
font,
font_scale,
colors["standard"]["16:9"],
font_thickness,
)
# Draw 9:16 crop
aspect_9_16 = 9 / 16
target_width_9_16 = int(height * aspect_9_16)
x_start = max(
0,
min(width - target_width_9_16, original_center_x - target_width_9_16 // 2),
)
x_end = x_start + target_width_9_16
cv2.rectangle(
visualization,
(x_start, 0),
(x_end, height),
colors["standard"]["9:16"],
thickness,
)
cv2.putText(
visualization,
"9:16",
(x_start + 5, 30),
font,
font_scale,
colors["standard"]["9:16"],
font_thickness,
)
# Draw threehalfs layouts - each segment is centered on the subject
elif layout_type == "threehalfs":
# For 16:9 (three 5.3:9 segments side by side - visually only)
aspect_5_3_9 = 5.3 / 9
segment_height = height
segment_width = int(segment_height * aspect_5_3_9)
# Calculate total width for visualization purposes
total_width = segment_width * 3
start_x = max(0, original_center_x - total_width // 2)
for i in range(3):
# For visualization, we'll place them side by side
vis_segment_x_start = start_x + i * segment_width
vis_segment_x_end = vis_segment_x_start + segment_width
# But each segment would actually be centered on the subject independently
# Here we also draw the centered version more faintly
actual_segment_x_start = max(
0, min(width - segment_width, original_center_x - segment_width // 2)
)
actual_segment_x_end = min(width, actual_segment_x_start + segment_width)
# Draw the visualization placement (side by side)
cv2.rectangle(
visualization,
(vis_segment_x_start, 0),
(vis_segment_x_end, segment_height),
colors["threehalfs"]["16:9"],
thickness,
)
# Draw the actual centered placement with dashed lines
if i > 0: # Only draw centered versions for parts 2 and 3
for j in range(0, segment_height, 20): # Dashed line effect
if j % 40 < 20: # Skip every other segment
cv2.line(
visualization,
(actual_segment_x_start, j),
(actual_segment_x_start, min(j + 20, segment_height)),
colors["threehalfs"]["16:9"],
1,
)
cv2.line(
visualization,
(actual_segment_x_end, j),
(actual_segment_x_end, min(j + 20, segment_height)),
colors["threehalfs"]["16:9"],
1,
)
cv2.putText(
visualization,
f"16:9 Part {i+1}",
(vis_segment_x_start + 5, 30 + i * 30),
font,
font_scale,
colors["threehalfs"]["16:9"],
font_thickness,
)
# For 9:16 (three 9:5.3 segments stacked top to bottom - visually only)
aspect_9_16 = 9 / 16
target_width_9_16 = int(height * aspect_9_16)
x_start = max(
0,
min(width - target_width_9_16, original_center_x - target_width_9_16 // 2),
)
x_end = x_start + target_width_9_16
aspect_9_5_3 = 9 / 5.3
segment_width_9_16 = target_width_9_16
segment_height_9_16 = int(segment_width_9_16 / aspect_9_5_3)
# Calculate total height for visualization purposes
total_height = segment_height_9_16 * 3
start_y = max(0, height // 2 - total_height // 2)
for i in range(3):
# For visualization, we'll place them stacked
vis_segment_y_start = start_y + i * segment_height_9_16
vis_segment_y_end = min(height, vis_segment_y_start + segment_height_9_16)
# But each segment would actually be centered on the subject independently
# Here we also draw the centered version more faintly
actual_segment_y_start = max(
0,
min(
height - segment_height_9_16,
original_center_y - segment_height_9_16 // 2,
),
)
actual_segment_y_end = min(
height, actual_segment_y_start + segment_height_9_16
)
# Draw the visualization placement (stacked)
cv2.rectangle(
visualization,
(x_start, vis_segment_y_start),
(x_end, vis_segment_y_end),
colors["threehalfs"]["9:16"],
thickness,
)
# Draw the actual centered placement with dashed lines
if i > 0: # Only draw centered versions for parts 2 and 3
for j in range(x_start, x_end, 20): # Dashed line effect
if j % 40 < 20: # Skip every other segment
cv2.line(
visualization,
(j, actual_segment_y_start),
(min(j + 20, x_end), actual_segment_y_start),
colors["threehalfs"]["9:16"],
1,
)
cv2.line(
visualization,
(j, actual_segment_y_end),
(min(j + 20, x_end), actual_segment_y_end),
colors["threehalfs"]["9:16"],
1,
)
cv2.putText(
visualization,
f"9:16 Part {i+1}",
(x_start + 5, vis_segment_y_start + 30),
font,
font_scale,
colors["threehalfs"]["9:16"],
font_thickness,
)
# Draw twothirdhalfs layouts
elif layout_type == "twothirdhalfs_var1" or layout_type == "twothirdhalfs_var2":
aspect_key = "16:9" if layout_type.endswith("var1") else "9:16"
layout_color = colors[
(
"twothirdhalfs_var1"
if layout_type.endswith("var1")
else "twothirdhalfs_var2"
)
][aspect_key]
if aspect_key == "16:9":
# For 16:9 (two segments side by side)
aspect_10_6_9 = 10.6 / 9
aspect_5_3_9 = 5.3 / 9
segment1_height = height
segment1_width = int(
segment1_height
* (aspect_10_6_9 if layout_type.endswith("var1") else aspect_5_3_9)
)
segment2_height = height
segment2_width = int(
segment2_height
* (aspect_5_3_9 if layout_type.endswith("var1") else aspect_10_6_9)
)
# First segment
segment_center_x = original_center_x - segment2_width // 4
segment_x_start = int(
max(
0,
min(width - segment1_width, segment_center_x - segment1_width // 2),
)
)
segment_x_end = int(min(width, segment_x_start + segment1_width))
cv2.rectangle(
visualization,
(segment_x_start, 0),
(segment_x_end, segment1_height),
layout_color,
thickness,
)
cv2.putText(
visualization,
f"16:9 Part 1",
(segment_x_start + 5, 30),
font,
font_scale,
layout_color,
font_thickness,
)
# Second segment
segment_center_x = original_center_x + segment1_width // 4
segment_x_start = int(
max(
0,
min(width - segment2_width, segment_center_x - segment2_width // 2),
)
)
segment_x_end = int(min(width, segment_x_start + segment2_width))
cv2.rectangle(
visualization,
(segment_x_start, 0),
(segment_x_end, segment2_height),
layout_color,
thickness,
)
cv2.putText(
visualization,
f"16:9 Part 2",
(segment_x_start + 5, 60),
font,
font_scale,
layout_color,
font_thickness,
)
else: # aspect_key == "9:16"
# For 9:16 (two segments stacked)
aspect_9_16 = 9 / 16
target_width_9_16 = int(height * aspect_9_16)
x_start = max(
0,
min(
width - target_width_9_16,
original_center_x - target_width_9_16 // 2,
),
)
x_end = x_start + target_width_9_16
aspect_9_10_6 = 9 / 10.6
aspect_9_5_3 = 9 / 5.3
segment1_width = target_width_9_16
segment1_height = int(
segment1_width
/ (aspect_9_10_6 if layout_type.endswith("var1") else aspect_9_5_3)
)
segment2_width = target_width_9_16
segment2_height = int(
segment2_width
/ (aspect_9_5_3 if layout_type.endswith("var1") else aspect_9_10_6)
)
# First segment (top)
segment_y_start = 0
segment_y_end = min(height, segment_y_start + segment1_height)
cv2.rectangle(
visualization,
(x_start, segment_y_start),
(x_end, segment_y_end),
layout_color,
thickness,
)
cv2.putText(
visualization,
f"9:16 Part 1",
(x_start + 5, segment_y_start + 30),
font,
font_scale,
layout_color,
font_thickness,
)
# Second segment (bottom)
segment_y_start = segment_y_end
segment_y_end = min(height, segment_y_start + segment2_height)
cv2.rectangle(
visualization,
(x_start, segment_y_start),
(x_end, segment_y_end),
layout_color,
thickness,
)
cv2.putText(
visualization,
f"9:16 Part 2",
(x_start + 5, segment_y_start + 30),
font,
font_scale,
layout_color,
font_thickness,
)
# Draw twoequalhalfs layouts
elif layout_type == "twoequalhalfs":
# For 16:9 (two 8:9 segments side by side)
aspect_8_9 = 8 / 9
segment_height = height
segment_width = int(segment_height * aspect_8_9)
# First segment (left)
segment_center_x = original_center_x - segment_width // 2
segment_x_start = int(
max(0, min(width - segment_width, segment_center_x - segment_width // 2))
)
segment_x_end = int(min(width, segment_x_start + segment_width))
cv2.rectangle(
visualization,
(segment_x_start, 0),
(segment_x_end, segment_height),
colors["twoequalhalfs"]["16:9"],
thickness,
)
cv2.putText(
visualization,
f"16:9 Equal 1",
(segment_x_start + 5, 30),
font,
font_scale,
colors["twoequalhalfs"]["16:9"],
font_thickness,
)
# Second segment (right)
segment_center_x = original_center_x + segment_width // 2
segment_x_start = int(
max(0, min(width - segment_width, segment_center_x - segment_width // 2))
)
segment_x_end = int(min(width, segment_x_start + segment_width))
cv2.rectangle(
visualization,
(segment_x_start, 0),
(segment_x_end, segment_height),
colors["twoequalhalfs"]["16:9"],
thickness,
)
cv2.putText(
visualization,
f"16:9 Equal 2",
(segment_x_start + 5, 60),
font,
font_scale,
colors["twoequalhalfs"]["16:9"],
font_thickness,
)
# For 9:16 (two 9:8 segments stacked)
aspect_9_16 = 9 / 16
target_width_9_16 = int(height * aspect_9_16)
x_start = max(
0,
min(width - target_width_9_16, original_center_x - target_width_9_16 // 2),
)
x_end = x_start + target_width_9_16
aspect_9_8 = 9 / 8
segment_width_9_16 = target_width_9_16
segment_height_9_16 = int(segment_width_9_16 / aspect_9_8)
# First segment (top)
segment_y_start = 0
segment_y_end = min(height, segment_y_start + segment_height_9_16)
cv2.rectangle(
visualization,
(x_start, segment_y_start),
(x_end, segment_y_end),
colors["twoequalhalfs"]["9:16"],
thickness,
)
cv2.putText(
visualization,
f"9:16 Equal 1",
(x_start + 5, segment_y_start + 30),
font,
font_scale,
colors["twoequalhalfs"]["9:16"],
font_thickness,
)
# Second segment (bottom)
segment_y_start = segment_y_end
segment_y_end = min(height, segment_y_start + segment_height_9_16)
cv2.rectangle(
visualization,
(x_start, segment_y_start),
(x_end, segment_y_end),
colors["twoequalhalfs"]["9:16"],
thickness,
)
cv2.putText(
visualization,
f"9:16 Equal 2",
(x_start + 5, segment_y_start + 30),
font,
font_scale,
colors["twoequalhalfs"]["9:16"],
font_thickness,
)
# Draw center point of person(s)
center_radius = 8
cv2.circle(
visualization,
(original_center_x, original_center_y),
center_radius,
(255, 255, 255),
-1,
)
cv2.circle(
visualization,
(original_center_x, original_center_y),
center_radius,
(0, 0, 0),
2,
)
# Convert back to PIL format
visualization_pil = Image.fromarray(cv2.cvtColor(visualization, cv2.COLOR_BGR2RGB))
return visualization_pil
def get_image_crop(cid=None, rsid=None, uid=None, ct=None):
"""
Function that returns both standard and layout variations for visualization.
Returns:
gr.Gallery: Gallery of all generated images
"""
try:
sprites_data = get_sprite_firebase(cid, rsid, uid)
image_paths = [sprite_data["url"] for sprite_data in sprites_data]
durations = [sprite_data["duration"] for sprite_data in sprites_data]
except Exception:
image_paths = [
# "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
# "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
"data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
"data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
"data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg",
"data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg",
]
# Lists to store all images
all_images = []
all_captions = []
for image_path in image_paths:
# Load image (from local file or URL)
try:
if image_path.startswith(("http://", "https://")):
response = requests.get(image_path)
input_image = Image.open(BytesIO(response.content))
else:
input_image = Image.open(image_path)
except Exception as e:
print(f"Error loading image {image_path}: {e}")
continue
# Get the middle thumbnail
mid_image = get_middle_thumbnail(input_image)
# Add numbered divisions for GPT-4V analysis
numbered_mid_image = add_top_numbers(
input_image=mid_image,
num_divisions=20,
margin=50,
font_size=30,
dot_spacing=20,
)
# Analyze the image to get optimal crop divisions
# This uses GPT-4V to identify the optimal crop points
(left_division, right_division, num_of_speakers) = analyze_image(
numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct
)
# Safety check for divisions
if left_division <= 0:
left_division = 1
if right_division > 20:
right_division = 20
if left_division >= right_division:
left_division = 1
right_division = 20
print(f"Using divisions: left={left_division}, right={right_division}")
# Create layouts and cutouts using the new function
(
standard_crops,
threehalfs_layouts,
twothirdhalfs_layouts,
twoequalhalfs_layouts,
visualization_data,
) = create_layouts(mid_image, left_division, right_division, num_of_speakers)
# Create all the required visualizations
# 1. Standard aspect ratio visualization (16:9 and 9:16)
standard_visualization = draw_layout_regions(
mid_image, left_division, right_division, visualization_data, "standard"
)
all_images.append(standard_visualization)
all_captions.append(
f"Standard Aspect Ratios (16:9 & 9:16) {standard_visualization.size}"
)
# Add input and middle image to gallery
all_images.append(input_image)
all_captions.append(f"Input Image {input_image.size}")
all_images.append(mid_image)
all_captions.append(f"Middle Thumbnail {mid_image.size}")
# Add standard crops
for key, crop in standard_crops.items():
all_images.append(crop)
all_captions.append(f"{key} {crop.size}")
# Add threehalfs layouts
for key, layout in threehalfs_layouts.items():
all_images.append(layout)
all_captions.append(f"Three Halfs {key} {layout.size}")
# Add twothirdhalfs layouts
for key, layout in twothirdhalfs_layouts.items():
all_images.append(layout)
all_captions.append(f"Two-Thirds Halfs {key} {layout.size}")
# Add twoequalhalfs layouts
for key, layout in twoequalhalfs_layouts.items():
all_images.append(layout)
all_captions.append(f"Two Equal Halfs {key} {layout.size}")
# Return gallery with all images
return gr.Gallery(value=list(zip(all_images, all_captions)))