import base64 import os from io import BytesIO import cv2 import gradio as gr import numpy as np import pyrebase import requests from openai import OpenAI from PIL import Image, ImageDraw, ImageFont from ultralytics import YOLO from prompts import remove_unwanted_prompt model = YOLO("yolo11n.pt") def get_middle_thumbnail(input_image: Image, grid_size=(10, 10), padding=3): """ Extract the middle thumbnail from a sprite sheet, handling different aspect ratios and removing padding. Args: input_image: PIL Image grid_size: Tuple of (columns, rows) padding: Number of padding pixels on each side (default 3) Returns: PIL.Image: The middle thumbnail image with padding removed """ sprite_sheet = input_image # Calculate thumbnail dimensions based on actual sprite sheet size sprite_width, sprite_height = sprite_sheet.size thumb_width_with_padding = sprite_width // grid_size[0] thumb_height_with_padding = sprite_height // grid_size[1] # Remove padding to get actual image dimensions thumb_width = thumb_width_with_padding - (2 * padding) # 726 - 6 = 720 thumb_height = thumb_height_with_padding - (2 * padding) # varies based on input # Calculate the middle position total_thumbs = grid_size[0] * grid_size[1] middle_index = total_thumbs // 2 # Calculate row and column of middle thumbnail middle_row = middle_index // grid_size[0] middle_col = middle_index % grid_size[0] # Calculate pixel coordinates for cropping, including padding offset left = (middle_col * thumb_width_with_padding) + padding top = (middle_row * thumb_height_with_padding) + padding right = left + thumb_width # Don't add padding here bottom = top + thumb_height # Don't add padding here # Crop and return the middle thumbnail middle_thumb = sprite_sheet.crop((left, top, right, bottom)) return middle_thumb def encode_image_to_base64(image: Image.Image, format: str = "JPEG") -> str: """ Convert a PIL image to a base64 string. Args: image: PIL Image object format: Image format to use for encoding (default: PNG) Returns: Base64 encoded string of the image """ buffered = BytesIO() image.save(buffered, format=format) return base64.b64encode(buffered.getvalue()).decode("utf-8") def add_top_numbers( input_image, num_divisions=20, margin=90, font_size=70, dot_spacing=20, ): """ Add numbered divisions across the top and bottom of any image with dotted vertical lines. Args: input_image (Image): PIL Image num_divisions (int): Number of divisions to create margin (int): Size of margin in pixels for numbers font_size (int): Font size for numbers dot_spacing (int): Spacing between dots in pixels """ # Load the image original_image = input_image # Create new image with extra space for numbers on top and bottom new_width = original_image.width new_height = original_image.height + ( 2 * margin ) # Add margin to both top and bottom new_image = Image.new("RGB", (new_width, new_height), "white") # Paste original image in the middle new_image.paste(original_image, (0, margin)) # Initialize drawing context draw = ImageDraw.Draw(new_image) try: font = ImageFont.truetype("arial.ttf", font_size) except OSError: print("Using default font") font = ImageFont.load_default(size=font_size) # Calculate division width division_width = original_image.width / num_divisions # Draw division numbers and dotted lines for i in range(num_divisions): x = (i * division_width) + (division_width / 2) # Draw number at top draw.text((x, margin // 2), str(i + 1), fill="black", font=font, anchor="mm") # Draw number at bottom draw.text( (x, new_height - (margin // 2)), str(i + 1), fill="black", font=font, anchor="mm", ) # Draw dotted line from top margin to bottom margin y_start = margin y_end = new_height - margin # Draw dots with specified spacing current_y = y_start while current_y < y_end: draw.circle( [x - 1, current_y - 1, x + 1, current_y + 1], fill="black", width=5, radius=3, ) current_y += dot_spacing return new_image def analyze_image(numbered_input_image: Image, prompt, input_image, ct): """ Perform inference on an image using GPT-4V. Args: numbered_input_image (Image): PIL Image prompt (str): The prompt/question about the image input_image (Image): input image without numbers Returns: str: The model's response """ client = OpenAI() base64_image = encode_image_to_base64(numbered_input_image, format="JPEG") messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, }, ], } ] response = client.chat.completions.create( model="gpt-4o", messages=messages, max_tokens=300 ) messages.extend( [ {"role": "assistant", "content": response.choices[0].message.content}, { "role": "user", "content": "please return the response in the json with keys left_row, right_row, and num_of_speakers", }, ], ) response = ( client.chat.completions.create(model="gpt-4o", messages=messages) .choices[0] .message.content ) left_index = response.find("{") right_index = response.rfind("}") try: if left_index != -1 and right_index != -1: print(response[left_index : right_index + 1]) response_json = eval(response[left_index : right_index + 1]) except Exception as e: print(e) return 0, 20 return ( response_json["left_row"], response_json["right_row"], response_json["num_of_speakers"], ) def get_sprite_firebase(cid, rsid, uid): config = { "apiKey": f"{os.getenv('FIREBASE_API_KEY')}", "authDomain": f"{os.getenv('FIREBASE_AUTH_DOMAIN')}", "databaseURL": f"{os.getenv('FIREBASE_DATABASE_URL')}", "projectId": f"{os.getenv('FIREBASE_PROJECT_ID')}", "storageBucket": f"{os.getenv('FIREBASE_STORAGE_BUCKET')}", "messagingSenderId": f"{os.getenv('FIREBASE_MESSAGING_SENDER_ID')}", "appId": f"{os.getenv('FIREBASE_APP_ID')}", "measurementId": f"{os.getenv('FIREBASE_MEASUREMENT_ID')}", } firebase = pyrebase.initialize_app(config) db = firebase.database() account_id = os.getenv("ROLL_ACCOUNT") COLLAB_EDIT_LINK = "collab_sprite_link_handler" path = f"{account_id}/{COLLAB_EDIT_LINK}/{uid}/{cid}/{rsid}" data = db.child(path).get() return data.val() def find_persons_center(image, num_of_speakers=1): """ Find the center point of the largest num_of_speakers persons in the image. If multiple persons are detected, merge the bounding boxes of only the largest ones. Args: image: CV2/numpy array image num_of_speakers: Number of speakers to consider (default: 1) Returns: int: x-coordinate of the center point of all considered persons """ # Detect persons (class 0 in COCO dataset) results = model(image, classes=[0], conf=0.6) if not results or len(results[0].boxes) == 0: # If no persons detected, return center of image return image.shape[1] // 2 # Get all person boxes boxes = results[0].boxes.xyxy.cpu().numpy() # Print the number of persons detected (for debugging) print(f"Detected {len(boxes)} persons in the image") if len(boxes) == 1: # If only one person, return center of their bounding box x1, _, x2, _ = boxes[0] center_x = int((x1 + x2) // 2) print(f"Single person detected at center x: {center_x}") return center_x else: # Multiple persons - consider only the largest num_of_speakers boxes # Calculate area for each box box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes] # Sort boxes by area (largest first) and take top num_of_speakers sorted_indices = sorted( range(len(box_areas)), key=lambda i: box_areas[i], reverse=True ) # Use all available boxes if fewer detected than requested num_boxes_to_use = min(num_of_speakers, len(boxes)) selected_indices = sorted_indices[:num_boxes_to_use] selected_boxes = [boxes[i] for i in selected_indices] # Create a merged bounding box from selected boxes left_x = min(box[0] for box in selected_boxes) right_x = max(box[2] for box in selected_boxes) merged_center_x = int((left_x + right_x) // 2) print( f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}" ) print(f"Merged bounds: left={left_x}, right={right_x}") return merged_center_x def create_layouts(image, left_division, right_division, num_of_speakers): """ Create different layout variations of the image using specific aspect ratios. All layout variations will be centered on detected persons. Args: image: PIL Image left_division: Left division index (1-20) right_division: Right division index (1-20) Returns: tuple: (standard_crops, threehalfs_layouts, twothirdhalfs_layouts, twoequalhalfs_layouts, visualization_data) """ # Convert PIL Image to cv2 format if isinstance(image, Image.Image): image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) else: image_cv = image.copy() # Get image dimensions height, width = image_cv.shape[:2] # Calculate division width and crop boundaries division_width = width / 20 # Assuming 20 divisions left_boundary = int((left_division - 1) * division_width) right_boundary = int(right_division * division_width) # 1. Create cutout image based on divisions cutout_image = image_cv[:, left_boundary:right_boundary].copy() cutout_width = right_boundary - left_boundary cutout_height = cutout_image.shape[0] # 2. Run YOLO on cutout to get person bounding box and center results = model(cutout_image, classes=[0], conf=0.6) # Default center if no detection cutout_center_x = cutout_image.shape[1] // 2 cutout_center_y = cutout_height // 2 # Default values for bounding box person_top = 0.0 person_height = float(cutout_height) if results and len(results[0].boxes) > 0: # Get person detection boxes = results[0].boxes.xyxy.cpu().numpy() if len(boxes) == 1: # Single person x1, y1, x2, y2 = boxes[0] cutout_center_x = int((x1 + x2) // 2) cutout_center_y = int((y1 + y2) // 2) person_top = y1 person_height = y2 - y1 else: # Multiple persons - consider only the largest num_of_speakers boxes # Calculate area for each box box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes] # Sort boxes by area (largest first) and take top num_of_speakers sorted_indices = sorted( range(len(box_areas)), key=lambda i: box_areas[i], reverse=True ) # Use all available boxes if fewer detected than requested num_boxes_to_use = min(num_of_speakers, len(boxes)) selected_indices = sorted_indices[:num_boxes_to_use] selected_boxes = [boxes[i] for i in selected_indices] # Merge bounding boxes of selected boxes left_x = min(box[0] for box in selected_boxes) right_x = max(box[2] for box in selected_boxes) top_y = min(box[1] for box in selected_boxes) # Top of highest person bottom_y = max(box[3] for box in selected_boxes) # Bottom of lowest person cutout_center_x = int((left_x + right_x) // 2) cutout_center_y = int((top_y + bottom_y) // 2) person_top = top_y person_height = bottom_y - top_y # 3. Create 16:9 and 9:16 versions with person properly framed aspect_16_9 = 16 / 9 aspect_9_16 = 9 / 16 # For 16:9 version (with 5% margin above person) target_height_16_9 = int(cutout_width / aspect_16_9) if target_height_16_9 <= cutout_height: # Calculate 5% of person height for top margin top_margin = int(person_height * 0.05) # Start 5% above the person's top y_start = int(max(0, person_top - top_margin)) # If this would make the crop exceed the bottom, adjust y_start if y_start + target_height_16_9 > cutout_height: y_start = int(max(0, cutout_height - target_height_16_9)) y_end = int(min(cutout_height, y_start + target_height_16_9)) cutout_16_9 = cutout_image[y_start:y_end, :].copy() else: # Handle rare case where we need to adjust width (not expected with normal images) new_width = int(cutout_height * aspect_16_9) x_start = max( 0, min(cutout_width - new_width, cutout_center_x - new_width // 2) ) x_end = min(cutout_width, x_start + new_width) cutout_16_9 = cutout_image[:, x_start:x_end].copy() # For 9:16 version (centered on person, adjusted upward for face visibility) target_width_9_16 = int(cutout_height * aspect_9_16) # Adjust center point upward by 20% of person height to ensure face is visible adjusted_center_y = int(cutout_center_y - (person_height * 0.2)) if target_width_9_16 <= cutout_width: # Center horizontally around person x_start = int( max( 0, min( cutout_width - target_width_9_16, cutout_center_x - target_width_9_16 // 2, ), ) ) x_end = int(min(cutout_width, x_start + target_width_9_16)) # Use adjusted center point for vertical positioning y_start = int( max( 0, min( cutout_height - cutout_height, adjusted_center_y - cutout_height // 2, ), ) ) cutout_9_16 = cutout_image[y_start:, x_start:x_end].copy() else: # Handle rare case where we need to adjust height new_height = int(cutout_width / aspect_9_16) # Use adjusted center point for vertical positioning y_start = int( max(0, min(cutout_height - new_height, adjusted_center_y - new_height // 2)) ) y_end = int(min(cutout_height, y_start + new_height)) cutout_9_16 = cutout_image[y_start:y_end, :].copy() # 4. Scale the center back to original image coordinates original_center_x = left_boundary + cutout_center_x original_center_y = cutout_center_y original_person_top = person_top # Store visualization data for drawing visualization_data = { "original_center_x": original_center_x, "original_center_y": original_center_y, "original_person_top": original_person_top, "original_person_height": person_height, "cutout_bounds": (left_boundary, right_boundary), } # 5. Create new layout variations - each segment is independently centered on the subject # ----- Create crops for threehalfs layout ----- # For 16:9 (three 5.3:9 segments, each independently centered) aspect_5_3_9 = 5.3 / 9 # Calculate dimensions for each segment segment_height_16_9 = cutout_height # Use full height segment_width_16_9 = int(segment_height_16_9 * aspect_5_3_9) # Create three segments for 16:9 threehalfs - all centered on the person threehalfs_16_9_segments = [] for i in range(3): # Each segment is centered on the person segment_x_start = int( max( 0, min( cutout_width - segment_width_16_9, cutout_center_x - segment_width_16_9 // 2, ), ) ) segment_x_end = int(min(cutout_width, segment_x_start + segment_width_16_9)) # Create the segment segment = cutout_image[:, segment_x_start:segment_x_end].copy() # Add a label for visualization label = f"Part {i+1}" cv2.putText( segment, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) threehalfs_16_9_segments.append(segment) # For 9:16 (three 9:5.3 segments, each independently centered) aspect_9_5_3 = 9 / 5.3 # Calculate dimensions for each segment segment_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop segment_height_9_16 = int(segment_width_9_16 / aspect_9_5_3) # Get adjusted center for 9:16 segments (move up by 20% of person height) cutout_9_16_center_y = cutout_9_16.shape[0] // 2 adjusted_9_16_center_y = int(cutout_9_16_center_y - (person_height * 0.2)) cutout_9_16_height = cutout_9_16.shape[0] # Create three segments for 9:16 threehalfs - all centered on the person threehalfs_9_16_segments = [] for i in range(3): # Each segment is centered on the person with adjusted center point segment_y_start = int( max( 0, min( cutout_9_16_height - segment_height_9_16, person_top, ), ) ) segment_y_end = int( min(cutout_9_16_height, segment_y_start + segment_height_9_16) ) # Create the segment segment = cutout_9_16[segment_y_start:segment_y_end, :].copy() # Add a label for visualization label = f"Part {i+1}" cv2.putText( segment, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) threehalfs_9_16_segments.append(segment) # ----- Create crops for twothirdhalfs layout ----- # For 16:9 (two segments: 10.6:9 and 5.3:9 OR 5.3:9 and 10.6:9) aspect_10_6_9 = 10.6 / 9 # Calculate dimensions for segments segment1_height_16_9 = cutout_height # Use full height segment1_width_16_9 = int(segment1_height_16_9 * aspect_10_6_9) segment2_height_16_9 = cutout_height # Use full height segment2_width_16_9 = int(segment2_height_16_9 * aspect_5_3_9) # Create segments for 16:9 twothirdhalfs var1 (10.6:9 then 5.3:9) # Both segments independently centered on the person # First segment (10.6:9) segment_x_start = int( max( 0, min( cutout_width - segment1_width_16_9, cutout_center_x - segment1_width_16_9 // 2, ), ) ) segment_x_end = int(min(cutout_width, segment_x_start + segment1_width_16_9)) segment1 = cutout_image[:, segment_x_start:segment_x_end].copy() # Add label cv2.putText( segment1, "10.6:9", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) # Second segment (5.3:9) segment_x_start = int( max( 0, min( cutout_width - segment2_width_16_9, cutout_center_x - segment2_width_16_9 // 2, ), ) ) segment_x_end = int(min(cutout_width, segment_x_start + segment2_width_16_9)) segment2 = cutout_image[:, segment_x_start:segment_x_end].copy() # Add label cv2.putText( segment2, "5.3:9", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) twothirdhalfs_16_9_var1_segments = [segment1, segment2] # Create segments for 16:9 twothirdhalfs var2 (5.3:9 then 10.6:9) # First segment (5.3:9) - reuse segment2 from var1 # Second segment (10.6:9) - reuse segment1 from var1 twothirdhalfs_16_9_var2_segments = [segment2.copy(), segment1.copy()] # For 9:16 (two segments stacked: 9:10.6 and 9:5.3 OR 9:5.3 and 9:10.6) aspect_9_10_6 = 9 / 10.6 aspect_9_5_3 = 9 / 5.3 # Calculate dimensions for segments segment1_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop segment1_height_9_16 = int(segment1_width_9_16 / aspect_9_10_6) segment2_width_9_16 = cutout_9_16.shape[1] # Use full width of 9:16 crop segment2_height_9_16 = int(segment2_width_9_16 / aspect_9_5_3) # Create segments for 9:16 twothirdhalfs var1 (9:10.6 then 9:5.3) # Both segments independently centered on the person with adjusted center point # First segment (9:10.6) segment_y_start = int( max( 0, min( cutout_9_16_height - segment1_height_9_16, adjusted_9_16_center_y - segment1_height_9_16 // 2, ), ) ) segment_y_end = int(min(cutout_9_16_height, segment_y_start + segment1_height_9_16)) segment1 = cutout_9_16[segment_y_start:segment_y_end, :].copy() # Add label cv2.putText( segment1, "9:10.6", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) # Second segment (9:5.3) segment_y_start = int( max( 0, min( cutout_9_16_height - segment2_height_9_16, person_top, ), ) ) segment_y_end = int(min(cutout_9_16_height, segment_y_start + segment2_height_9_16)) segment2 = cutout_9_16[segment_y_start:segment_y_end, :].copy() # Add label cv2.putText( segment2, "9:5.3", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) twothirdhalfs_9_16_var1_segments = [segment1, segment2] # Create segments for 9:16 twothirdhalfs var2 (9:5.3 then 9:10.6) # First segment (9:5.3) - reuse segment2 from var1 # Second segment (9:10.6) - reuse segment1 from var1 twothirdhalfs_9_16_var2_segments = [segment2.copy(), segment1.copy()] # ----- Create crops for twoequalhalfs layout ----- # For 16:9 (two 8:9 segments side by side) aspect_8_9 = 8 / 9 # Calculate dimensions for segments segment_height_16_9_equal = cutout_height # Use full height segment_width_16_9_equal = int(segment_height_16_9_equal * aspect_8_9) # Create segments for 16:9 twoequalhalfs - both centered on the person # First segment (8:9) segment_x_start = int( max( 0, min( cutout_width - segment_width_16_9_equal, cutout_center_x - segment_width_16_9_equal // 2, ), ) ) segment_x_end = int(min(cutout_width, segment_x_start + segment_width_16_9_equal)) segment1 = cutout_image[:, segment_x_start:segment_x_end].copy() # Add label cv2.putText( segment1, "8:9 (1)", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) # Second segment (identical to first for equal halfs) segment2 = segment1.copy() # Update label for segment 2 cv2.putText( segment2, "8:9 (2)", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) twoequalhalfs_16_9_segments = [segment1, segment2] # For 9:16 (two 9:8 segments stacked) aspect_9_8 = 9 / 8 # Calculate dimensions for segments segment_width_9_16_equal = cutout_9_16.shape[1] # Use full width of 9:16 crop segment_height_9_16_equal = int(segment_width_9_16_equal / aspect_9_8) # Create segments for 9:16 twoequalhalfs - both centered on the person with adjusted center point # First segment (9:8) segment_y_start = int( max( 0, min( cutout_9_16_height - segment_height_9_16_equal, max(0, person_top - person_height * 0.05), ), ) ) segment_y_end = int( min(cutout_9_16_height, segment_y_start + segment_height_9_16_equal) ) segment1 = cutout_9_16[segment_y_start:segment_y_end, :].copy() # Add label cv2.putText( segment1, "9:8 (1)", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) # Second segment (identical to first for equal halfs) segment2 = segment1.copy() # Update label for segment 2 cv2.putText( segment2, "9:8 (2)", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA, ) twoequalhalfs_9_16_segments = [segment1, segment2] # 6. Create composite layouts by joining segments # Function to create a composite image def create_composite(segments, horizontal=True): if not segments: return None if horizontal: # Calculate the total width and max height total_width = sum(segment.shape[1] for segment in segments) max_height = max(segment.shape[0] for segment in segments) # Create a canvas composite = np.zeros((max_height, total_width, 3), dtype=np.uint8) # Place segments side by side x_offset = 0 for segment in segments: h, w = segment.shape[:2] composite[:h, x_offset : x_offset + w] = segment x_offset += w else: # vertical stacking # Calculate the max width and total height max_width = max(segment.shape[1] for segment in segments) total_height = sum(segment.shape[0] for segment in segments) # Create a canvas composite = np.zeros((total_height, max_width, 3), dtype=np.uint8) # Place segments top to bottom y_offset = 0 for segment in segments: h, w = segment.shape[:2] composite[y_offset : y_offset + h, :w] = segment y_offset += h return composite # Create composite layouts threehalfs_16_9_composite = create_composite( threehalfs_16_9_segments, horizontal=True ) threehalfs_9_16_composite = create_composite( threehalfs_9_16_segments, horizontal=False ) twothirdhalfs_16_9_var1_composite = create_composite( twothirdhalfs_16_9_var1_segments, horizontal=True ) twothirdhalfs_16_9_var2_composite = create_composite( twothirdhalfs_16_9_var2_segments, horizontal=True ) twothirdhalfs_9_16_var1_composite = create_composite( twothirdhalfs_9_16_var1_segments, horizontal=False ) twothirdhalfs_9_16_var2_composite = create_composite( twothirdhalfs_9_16_var2_segments, horizontal=False ) twoequalhalfs_16_9_composite = create_composite( twoequalhalfs_16_9_segments, horizontal=True ) twoequalhalfs_9_16_composite = create_composite( twoequalhalfs_9_16_segments, horizontal=False ) # Add labels to all composites def add_label(img, label): if img is None: return None font = cv2.FONT_HERSHEY_SIMPLEX label_settings = { "fontScale": 1.0, "fontFace": font, "thickness": 2, } # Draw background for text text_size = cv2.getTextSize( label, fontFace=label_settings["fontFace"], fontScale=label_settings["fontScale"], thickness=label_settings["thickness"], ) cv2.rectangle( img, (10, 10), (10 + text_size[0][0] + 10, 10 + text_size[0][1] + 10), (0, 0, 0), -1, ) # Black background # Draw text cv2.putText( img, label, (15, 15 + text_size[0][1]), fontFace=label_settings["fontFace"], fontScale=label_settings["fontScale"], thickness=label_settings["thickness"], color=(255, 255, 255), lineType=cv2.LINE_AA, ) return img # Label the basic crops cutout_image_labeled = add_label(cutout_image.copy(), "Cutout") cutout_16_9_labeled = add_label(cutout_16_9.copy(), "16:9") cutout_9_16_labeled = add_label(cutout_9_16.copy(), "9:16") # Label the composite layouts threehalfs_16_9_labeled = add_label(threehalfs_16_9_composite, "Three Halfs 16:9") threehalfs_9_16_labeled = add_label(threehalfs_9_16_composite, "Three Halfs 9:16") twothirdhalfs_16_9_var1_labeled = add_label( twothirdhalfs_16_9_var1_composite, "Two Thirds Var1 16:9" ) twothirdhalfs_16_9_var2_labeled = add_label( twothirdhalfs_16_9_var2_composite, "Two Thirds Var2 16:9" ) twothirdhalfs_9_16_var1_labeled = add_label( twothirdhalfs_9_16_var1_composite, "Two Thirds Var1 9:16" ) twothirdhalfs_9_16_var2_labeled = add_label( twothirdhalfs_9_16_var2_composite, "Two Thirds Var2 9:16" ) twoequalhalfs_16_9_labeled = add_label( twoequalhalfs_16_9_composite, "Two Equal Halfs 16:9" ) twoequalhalfs_9_16_labeled = add_label( twoequalhalfs_9_16_composite, "Two Equal Halfs 9:16" ) # Convert all output images to PIL format def cv2_to_pil(img): if img is None: return None return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # Convert standard crops standard_crops = { "cutout": cv2_to_pil(cutout_image_labeled), "16:9": cv2_to_pil(cutout_16_9_labeled), "9:16": cv2_to_pil(cutout_9_16_labeled), } # Convert threehalfs layouts threehalfs_layouts = { "16:9": cv2_to_pil(threehalfs_16_9_labeled), "9:16": cv2_to_pil(threehalfs_9_16_labeled), } # Convert twothirdhalfs layouts twothirdhalfs_layouts = { "16:9_var1": cv2_to_pil(twothirdhalfs_16_9_var1_labeled), "16:9_var2": cv2_to_pil(twothirdhalfs_16_9_var2_labeled), "9:16_var1": cv2_to_pil(twothirdhalfs_9_16_var1_labeled), "9:16_var2": cv2_to_pil(twothirdhalfs_9_16_var2_labeled), } # Convert twoequalhalfs layouts twoequalhalfs_layouts = { "16:9": cv2_to_pil(twoequalhalfs_16_9_labeled), "9:16": cv2_to_pil(twoequalhalfs_9_16_labeled), } return ( standard_crops, threehalfs_layouts, twothirdhalfs_layouts, twoequalhalfs_layouts, visualization_data, ) def draw_layout_regions( image, left_division, right_division, visualization_data, layout_type ): """ Create a visualization showing the layout regions overlaid on the original image. Each region is independently centered on the subject, as in practice different videos would be stacked in these layouts. Args: image: PIL Image left_division: Left division index (1-20) right_division: Right division index (1-20) visualization_data: Dictionary with visualization data from create_layouts layout_type: Type of layout to visualize (e.g., "standard", "threehalfs", "twothirdhalfs_var1", etc.) Returns: PIL Image: Original image with layout regions visualized """ # Convert PIL Image to cv2 format if isinstance(image, Image.Image): image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) else: image_cv = image.copy() # Get a clean copy for drawing visualization = image_cv.copy() # Get image dimensions height, width = image_cv.shape[:2] # Extract visualization data original_center_x = visualization_data["original_center_x"] original_center_y = visualization_data["original_center_y"] original_person_top = visualization_data["original_person_top"] original_person_height = visualization_data["original_person_height"] left_boundary, right_boundary = visualization_data["cutout_bounds"] cutout_width = right_boundary - left_boundary # Define colors for different layouts (BGR format) colors = { "standard": {"16:9": (0, 255, 0), "9:16": (255, 0, 0)}, # Green, Blue "threehalfs": {"16:9": (0, 165, 255), "9:16": (255, 255, 0)}, # Orange, Cyan "twothirdhalfs_var1": { "16:9": (255, 0, 255), "9:16": (128, 0, 128), }, # Magenta, Purple "twothirdhalfs_var2": { "16:9": (0, 255, 255), "9:16": (128, 128, 0), }, # Yellow, Teal "twoequalhalfs": { "16:9": (0, 128, 128), "9:16": (255, 165, 0), }, # Dark Cyan, Blue-Green } # Define line thickness and font thickness = 3 font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.8 font_thickness = 2 # Draw standard layouts (16:9 and 9:16) if layout_type == "standard": # Draw 16:9 crop aspect_16_9 = 16 / 9 target_height_16_9 = int(cutout_width / aspect_16_9) # Calculate 20% of person height for top margin top_margin = int(original_person_height * 0.05) y_start = int(max(0, original_person_top - top_margin)) if y_start + target_height_16_9 > height: y_start = int(max(0, height - target_height_16_9)) y_end = int(min(height, y_start + target_height_16_9)) cv2.rectangle( visualization, (left_boundary, y_start), (right_boundary, y_end), colors["standard"]["16:9"], thickness, ) cv2.putText( visualization, "16:9", (left_boundary + 5, y_start + 30), font, font_scale, colors["standard"]["16:9"], font_thickness, ) # Draw 9:16 crop aspect_9_16 = 9 / 16 target_width_9_16 = int(height * aspect_9_16) x_start = max( 0, min(width - target_width_9_16, original_center_x - target_width_9_16 // 2), ) x_end = x_start + target_width_9_16 cv2.rectangle( visualization, (x_start, 0), (x_end, height), colors["standard"]["9:16"], thickness, ) cv2.putText( visualization, "9:16", (x_start + 5, 30), font, font_scale, colors["standard"]["9:16"], font_thickness, ) # Draw threehalfs layouts - each segment is centered on the subject elif layout_type == "threehalfs": # For 16:9 (three 5.3:9 segments side by side - visually only) aspect_5_3_9 = 5.3 / 9 segment_height = height segment_width = int(segment_height * aspect_5_3_9) # Calculate total width for visualization purposes total_width = segment_width * 3 start_x = max(0, original_center_x - total_width // 2) for i in range(3): # For visualization, we'll place them side by side vis_segment_x_start = start_x + i * segment_width vis_segment_x_end = vis_segment_x_start + segment_width # But each segment would actually be centered on the subject independently # Here we also draw the centered version more faintly actual_segment_x_start = max( 0, min(width - segment_width, original_center_x - segment_width // 2) ) actual_segment_x_end = min(width, actual_segment_x_start + segment_width) # Draw the visualization placement (side by side) cv2.rectangle( visualization, (vis_segment_x_start, 0), (vis_segment_x_end, segment_height), colors["threehalfs"]["16:9"], thickness, ) # Draw the actual centered placement with dashed lines if i > 0: # Only draw centered versions for parts 2 and 3 for j in range(0, segment_height, 20): # Dashed line effect if j % 40 < 20: # Skip every other segment cv2.line( visualization, (actual_segment_x_start, j), (actual_segment_x_start, min(j + 20, segment_height)), colors["threehalfs"]["16:9"], 1, ) cv2.line( visualization, (actual_segment_x_end, j), (actual_segment_x_end, min(j + 20, segment_height)), colors["threehalfs"]["16:9"], 1, ) cv2.putText( visualization, f"16:9 Part {i+1}", (vis_segment_x_start + 5, 30 + i * 30), font, font_scale, colors["threehalfs"]["16:9"], font_thickness, ) # For 9:16 (three 9:5.3 segments stacked top to bottom - visually only) aspect_9_16 = 9 / 16 target_width_9_16 = int(height * aspect_9_16) x_start = max( 0, min(width - target_width_9_16, original_center_x - target_width_9_16 // 2), ) x_end = x_start + target_width_9_16 aspect_9_5_3 = 9 / 5.3 segment_width_9_16 = target_width_9_16 segment_height_9_16 = int(segment_width_9_16 / aspect_9_5_3) # Calculate total height for visualization purposes total_height = segment_height_9_16 * 3 start_y = max(0, height // 2 - total_height // 2) for i in range(3): # For visualization, we'll place them stacked vis_segment_y_start = start_y + i * segment_height_9_16 vis_segment_y_end = min(height, vis_segment_y_start + segment_height_9_16) # But each segment would actually be centered on the subject independently # Here we also draw the centered version more faintly actual_segment_y_start = max( 0, min( height - segment_height_9_16, original_center_y - segment_height_9_16 // 2, ), ) actual_segment_y_end = min( height, actual_segment_y_start + segment_height_9_16 ) # Draw the visualization placement (stacked) cv2.rectangle( visualization, (x_start, vis_segment_y_start), (x_end, vis_segment_y_end), colors["threehalfs"]["9:16"], thickness, ) # Draw the actual centered placement with dashed lines if i > 0: # Only draw centered versions for parts 2 and 3 for j in range(x_start, x_end, 20): # Dashed line effect if j % 40 < 20: # Skip every other segment cv2.line( visualization, (j, actual_segment_y_start), (min(j + 20, x_end), actual_segment_y_start), colors["threehalfs"]["9:16"], 1, ) cv2.line( visualization, (j, actual_segment_y_end), (min(j + 20, x_end), actual_segment_y_end), colors["threehalfs"]["9:16"], 1, ) cv2.putText( visualization, f"9:16 Part {i+1}", (x_start + 5, vis_segment_y_start + 30), font, font_scale, colors["threehalfs"]["9:16"], font_thickness, ) # Draw twothirdhalfs layouts elif layout_type == "twothirdhalfs_var1" or layout_type == "twothirdhalfs_var2": aspect_key = "16:9" if layout_type.endswith("var1") else "9:16" layout_color = colors[ ( "twothirdhalfs_var1" if layout_type.endswith("var1") else "twothirdhalfs_var2" ) ][aspect_key] if aspect_key == "16:9": # For 16:9 (two segments side by side) aspect_10_6_9 = 10.6 / 9 aspect_5_3_9 = 5.3 / 9 segment1_height = height segment1_width = int( segment1_height * (aspect_10_6_9 if layout_type.endswith("var1") else aspect_5_3_9) ) segment2_height = height segment2_width = int( segment2_height * (aspect_5_3_9 if layout_type.endswith("var1") else aspect_10_6_9) ) # First segment segment_center_x = original_center_x - segment2_width // 4 segment_x_start = int( max( 0, min(width - segment1_width, segment_center_x - segment1_width // 2), ) ) segment_x_end = int(min(width, segment_x_start + segment1_width)) cv2.rectangle( visualization, (segment_x_start, 0), (segment_x_end, segment1_height), layout_color, thickness, ) cv2.putText( visualization, f"16:9 Part 1", (segment_x_start + 5, 30), font, font_scale, layout_color, font_thickness, ) # Second segment segment_center_x = original_center_x + segment1_width // 4 segment_x_start = int( max( 0, min(width - segment2_width, segment_center_x - segment2_width // 2), ) ) segment_x_end = int(min(width, segment_x_start + segment2_width)) cv2.rectangle( visualization, (segment_x_start, 0), (segment_x_end, segment2_height), layout_color, thickness, ) cv2.putText( visualization, f"16:9 Part 2", (segment_x_start + 5, 60), font, font_scale, layout_color, font_thickness, ) else: # aspect_key == "9:16" # For 9:16 (two segments stacked) aspect_9_16 = 9 / 16 target_width_9_16 = int(height * aspect_9_16) x_start = max( 0, min( width - target_width_9_16, original_center_x - target_width_9_16 // 2, ), ) x_end = x_start + target_width_9_16 aspect_9_10_6 = 9 / 10.6 aspect_9_5_3 = 9 / 5.3 segment1_width = target_width_9_16 segment1_height = int( segment1_width / (aspect_9_10_6 if layout_type.endswith("var1") else aspect_9_5_3) ) segment2_width = target_width_9_16 segment2_height = int( segment2_width / (aspect_9_5_3 if layout_type.endswith("var1") else aspect_9_10_6) ) # First segment (top) segment_y_start = 0 segment_y_end = min(height, segment_y_start + segment1_height) cv2.rectangle( visualization, (x_start, segment_y_start), (x_end, segment_y_end), layout_color, thickness, ) cv2.putText( visualization, f"9:16 Part 1", (x_start + 5, segment_y_start + 30), font, font_scale, layout_color, font_thickness, ) # Second segment (bottom) segment_y_start = segment_y_end segment_y_end = min(height, segment_y_start + segment2_height) cv2.rectangle( visualization, (x_start, segment_y_start), (x_end, segment_y_end), layout_color, thickness, ) cv2.putText( visualization, f"9:16 Part 2", (x_start + 5, segment_y_start + 30), font, font_scale, layout_color, font_thickness, ) # Draw twoequalhalfs layouts elif layout_type == "twoequalhalfs": # For 16:9 (two 8:9 segments side by side) aspect_8_9 = 8 / 9 segment_height = height segment_width = int(segment_height * aspect_8_9) # First segment (left) segment_center_x = original_center_x - segment_width // 2 segment_x_start = int( max(0, min(width - segment_width, segment_center_x - segment_width // 2)) ) segment_x_end = int(min(width, segment_x_start + segment_width)) cv2.rectangle( visualization, (segment_x_start, 0), (segment_x_end, segment_height), colors["twoequalhalfs"]["16:9"], thickness, ) cv2.putText( visualization, f"16:9 Equal 1", (segment_x_start + 5, 30), font, font_scale, colors["twoequalhalfs"]["16:9"], font_thickness, ) # Second segment (right) segment_center_x = original_center_x + segment_width // 2 segment_x_start = int( max(0, min(width - segment_width, segment_center_x - segment_width // 2)) ) segment_x_end = int(min(width, segment_x_start + segment_width)) cv2.rectangle( visualization, (segment_x_start, 0), (segment_x_end, segment_height), colors["twoequalhalfs"]["16:9"], thickness, ) cv2.putText( visualization, f"16:9 Equal 2", (segment_x_start + 5, 60), font, font_scale, colors["twoequalhalfs"]["16:9"], font_thickness, ) # For 9:16 (two 9:8 segments stacked) aspect_9_16 = 9 / 16 target_width_9_16 = int(height * aspect_9_16) x_start = max( 0, min(width - target_width_9_16, original_center_x - target_width_9_16 // 2), ) x_end = x_start + target_width_9_16 aspect_9_8 = 9 / 8 segment_width_9_16 = target_width_9_16 segment_height_9_16 = int(segment_width_9_16 / aspect_9_8) # First segment (top) segment_y_start = 0 segment_y_end = min(height, segment_y_start + segment_height_9_16) cv2.rectangle( visualization, (x_start, segment_y_start), (x_end, segment_y_end), colors["twoequalhalfs"]["9:16"], thickness, ) cv2.putText( visualization, f"9:16 Equal 1", (x_start + 5, segment_y_start + 30), font, font_scale, colors["twoequalhalfs"]["9:16"], font_thickness, ) # Second segment (bottom) segment_y_start = segment_y_end segment_y_end = min(height, segment_y_start + segment_height_9_16) cv2.rectangle( visualization, (x_start, segment_y_start), (x_end, segment_y_end), colors["twoequalhalfs"]["9:16"], thickness, ) cv2.putText( visualization, f"9:16 Equal 2", (x_start + 5, segment_y_start + 30), font, font_scale, colors["twoequalhalfs"]["9:16"], font_thickness, ) # Draw center point of person(s) center_radius = 8 cv2.circle( visualization, (original_center_x, original_center_y), center_radius, (255, 255, 255), -1, ) cv2.circle( visualization, (original_center_x, original_center_y), center_radius, (0, 0, 0), 2, ) # Convert back to PIL format visualization_pil = Image.fromarray(cv2.cvtColor(visualization, cv2.COLOR_BGR2RGB)) return visualization_pil def get_image_crop(cid=None, rsid=None, uid=None, ct=None): """ Function that returns both standard and layout variations for visualization. Returns: gr.Gallery: Gallery of all generated images """ try: sprites_data = get_sprite_firebase(cid, rsid, uid) image_paths = [sprite_data["url"] for sprite_data in sprites_data] durations = [sprite_data["duration"] for sprite_data in sprites_data] except Exception: image_paths = [ # "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg", # "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg", "data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg", "data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg", "data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg", "data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg", ] # Lists to store all images all_images = [] all_captions = [] for image_path in image_paths: # Load image (from local file or URL) try: if image_path.startswith(("http://", "https://")): response = requests.get(image_path) input_image = Image.open(BytesIO(response.content)) else: input_image = Image.open(image_path) except Exception as e: print(f"Error loading image {image_path}: {e}") continue # Get the middle thumbnail mid_image = get_middle_thumbnail(input_image) # Add numbered divisions for GPT-4V analysis numbered_mid_image = add_top_numbers( input_image=mid_image, num_divisions=20, margin=50, font_size=30, dot_spacing=20, ) # Analyze the image to get optimal crop divisions # This uses GPT-4V to identify the optimal crop points (left_division, right_division, num_of_speakers) = analyze_image( numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct ) # Safety check for divisions if left_division <= 0: left_division = 1 if right_division > 20: right_division = 20 if left_division >= right_division: left_division = 1 right_division = 20 print(f"Using divisions: left={left_division}, right={right_division}") # Create layouts and cutouts using the new function ( standard_crops, threehalfs_layouts, twothirdhalfs_layouts, twoequalhalfs_layouts, visualization_data, ) = create_layouts(mid_image, left_division, right_division, num_of_speakers) # Create all the required visualizations # 1. Standard aspect ratio visualization (16:9 and 9:16) standard_visualization = draw_layout_regions( mid_image, left_division, right_division, visualization_data, "standard" ) all_images.append(standard_visualization) all_captions.append( f"Standard Aspect Ratios (16:9 & 9:16) {standard_visualization.size}" ) # Add input and middle image to gallery all_images.append(input_image) all_captions.append(f"Input Image {input_image.size}") all_images.append(mid_image) all_captions.append(f"Middle Thumbnail {mid_image.size}") # Add standard crops for key, crop in standard_crops.items(): all_images.append(crop) all_captions.append(f"{key} {crop.size}") # Add threehalfs layouts for key, layout in threehalfs_layouts.items(): all_images.append(layout) all_captions.append(f"Three Halfs {key} {layout.size}") # Add twothirdhalfs layouts for key, layout in twothirdhalfs_layouts.items(): all_images.append(layout) all_captions.append(f"Two-Thirds Halfs {key} {layout.size}") # Add twoequalhalfs layouts for key, layout in twoequalhalfs_layouts.items(): all_images.append(layout) all_captions.append(f"Two Equal Halfs {key} {layout.size}") # Return gallery with all images return gr.Gallery(value=list(zip(all_images, all_captions)))