Multi_LLM_Image_Captioning / caption_overlay.py
HarshitX's picture
Upload 9 files
8a8f3ed verified
import os
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
class ImageCaptionOverlay:
"""Handles adding captions to images using OpenCV"""
@staticmethod
def add_caption_overlay(image: np.ndarray, caption: str, position: str = "bottom",
font_size: int = 1, thickness: int = 2) -> np.ndarray:
"""Add caption as overlay on the image"""
img_copy = image.copy()
height, width = img_copy.shape[:2]
# Prepare text
font = cv2.FONT_HERSHEY_SIMPLEX
# Calculate text size and position
text_size = cv2.getTextSize(caption, font, font_size, thickness)[0]
# Wrap text if too long
max_width = width - 40
if text_size[0] > max_width:
words = caption.split()
lines = []
current_line = ""
for word in words:
test_line = current_line + " " + word if current_line else word
test_size = cv2.getTextSize(test_line, font, font_size, thickness)[0]
if test_size[0] <= max_width:
current_line = test_line
else:
if current_line:
lines.append(current_line)
current_line = word
if current_line:
lines.append(current_line)
else:
lines = [caption]
# Calculate positions
line_height = cv2.getTextSize("A", font, font_size, thickness)[0][1] + 10
total_height = len(lines) * line_height
if position == "bottom":
start_y = height - total_height - 20
elif position == "top":
start_y = 30
else: # center
start_y = (height - total_height) // 2
# Add background rectangle for better readability
for i, line in enumerate(lines):
text_size = cv2.getTextSize(line, font, font_size, thickness)[0]
text_x = (width - text_size[0]) // 2
text_y = start_y + (i * line_height) + text_size[1]
# Background rectangle
cv2.rectangle(img_copy,
(text_x - 10, text_y - text_size[1] - 5),
(text_x + text_size[0] + 10, text_y + 5),
(0, 0, 0), -1)
# Text
cv2.putText(img_copy, line, (text_x, text_y), font, font_size, (255, 255, 255), thickness)
return img_copy
@staticmethod
def add_caption_background(image: np.ndarray, caption: str,
font_path: str = None,
background_color: tuple = (0, 0, 0),
text_color: tuple = (255, 255, 255),
margin: int = 50) -> np.ndarray:
"""Add caption on a background behind the image"""
height, width = image.shape[:2]
# Use PIL for better text rendering
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# Try to use Poppins font first, then fallback to default
try:
# First priority: custom font path if provided
if font_path and os.path.exists(font_path):
font = ImageFont.truetype(font_path, 24)
# Second priority: check for Poppins font in fonts directory
elif os.path.exists("fonts/Poppins-Regular.ttf"):
font = ImageFont.truetype("fonts/Poppins-Regular.ttf", 24)
else:
# Fallback to default font
font = ImageFont.load_default()
except Exception:
# If anything fails, use default font
font = ImageFont.load_default()
# Calculate text dimensions
draw = ImageDraw.Draw(pil_image)
bbox = draw.textbbox((0, 0), caption, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# Wrap text if necessary
max_width = width - (2 * margin)
if text_width > max_width:
words = caption.split()
lines = []
current_line = ""
for word in words:
test_line = current_line + " " + word if current_line else word
test_bbox = draw.textbbox((0, 0), test_line, font=font)
test_width = test_bbox[2] - test_bbox[0]
if test_width <= max_width:
current_line = test_line
else:
if current_line:
lines.append(current_line)
current_line = word
if current_line:
lines.append(current_line)
else:
lines = [caption]
# Calculate total text height
total_text_height = len(lines) * text_height + (len(lines) - 1) * 10
# Create new image with space for text
new_height = height + total_text_height + (2 * margin)
new_image = Image.new('RGB', (width, new_height), background_color)
# Paste original image
new_image.paste(pil_image, (0, total_text_height + (2 * margin)))
# Add text
draw = ImageDraw.Draw(new_image)
y_offset = margin
for line in lines:
bbox = draw.textbbox((0, 0), line, font=font)
line_width = bbox[2] - bbox[0]
x_position = (width - line_width) // 2
draw.text((x_position, y_offset), line, fill=text_color, font=font)
y_offset += text_height + 10
# Convert back to OpenCV format
return cv2.cvtColor(np.array(new_image), cv2.COLOR_RGB2BGR)