Spaces:
Sleeping
Sleeping
Commit
·
09fada4
1
Parent(s):
0633d98
add: street interview crops
Browse files- app.py +1 -1
- crop_utils.py +66 -131
- prompts.py +5 -3
app.py
CHANGED
|
@@ -245,7 +245,7 @@ def chat(
|
|
| 245 |
tool_call = response.choices[0].message.tool_calls[0]
|
| 246 |
if tool_call.function.name == "get_image":
|
| 247 |
# Return the image directly in the chat
|
| 248 |
-
image_data = get_image_crop(cid, rsid, uid)
|
| 249 |
print(response.choices[0].message)
|
| 250 |
messages.append(response.choices[0].message)
|
| 251 |
function_call_result_message = {
|
|
|
|
| 245 |
tool_call = response.choices[0].message.tool_calls[0]
|
| 246 |
if tool_call.function.name == "get_image":
|
| 247 |
# Return the image directly in the chat
|
| 248 |
+
image_data = get_image_crop(cid, rsid, uid, ct)
|
| 249 |
print(response.choices[0].message)
|
| 250 |
messages.append(response.choices[0].message)
|
| 251 |
function_call_result_message = {
|
crop_utils.py
CHANGED
|
@@ -151,97 +151,7 @@ def add_top_numbers(
|
|
| 151 |
return new_image
|
| 152 |
|
| 153 |
|
| 154 |
-
def
|
| 155 |
-
input_image,
|
| 156 |
-
left_division,
|
| 157 |
-
right_division,
|
| 158 |
-
num_divisions=20,
|
| 159 |
-
line_color=(255, 0, 0),
|
| 160 |
-
line_width=2,
|
| 161 |
-
head_margin_percent=0.1,
|
| 162 |
-
):
|
| 163 |
-
"""
|
| 164 |
-
Create both 9:16 and 16:9 crops and draw guide lines.
|
| 165 |
-
|
| 166 |
-
Args:
|
| 167 |
-
input_image (Image): PIL Image
|
| 168 |
-
left_division (int): Left-side division number (1-20)
|
| 169 |
-
right_division (int): Right-side division number (1-20)
|
| 170 |
-
num_divisions (int): Total number of divisions (default=20)
|
| 171 |
-
line_color (tuple): RGB color tuple for lines (default: red)
|
| 172 |
-
line_width (int): Width of lines in pixels (default: 2)
|
| 173 |
-
head_margin_percent (float): Percentage margin above head (default: 0.1)
|
| 174 |
-
|
| 175 |
-
Returns:
|
| 176 |
-
tuple: (cropped_image_16_9, image_with_lines, cropped_image_9_16)
|
| 177 |
-
"""
|
| 178 |
-
yolo_model = model
|
| 179 |
-
# Calculate division width and boundaries
|
| 180 |
-
division_width = input_image.width / num_divisions
|
| 181 |
-
left_boundary = (left_division - 1) * division_width
|
| 182 |
-
right_boundary = right_division * division_width
|
| 183 |
-
|
| 184 |
-
# First get the 9:16 crop
|
| 185 |
-
cropped_image_9_16 = input_image.crop(
|
| 186 |
-
(left_boundary, 0, right_boundary, input_image.height)
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
# Run YOLO on the 9:16 crop to get person bbox
|
| 190 |
-
bbox = (
|
| 191 |
-
yolo_model(cropped_image_9_16, classes=[0], conf=0.6)[0]
|
| 192 |
-
.boxes.xyxy.cpu()
|
| 193 |
-
.numpy()[0]
|
| 194 |
-
)
|
| 195 |
-
x1, y1, x2, y2 = bbox
|
| 196 |
-
|
| 197 |
-
# Calculate top boundary with head margin
|
| 198 |
-
head_margin = (y2 - y1) * head_margin_percent
|
| 199 |
-
top_boundary = max(0, y1 - head_margin)
|
| 200 |
-
|
| 201 |
-
# Calculate 16:9 dimensions based on the width between divisions
|
| 202 |
-
crop_width = right_boundary - left_boundary
|
| 203 |
-
crop_height_16_9 = int(crop_width * 9 / 16)
|
| 204 |
-
|
| 205 |
-
# Calculate bottom boundary for 16:9
|
| 206 |
-
bottom_boundary = min(input_image.height, top_boundary + crop_height_16_9)
|
| 207 |
-
|
| 208 |
-
# Create 16:9 crop from original image
|
| 209 |
-
cropped_image_16_9 = input_image.crop(
|
| 210 |
-
(left_boundary, top_boundary, right_boundary, bottom_boundary)
|
| 211 |
-
)
|
| 212 |
-
|
| 213 |
-
# Draw guide lines for both crops on original image
|
| 214 |
-
image_with_lines = input_image.copy()
|
| 215 |
-
draw = ImageDraw.Draw(image_with_lines)
|
| 216 |
-
|
| 217 |
-
# Draw vertical lines (for both crops)
|
| 218 |
-
draw.line(
|
| 219 |
-
[(left_boundary, 0), (left_boundary, input_image.height)],
|
| 220 |
-
fill=line_color,
|
| 221 |
-
width=line_width,
|
| 222 |
-
)
|
| 223 |
-
draw.line(
|
| 224 |
-
[(right_boundary, 0), (right_boundary, input_image.height)],
|
| 225 |
-
fill=line_color,
|
| 226 |
-
width=line_width,
|
| 227 |
-
)
|
| 228 |
-
|
| 229 |
-
# Draw horizontal lines (for 16:9 crop)
|
| 230 |
-
draw.line(
|
| 231 |
-
[(left_boundary, top_boundary), (right_boundary, top_boundary)],
|
| 232 |
-
fill=line_color,
|
| 233 |
-
width=line_width,
|
| 234 |
-
)
|
| 235 |
-
draw.line(
|
| 236 |
-
[(left_boundary, bottom_boundary), (right_boundary, bottom_boundary)],
|
| 237 |
-
fill=line_color,
|
| 238 |
-
width=line_width,
|
| 239 |
-
)
|
| 240 |
-
|
| 241 |
-
return cropped_image_16_9, image_with_lines, cropped_image_9_16
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
def analyze_image(numbered_input_image: Image, prompt, input_image):
|
| 245 |
"""
|
| 246 |
Perform inference on an image using GPT-4V.
|
| 247 |
|
|
@@ -278,7 +188,7 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
|
|
| 278 |
{"role": "assistant", "content": response.choices[0].message.content},
|
| 279 |
{
|
| 280 |
"role": "user",
|
| 281 |
-
"content": "please return the response in the json with keys left_row and
|
| 282 |
},
|
| 283 |
],
|
| 284 |
)
|
|
@@ -294,24 +204,16 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
|
|
| 294 |
|
| 295 |
try:
|
| 296 |
if left_index != -1 and right_index != -1:
|
|
|
|
| 297 |
response_json = eval(response[left_index : right_index + 1])
|
| 298 |
-
cropped_image_16_9, image_with_lines, cropped_image_9_16 = (
|
| 299 |
-
crop_and_draw_divisions(
|
| 300 |
-
input_image=input_image,
|
| 301 |
-
left_division=response_json["left_row"],
|
| 302 |
-
right_division=response_json["right_row"],
|
| 303 |
-
)
|
| 304 |
-
)
|
| 305 |
except Exception as e:
|
| 306 |
print(e)
|
| 307 |
-
return
|
| 308 |
|
| 309 |
return (
|
| 310 |
-
cropped_image_16_9,
|
| 311 |
-
image_with_lines,
|
| 312 |
-
cropped_image_9_16,
|
| 313 |
response_json["left_row"],
|
| 314 |
response_json["right_row"],
|
|
|
|
| 315 |
)
|
| 316 |
|
| 317 |
|
|
@@ -339,16 +241,17 @@ def get_sprite_firebase(cid, rsid, uid):
|
|
| 339 |
return data.val()
|
| 340 |
|
| 341 |
|
| 342 |
-
def find_persons_center(image):
|
| 343 |
"""
|
| 344 |
-
Find the center point of
|
| 345 |
-
If multiple persons are detected, merge
|
| 346 |
|
| 347 |
Args:
|
| 348 |
image: CV2/numpy array image
|
|
|
|
| 349 |
|
| 350 |
Returns:
|
| 351 |
-
int: x-coordinate of the center point of all persons
|
| 352 |
"""
|
| 353 |
# Detect persons (class 0 in COCO dataset)
|
| 354 |
results = model(image, classes=[0], conf=0.6)
|
|
@@ -370,18 +273,35 @@ def find_persons_center(image):
|
|
| 370 |
print(f"Single person detected at center x: {center_x}")
|
| 371 |
return center_x
|
| 372 |
else:
|
| 373 |
-
# Multiple persons -
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
merged_center_x = int((left_x + right_x) // 2)
|
| 377 |
|
| 378 |
-
print(
|
|
|
|
|
|
|
| 379 |
print(f"Merged bounds: left={left_x}, right={right_x}")
|
| 380 |
|
| 381 |
return merged_center_x
|
| 382 |
|
| 383 |
|
| 384 |
-
def create_layouts(image, left_division, right_division):
|
| 385 |
"""
|
| 386 |
Create different layout variations of the image using specific aspect ratios.
|
| 387 |
All layout variations will be centered on detected persons.
|
|
@@ -436,11 +356,26 @@ def create_layouts(image, left_division, right_division):
|
|
| 436 |
person_top = y1
|
| 437 |
person_height = y2 - y1
|
| 438 |
else:
|
| 439 |
-
# Multiple persons -
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
cutout_center_x = int((left_x + right_x) // 2)
|
| 446 |
cutout_center_y = int((top_y + bottom_y) // 2)
|
|
@@ -451,13 +386,13 @@ def create_layouts(image, left_division, right_division):
|
|
| 451 |
aspect_16_9 = 16 / 9
|
| 452 |
aspect_9_16 = 9 / 16
|
| 453 |
|
| 454 |
-
# For 16:9 version (with
|
| 455 |
target_height_16_9 = int(cutout_width / aspect_16_9)
|
| 456 |
if target_height_16_9 <= cutout_height:
|
| 457 |
-
# Calculate
|
| 458 |
top_margin = int(person_height * 0.05)
|
| 459 |
|
| 460 |
-
# Start
|
| 461 |
y_start = int(max(0, person_top - top_margin))
|
| 462 |
|
| 463 |
# If this would make the crop exceed the bottom, adjust y_start
|
|
@@ -1578,7 +1513,7 @@ def draw_layout_regions(
|
|
| 1578 |
return visualization_pil
|
| 1579 |
|
| 1580 |
|
| 1581 |
-
def get_image_crop(cid=None, rsid=None, uid=None):
|
| 1582 |
"""
|
| 1583 |
Function that returns both standard and layout variations for visualization.
|
| 1584 |
|
|
@@ -1591,8 +1526,12 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
| 1591 |
durations = [sprite_data["duration"] for sprite_data in sprites_data]
|
| 1592 |
except Exception:
|
| 1593 |
image_paths = [
|
| 1594 |
-
"data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
| 1595 |
-
"data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1596 |
]
|
| 1597 |
|
| 1598 |
# Lists to store all images
|
|
@@ -1625,13 +1564,9 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
| 1625 |
|
| 1626 |
# Analyze the image to get optimal crop divisions
|
| 1627 |
# This uses GPT-4V to identify the optimal crop points
|
| 1628 |
-
(
|
| 1629 |
-
|
| 1630 |
-
|
| 1631 |
-
_,
|
| 1632 |
-
left_division,
|
| 1633 |
-
right_division,
|
| 1634 |
-
) = analyze_image(numbered_mid_image, remove_unwanted_prompt(2), mid_image)
|
| 1635 |
|
| 1636 |
# Safety check for divisions
|
| 1637 |
if left_division <= 0:
|
|
@@ -1651,7 +1586,7 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
| 1651 |
twothirdhalfs_layouts,
|
| 1652 |
twoequalhalfs_layouts,
|
| 1653 |
visualization_data,
|
| 1654 |
-
) = create_layouts(mid_image, left_division, right_division)
|
| 1655 |
|
| 1656 |
# Create all the required visualizations
|
| 1657 |
# 1. Standard aspect ratio visualization (16:9 and 9:16)
|
|
|
|
| 151 |
return new_image
|
| 152 |
|
| 153 |
|
| 154 |
+
def analyze_image(numbered_input_image: Image, prompt, input_image, ct):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
"""
|
| 156 |
Perform inference on an image using GPT-4V.
|
| 157 |
|
|
|
|
| 188 |
{"role": "assistant", "content": response.choices[0].message.content},
|
| 189 |
{
|
| 190 |
"role": "user",
|
| 191 |
+
"content": "please return the response in the json with keys left_row, right_row, and num_of_speakers",
|
| 192 |
},
|
| 193 |
],
|
| 194 |
)
|
|
|
|
| 204 |
|
| 205 |
try:
|
| 206 |
if left_index != -1 and right_index != -1:
|
| 207 |
+
print(response[left_index : right_index + 1])
|
| 208 |
response_json = eval(response[left_index : right_index + 1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
except Exception as e:
|
| 210 |
print(e)
|
| 211 |
+
return 0, 20
|
| 212 |
|
| 213 |
return (
|
|
|
|
|
|
|
|
|
|
| 214 |
response_json["left_row"],
|
| 215 |
response_json["right_row"],
|
| 216 |
+
response_json["num_of_speakers"],
|
| 217 |
)
|
| 218 |
|
| 219 |
|
|
|
|
| 241 |
return data.val()
|
| 242 |
|
| 243 |
|
| 244 |
+
def find_persons_center(image, num_of_speakers=1):
|
| 245 |
"""
|
| 246 |
+
Find the center point of the largest num_of_speakers persons in the image.
|
| 247 |
+
If multiple persons are detected, merge the bounding boxes of only the largest ones.
|
| 248 |
|
| 249 |
Args:
|
| 250 |
image: CV2/numpy array image
|
| 251 |
+
num_of_speakers: Number of speakers to consider (default: 1)
|
| 252 |
|
| 253 |
Returns:
|
| 254 |
+
int: x-coordinate of the center point of all considered persons
|
| 255 |
"""
|
| 256 |
# Detect persons (class 0 in COCO dataset)
|
| 257 |
results = model(image, classes=[0], conf=0.6)
|
|
|
|
| 273 |
print(f"Single person detected at center x: {center_x}")
|
| 274 |
return center_x
|
| 275 |
else:
|
| 276 |
+
# Multiple persons - consider only the largest num_of_speakers boxes
|
| 277 |
+
|
| 278 |
+
# Calculate area for each box
|
| 279 |
+
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
|
| 280 |
+
|
| 281 |
+
# Sort boxes by area (largest first) and take top num_of_speakers
|
| 282 |
+
sorted_indices = sorted(
|
| 283 |
+
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
# Use all available boxes if fewer detected than requested
|
| 287 |
+
num_boxes_to_use = min(num_of_speakers, len(boxes))
|
| 288 |
+
selected_indices = sorted_indices[:num_boxes_to_use]
|
| 289 |
+
selected_boxes = [boxes[i] for i in selected_indices]
|
| 290 |
+
|
| 291 |
+
# Create a merged bounding box from selected boxes
|
| 292 |
+
left_x = min(box[0] for box in selected_boxes)
|
| 293 |
+
right_x = max(box[2] for box in selected_boxes)
|
| 294 |
merged_center_x = int((left_x + right_x) // 2)
|
| 295 |
|
| 296 |
+
print(
|
| 297 |
+
f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}"
|
| 298 |
+
)
|
| 299 |
print(f"Merged bounds: left={left_x}, right={right_x}")
|
| 300 |
|
| 301 |
return merged_center_x
|
| 302 |
|
| 303 |
|
| 304 |
+
def create_layouts(image, left_division, right_division, num_of_speakers):
|
| 305 |
"""
|
| 306 |
Create different layout variations of the image using specific aspect ratios.
|
| 307 |
All layout variations will be centered on detected persons.
|
|
|
|
| 356 |
person_top = y1
|
| 357 |
person_height = y2 - y1
|
| 358 |
else:
|
| 359 |
+
# Multiple persons - consider only the largest num_of_speakers boxes
|
| 360 |
+
|
| 361 |
+
# Calculate area for each box
|
| 362 |
+
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
|
| 363 |
+
|
| 364 |
+
# Sort boxes by area (largest first) and take top num_of_speakers
|
| 365 |
+
sorted_indices = sorted(
|
| 366 |
+
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Use all available boxes if fewer detected than requested
|
| 370 |
+
num_boxes_to_use = min(num_of_speakers, len(boxes))
|
| 371 |
+
selected_indices = sorted_indices[:num_boxes_to_use]
|
| 372 |
+
selected_boxes = [boxes[i] for i in selected_indices]
|
| 373 |
+
|
| 374 |
+
# Merge bounding boxes of selected boxes
|
| 375 |
+
left_x = min(box[0] for box in selected_boxes)
|
| 376 |
+
right_x = max(box[2] for box in selected_boxes)
|
| 377 |
+
top_y = min(box[1] for box in selected_boxes) # Top of highest person
|
| 378 |
+
bottom_y = max(box[3] for box in selected_boxes) # Bottom of lowest person
|
| 379 |
|
| 380 |
cutout_center_x = int((left_x + right_x) // 2)
|
| 381 |
cutout_center_y = int((top_y + bottom_y) // 2)
|
|
|
|
| 386 |
aspect_16_9 = 16 / 9
|
| 387 |
aspect_9_16 = 9 / 16
|
| 388 |
|
| 389 |
+
# For 16:9 version (with 5% margin above person)
|
| 390 |
target_height_16_9 = int(cutout_width / aspect_16_9)
|
| 391 |
if target_height_16_9 <= cutout_height:
|
| 392 |
+
# Calculate 5% of person height for top margin
|
| 393 |
top_margin = int(person_height * 0.05)
|
| 394 |
|
| 395 |
+
# Start 5% above the person's top
|
| 396 |
y_start = int(max(0, person_top - top_margin))
|
| 397 |
|
| 398 |
# If this would make the crop exceed the bottom, adjust y_start
|
|
|
|
| 1513 |
return visualization_pil
|
| 1514 |
|
| 1515 |
|
| 1516 |
+
def get_image_crop(cid=None, rsid=None, uid=None, ct=None):
|
| 1517 |
"""
|
| 1518 |
Function that returns both standard and layout variations for visualization.
|
| 1519 |
|
|
|
|
| 1526 |
durations = [sprite_data["duration"] for sprite_data in sprites_data]
|
| 1527 |
except Exception:
|
| 1528 |
image_paths = [
|
| 1529 |
+
# "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
| 1530 |
+
# "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
| 1531 |
+
"data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
| 1532 |
+
"data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
| 1533 |
+
"data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg",
|
| 1534 |
+
"data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg",
|
| 1535 |
]
|
| 1536 |
|
| 1537 |
# Lists to store all images
|
|
|
|
| 1564 |
|
| 1565 |
# Analyze the image to get optimal crop divisions
|
| 1566 |
# This uses GPT-4V to identify the optimal crop points
|
| 1567 |
+
(left_division, right_division, num_of_speakers) = analyze_image(
|
| 1568 |
+
numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct
|
| 1569 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1570 |
|
| 1571 |
# Safety check for divisions
|
| 1572 |
if left_division <= 0:
|
|
|
|
| 1586 |
twothirdhalfs_layouts,
|
| 1587 |
twoequalhalfs_layouts,
|
| 1588 |
visualization_data,
|
| 1589 |
+
) = create_layouts(mid_image, left_division, right_division, num_of_speakers)
|
| 1590 |
|
| 1591 |
# Create all the required visualizations
|
| 1592 |
# 1. Standard aspect ratio visualization (16:9 and 9:16)
|
prompts.py
CHANGED
|
@@ -152,12 +152,14 @@ If the user provides the correct call type, use the correct_call_type function t
|
|
| 152 |
|
| 153 |
|
| 154 |
def remove_unwanted_prompt(number_of_speakers: int):
|
| 155 |
-
if number_of_speakers ==
|
| 156 |
-
return """I want to crop this image only when absolutely necessary to remove partial objects or humans.
|
| 157 |
|
| 158 |
Please analyze the image and tell me:
|
| 159 |
1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
|
| 160 |
|
| 161 |
2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
|
| 162 |
|
| 163 |
-
I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
def remove_unwanted_prompt(number_of_speakers: int):
|
| 155 |
+
if number_of_speakers == 1:
|
| 156 |
+
return """I want to crop this image only when absolutely necessary to remove partial objects or partial humans.
|
| 157 |
|
| 158 |
Please analyze the image and tell me:
|
| 159 |
1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
|
| 160 |
|
| 161 |
2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
|
| 162 |
|
| 163 |
+
I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
|
| 164 |
+
Also tell the number of speakers that are completely visible and should be part of the crop. Generally it is either 1 or 2 but can be more.
|
| 165 |
+
"""
|