AhmadMustafa commited on
Commit
09fada4
·
1 Parent(s): 0633d98

add: street interview crops

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. crop_utils.py +66 -131
  3. prompts.py +5 -3
app.py CHANGED
@@ -245,7 +245,7 @@ def chat(
245
  tool_call = response.choices[0].message.tool_calls[0]
246
  if tool_call.function.name == "get_image":
247
  # Return the image directly in the chat
248
- image_data = get_image_crop(cid, rsid, uid)
249
  print(response.choices[0].message)
250
  messages.append(response.choices[0].message)
251
  function_call_result_message = {
 
245
  tool_call = response.choices[0].message.tool_calls[0]
246
  if tool_call.function.name == "get_image":
247
  # Return the image directly in the chat
248
+ image_data = get_image_crop(cid, rsid, uid, ct)
249
  print(response.choices[0].message)
250
  messages.append(response.choices[0].message)
251
  function_call_result_message = {
crop_utils.py CHANGED
@@ -151,97 +151,7 @@ def add_top_numbers(
151
  return new_image
152
 
153
 
154
- def crop_and_draw_divisions(
155
- input_image,
156
- left_division,
157
- right_division,
158
- num_divisions=20,
159
- line_color=(255, 0, 0),
160
- line_width=2,
161
- head_margin_percent=0.1,
162
- ):
163
- """
164
- Create both 9:16 and 16:9 crops and draw guide lines.
165
-
166
- Args:
167
- input_image (Image): PIL Image
168
- left_division (int): Left-side division number (1-20)
169
- right_division (int): Right-side division number (1-20)
170
- num_divisions (int): Total number of divisions (default=20)
171
- line_color (tuple): RGB color tuple for lines (default: red)
172
- line_width (int): Width of lines in pixels (default: 2)
173
- head_margin_percent (float): Percentage margin above head (default: 0.1)
174
-
175
- Returns:
176
- tuple: (cropped_image_16_9, image_with_lines, cropped_image_9_16)
177
- """
178
- yolo_model = model
179
- # Calculate division width and boundaries
180
- division_width = input_image.width / num_divisions
181
- left_boundary = (left_division - 1) * division_width
182
- right_boundary = right_division * division_width
183
-
184
- # First get the 9:16 crop
185
- cropped_image_9_16 = input_image.crop(
186
- (left_boundary, 0, right_boundary, input_image.height)
187
- )
188
-
189
- # Run YOLO on the 9:16 crop to get person bbox
190
- bbox = (
191
- yolo_model(cropped_image_9_16, classes=[0], conf=0.6)[0]
192
- .boxes.xyxy.cpu()
193
- .numpy()[0]
194
- )
195
- x1, y1, x2, y2 = bbox
196
-
197
- # Calculate top boundary with head margin
198
- head_margin = (y2 - y1) * head_margin_percent
199
- top_boundary = max(0, y1 - head_margin)
200
-
201
- # Calculate 16:9 dimensions based on the width between divisions
202
- crop_width = right_boundary - left_boundary
203
- crop_height_16_9 = int(crop_width * 9 / 16)
204
-
205
- # Calculate bottom boundary for 16:9
206
- bottom_boundary = min(input_image.height, top_boundary + crop_height_16_9)
207
-
208
- # Create 16:9 crop from original image
209
- cropped_image_16_9 = input_image.crop(
210
- (left_boundary, top_boundary, right_boundary, bottom_boundary)
211
- )
212
-
213
- # Draw guide lines for both crops on original image
214
- image_with_lines = input_image.copy()
215
- draw = ImageDraw.Draw(image_with_lines)
216
-
217
- # Draw vertical lines (for both crops)
218
- draw.line(
219
- [(left_boundary, 0), (left_boundary, input_image.height)],
220
- fill=line_color,
221
- width=line_width,
222
- )
223
- draw.line(
224
- [(right_boundary, 0), (right_boundary, input_image.height)],
225
- fill=line_color,
226
- width=line_width,
227
- )
228
-
229
- # Draw horizontal lines (for 16:9 crop)
230
- draw.line(
231
- [(left_boundary, top_boundary), (right_boundary, top_boundary)],
232
- fill=line_color,
233
- width=line_width,
234
- )
235
- draw.line(
236
- [(left_boundary, bottom_boundary), (right_boundary, bottom_boundary)],
237
- fill=line_color,
238
- width=line_width,
239
- )
240
-
241
- return cropped_image_16_9, image_with_lines, cropped_image_9_16
242
-
243
-
244
- def analyze_image(numbered_input_image: Image, prompt, input_image):
245
  """
246
  Perform inference on an image using GPT-4V.
247
 
@@ -278,7 +188,7 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
278
  {"role": "assistant", "content": response.choices[0].message.content},
279
  {
280
  "role": "user",
281
- "content": "please return the response in the json with keys left_row and right_row",
282
  },
283
  ],
284
  )
@@ -294,24 +204,16 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
294
 
295
  try:
296
  if left_index != -1 and right_index != -1:
 
297
  response_json = eval(response[left_index : right_index + 1])
298
- cropped_image_16_9, image_with_lines, cropped_image_9_16 = (
299
- crop_and_draw_divisions(
300
- input_image=input_image,
301
- left_division=response_json["left_row"],
302
- right_division=response_json["right_row"],
303
- )
304
- )
305
  except Exception as e:
306
  print(e)
307
- return input_image, input_image, input_image, 0, 20
308
 
309
  return (
310
- cropped_image_16_9,
311
- image_with_lines,
312
- cropped_image_9_16,
313
  response_json["left_row"],
314
  response_json["right_row"],
 
315
  )
316
 
317
 
@@ -339,16 +241,17 @@ def get_sprite_firebase(cid, rsid, uid):
339
  return data.val()
340
 
341
 
342
- def find_persons_center(image):
343
  """
344
- Find the center point of all persons in the image.
345
- If multiple persons are detected, merge all bounding boxes and find the center.
346
 
347
  Args:
348
  image: CV2/numpy array image
 
349
 
350
  Returns:
351
- int: x-coordinate of the center point of all persons
352
  """
353
  # Detect persons (class 0 in COCO dataset)
354
  results = model(image, classes=[0], conf=0.6)
@@ -370,18 +273,35 @@ def find_persons_center(image):
370
  print(f"Single person detected at center x: {center_x}")
371
  return center_x
372
  else:
373
- # Multiple persons - create a merged bounding box
374
- left_x = min(box[0] for box in boxes)
375
- right_x = max(box[2] for box in boxes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  merged_center_x = int((left_x + right_x) // 2)
377
 
378
- print(f"Multiple persons merged bounding box center x: {merged_center_x}")
 
 
379
  print(f"Merged bounds: left={left_x}, right={right_x}")
380
 
381
  return merged_center_x
382
 
383
 
384
- def create_layouts(image, left_division, right_division):
385
  """
386
  Create different layout variations of the image using specific aspect ratios.
387
  All layout variations will be centered on detected persons.
@@ -436,11 +356,26 @@ def create_layouts(image, left_division, right_division):
436
  person_top = y1
437
  person_height = y2 - y1
438
  else:
439
- # Multiple persons - merge bounding boxes
440
- left_x = min(box[0] for box in boxes)
441
- right_x = max(box[2] for box in boxes)
442
- top_y = min(box[1] for box in boxes) # Top of highest person
443
- bottom_y = max(box[3] for box in boxes) # Bottom of lowest person
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  cutout_center_x = int((left_x + right_x) // 2)
446
  cutout_center_y = int((top_y + bottom_y) // 2)
@@ -451,13 +386,13 @@ def create_layouts(image, left_division, right_division):
451
  aspect_16_9 = 16 / 9
452
  aspect_9_16 = 9 / 16
453
 
454
- # For 16:9 version (with 20% margin above person)
455
  target_height_16_9 = int(cutout_width / aspect_16_9)
456
  if target_height_16_9 <= cutout_height:
457
- # Calculate 20% of person height for top margin
458
  top_margin = int(person_height * 0.05)
459
 
460
- # Start 20% above the person's top
461
  y_start = int(max(0, person_top - top_margin))
462
 
463
  # If this would make the crop exceed the bottom, adjust y_start
@@ -1578,7 +1513,7 @@ def draw_layout_regions(
1578
  return visualization_pil
1579
 
1580
 
1581
- def get_image_crop(cid=None, rsid=None, uid=None):
1582
  """
1583
  Function that returns both standard and layout variations for visualization.
1584
 
@@ -1591,8 +1526,12 @@ def get_image_crop(cid=None, rsid=None, uid=None):
1591
  durations = [sprite_data["duration"] for sprite_data in sprites_data]
1592
  except Exception:
1593
  image_paths = [
1594
- "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
1595
- "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
 
 
 
 
1596
  ]
1597
 
1598
  # Lists to store all images
@@ -1625,13 +1564,9 @@ def get_image_crop(cid=None, rsid=None, uid=None):
1625
 
1626
  # Analyze the image to get optimal crop divisions
1627
  # This uses GPT-4V to identify the optimal crop points
1628
- (
1629
- _,
1630
- _,
1631
- _,
1632
- left_division,
1633
- right_division,
1634
- ) = analyze_image(numbered_mid_image, remove_unwanted_prompt(2), mid_image)
1635
 
1636
  # Safety check for divisions
1637
  if left_division <= 0:
@@ -1651,7 +1586,7 @@ def get_image_crop(cid=None, rsid=None, uid=None):
1651
  twothirdhalfs_layouts,
1652
  twoequalhalfs_layouts,
1653
  visualization_data,
1654
- ) = create_layouts(mid_image, left_division, right_division)
1655
 
1656
  # Create all the required visualizations
1657
  # 1. Standard aspect ratio visualization (16:9 and 9:16)
 
151
  return new_image
152
 
153
 
154
+ def analyze_image(numbered_input_image: Image, prompt, input_image, ct):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  """
156
  Perform inference on an image using GPT-4V.
157
 
 
188
  {"role": "assistant", "content": response.choices[0].message.content},
189
  {
190
  "role": "user",
191
+ "content": "please return the response in the json with keys left_row, right_row, and num_of_speakers",
192
  },
193
  ],
194
  )
 
204
 
205
  try:
206
  if left_index != -1 and right_index != -1:
207
+ print(response[left_index : right_index + 1])
208
  response_json = eval(response[left_index : right_index + 1])
 
 
 
 
 
 
 
209
  except Exception as e:
210
  print(e)
211
+ return 0, 20
212
 
213
  return (
 
 
 
214
  response_json["left_row"],
215
  response_json["right_row"],
216
+ response_json["num_of_speakers"],
217
  )
218
 
219
 
 
241
  return data.val()
242
 
243
 
244
+ def find_persons_center(image, num_of_speakers=1):
245
  """
246
+ Find the center point of the largest num_of_speakers persons in the image.
247
+ If multiple persons are detected, merge the bounding boxes of only the largest ones.
248
 
249
  Args:
250
  image: CV2/numpy array image
251
+ num_of_speakers: Number of speakers to consider (default: 1)
252
 
253
  Returns:
254
+ int: x-coordinate of the center point of all considered persons
255
  """
256
  # Detect persons (class 0 in COCO dataset)
257
  results = model(image, classes=[0], conf=0.6)
 
273
  print(f"Single person detected at center x: {center_x}")
274
  return center_x
275
  else:
276
+ # Multiple persons - consider only the largest num_of_speakers boxes
277
+
278
+ # Calculate area for each box
279
+ box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
280
+
281
+ # Sort boxes by area (largest first) and take top num_of_speakers
282
+ sorted_indices = sorted(
283
+ range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
284
+ )
285
+
286
+ # Use all available boxes if fewer detected than requested
287
+ num_boxes_to_use = min(num_of_speakers, len(boxes))
288
+ selected_indices = sorted_indices[:num_boxes_to_use]
289
+ selected_boxes = [boxes[i] for i in selected_indices]
290
+
291
+ # Create a merged bounding box from selected boxes
292
+ left_x = min(box[0] for box in selected_boxes)
293
+ right_x = max(box[2] for box in selected_boxes)
294
  merged_center_x = int((left_x + right_x) // 2)
295
 
296
+ print(
297
+ f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}"
298
+ )
299
  print(f"Merged bounds: left={left_x}, right={right_x}")
300
 
301
  return merged_center_x
302
 
303
 
304
+ def create_layouts(image, left_division, right_division, num_of_speakers):
305
  """
306
  Create different layout variations of the image using specific aspect ratios.
307
  All layout variations will be centered on detected persons.
 
356
  person_top = y1
357
  person_height = y2 - y1
358
  else:
359
+ # Multiple persons - consider only the largest num_of_speakers boxes
360
+
361
+ # Calculate area for each box
362
+ box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
363
+
364
+ # Sort boxes by area (largest first) and take top num_of_speakers
365
+ sorted_indices = sorted(
366
+ range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
367
+ )
368
+
369
+ # Use all available boxes if fewer detected than requested
370
+ num_boxes_to_use = min(num_of_speakers, len(boxes))
371
+ selected_indices = sorted_indices[:num_boxes_to_use]
372
+ selected_boxes = [boxes[i] for i in selected_indices]
373
+
374
+ # Merge bounding boxes of selected boxes
375
+ left_x = min(box[0] for box in selected_boxes)
376
+ right_x = max(box[2] for box in selected_boxes)
377
+ top_y = min(box[1] for box in selected_boxes) # Top of highest person
378
+ bottom_y = max(box[3] for box in selected_boxes) # Bottom of lowest person
379
 
380
  cutout_center_x = int((left_x + right_x) // 2)
381
  cutout_center_y = int((top_y + bottom_y) // 2)
 
386
  aspect_16_9 = 16 / 9
387
  aspect_9_16 = 9 / 16
388
 
389
+ # For 16:9 version (with 5% margin above person)
390
  target_height_16_9 = int(cutout_width / aspect_16_9)
391
  if target_height_16_9 <= cutout_height:
392
+ # Calculate 5% of person height for top margin
393
  top_margin = int(person_height * 0.05)
394
 
395
+ # Start 5% above the person's top
396
  y_start = int(max(0, person_top - top_margin))
397
 
398
  # If this would make the crop exceed the bottom, adjust y_start
 
1513
  return visualization_pil
1514
 
1515
 
1516
+ def get_image_crop(cid=None, rsid=None, uid=None, ct=None):
1517
  """
1518
  Function that returns both standard and layout variations for visualization.
1519
 
 
1526
  durations = [sprite_data["duration"] for sprite_data in sprites_data]
1527
  except Exception:
1528
  image_paths = [
1529
+ # "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
1530
+ # "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
1531
+ "data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
1532
+ "data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
1533
+ "data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg",
1534
+ "data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg",
1535
  ]
1536
 
1537
  # Lists to store all images
 
1564
 
1565
  # Analyze the image to get optimal crop divisions
1566
  # This uses GPT-4V to identify the optimal crop points
1567
+ (left_division, right_division, num_of_speakers) = analyze_image(
1568
+ numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct
1569
+ )
 
 
 
 
1570
 
1571
  # Safety check for divisions
1572
  if left_division <= 0:
 
1586
  twothirdhalfs_layouts,
1587
  twoequalhalfs_layouts,
1588
  visualization_data,
1589
+ ) = create_layouts(mid_image, left_division, right_division, num_of_speakers)
1590
 
1591
  # Create all the required visualizations
1592
  # 1. Standard aspect ratio visualization (16:9 and 9:16)
prompts.py CHANGED
@@ -152,12 +152,14 @@ If the user provides the correct call type, use the correct_call_type function t
152
 
153
 
154
  def remove_unwanted_prompt(number_of_speakers: int):
155
- if number_of_speakers == 2:
156
- return """I want to crop this image only when absolutely necessary to remove partial objects or humans.
157
 
158
  Please analyze the image and tell me:
159
  1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
160
 
161
  2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
162
 
163
- I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements."""
 
 
 
152
 
153
 
154
  def remove_unwanted_prompt(number_of_speakers: int):
155
+ if number_of_speakers == 1:
156
+ return """I want to crop this image only when absolutely necessary to remove partial objects or partial humans.
157
 
158
  Please analyze the image and tell me:
159
  1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
160
 
161
  2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
162
 
163
+ I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
164
+ Also tell the number of speakers that are completely visible and should be part of the crop. Generally it is either 1 or 2 but can be more.
165
+ """