Spaces:
Sleeping
Sleeping
Commit
·
09fada4
1
Parent(s):
0633d98
add: street interview crops
Browse files- app.py +1 -1
- crop_utils.py +66 -131
- prompts.py +5 -3
app.py
CHANGED
@@ -245,7 +245,7 @@ def chat(
|
|
245 |
tool_call = response.choices[0].message.tool_calls[0]
|
246 |
if tool_call.function.name == "get_image":
|
247 |
# Return the image directly in the chat
|
248 |
-
image_data = get_image_crop(cid, rsid, uid)
|
249 |
print(response.choices[0].message)
|
250 |
messages.append(response.choices[0].message)
|
251 |
function_call_result_message = {
|
|
|
245 |
tool_call = response.choices[0].message.tool_calls[0]
|
246 |
if tool_call.function.name == "get_image":
|
247 |
# Return the image directly in the chat
|
248 |
+
image_data = get_image_crop(cid, rsid, uid, ct)
|
249 |
print(response.choices[0].message)
|
250 |
messages.append(response.choices[0].message)
|
251 |
function_call_result_message = {
|
crop_utils.py
CHANGED
@@ -151,97 +151,7 @@ def add_top_numbers(
|
|
151 |
return new_image
|
152 |
|
153 |
|
154 |
-
def
|
155 |
-
input_image,
|
156 |
-
left_division,
|
157 |
-
right_division,
|
158 |
-
num_divisions=20,
|
159 |
-
line_color=(255, 0, 0),
|
160 |
-
line_width=2,
|
161 |
-
head_margin_percent=0.1,
|
162 |
-
):
|
163 |
-
"""
|
164 |
-
Create both 9:16 and 16:9 crops and draw guide lines.
|
165 |
-
|
166 |
-
Args:
|
167 |
-
input_image (Image): PIL Image
|
168 |
-
left_division (int): Left-side division number (1-20)
|
169 |
-
right_division (int): Right-side division number (1-20)
|
170 |
-
num_divisions (int): Total number of divisions (default=20)
|
171 |
-
line_color (tuple): RGB color tuple for lines (default: red)
|
172 |
-
line_width (int): Width of lines in pixels (default: 2)
|
173 |
-
head_margin_percent (float): Percentage margin above head (default: 0.1)
|
174 |
-
|
175 |
-
Returns:
|
176 |
-
tuple: (cropped_image_16_9, image_with_lines, cropped_image_9_16)
|
177 |
-
"""
|
178 |
-
yolo_model = model
|
179 |
-
# Calculate division width and boundaries
|
180 |
-
division_width = input_image.width / num_divisions
|
181 |
-
left_boundary = (left_division - 1) * division_width
|
182 |
-
right_boundary = right_division * division_width
|
183 |
-
|
184 |
-
# First get the 9:16 crop
|
185 |
-
cropped_image_9_16 = input_image.crop(
|
186 |
-
(left_boundary, 0, right_boundary, input_image.height)
|
187 |
-
)
|
188 |
-
|
189 |
-
# Run YOLO on the 9:16 crop to get person bbox
|
190 |
-
bbox = (
|
191 |
-
yolo_model(cropped_image_9_16, classes=[0], conf=0.6)[0]
|
192 |
-
.boxes.xyxy.cpu()
|
193 |
-
.numpy()[0]
|
194 |
-
)
|
195 |
-
x1, y1, x2, y2 = bbox
|
196 |
-
|
197 |
-
# Calculate top boundary with head margin
|
198 |
-
head_margin = (y2 - y1) * head_margin_percent
|
199 |
-
top_boundary = max(0, y1 - head_margin)
|
200 |
-
|
201 |
-
# Calculate 16:9 dimensions based on the width between divisions
|
202 |
-
crop_width = right_boundary - left_boundary
|
203 |
-
crop_height_16_9 = int(crop_width * 9 / 16)
|
204 |
-
|
205 |
-
# Calculate bottom boundary for 16:9
|
206 |
-
bottom_boundary = min(input_image.height, top_boundary + crop_height_16_9)
|
207 |
-
|
208 |
-
# Create 16:9 crop from original image
|
209 |
-
cropped_image_16_9 = input_image.crop(
|
210 |
-
(left_boundary, top_boundary, right_boundary, bottom_boundary)
|
211 |
-
)
|
212 |
-
|
213 |
-
# Draw guide lines for both crops on original image
|
214 |
-
image_with_lines = input_image.copy()
|
215 |
-
draw = ImageDraw.Draw(image_with_lines)
|
216 |
-
|
217 |
-
# Draw vertical lines (for both crops)
|
218 |
-
draw.line(
|
219 |
-
[(left_boundary, 0), (left_boundary, input_image.height)],
|
220 |
-
fill=line_color,
|
221 |
-
width=line_width,
|
222 |
-
)
|
223 |
-
draw.line(
|
224 |
-
[(right_boundary, 0), (right_boundary, input_image.height)],
|
225 |
-
fill=line_color,
|
226 |
-
width=line_width,
|
227 |
-
)
|
228 |
-
|
229 |
-
# Draw horizontal lines (for 16:9 crop)
|
230 |
-
draw.line(
|
231 |
-
[(left_boundary, top_boundary), (right_boundary, top_boundary)],
|
232 |
-
fill=line_color,
|
233 |
-
width=line_width,
|
234 |
-
)
|
235 |
-
draw.line(
|
236 |
-
[(left_boundary, bottom_boundary), (right_boundary, bottom_boundary)],
|
237 |
-
fill=line_color,
|
238 |
-
width=line_width,
|
239 |
-
)
|
240 |
-
|
241 |
-
return cropped_image_16_9, image_with_lines, cropped_image_9_16
|
242 |
-
|
243 |
-
|
244 |
-
def analyze_image(numbered_input_image: Image, prompt, input_image):
|
245 |
"""
|
246 |
Perform inference on an image using GPT-4V.
|
247 |
|
@@ -278,7 +188,7 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
|
|
278 |
{"role": "assistant", "content": response.choices[0].message.content},
|
279 |
{
|
280 |
"role": "user",
|
281 |
-
"content": "please return the response in the json with keys left_row and
|
282 |
},
|
283 |
],
|
284 |
)
|
@@ -294,24 +204,16 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
|
|
294 |
|
295 |
try:
|
296 |
if left_index != -1 and right_index != -1:
|
|
|
297 |
response_json = eval(response[left_index : right_index + 1])
|
298 |
-
cropped_image_16_9, image_with_lines, cropped_image_9_16 = (
|
299 |
-
crop_and_draw_divisions(
|
300 |
-
input_image=input_image,
|
301 |
-
left_division=response_json["left_row"],
|
302 |
-
right_division=response_json["right_row"],
|
303 |
-
)
|
304 |
-
)
|
305 |
except Exception as e:
|
306 |
print(e)
|
307 |
-
return
|
308 |
|
309 |
return (
|
310 |
-
cropped_image_16_9,
|
311 |
-
image_with_lines,
|
312 |
-
cropped_image_9_16,
|
313 |
response_json["left_row"],
|
314 |
response_json["right_row"],
|
|
|
315 |
)
|
316 |
|
317 |
|
@@ -339,16 +241,17 @@ def get_sprite_firebase(cid, rsid, uid):
|
|
339 |
return data.val()
|
340 |
|
341 |
|
342 |
-
def find_persons_center(image):
|
343 |
"""
|
344 |
-
Find the center point of
|
345 |
-
If multiple persons are detected, merge
|
346 |
|
347 |
Args:
|
348 |
image: CV2/numpy array image
|
|
|
349 |
|
350 |
Returns:
|
351 |
-
int: x-coordinate of the center point of all persons
|
352 |
"""
|
353 |
# Detect persons (class 0 in COCO dataset)
|
354 |
results = model(image, classes=[0], conf=0.6)
|
@@ -370,18 +273,35 @@ def find_persons_center(image):
|
|
370 |
print(f"Single person detected at center x: {center_x}")
|
371 |
return center_x
|
372 |
else:
|
373 |
-
# Multiple persons -
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
merged_center_x = int((left_x + right_x) // 2)
|
377 |
|
378 |
-
print(
|
|
|
|
|
379 |
print(f"Merged bounds: left={left_x}, right={right_x}")
|
380 |
|
381 |
return merged_center_x
|
382 |
|
383 |
|
384 |
-
def create_layouts(image, left_division, right_division):
|
385 |
"""
|
386 |
Create different layout variations of the image using specific aspect ratios.
|
387 |
All layout variations will be centered on detected persons.
|
@@ -436,11 +356,26 @@ def create_layouts(image, left_division, right_division):
|
|
436 |
person_top = y1
|
437 |
person_height = y2 - y1
|
438 |
else:
|
439 |
-
# Multiple persons -
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
|
445 |
cutout_center_x = int((left_x + right_x) // 2)
|
446 |
cutout_center_y = int((top_y + bottom_y) // 2)
|
@@ -451,13 +386,13 @@ def create_layouts(image, left_division, right_division):
|
|
451 |
aspect_16_9 = 16 / 9
|
452 |
aspect_9_16 = 9 / 16
|
453 |
|
454 |
-
# For 16:9 version (with
|
455 |
target_height_16_9 = int(cutout_width / aspect_16_9)
|
456 |
if target_height_16_9 <= cutout_height:
|
457 |
-
# Calculate
|
458 |
top_margin = int(person_height * 0.05)
|
459 |
|
460 |
-
# Start
|
461 |
y_start = int(max(0, person_top - top_margin))
|
462 |
|
463 |
# If this would make the crop exceed the bottom, adjust y_start
|
@@ -1578,7 +1513,7 @@ def draw_layout_regions(
|
|
1578 |
return visualization_pil
|
1579 |
|
1580 |
|
1581 |
-
def get_image_crop(cid=None, rsid=None, uid=None):
|
1582 |
"""
|
1583 |
Function that returns both standard and layout variations for visualization.
|
1584 |
|
@@ -1591,8 +1526,12 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
1591 |
durations = [sprite_data["duration"] for sprite_data in sprites_data]
|
1592 |
except Exception:
|
1593 |
image_paths = [
|
1594 |
-
"data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
1595 |
-
"data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
|
|
|
|
|
|
|
|
1596 |
]
|
1597 |
|
1598 |
# Lists to store all images
|
@@ -1625,13 +1564,9 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
1625 |
|
1626 |
# Analyze the image to get optimal crop divisions
|
1627 |
# This uses GPT-4V to identify the optimal crop points
|
1628 |
-
(
|
1629 |
-
|
1630 |
-
|
1631 |
-
_,
|
1632 |
-
left_division,
|
1633 |
-
right_division,
|
1634 |
-
) = analyze_image(numbered_mid_image, remove_unwanted_prompt(2), mid_image)
|
1635 |
|
1636 |
# Safety check for divisions
|
1637 |
if left_division <= 0:
|
@@ -1651,7 +1586,7 @@ def get_image_crop(cid=None, rsid=None, uid=None):
|
|
1651 |
twothirdhalfs_layouts,
|
1652 |
twoequalhalfs_layouts,
|
1653 |
visualization_data,
|
1654 |
-
) = create_layouts(mid_image, left_division, right_division)
|
1655 |
|
1656 |
# Create all the required visualizations
|
1657 |
# 1. Standard aspect ratio visualization (16:9 and 9:16)
|
|
|
151 |
return new_image
|
152 |
|
153 |
|
154 |
+
def analyze_image(numbered_input_image: Image, prompt, input_image, ct):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
"""
|
156 |
Perform inference on an image using GPT-4V.
|
157 |
|
|
|
188 |
{"role": "assistant", "content": response.choices[0].message.content},
|
189 |
{
|
190 |
"role": "user",
|
191 |
+
"content": "please return the response in the json with keys left_row, right_row, and num_of_speakers",
|
192 |
},
|
193 |
],
|
194 |
)
|
|
|
204 |
|
205 |
try:
|
206 |
if left_index != -1 and right_index != -1:
|
207 |
+
print(response[left_index : right_index + 1])
|
208 |
response_json = eval(response[left_index : right_index + 1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
except Exception as e:
|
210 |
print(e)
|
211 |
+
return 0, 20
|
212 |
|
213 |
return (
|
|
|
|
|
|
|
214 |
response_json["left_row"],
|
215 |
response_json["right_row"],
|
216 |
+
response_json["num_of_speakers"],
|
217 |
)
|
218 |
|
219 |
|
|
|
241 |
return data.val()
|
242 |
|
243 |
|
244 |
+
def find_persons_center(image, num_of_speakers=1):
|
245 |
"""
|
246 |
+
Find the center point of the largest num_of_speakers persons in the image.
|
247 |
+
If multiple persons are detected, merge the bounding boxes of only the largest ones.
|
248 |
|
249 |
Args:
|
250 |
image: CV2/numpy array image
|
251 |
+
num_of_speakers: Number of speakers to consider (default: 1)
|
252 |
|
253 |
Returns:
|
254 |
+
int: x-coordinate of the center point of all considered persons
|
255 |
"""
|
256 |
# Detect persons (class 0 in COCO dataset)
|
257 |
results = model(image, classes=[0], conf=0.6)
|
|
|
273 |
print(f"Single person detected at center x: {center_x}")
|
274 |
return center_x
|
275 |
else:
|
276 |
+
# Multiple persons - consider only the largest num_of_speakers boxes
|
277 |
+
|
278 |
+
# Calculate area for each box
|
279 |
+
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
|
280 |
+
|
281 |
+
# Sort boxes by area (largest first) and take top num_of_speakers
|
282 |
+
sorted_indices = sorted(
|
283 |
+
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
|
284 |
+
)
|
285 |
+
|
286 |
+
# Use all available boxes if fewer detected than requested
|
287 |
+
num_boxes_to_use = min(num_of_speakers, len(boxes))
|
288 |
+
selected_indices = sorted_indices[:num_boxes_to_use]
|
289 |
+
selected_boxes = [boxes[i] for i in selected_indices]
|
290 |
+
|
291 |
+
# Create a merged bounding box from selected boxes
|
292 |
+
left_x = min(box[0] for box in selected_boxes)
|
293 |
+
right_x = max(box[2] for box in selected_boxes)
|
294 |
merged_center_x = int((left_x + right_x) // 2)
|
295 |
|
296 |
+
print(
|
297 |
+
f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}"
|
298 |
+
)
|
299 |
print(f"Merged bounds: left={left_x}, right={right_x}")
|
300 |
|
301 |
return merged_center_x
|
302 |
|
303 |
|
304 |
+
def create_layouts(image, left_division, right_division, num_of_speakers):
|
305 |
"""
|
306 |
Create different layout variations of the image using specific aspect ratios.
|
307 |
All layout variations will be centered on detected persons.
|
|
|
356 |
person_top = y1
|
357 |
person_height = y2 - y1
|
358 |
else:
|
359 |
+
# Multiple persons - consider only the largest num_of_speakers boxes
|
360 |
+
|
361 |
+
# Calculate area for each box
|
362 |
+
box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
|
363 |
+
|
364 |
+
# Sort boxes by area (largest first) and take top num_of_speakers
|
365 |
+
sorted_indices = sorted(
|
366 |
+
range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
|
367 |
+
)
|
368 |
+
|
369 |
+
# Use all available boxes if fewer detected than requested
|
370 |
+
num_boxes_to_use = min(num_of_speakers, len(boxes))
|
371 |
+
selected_indices = sorted_indices[:num_boxes_to_use]
|
372 |
+
selected_boxes = [boxes[i] for i in selected_indices]
|
373 |
+
|
374 |
+
# Merge bounding boxes of selected boxes
|
375 |
+
left_x = min(box[0] for box in selected_boxes)
|
376 |
+
right_x = max(box[2] for box in selected_boxes)
|
377 |
+
top_y = min(box[1] for box in selected_boxes) # Top of highest person
|
378 |
+
bottom_y = max(box[3] for box in selected_boxes) # Bottom of lowest person
|
379 |
|
380 |
cutout_center_x = int((left_x + right_x) // 2)
|
381 |
cutout_center_y = int((top_y + bottom_y) // 2)
|
|
|
386 |
aspect_16_9 = 16 / 9
|
387 |
aspect_9_16 = 9 / 16
|
388 |
|
389 |
+
# For 16:9 version (with 5% margin above person)
|
390 |
target_height_16_9 = int(cutout_width / aspect_16_9)
|
391 |
if target_height_16_9 <= cutout_height:
|
392 |
+
# Calculate 5% of person height for top margin
|
393 |
top_margin = int(person_height * 0.05)
|
394 |
|
395 |
+
# Start 5% above the person's top
|
396 |
y_start = int(max(0, person_top - top_margin))
|
397 |
|
398 |
# If this would make the crop exceed the bottom, adjust y_start
|
|
|
1513 |
return visualization_pil
|
1514 |
|
1515 |
|
1516 |
+
def get_image_crop(cid=None, rsid=None, uid=None, ct=None):
|
1517 |
"""
|
1518 |
Function that returns both standard and layout variations for visualization.
|
1519 |
|
|
|
1526 |
durations = [sprite_data["duration"] for sprite_data in sprites_data]
|
1527 |
except Exception:
|
1528 |
image_paths = [
|
1529 |
+
# "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
1530 |
+
# "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
1531 |
+
"data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
1532 |
+
"data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
|
1533 |
+
"data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg",
|
1534 |
+
"data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg",
|
1535 |
]
|
1536 |
|
1537 |
# Lists to store all images
|
|
|
1564 |
|
1565 |
# Analyze the image to get optimal crop divisions
|
1566 |
# This uses GPT-4V to identify the optimal crop points
|
1567 |
+
(left_division, right_division, num_of_speakers) = analyze_image(
|
1568 |
+
numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct
|
1569 |
+
)
|
|
|
|
|
|
|
|
|
1570 |
|
1571 |
# Safety check for divisions
|
1572 |
if left_division <= 0:
|
|
|
1586 |
twothirdhalfs_layouts,
|
1587 |
twoequalhalfs_layouts,
|
1588 |
visualization_data,
|
1589 |
+
) = create_layouts(mid_image, left_division, right_division, num_of_speakers)
|
1590 |
|
1591 |
# Create all the required visualizations
|
1592 |
# 1. Standard aspect ratio visualization (16:9 and 9:16)
|
prompts.py
CHANGED
@@ -152,12 +152,14 @@ If the user provides the correct call type, use the correct_call_type function t
|
|
152 |
|
153 |
|
154 |
def remove_unwanted_prompt(number_of_speakers: int):
|
155 |
-
if number_of_speakers ==
|
156 |
-
return """I want to crop this image only when absolutely necessary to remove partial objects or humans.
|
157 |
|
158 |
Please analyze the image and tell me:
|
159 |
1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
|
160 |
|
161 |
2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
|
162 |
|
163 |
-
I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
|
|
|
|
|
|
152 |
|
153 |
|
154 |
def remove_unwanted_prompt(number_of_speakers: int):
|
155 |
+
if number_of_speakers == 1:
|
156 |
+
return """I want to crop this image only when absolutely necessary to remove partial objects or partial humans.
|
157 |
|
158 |
Please analyze the image and tell me:
|
159 |
1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
|
160 |
|
161 |
2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
|
162 |
|
163 |
+
I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
|
164 |
+
Also tell the number of speakers that are completely visible and should be part of the crop. Generally it is either 1 or 2 but can be more.
|
165 |
+
"""
|