Spaces:

RollAI
/

ChatWithTranscriptDev2

Sleeping

App Files Files Community

AhmadMustafa commited on Apr 4

Commit

09fada4

1 Parent(s): 0633d98

add: street interview crops

Browse files

Files changed (3) hide show

app.py +1 -1
crop_utils.py +66 -131
prompts.py +5 -3

app.py CHANGED Viewed

@@ -245,7 +245,7 @@ def chat(
                     tool_call = response.choices[0].message.tool_calls[0]
                     if tool_call.function.name == "get_image":
                         # Return the image directly in the chat
-                        image_data = get_image_crop(cid, rsid, uid)
                         print(response.choices[0].message)
                         messages.append(response.choices[0].message)
                         function_call_result_message = {

                     tool_call = response.choices[0].message.tool_calls[0]
                     if tool_call.function.name == "get_image":
                         # Return the image directly in the chat
+                        image_data = get_image_crop(cid, rsid, uid, ct)
                         print(response.choices[0].message)
                         messages.append(response.choices[0].message)
                         function_call_result_message = {

crop_utils.py CHANGED Viewed

@@ -151,97 +151,7 @@ def add_top_numbers(
     return new_image
-def crop_and_draw_divisions(
-    input_image,
-    left_division,
-    right_division,
-    num_divisions=20,
-    line_color=(255, 0, 0),
-    line_width=2,
-    head_margin_percent=0.1,
-):
-    """
-    Create both 9:16 and 16:9 crops and draw guide lines.
-    Args:
-        input_image (Image): PIL Image
-        left_division (int): Left-side division number (1-20)
-        right_division (int): Right-side division number (1-20)
-        num_divisions (int): Total number of divisions (default=20)
-        line_color (tuple): RGB color tuple for lines (default: red)
-        line_width (int): Width of lines in pixels (default: 2)
-        head_margin_percent (float): Percentage margin above head (default: 0.1)
-    Returns:
-        tuple: (cropped_image_16_9, image_with_lines, cropped_image_9_16)
-    """
-    yolo_model = model
-    # Calculate division width and boundaries
-    division_width = input_image.width / num_divisions
-    left_boundary = (left_division - 1) * division_width
-    right_boundary = right_division * division_width
-    # First get the 9:16 crop
-    cropped_image_9_16 = input_image.crop(
-        (left_boundary, 0, right_boundary, input_image.height)
-    )
-    # Run YOLO on the 9:16 crop to get person bbox
-    bbox = (
-        yolo_model(cropped_image_9_16, classes=[0], conf=0.6)[0]
-        .boxes.xyxy.cpu()
-        .numpy()[0]
-    )
-    x1, y1, x2, y2 = bbox
-    # Calculate top boundary with head margin
-    head_margin = (y2 - y1) * head_margin_percent
-    top_boundary = max(0, y1 - head_margin)
-    # Calculate 16:9 dimensions based on the width between divisions
-    crop_width = right_boundary - left_boundary
-    crop_height_16_9 = int(crop_width * 9 / 16)
-    # Calculate bottom boundary for 16:9
-    bottom_boundary = min(input_image.height, top_boundary + crop_height_16_9)
-    # Create 16:9 crop from original image
-    cropped_image_16_9 = input_image.crop(
-        (left_boundary, top_boundary, right_boundary, bottom_boundary)
-    )
-    # Draw guide lines for both crops on original image
-    image_with_lines = input_image.copy()
-    draw = ImageDraw.Draw(image_with_lines)
-    # Draw vertical lines (for both crops)
-    draw.line(
-        [(left_boundary, 0), (left_boundary, input_image.height)],
-        fill=line_color,
-        width=line_width,
-    )
-    draw.line(
-        [(right_boundary, 0), (right_boundary, input_image.height)],
-        fill=line_color,
-        width=line_width,
-    )
-    # Draw horizontal lines (for 16:9 crop)
-    draw.line(
-        [(left_boundary, top_boundary), (right_boundary, top_boundary)],
-        fill=line_color,
-        width=line_width,
-    )
-    draw.line(
-        [(left_boundary, bottom_boundary), (right_boundary, bottom_boundary)],
-        fill=line_color,
-        width=line_width,
-    )
-    return cropped_image_16_9, image_with_lines, cropped_image_9_16
-def analyze_image(numbered_input_image: Image, prompt, input_image):
     """
     Perform inference on an image using GPT-4V.
@@ -278,7 +188,7 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
             {"role": "assistant", "content": response.choices[0].message.content},
             {
                 "role": "user",
-                "content": "please return the response in the json with keys left_row and right_row",
             },
         ],
     )
@@ -294,24 +204,16 @@ def analyze_image(numbered_input_image: Image, prompt, input_image):
     try:
         if left_index != -1 and right_index != -1:
             response_json = eval(response[left_index : right_index + 1])
-        cropped_image_16_9, image_with_lines, cropped_image_9_16 = (
-            crop_and_draw_divisions(
-                input_image=input_image,
-                left_division=response_json["left_row"],
-                right_division=response_json["right_row"],
-            )
-        )
     except Exception as e:
         print(e)
-        return input_image, input_image, input_image, 0, 20
     return (
-        cropped_image_16_9,
-        image_with_lines,
-        cropped_image_9_16,
         response_json["left_row"],
         response_json["right_row"],
     )
@@ -339,16 +241,17 @@ def get_sprite_firebase(cid, rsid, uid):
     return data.val()
-def find_persons_center(image):
     """
-    Find the center point of all persons in the image.
-    If multiple persons are detected, merge all bounding boxes and find the center.
     Args:
         image: CV2/numpy array image
     Returns:
-        int: x-coordinate of the center point of all persons
     """
     # Detect persons (class 0 in COCO dataset)
     results = model(image, classes=[0], conf=0.6)
@@ -370,18 +273,35 @@ def find_persons_center(image):
         print(f"Single person detected at center x: {center_x}")
         return center_x
     else:
-        # Multiple persons - create a merged bounding box
-        left_x = min(box[0] for box in boxes)
-        right_x = max(box[2] for box in boxes)
         merged_center_x = int((left_x + right_x) // 2)
-        print(f"Multiple persons merged bounding box center x: {merged_center_x}")
         print(f"Merged bounds: left={left_x}, right={right_x}")
         return merged_center_x
-def create_layouts(image, left_division, right_division):
     """
     Create different layout variations of the image using specific aspect ratios.
     All layout variations will be centered on detected persons.
@@ -436,11 +356,26 @@ def create_layouts(image, left_division, right_division):
             person_top = y1
             person_height = y2 - y1
         else:
-            # Multiple persons - merge bounding boxes
-            left_x = min(box[0] for box in boxes)
-            right_x = max(box[2] for box in boxes)
-            top_y = min(box[1] for box in boxes)  # Top of highest person
-            bottom_y = max(box[3] for box in boxes)  # Bottom of lowest person
             cutout_center_x = int((left_x + right_x) // 2)
             cutout_center_y = int((top_y + bottom_y) // 2)
@@ -451,13 +386,13 @@ def create_layouts(image, left_division, right_division):
     aspect_16_9 = 16 / 9
     aspect_9_16 = 9 / 16
-    # For 16:9 version (with 20% margin above person)
     target_height_16_9 = int(cutout_width / aspect_16_9)
     if target_height_16_9 <= cutout_height:
-        # Calculate 20% of person height for top margin
         top_margin = int(person_height * 0.05)
-        # Start 20% above the person's top
         y_start = int(max(0, person_top - top_margin))
         # If this would make the crop exceed the bottom, adjust y_start
@@ -1578,7 +1513,7 @@ def draw_layout_regions(
     return visualization_pil
-def get_image_crop(cid=None, rsid=None, uid=None):
     """
     Function that returns both standard and layout variations for visualization.
@@ -1591,8 +1526,12 @@ def get_image_crop(cid=None, rsid=None, uid=None):
         durations = [sprite_data["duration"] for sprite_data in sprites_data]
     except Exception:
         image_paths = [
-            "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
-            "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
         ]
     # Lists to store all images
@@ -1625,13 +1564,9 @@ def get_image_crop(cid=None, rsid=None, uid=None):
         # Analyze the image to get optimal crop divisions
         # This uses GPT-4V to identify the optimal crop points
-        (
-            _,
-            _,
-            _,
-            left_division,
-            right_division,
-        ) = analyze_image(numbered_mid_image, remove_unwanted_prompt(2), mid_image)
         # Safety check for divisions
         if left_division <= 0:
@@ -1651,7 +1586,7 @@ def get_image_crop(cid=None, rsid=None, uid=None):
             twothirdhalfs_layouts,
             twoequalhalfs_layouts,
             visualization_data,
-        ) = create_layouts(mid_image, left_division, right_division)
         # Create all the required visualizations
         # 1. Standard aspect ratio visualization (16:9 and 9:16)

     return new_image
+def analyze_image(numbered_input_image: Image, prompt, input_image, ct):
     """
     Perform inference on an image using GPT-4V.
             {"role": "assistant", "content": response.choices[0].message.content},
             {
                 "role": "user",
+                "content": "please return the response in the json with keys left_row, right_row, and num_of_speakers",
             },
         ],
     )
     try:
         if left_index != -1 and right_index != -1:
+            print(response[left_index : right_index + 1])
             response_json = eval(response[left_index : right_index + 1])
     except Exception as e:
         print(e)
+        return 0, 20
     return (
         response_json["left_row"],
         response_json["right_row"],
+        response_json["num_of_speakers"],
     )
     return data.val()
+def find_persons_center(image, num_of_speakers=1):
     """
+    Find the center point of the largest num_of_speakers persons in the image.
+    If multiple persons are detected, merge the bounding boxes of only the largest ones.
     Args:
         image: CV2/numpy array image
+        num_of_speakers: Number of speakers to consider (default: 1)
     Returns:
+        int: x-coordinate of the center point of all considered persons
     """
     # Detect persons (class 0 in COCO dataset)
     results = model(image, classes=[0], conf=0.6)
         print(f"Single person detected at center x: {center_x}")
         return center_x
     else:
+        # Multiple persons - consider only the largest num_of_speakers boxes
+        # Calculate area for each box
+        box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
+        # Sort boxes by area (largest first) and take top num_of_speakers
+        sorted_indices = sorted(
+            range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
+        )
+        # Use all available boxes if fewer detected than requested
+        num_boxes_to_use = min(num_of_speakers, len(boxes))
+        selected_indices = sorted_indices[:num_boxes_to_use]
+        selected_boxes = [boxes[i] for i in selected_indices]
+        # Create a merged bounding box from selected boxes
+        left_x = min(box[0] for box in selected_boxes)
+        right_x = max(box[2] for box in selected_boxes)
         merged_center_x = int((left_x + right_x) // 2)
+        print(
+            f"{num_boxes_to_use} largest persons merged bounding box center x: {merged_center_x}"
+        )
         print(f"Merged bounds: left={left_x}, right={right_x}")
         return merged_center_x
+def create_layouts(image, left_division, right_division, num_of_speakers):
     """
     Create different layout variations of the image using specific aspect ratios.
     All layout variations will be centered on detected persons.
             person_top = y1
             person_height = y2 - y1
         else:
+            # Multiple persons - consider only the largest num_of_speakers boxes
+            # Calculate area for each box
+            box_areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
+            # Sort boxes by area (largest first) and take top num_of_speakers
+            sorted_indices = sorted(
+                range(len(box_areas)), key=lambda i: box_areas[i], reverse=True
+            )
+            # Use all available boxes if fewer detected than requested
+            num_boxes_to_use = min(num_of_speakers, len(boxes))
+            selected_indices = sorted_indices[:num_boxes_to_use]
+            selected_boxes = [boxes[i] for i in selected_indices]
+            # Merge bounding boxes of selected boxes
+            left_x = min(box[0] for box in selected_boxes)
+            right_x = max(box[2] for box in selected_boxes)
+            top_y = min(box[1] for box in selected_boxes)  # Top of highest person
+            bottom_y = max(box[3] for box in selected_boxes)  # Bottom of lowest person
             cutout_center_x = int((left_x + right_x) // 2)
             cutout_center_y = int((top_y + bottom_y) // 2)
     aspect_16_9 = 16 / 9
     aspect_9_16 = 9 / 16
+    # For 16:9 version (with 5% margin above person)
     target_height_16_9 = int(cutout_width / aspect_16_9)
     if target_height_16_9 <= cutout_height:
+        # Calculate 5% of person height for top margin
         top_margin = int(person_height * 0.05)
+        # Start 5% above the person's top
         y_start = int(max(0, person_top - top_margin))
         # If this would make the crop exceed the bottom, adjust y_start
     return visualization_pil
+def get_image_crop(cid=None, rsid=None, uid=None, ct=None):
     """
     Function that returns both standard and layout variations for visualization.
         durations = [sprite_data["duration"] for sprite_data in sprites_data]
     except Exception:
         image_paths = [
+            # "data/C2-Roll3D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
+            # "data/E2-HamzaA-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
+            "data/F2-Roll4D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
+            "data/G2-Roll5D-i2x-Take2-Nov19.24-PST02.31.31pm.jpg",
+            "data/C1-Roll10D-i1x-Take2-Mar20.25-PST12.14.56pm.jpg",
+            "data/C2-Roll10D-i2x-Take2-Mar20.25-PST12.14.56pm.jpg",
         ]
     # Lists to store all images
         # Analyze the image to get optimal crop divisions
         # This uses GPT-4V to identify the optimal crop points
+        (left_division, right_division, num_of_speakers) = analyze_image(
+            numbered_mid_image, remove_unwanted_prompt(1), mid_image, ct
+        )
         # Safety check for divisions
         if left_division <= 0:
             twothirdhalfs_layouts,
             twoequalhalfs_layouts,
             visualization_data,
+        ) = create_layouts(mid_image, left_division, right_division, num_of_speakers)
         # Create all the required visualizations
         # 1. Standard aspect ratio visualization (16:9 and 9:16)

prompts.py CHANGED Viewed

@@ -152,12 +152,14 @@ If the user provides the correct call type, use the correct_call_type function t
 def remove_unwanted_prompt(number_of_speakers: int):
-    if number_of_speakers == 2:
-        return """I want to crop this image only when absolutely necessary to remove partial objects or humans.
 Please analyze the image and tell me:
 1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
 2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
-I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements."""

 def remove_unwanted_prompt(number_of_speakers: int):
+    if number_of_speakers == 1:
+        return """I want to crop this image only when absolutely necessary to remove partial objects or partial humans.
 Please analyze the image and tell me:
 1. The column number (1-20) on the left side where I should start the crop. Only suggest cropping (columns 1-4) if there are clear partial objects or humans that need removal. If no cropping is needed on the left, return 1.
 2. The column number (1-20) on the right side where I should end the crop. Only suggest cropping (columns 17-20) if there are clear partial objects or humans that need removal. If no cropping is needed on the right, return 20.
+I'm looking for minimal cropping - only cut when absolutely necessary to remove distracting partial elements.
+Also tell the number of speakers that are completely visible and should be part of the crop. Generally it is either 1 or 2 but can be more.
+"""