File size: 6,743 Bytes
1abc56b
f5eca3d
1abc56b
f5eca3d
1abc56b
 
f5eca3d
1abc56b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5eca3d
1abc56b
 
f5eca3d
3e8f2a5
f5eca3d
1abc56b
 
 
 
 
 
 
 
 
 
 
 
 
a2238e9
1abc56b
 
 
3e8f2a5
 
1abc56b
 
3e8f2a5
1abc56b
3e8f2a5
 
 
 
 
1abc56b
3e8f2a5
 
1abc56b
3e8f2a5
 
 
1abc56b
3e8f2a5
1abc56b
3e8f2a5
1abc56b
3e8f2a5
 
 
 
1abc56b
 
3e8f2a5
1abc56b
3e8f2a5
1abc56b
3e8f2a5
 
1abc56b
3e8f2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1abc56b
3e8f2a5
1abc56b
 
 
 
 
3e8f2a5
 
1abc56b
3e8f2a5
 
1abc56b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e8f2a5
1abc56b
 
3e8f2a5
1abc56b
 
 
 
 
 
 
 
f5eca3d
3e8f2a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import logging
import gradio as gr
import os
from roboflow import Roboflow
from dotenv import load_dotenv
from openai import OpenAI
import tempfile
import numpy as np
from PIL import Image, ImageDraw
import base64

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Initialize API Keys
roboflow_key = os.getenv("ROBOFLOW_API_KEY")
if not roboflow_key:
    raise ValueError("ROBOFLOW_API_KEY is missing. Please add it to the .env file.")

openai_key = os.getenv("OPENAI_API_KEY")
if not openai_key:
    raise ValueError("OPENAI_API_KEY is missing. Please add it to the .env file.")

# Initialize Roboflow and OpenAI clients
rf = Roboflow(api_key=roboflow_key)
project = rf.workspace("alat-pelindung-diri").project("nescafe-4base")
model = project.version(16).model  # Commented-out YOLO model usage, if necessary

client_openai = OpenAI(api_key=openai_key)

# Function to detect objects and estimate occluded objects
def detect_and_estimate_objects(image):
    try:
        # Save image to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
            image.save(temp_file, format="JPEG")
            temp_file_path = temp_file.name

        logger.info("Image saved successfully for processing.")

        # Step 1: YOLO detection
        predictions = model.predict(temp_file_path, confidence=70, overlap=80).json()
        class_count = {}
        object_positions = []

        # Draw bounding boxes
        draw = ImageDraw.Draw(image)
        for prediction in predictions['predictions']:
            class_name = prediction['class']
            x, y, width, height = prediction['x'], prediction['y'], prediction['width'], prediction['height']

            # Calculate bounding box coordinates
            left = int(x - width / 2)
            top = int(y - height / 2)
            right = int(x + width / 2)
            bottom = int(y + height / 2)

            # Draw bounding box
            draw.rectangle([left, top, right, bottom], outline="red", width=4)

            # Count occurrences of detected classes
            class_count[class_name] = class_count.get(class_name, 0) + 1
            object_positions.append((left, top, right, bottom))

        logger.info(f"YOLO detected objects: {class_count}")

        # Step 2: Prepare base64 encoding for GPT-4
        # Encode image to Base64
        with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
            image.save(temp_file, format="JPEG")
            temp_file_path = temp_file.name

        with open(temp_file_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode("utf-8")
            logger.info(f"Base64 encoding successful. Length: {len(base64_image)}")

        # Step 3: Use GPT-4 to estimate occluded objects
        response = client_openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """Please count the number of cans of the following Nestlé products in the image, including those that are partially obstructed or hidden.
                            For partially visible or obstructed cans, please estimate their number based on visible clues and assume that they belong to the same product in front of them.
                            Please count accurately the number of cans of the following Nestlé products in the image:
                            - Nescafe Mocha
                            - Nescafe Latte
                            - Nescafe Original
                            - Bear Brand
                            - Nescafe Cappuccino
                            - Nescafe Ice Black
                            - Nescafe Coconut Latte
                            - Nescafe Caramel
                            Please note that some products may be partially visible or obstructed, but are still important to count. Products that are only partially visible or obstructed. Think of them as cans of the same product in front of them.
                            Please count the visible cans as well as the occluded ones. For partially hidden cans, assume they are the same product and estimate their presence based on the visible portion.

                            Provide your response in the format:
                            Nescafé Mocha: [number]
                            Nescafé Latte: [number]
                            Nescafé Original: [number]
                            Bear Brand: [number]
                            Nescafé Cappuccino: [number]
                            Nescafé Ice Black: [number]
                            Nescafé Coconut Latte: [number]
                            Nescafé Caramel: [number]
                            Total Nestlé Products: [Total number of Nestlé products]""",
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                        },
                    ],
                }
            ],
        )
        gpt_estimation = response.choices[0].message.content.strip()
        print(response.choices[0].message.content)

        logger.info(f"GPT-4 estimation: {gpt_estimation}")

        # Step 4: Combine YOLO and GPT results (without YOLO part now)
        result_text = f"Results from GPT-4:\n{gpt_estimation}"

        # Step 5: Return the result text without the grid visualization
        output_path = "/tmp/prediction_result.jpg"
        image.save(output_path)

        logger.info("Processed image saved successfully.")

        # Cleanup
        os.remove(temp_file_path)

        return output_path, result_text

    except Exception as e:
        logger.error(f"Error during processing: {e}")
        return None, f"Error: {e}"

# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("### Object Detection and Counting with YOLO and GPT-4 Assistance")
    with gr.Row():
        input_image = gr.Image(type="pil", label="Upload Image")
        output_image = gr.Image(label="Processed Image")
        output_text = gr.Textbox(label="Results", interactive=False)

    detect_button = gr.Button("Process Image")
    detect_button.click(
        fn=detect_and_estimate_objects,
        inputs=[input_image],
        outputs=[output_image, output_text]
    )

iface.launch(debug=True)