|
|
import numpy as np
|
|
|
from PIL import Image
|
|
|
import base64
|
|
|
import io
|
|
|
|
|
|
def process_image_with_models(
|
|
|
image: Image.Image,
|
|
|
models: dict,
|
|
|
box_threshold: float = 0.05,
|
|
|
iou_threshold: float = 0.1
|
|
|
) -> tuple:
|
|
|
"""Process image with YOLO and captioning models."""
|
|
|
|
|
|
img_array = np.array(image)
|
|
|
|
|
|
|
|
|
results = models['yolo_model'](img_array)
|
|
|
|
|
|
|
|
|
boxes = results[0].boxes
|
|
|
coordinates = boxes.xyxy.cpu().numpy().tolist()
|
|
|
|
|
|
|
|
|
inputs = models['processor'](images=image, return_tensors="pt")
|
|
|
outputs = models['caption_model'].generate(
|
|
|
**inputs,
|
|
|
max_length=50,
|
|
|
num_beams=5,
|
|
|
early_stopping=True
|
|
|
)
|
|
|
|
|
|
|
|
|
captions = models['processor'].batch_decode(outputs, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
img_with_boxes = results[0].plot()
|
|
|
|
|
|
|
|
|
labeled_img = Image.fromarray(img_with_boxes)
|
|
|
buffered = io.BytesIO()
|
|
|
labeled_img.save(buffered, format="PNG")
|
|
|
img_str = base64.b64encode(buffered.getvalue())
|
|
|
|
|
|
return img_str, coordinates, captions
|
|
|
|