# models/vision.py -- Working from transformers import pipeline from PIL import Image class VisionModel: def __init__( self, model_name: str = "valentinafeve/yolos-fashionpedia", threshold: float = 0.7 ): self.pipe = pipeline("object-detection", model=model_name) self.threshold = threshold def detect(self, image: Image.Image): # 1) Ensure RGB if image.mode != "RGB": image = image.convert("RGB") # 2) Run detection results = self.pipe(image) # 3) Process & filter processed = [] for r in results: score = float(r["score"]) if score < self.threshold: continue # r["box"] is a dict: {"xmin":..., "ymin":..., "xmax":..., "ymax":...} box = r["box"] coords = [ float(box["xmin"]), float(box["ymin"]), float(box["xmax"]), float(box["ymax"]), ] processed.append({ "label": r["label"], "score": score, "box": coords }) return processed