import torch import gradio as gr from transformers import Owlv2Processor, Owlv2ForObjectDetection, pipeline import spaces from pathlib import Path # Use GPU if available if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(device) processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble") translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en") @spaces.GPU def query_image(img, description, score_threshold): description=description translation_result = translator(description , src_lang="fr", tgt_lang="en") description = translation_result[0]['translation_text'] description = description.split(",") size = max(img.shape[:2]) target_sizes = torch.Tensor([[size, size]]) inputs = processor(text=description, images=img, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) outputs.logits = outputs.logits.cpu() outputs.pred_boxes = outputs.pred_boxes.cpu() results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes) boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"] result_labels = [] for box, score, label in zip(boxes, scores, labels): box = [int(i) for i in box.tolist()] if score < score_threshold: continue result_labels.append((box, description[label.item()])) return img, result_labels logo = r"""