Spaces:

MonilM
/

Lingual

Running

File size: 12,347 Bytes

ea6c5ed
 
d84eee5
585c0be
a71f95e
 
a24d7c3
 
3233dd2
ce37654
a71f95e
 
ea6c5ed
3233dd2
 
 
 
 
 
 
 
 
 
ea6c5ed
 
 
 
 
 
 
ad796fa
 
 
 
 
 
 
 
 
ea6c5ed
 
 
 
 
 
 
 
 
 
 
 
9053779
 
 
 
ad796fa
 
 
 
 
 
 
 
 
ea6c5ed
 
 
 
 
 
 
 
 
 
 
9053779
ea6c5ed
 
 
36e2525
 
 
ea6c5ed
 
 
d84eee5
 
 
 
 
 
585c0be
 
d84eee5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a24d7c3
 
 
585c0be
a24d7c3
 
585c0be
a71f95e
ce37654
 
 
 
 
 
 
 
 
 
 
 
 
a71f95e
ce37654
a3fb224
a71f95e
ce37654
a3fb224
a71f95e
ce37654
a3fb224
 
 
 
 
a24d7c3
a71f95e
975ce9a
a24d7c3
a71f95e
585c0be
975ce9a
585c0be
 
a71f95e
585c0be
 
a71f95e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3fb224
585c0be
a71f95e
585c0be
 
a71f95e
585c0be
a71f95e
a24d7c3
585c0be
 
a71f95e
585c0be
 
 
a24d7c3
a71f95e
a24d7c3
 
a71f95e
a24d7c3
 
a71f95e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b757c0
a71f95e
 
 
 
 
 
 
 
 
 
 
 
 
 
8b757c0

import numpy as np
from PIL import Image
from typing import List, Dict, Any, Set, Tuple # Add Tuple
import os
import tempfile # Restored
from googletrans import Translator # Restored
import cv2
from inference.models.yolo_world.yolo_world import YOLOWorld
import onnxruntime as ort
import requests
import random # Restored
# import json # Removed, was added for the direct API call

# Patch ONNXRuntime to only use CPU globally
original_inference_session = ort.InferenceSession

def patched_inference_session(*args, **kwargs):
    kwargs["providers"] = ["CPUExecutionProvider"]
    return original_inference_session(*args, **kwargs)

ort.InferenceSession = patched_inference_session
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime")
# Define Predefined Class Lists for YOLO-World
PREDEFINED_CLASSES = {
    "tourist": [
        "person", "car", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
        "dog", "backpack", "umbrella", "handbag", "tie", "suitcase", "building",
        "signboard", "taxi", "rickshaw", "camera", "map", "monument", "souvenir",
        "statue", "fountain", "street sign", "tour guide", "hotel", "restaurant",
        # Added more tourist/India-specific classes
        "temple", "mosque", "church", "fort", "palace", "museum", "market", "bazaar",
        "auto rickshaw", "cycle rickshaw", "metro", "heritage site",
        "ticket counter", "luggage", "water bottle", "scarf", "hat","bus stop",
        "information center", "shopping bag", "vendor", "street food", "food stall",
        "hawker", "street performer", "camel", "elephant ride", "tour bus", "minaret",
        "gopuram", "chhatri", "ghat", "river", "lake", "bridge", "park", "garden",
       
    ],
    "casual": [
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
        "truck", "boat", "traffic light", "fire hydrant", "stop sign",
        "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
        "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
        "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
        "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
        "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
        "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
        "couch", "potted plant", "bed", "dining table", "toilet", "tv",
        "laptop", "mouse", "tv remote","remote control", "keyboard", "cell phone", "microwave",
        "oven", "toaster", "book", "clock",
        "scissors", "teddy bear", "toothbrush", "tree", "flower", "park",
        "computer", "desk", "window", "door",
        # Added Indian road/office-specific classes
        "auto rickshaw", "cycle rickshaw", "scooter", "tempo", "tractor", "e-rickshaw",
        "delivery van", "ambulance", "police car", "roadside stall", "food cart",
        "street vendor", "helmet", "road sign", "speed breaker",
        "divider", "pothole", "bus stop", "petrol pump",
        "water dispenser", "printer", "file cabinet", "whiteboard", "projector",
        "security guard", "id card", "notice board",
        "elevator", "staircase", "canteen", "cafeteria", "tea cup", "tiffin box",
        "lunch box", "stationery", "pen", "notebook", "marker", "mouse pad"
    ],
    "kids": [
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
        "truck", "boat", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
        "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
        "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
        "surfboard", "tennis racket", "bottle", "banana", "apple", "sandwich",
        "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
        "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
        "laptop", "mouse", "remote", "keyboard", "cell phone", "book", "clock",
         "scissors", "teddy bear", "hair drier", "toothbrush", "red", "blue",
        "green", "yellow", "orange", "purple", "pink", "black", "white", "gray",
        "brown", "circle", "square", "triangle", "rectangle", "star", "heart",
        "ball", "block", "toy", "doll", "crayon", "slide", "swing", "duck", "lion",
        "tiger", "monkey", "moon", "sun", "cloud", "rainbow",
        # Added geometric solids
        "cylinder", "rectangular prism", "pyramid", "cube", "cone", "sphere", "triangular prism"
    ]
}

# --- Synonym Mapping ---
# Add known synonyms for potentially ambiguous terms
# Keys are the terms users might input, values are terms the model might recognize better
SYNONYM_MAP = {
    "rickshaw": ["tuk-tuk", "auto rickshaw"],
    # Add more synonyms as needed, e.g.:
    "motorbike": ["motorcycle"],
    "automobile": ["car"],
}

# Reverse map to easily find the 'original' term from a synonym
ORIGINAL_TERM_MAP = {}
for original, synonyms in SYNONYM_MAP.items():
    for synonym in synonyms:
        ORIGINAL_TERM_MAP[synonym] = original

def expand_synonyms(class_list: List[str]) -> List[str]:
    """Expands a list of class names with predefined synonyms."""
    expanded_set = set(class_list) # Start with original classes
    for term in class_list:
        if term in SYNONYM_MAP:
            expanded_set.update(SYNONYM_MAP[term])
    return sorted(list(expanded_set))
# --- End Synonym Mapping ---

# --- YOLO-World Model Setup (Local, Preloaded) ---
# Preload a separate YOLO-World model for each profile (casual, tourist, kids)
YOLOWORLD_MODELS = {}
for profile in PREDEFINED_CLASSES.keys():
    YOLOWORLD_MODELS[profile] = YOLOWorld(model_id="yolo_world/l")
    YOLOWORLD_MODELS[profile].set_classes(PREDEFINED_CLASSES[profile])

# Patch requests for this module only (for proxies) # Restored
def patch_requests_with_proxy():
    proxies_path = os.path.join(os.path.dirname(__file__), "proxies.txt")
    try:
        with open(proxies_path, "r") as f:
            proxies = [line.strip() for line in f if line.strip()]
        if proxies:
            proxy = random.choice(proxies)
            proxy_url = f"http://{proxy}"
            requests.Session.proxies = {
                "http": proxy_url,
                "https": proxy_url
            }
    except Exception:
        pass # Silently pass if proxy setup fails

# --- Google Translate Helper (inline, replaces missing translate_text) ---
def translate_text(text, dest_lang): # Restored original signature and logic
    patch_requests_with_proxy()  # Patch requests with a random proxy for this call
    try:
        # from googletrans import Translator # Already imported at the top
        translator = Translator(service_urls=['translate.googleapis.com'])
        result = translator.translate(text, dest=dest_lang)
        return result.text
    except Exception:
        return text  # fallback to original if translation fails

# --- Helper function to process YOLO-World prediction results ---
def process_yoloworld_results(predictions, original_w: int, original_h: int, scale: float, pad_top: int, pad_left: int, class_filter=None, target_language="en"):
    """
    Process YOLO-World predictions to match the expected output format.
    Transforms coordinates back to the original image space.
    Only translate label if target_language is not English.
    """
    detections = []
    for pred in predictions:
        class_name = pred.class_name # This should be the English class name from the model
        if class_filter and class_name not in class_filter:
            continue

        # Coordinates from the model are on the resized+padded image
        # (pred.x, pred.y) is the center of the box
        box_center_x_padded = float(pred.x)
        box_center_y_padded = float(pred.y)
        box_width_padded = float(pred.width)
        box_height_padded = float(pred.height)

        # 1. Un-pad: Adjust for padding
        box_center_x_resized = box_center_x_padded - pad_left
        box_center_y_resized = box_center_y_padded - pad_top

        # 2. Un-scale: Adjust for scaling
        original_center_x = box_center_x_resized / scale
        original_center_y = box_center_y_resized / scale
        original_width = box_width_padded / scale
        original_height = box_height_padded / scale

        # Calculate x1, y1 for the original image
        original_x1 = original_center_x - (original_width / 2)
        original_y1 = original_center_y - (original_height / 2)
        
        # Ensure coordinates are within original image bounds (clipping)
        original_x1 = max(0, min(original_x1, original_w))
        original_y1 = max(0, min(original_y1, original_h))
        # Calculate x2, y2 and then clip them too, then re-calculate width/height
        original_x2 = max(0, min(original_x1 + original_width, original_w))
        original_y2 = max(0, min(original_y1 + original_height, original_h))
        
        final_width = original_x2 - original_x1
        final_height = original_y2 - original_y1
        final_center_x = original_x1 + final_width / 2
        final_center_y = original_y1 + final_height / 2

        # Translate only if target_language is not English
        if target_language and target_language.lower() != "en":
            label_translated = translate_text(class_name, target_language) # Uses the restored translate_text
        else:
            label_translated = class_name

        detections.append({
            "box": [int(original_x1), int(original_y1), int(final_width), int(final_height)],
            "confidence": float(pred.confidence),
            "label": label_translated,      # translated or original label
            "label_en": class_name,         # always original (English) label
            "centre": [int(final_center_x), int(final_center_y)]
        })
    return detections

# --- YOLO-World Inference Function ---
def run_yoloworld_detection(image: Image.Image, target_classes: set, confidence_threshold: float = 0.1, iou_threshold: float = 0.4, profile: str = "casual", target_language: str = "en"):
    """Run YOLO-World detection for the given profile and filter by target classes."""
    model = YOLOWORLD_MODELS.get(profile, list(YOLOWORLD_MODELS.values())[0])
    
    # Convert PIL Image to OpenCV format
    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # --- Image Resizing (maintaining aspect ratio) ---
    target_size = 640
    h, w = image_cv.shape[:2]
    scale = target_size / max(h, w)
    new_w, new_h = int(w * scale), int(h * scale)
    
    resized_image_cv = cv2.resize(image_cv, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    # Pad to target_size x target_size
    delta_w = target_size - new_w
    delta_h = target_size - new_h
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)
    
    padded_image_cv = cv2.copyMakeBorder(resized_image_cv, top, bottom, left, right, 
                                         cv2.BORDER_CONSTANT, value=[114, 114, 114]) # Use a neutral padding color

    # Force CPU provider for inference (required for Hugging Face free tier)
    results = model.infer(padded_image_cv, confidence=confidence_threshold, iou=iou_threshold, providers=["CPUExecutionProvider"])
    
    # Adjust detection coordinates back to original image size if necessary (OPTIONAL, for now results are on padded_image_cv)
    # This is important if you need to draw boxes on the original, unresized image.
    # For now, the processing function `process_yoloworld_results` receives predictions based on the `padded_image_cv`.
    # If you need to scale back, you'd have to account for padding and scaling.
    # Example (conceptual, needs careful implementation if you draw on original image):
    # for pred in results.predictions:
    #   pred.x = (pred.x - left) / scale
    #   pred.y = (pred.y - top) / scale
    #   pred.width /= scale
    #   pred.height /= scale
        
    detections = process_yoloworld_results(results.predictions, w, h, scale, top, left, class_filter=target_classes, target_language=target_language)
    return detections