VisionScout / scene_description.py
DawnC's picture
Upload 27 files
3172319 verified
import os
import json
from typing import Dict, List, Tuple, Any, Optional
from scene_type import SCENE_TYPES
from scene_detail_templates import SCENE_DETAIL_TEMPLATES
from object_template_fillers import OBJECT_TEMPLATE_FILLERS
from activity_templates import ACTIVITY_TEMPLATES
from safety_templates import SAFETY_TEMPLATES
from confifence_templates import CONFIDENCE_TEMPLATES
class SceneDescriptor:
"""
Generates natural language descriptions of scenes.
Handles scene descriptions, activity inference, and safety concerns identification.
"""
def __init__(self, scene_types=None, object_categories=None):
"""
Initialize the scene descriptor
Args:
scene_types: Dictionary of scene type definitions
"""
self.scene_types = scene_types or {}
self.SCENE_TYPES = scene_types or {}
if object_categories:
self.OBJECT_CATEGORIES = object_categories
else:
# 從 JSON 加載或使用默認值
self.OBJECT_CATEGORIES = self._load_json_data("object_categories") or {
"furniture": [56, 57, 58, 59, 60, 61],
"electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
"kitchen_items": [39, 40, 41, 42, 43, 44, 45],
"food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
"vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
"personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
}
# 加載所有模板數據
self._load_templates()
def _load_templates(self):
"""Load all template data from script or fallback to imported defaults"""
self.confidence_templates = CONFIDENCE_TEMPLATES
self.scene_detail_templates = SCENE_DETAIL_TEMPLATES
self.object_template_fillers = OBJECT_TEMPLATE_FILLERS
self.safety_templates = SAFETY_TEMPLATES
self.activity_templates = ACTIVITY_TEMPLATES
def _initialize_fallback_templates(self):
"""Initialize fallback templates when no external data is available"""
# 只在無法從文件或導入加載時使用
self.confidence_templates = {
"high": "{description} {details}",
"medium": "This appears to be {description} {details}",
"low": "This might be {description}, but the confidence is low. {details}"
}
# 僅提供最基本的模板作為後備
self.scene_detail_templates = {
"default": ["A space with various objects."]
}
self.object_template_fillers = {
"default": ["various items"]
}
self.safety_templates = {
"general": "Pay attention to {safety_element}."
}
self.activity_templates = {
"default": ["General activity"]
}
def _get_alternative_scenes(self, scene_scores: Dict[str, float],
threshold: float, top_k: int = 2) -> List[Dict]:
"""
Get alternative scene interpretations with their scores.
Args:
scene_scores: Dictionary of scene type scores
threshold: Minimum confidence threshold
top_k: Number of alternatives to return
Returns:
List of dictionaries with alternative scenes
"""
# Sort scenes by score in descending order
sorted_scenes = sorted(scene_scores.items(), key=lambda x: x[1], reverse=True)
# Skip the first one (best match) and take the next top_k
alternatives = []
for scene_type, score in sorted_scenes[1:1+top_k]:
if score >= threshold:
alternatives.append({
"type": scene_type,
"name": self.SCENE_TYPES.get(scene_type, {}).get("name", "Unknown"),
"confidence": score
})
return alternatives
def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
"""
Infer possible activities based on scene type and detected objects.
Args:
scene_type: Identified scene type
detected_objects: List of detected objects
Returns:
List of possible activities
"""
activities = []
if scene_type.startswith("aerial_view_"):
if scene_type == "aerial_view_intersection":
# 使用預定義的十字路口活動
activities.extend(self.activity_templates.get("aerial_view_intersection", []))
# 添加與行人和車輛相關的特定活動
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
if pedestrians and vehicles:
activities.append("Waiting for an opportunity to cross the street")
activities.append("Obeying traffic signals")
elif scene_type == "aerial_view_commercial_area":
activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
elif scene_type == "aerial_view_plaza":
activities.extend(self.activity_templates.get("aerial_view_plaza", []))
else:
# 處理其他未明確定義的空中視角場景
aerial_activities = [
"Street crossing",
"Waiting for signals",
"Following traffic rules",
"Pedestrian movement"
]
activities.extend(aerial_activities)
if scene_type in self.activity_templates:
activities.extend(self.activity_templates[scene_type])
elif "default" in self.activity_templates:
activities.extend(self.activity_templates["default"])
detected_class_ids = [obj["class_id"] for obj in detected_objects]
# Add activities based on specific object combinations
if 62 in detected_class_ids and 57 in detected_class_ids: # TV and sofa
activities.append("Watching shows or movies")
if 63 in detected_class_ids: # laptop
activities.append("Using a computer/laptop")
if 67 in detected_class_ids: # cell phone
activities.append("Using a mobile phone")
if 73 in detected_class_ids: # book
activities.append("Reading")
if any(food_id in detected_class_ids for food_id in [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]):
activities.append("Eating or preparing food")
# Person-specific activities
if 0 in detected_class_ids: # Person
if any(vehicle in detected_class_ids for vehicle in [1, 2, 3, 5, 7]): # Vehicles
activities.append("Commuting or traveling")
if 16 in detected_class_ids: # Dog
activities.append("Walking a dog")
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
activities.append("Carrying personal items")
# Remove duplicates
return list(set(activities))
def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
"""
Identify potential safety concerns based on objects and scene type.
Args:
detected_objects: List of detected objects
scene_type: Identified scene type
Returns:
List of potential safety concerns
"""
concerns = []
detected_class_ids = [obj["class_id"] for obj in detected_objects]
# ORIGINAL SAFETY CONCERNS LOGIC
# General safety concerns
if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
concerns.append("Sharp utensils present")
if 76 in detected_class_ids: # Scissors
concerns.append("Cutting tools present")
# Traffic-related concerns
if scene_type in ["city_street", "parking_lot"]:
if 0 in detected_class_ids: # Person
if any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7, 8]): # Vehicles
concerns.append("Pedestrians near vehicles")
if 9 in detected_class_ids: # Traffic light
concerns.append("Monitor traffic signals")
# Identify crowded scenes
person_count = detected_class_ids.count(0)
if person_count > 5:
concerns.append(f"Crowded area with multiple people ({person_count})")
# Scene-specific concerns
if scene_type == "kitchen":
if 68 in detected_class_ids or 69 in detected_class_ids: # Microwave or oven
concerns.append("Hot cooking equipment")
# Potentially unstable objects
for obj in detected_objects:
if obj["class_id"] in [39, 40, 41, 45]: # Bottle, wine glass, cup, bowl
if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
concerns.append(f"Elevated {obj['class_name']} might be unstable")
# NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
# Upscale dining safety concerns
if scene_type == "upscale_dining":
# Check for fragile items
if 40 in detected_class_ids: # Wine glass
concerns.append("Fragile glassware present")
# Check for lit candles (can't directly detect but can infer from context)
# Look for small bright spots that might be candles
if any(obj["class_id"] == 41 for obj in detected_objects): # Cup (which might include candle holders)
# We can't reliably detect candles, but if the scene appears to be formal dining,
# we can suggest this as a possibility
concerns.append("Possible lit candles or decorative items requiring care")
# Check for overcrowded table
table_objs = [obj for obj in detected_objects if obj["class_id"] == 60] # Dining table
if table_objs:
table_region = table_objs[0]["region"]
items_on_table = 0
for obj in detected_objects:
if obj["class_id"] in [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]:
if obj["region"] == table_region:
items_on_table += 1
if items_on_table > 8:
concerns.append("Dining table has multiple items which should be handled with care")
# Asian commercial street safety concerns
elif scene_type == "asian_commercial_street":
# Check for crowded walkways
if 0 in detected_class_ids: # Person
person_count = detected_class_ids.count(0)
if person_count > 3:
# Calculate person density (simplified)
person_positions = []
for obj in detected_objects:
if obj["class_id"] == 0:
person_positions.append(obj["normalized_center"])
if len(person_positions) >= 2:
# Calculate average distance between people
total_distance = 0
count = 0
for i in range(len(person_positions)):
for j in range(i+1, len(person_positions)):
p1 = person_positions[i]
p2 = person_positions[j]
distance = ((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)**0.5
total_distance += distance
count += 1
if count > 0:
avg_distance = total_distance / count
if avg_distance < 0.1: # Close proximity
concerns.append("Crowded walkway with limited personal space")
# Check for motorcycles/bicycles near pedestrians
if (1 in detected_class_ids or 3 in detected_class_ids) and 0 in detected_class_ids: # Bicycle/motorcycle and person
concerns.append("Two-wheeled vehicles in pedestrian areas")
# Check for potential trip hazards
# We can't directly detect this, but can infer from context
if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
# If people are in bottom regions, they might be walking on uneven surfaces
concerns.append("Potential uneven walking surfaces in commercial area")
# Financial district safety concerns
elif scene_type == "financial_district":
# Check for heavy traffic conditions
vehicle_count = sum(1 for obj_id in detected_class_ids if obj_id in [2, 5, 7]) # Car, bus, truck
if vehicle_count > 5:
concerns.append("Heavy vehicle traffic in urban area")
# Check for pedestrians crossing busy streets
if 0 in detected_class_ids: # Person
person_count = detected_class_ids.count(0)
vehicle_nearby = any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7])
if person_count > 0 and vehicle_nearby:
concerns.append("Pedestrians navigating busy urban traffic")
# Check for traffic signals
if 9 in detected_class_ids: # Traffic light
concerns.append("Observe traffic signals when navigating this area")
else:
# If no traffic lights detected but it's a busy area, it's worth noting
if vehicle_count > 3:
concerns.append("Busy traffic area potentially without visible traffic signals in view")
# Time of day considerations
# We don't have direct time data, but can infer from vehicle lights
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
# If vehicles are present and it might be evening/night
concerns.append("Reduced visibility conditions during evening commute")
# Urban intersection safety concerns
elif scene_type == "urban_intersection":
# Check for pedestrians in crosswalks
pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 3, 5, 7]]
if pedestrian_objs:
# Calculate distribution of pedestrians to see if they're crossing
pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
# Simplified check for pedestrians in crossing pattern
if len(pedestrian_positions) >= 3:
# Check if pedestrians are distributed across different regions
pedestrian_regions = set(obj["region"] for obj in pedestrian_objs)
if len(pedestrian_regions) >= 2:
concerns.append("Multiple pedestrians crossing the intersection")
# Check for traffic signal observation
if 9 in detected_class_ids: # Traffic light
concerns.append("Observe traffic signals when crossing")
# Check for busy intersection
if len(vehicle_objs) > 3:
concerns.append("Busy intersection with multiple vehicles")
# Check for pedestrians potentially jay-walking
if pedestrian_objs and not 9 in detected_class_ids: # People but no traffic lights
concerns.append("Pedestrians should use designated crosswalks")
# Visibility concerns based on lighting
# This would be better with actual lighting data
pedestrian_count = len(pedestrian_objs)
if pedestrian_count > 5:
concerns.append("High pedestrian density at crossing points")
# Transit hub safety concerns
elif scene_type == "transit_hub":
# These would be for transit areas like train stations or bus terminals
if 0 in detected_class_ids: # Person
person_count = detected_class_ids.count(0)
if person_count > 8:
concerns.append("Crowded transit area requiring careful navigation")
# Check for luggage/bags that could be trip hazards
if 24 in detected_class_ids or 28 in detected_class_ids: # Backpack or suitcase
concerns.append("Luggage and personal items may create obstacles")
# Public transportation vehicles
if any(vehicle in detected_class_ids for vehicle in [5, 6, 7]): # Bus, train, truck
concerns.append("Stay clear of arriving and departing transit vehicles")
# Shopping district safety concerns
elif scene_type == "shopping_district":
# Check for crowded shopping areas
if 0 in detected_class_ids: # Person
person_count = detected_class_ids.count(0)
if person_count > 5:
concerns.append("Crowded shopping area with multiple people")
# Check for shopping bags and personal items
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
concerns.append("Mind personal belongings in busy retail environment")
# Check for store entrances/exits which might have automatic doors
# We can't directly detect this, but can infer from context
if scene_type == "shopping_district" and 0 in detected_class_ids:
concerns.append("Be aware of store entrances and exits with potential automatic doors")
return concerns