|
from dataclasses import dataclass |
|
from typing import Any, List, Tuple, Dict |
|
|
|
from nuplan.common.maps.abstract_map import SemanticMapLayer |
|
from nuplan.common.actor_state.tracked_objects_types import TrackedObjectType |
|
from nuplan.planning.simulation.trajectory.trajectory_sampling import TrajectorySampling |
|
|
|
from navsim.agents.transfuser.transfuser_config import TransfuserConfig |
|
import os |
|
NAVSIM_DEVKIT_ROOT = os.environ.get("NAVSIM_DEVKIT_ROOT") |
|
|
|
@dataclass |
|
class HydraDreamerConfig(TransfuserConfig): |
|
decoder_blocks: int = 8 |
|
wm_loss_weight: float = 1.0 |
|
|
|
trajectory_imi_weight: float = 1.0 |
|
trajectory_pdm_weight = { |
|
'noc': 3.0, |
|
'da': 3.0, |
|
'dd': 3.0, |
|
'ttc': 2.0, |
|
'progress': 1.0, |
|
'comfort': 1.0, |
|
} |
|
progress_weight: float = 1.0 |
|
inference_imi_weight: float = 0.1 |
|
inference_da_weight: float = 1.0 |
|
decouple: bool = False |
|
vocab_size: int = 4096 |
|
vocab_path: str = None |
|
normalize_vocab_pos: bool = False |
|
num_ego_status: int = 1 |
|
|
|
ckpt_path: str = None |
|
sigma: float = 0.5 |
|
use_pers_bev_embed: bool = False |
|
type: str = 'center' |
|
rel: bool = False |
|
use_nerf: bool = False |
|
extra_traj_layer: bool = False |
|
|
|
use_back_view: bool = False |
|
|
|
extra_tr: bool = False |
|
vadv2_head_nhead: int = 8 |
|
vadv2_head_nlayers: int = 3 |
|
|
|
trajectory_sampling: TrajectorySampling = TrajectorySampling( |
|
time_horizon=4, interval_length=0.1 |
|
) |
|
|
|
|
|
use_final_fpn: bool = False |
|
use_img_pretrained: bool = False |
|
|
|
image_architecture: str = "resnet34" |
|
backbone_type: str = 'resnet' |
|
vit_ckpt: str = '' |
|
intern_ckpt: str = '' |
|
vov_ckpt: str = '' |
|
eva_ckpt: str = '' |
|
swin_ckpt: str = '' |
|
|
|
sptr_ckpt: str = '' |
|
map_ckpt: str = '' |
|
|
|
|
|
lr_mult_backbone: float = 1.0 |
|
backbone_wd: float = 0.0 |
|
|
|
|
|
lidar_architecture: str = "resnet34" |
|
|
|
max_height_lidar: float = 100.0 |
|
pixels_per_meter: float = 4.0 |
|
hist_max_per_pixel: int = 5 |
|
|
|
lidar_min_x: float = -32 |
|
lidar_max_x: float = 32 |
|
lidar_min_y: float = -32 |
|
lidar_max_y: float = 32 |
|
|
|
lidar_split_height: float = 0.2 |
|
use_ground_plane: bool = False |
|
|
|
|
|
lidar_seq_len: int = 1 |
|
|
|
camera_width: int = 1024 |
|
camera_height: int = 256 |
|
lidar_resolution_width: int = 256 |
|
lidar_resolution_height: int = 256 |
|
|
|
img_vert_anchors: int = camera_height // 32 |
|
img_horz_anchors: int = camera_width // 32 |
|
lidar_vert_anchors: int = lidar_resolution_height // 32 |
|
lidar_horz_anchors: int = lidar_resolution_width // 32 |
|
|
|
block_exp = 4 |
|
n_layer = 2 |
|
n_head = 4 |
|
n_scale = 4 |
|
embd_pdrop = 0.1 |
|
resid_pdrop = 0.1 |
|
attn_pdrop = 0.1 |
|
|
|
gpt_linear_layer_init_mean = 0.0 |
|
|
|
gpt_linear_layer_init_std = 0.02 |
|
|
|
gpt_layer_norm_init_weight = 1.0 |
|
|
|
perspective_downsample_factor = 1 |
|
transformer_decoder_join = True |
|
detect_boxes = True |
|
use_bev_semantic = True |
|
use_semantic = False |
|
use_depth = False |
|
add_features = True |
|
|
|
|
|
tf_d_model: int = 256 |
|
tf_d_ffn: int = 1024 |
|
tf_num_layers: int = 3 |
|
tf_num_head: int = 8 |
|
tf_dropout: float = 0.0 |
|
|
|
|
|
num_bounding_boxes: int = 30 |
|
|
|
|
|
agent_class_weight: float = 10.0 |
|
agent_box_weight: float = 1.0 |
|
bev_semantic_weight: float = 10.0 |
|
|
|
|
|
bev_semantic_classes = { |
|
1: ("polygon", [SemanticMapLayer.LANE, SemanticMapLayer.INTERSECTION]), |
|
2: ("polygon", [SemanticMapLayer.WALKWAYS]), |
|
3: ("linestring", [SemanticMapLayer.LANE, SemanticMapLayer.LANE_CONNECTOR]), |
|
4: ( |
|
"box", |
|
[ |
|
TrackedObjectType.CZONE_SIGN, |
|
TrackedObjectType.BARRIER, |
|
TrackedObjectType.TRAFFIC_CONE, |
|
TrackedObjectType.GENERIC_OBJECT, |
|
], |
|
), |
|
5: ("box", [TrackedObjectType.VEHICLE]), |
|
6: ("box", [TrackedObjectType.PEDESTRIAN]), |
|
} |
|
|
|
bev_pixel_width: int = lidar_resolution_width |
|
bev_pixel_height: int = lidar_resolution_height // 2 |
|
bev_pixel_size: float = 1 / pixels_per_meter |
|
|
|
num_bev_classes = 7 |
|
bev_features_channels: int = 64 |
|
bev_down_sample_factor: int = 4 |
|
bev_upsample_factor: int = 2 |
|
|
|
@property |
|
def bev_semantic_frame(self) -> Tuple[int, int]: |
|
return (self.bev_pixel_height, self.bev_pixel_width) |
|
|
|
@property |
|
def bev_radius(self) -> float: |
|
values = [self.lidar_min_x, self.lidar_max_x, self.lidar_min_y, self.lidar_max_y] |
|
return max([abs(value) for value in values]) |
|
|