nguyenminh4099 commited on 12 days ago

Commit

a816101

verified ·

1 Parent(s): ce60298

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.gitattributes +2 -0
20260315_224809/bboxes_3d/examples/val_1/38a28a3aaf2647f2a8c0e90e31267bf8.png +3 -0
20260315_224809/bboxes_3d/examples/val_2/38a28a3aaf2647f2a8c0e90e31267bf8.png +3 -0
20260315_224809/bboxes_3d/metrics_details.json +0 -0
20260315_224809/bboxes_3d/metrics_summary.json +198 -0
20260315_224809/bboxes_3d/plots/barrier_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/barrier_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/bicycle_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/bicycle_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/bus_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/bus_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/car_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/car_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/construction_vehicle_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/construction_vehicle_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/dist_pr_0.5.pdf +0 -0
20260315_224809/bboxes_3d/plots/dist_pr_1.0.pdf +0 -0
20260315_224809/bboxes_3d/plots/dist_pr_2.0.pdf +0 -0
20260315_224809/bboxes_3d/plots/dist_pr_4.0.pdf +0 -0
20260315_224809/bboxes_3d/plots/motorcycle_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/motorcycle_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/pedestrian_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/pedestrian_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/summary.pdf +0 -0
20260315_224809/bboxes_3d/plots/traffic_cone_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/traffic_cone_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/trailer_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/trailer_tp.pdf +0 -0
20260315_224809/bboxes_3d/plots/truck_pr.pdf +0 -0
20260315_224809/bboxes_3d/plots/truck_tp.pdf +0 -0
20260315_224809/bboxes_3d/results_nusc.json +0 -0
20260315_224809/logs.log +1834 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 20260314_223104/bboxes_3d/examples/val_1/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text
 20260314_223104/bboxes_3d/examples/val_2/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 20260314_223104/bboxes_3d/examples/val_1/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text
 20260314_223104/bboxes_3d/examples/val_2/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text
+20260315_224809/bboxes_3d/examples/val_1/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text
+20260315_224809/bboxes_3d/examples/val_2/38a28a3aaf2647f2a8c0e90e31267bf8.png filter=lfs diff=lfs merge=lfs -text

20260315_224809/bboxes_3d/examples/val_1/38a28a3aaf2647f2a8c0e90e31267bf8.png ADDED Viewed

Git LFS Details

SHA256: 15e3a35220de2dea4496370a7ad999d2f74cd0e0396f2551202b8c3fe5c7671e
Pointer size: 131 Bytes
Size of remote file: 138 kB

20260315_224809/bboxes_3d/examples/val_2/38a28a3aaf2647f2a8c0e90e31267bf8.png ADDED Viewed

Git LFS Details

SHA256: 15e3a35220de2dea4496370a7ad999d2f74cd0e0396f2551202b8c3fe5c7671e
Pointer size: 131 Bytes
Size of remote file: 138 kB

20260315_224809/bboxes_3d/metrics_details.json ADDED Viewed

The diff for this file is too large to render. See raw diff

20260315_224809/bboxes_3d/metrics_summary.json ADDED Viewed

	@@ -0,0 +1,198 @@

+{
+  "label_aps": {
+    "car": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.009927908302700019
+    },
+    "truck": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "bus": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "trailer": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "construction_vehicle": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "pedestrian": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "motorcycle": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "bicycle": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "traffic_cone": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    },
+    "barrier": {
+      "0.5": 0.0,
+      "1.0": 0.0,
+      "2.0": 0.0,
+      "4.0": 0.0
+    }
+  },
+  "mean_dist_aps": {
+    "car": 0.0024819770756750047,
+    "truck": 0.0,
+    "bus": 0.0,
+    "trailer": 0.0,
+    "construction_vehicle": 0.0,
+    "pedestrian": 0.0,
+    "motorcycle": 0.0,
+    "bicycle": 0.0,
+    "traffic_cone": 0.0,
+    "barrier": 0.0
+  },
+  "mean_ap": 0.0002481977075675005,
+  "label_tp_errors": {
+    "car": {
+      "trans_err": 1.1822891487922438,
+      "scale_err": 0.9073979925542556,
+      "orient_err": 1.4475431464436141,
+      "vel_err": 0.25779051477943044,
+      "attr_err": 0.4033333333333333
+    },
+    "truck": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "bus": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "trailer": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "construction_vehicle": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "pedestrian": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "motorcycle": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "bicycle": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": 1.0,
+      "attr_err": 1.0
+    },
+    "traffic_cone": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": NaN,
+      "vel_err": NaN,
+      "attr_err": NaN
+    },
+    "barrier": {
+      "trans_err": 1.0,
+      "scale_err": 1.0,
+      "orient_err": 1.0,
+      "vel_err": NaN,
+      "attr_err": NaN
+    }
+  },
+  "tp_errors": {
+    "trans_err": 1.0182289148792243,
+    "scale_err": 0.9907397992554255,
+    "orient_err": 1.0497270162715127,
+    "vel_err": 0.9072238143474288,
+    "attr_err": 0.9254166666666667
+  },
+  "tp_scores": {
+    "trans_err": 0.0,
+    "scale_err": 0.009260200744574454,
+    "orient_err": 0.0,
+    "vel_err": 0.09277618565257117,
+    "attr_err": 0.07458333333333333
+  },
+  "nd_score": 0.017786070826831646,
+  "eval_time": 0.11112022399902344,
+  "cfg": {
+    "class_range": {
+      "car": 50,
+      "truck": 50,
+      "bus": 50,
+      "trailer": 50,
+      "construction_vehicle": 50,
+      "pedestrian": 40,
+      "motorcycle": 40,
+      "bicycle": 40,
+      "traffic_cone": 30,
+      "barrier": 30
+    },
+    "dist_fcn": "center_distance",
+    "dist_ths": [
+      0.5,
+      1.0,
+      2.0,
+      4.0
+    ],
+    "dist_th_tp": 2.0,
+    "min_recall": 0.1,
+    "min_precision": 0.1,
+    "max_boxes_per_sample": 500,
+    "mean_ap_weight": 5
+  },
+  "meta": {
+    "use_camera": true,
+    "use_lidar": false,
+    "use_radar": false,
+    "use_map": false,
+    "use_external": false
+  }
+}

20260315_224809/bboxes_3d/plots/barrier_pr.pdf ADDED Viewed

Binary file (12 kB). View file

20260315_224809/bboxes_3d/plots/barrier_tp.pdf ADDED Viewed

Binary file (12.5 kB). View file

20260315_224809/bboxes_3d/plots/bicycle_pr.pdf ADDED Viewed

Binary file (12.2 kB). View file

20260315_224809/bboxes_3d/plots/bicycle_tp.pdf ADDED Viewed

Binary file (12.5 kB). View file

20260315_224809/bboxes_3d/plots/bus_pr.pdf ADDED Viewed

Binary file (12.3 kB). View file

20260315_224809/bboxes_3d/plots/bus_tp.pdf ADDED Viewed

Binary file (12.6 kB). View file

20260315_224809/bboxes_3d/plots/car_pr.pdf ADDED Viewed

Binary file (12.5 kB). View file

20260315_224809/bboxes_3d/plots/car_tp.pdf ADDED Viewed

Binary file (15.3 kB). View file

20260315_224809/bboxes_3d/plots/construction_vehicle_pr.pdf ADDED Viewed

Binary file (12.4 kB). View file

20260315_224809/bboxes_3d/plots/construction_vehicle_tp.pdf ADDED Viewed

Binary file (12.6 kB). View file

20260315_224809/bboxes_3d/plots/dist_pr_0.5.pdf ADDED Viewed

Binary file (14.7 kB). View file

20260315_224809/bboxes_3d/plots/dist_pr_1.0.pdf ADDED Viewed

Binary file (14.7 kB). View file

20260315_224809/bboxes_3d/plots/dist_pr_2.0.pdf ADDED Viewed

Binary file (14.9 kB). View file

20260315_224809/bboxes_3d/plots/dist_pr_4.0.pdf ADDED Viewed

Binary file (15 kB). View file

20260315_224809/bboxes_3d/plots/motorcycle_pr.pdf ADDED Viewed

Binary file (12.1 kB). View file

20260315_224809/bboxes_3d/plots/motorcycle_tp.pdf ADDED Viewed

Binary file (12.4 kB). View file

20260315_224809/bboxes_3d/plots/pedestrian_pr.pdf ADDED Viewed

Binary file (12 kB). View file

20260315_224809/bboxes_3d/plots/pedestrian_tp.pdf ADDED Viewed

Binary file (12.5 kB). View file

20260315_224809/bboxes_3d/plots/summary.pdf ADDED Viewed

Binary file (31.7 kB). View file

20260315_224809/bboxes_3d/plots/traffic_cone_pr.pdf ADDED Viewed

Binary file (12.4 kB). View file

20260315_224809/bboxes_3d/plots/traffic_cone_tp.pdf ADDED Viewed

Binary file (12.7 kB). View file

20260315_224809/bboxes_3d/plots/trailer_pr.pdf ADDED Viewed

Binary file (11.8 kB). View file

20260315_224809/bboxes_3d/plots/trailer_tp.pdf ADDED Viewed

Binary file (11.9 kB). View file

20260315_224809/bboxes_3d/plots/truck_pr.pdf ADDED Viewed

Binary file (12.3 kB). View file

20260315_224809/bboxes_3d/plots/truck_tp.pdf ADDED Viewed

Binary file (12.4 kB). View file

20260315_224809/bboxes_3d/results_nusc.json ADDED Viewed

The diff for this file is too large to render. See raw diff

20260315_224809/logs.log ADDED Viewed

	@@ -0,0 +1,1834 @@

+2026/03/15 22:48:11 - bevformer - INFO -
+------------------------------------------------------------
+System environment:
+    sys.platform: darwin
+    Python: 3.10.0 (default, Oct 17 2021, 11:56:26) [Clang 13.0.0 ]
+    CUDA available: False
+    numpy_random_seed: 321
+    GCC: Apple clang version 15.0.0 (clang-1500.3.9.4)
+    PyTorch: 2.10.0
+    PyTorch compiling details: PyTorch built with:
+  - GCC 4.2
+  - C++ Version: 201703
+  - clang 15.0.0
+  - OpenMP 201811
+  - LAPACK is enabled (usually provided by MKL)
+  - NNPACK is enabled
+  - CPU capability usage: DEFAULT
+  - Build settings: BLAS_INFO=accelerate, BUILD_TYPE=Release, COMMIT_SHA=449b1768410104d3ed79d3bcfe4ba1d65c7f22c0, CXX_COMPILER=/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_PYTORCH_QNNPACK -DAT_BUILD_ARM_VEC256_WITH_SLEEF -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DUSE_COREML_DELEGATE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wvla-extension -Wsuggest-override -Wnewline-eof -Winconsistent-missing-override -Winconsistent-missing-destructor-override -Wno-pass-failed -Wno-error=old-style-cast -Wconstant-conversion -Qunused-arguments -faligned-new -fno-math-errno -fno-trapping-math -Werror=format -DUSE_MPS -Wno-missing-braces, LAPACK_INFO=accelerate, TORCH_VERSION=2.10.0, USE_CUDA=OFF, USE_CUDNN=OFF, USE_CUSPARSELT=OFF, USE_EIGEN_FOR_BLAS=ON, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=OFF, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF,
+    TorchVision: 0.25.0
+    OpenCV: 4.13.0
+    MMEngine: 0.8.5
+Runtime environment:
+    dist_cfg: {'backend': 'nccl'}
+    seed: 321
+    Distributed launcher: none
+    Distributed training: False
+    GPU number: 1
+------------------------------------------------------------
+2026/03/15 22:48:11 - bevformer - INFO - Set random seed to 321, deterministic: False
+2026/03/15 22:48:11 - bevformer - INFO - Building model
+2026/03/15 22:48:12 - bevformer - INFO - Model architecture:
+BEVFormerDetector(
+  (data_preprocessor): BaseDataPreprocessor()
+  (pts_bbox_head): BEVFormerHead(
+    (loss_cls): FocalLoss()
+    (loss_bbox): L1Loss()
+    (loss_iou): GIoULoss()
+    (transformer): PerceptionTransformer(
+      (encoder): BEVFormerEncoder(
+        (layers): ModuleList(
+          (0-2): 3 x BEVFormerLayer(
+            (attentions): ModuleList(
+              (0): TemporalSelfAttention(
+                (sampling_offsets): Linear(in_features=512, out_features=128, bias=True)
+                (attention_weights): Linear(in_features=512, out_features=64, bias=True)
+                (value_proj): Linear(in_features=256, out_features=256, bias=True)
+                (output_proj): Linear(in_features=256, out_features=256, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+              (1): SpatialCrossAttention(
+                (deformable_attention): MSDeformableAttention3D(
+                  (sampling_offsets): Linear(in_features=256, out_features=128, bias=True)
+                  (attention_weights): Linear(in_features=256, out_features=64, bias=True)
+                  (value_proj): Linear(in_features=256, out_features=256, bias=True)
+                )
+                (output_proj): Linear(in_features=256, out_features=256, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (ffns): ModuleList(
+              (0): FFN(
+                (layers): Sequential(
+                  (0): Sequential(
+                    (0): Linear(in_features=256, out_features=512, bias=True)
+                    (1): ReLU(inplace=True)
+                    (2): Dropout(p=0.1, inplace=False)
+                  )
+                  (1): Linear(in_features=512, out_features=256, bias=True)
+                  (2): Dropout(p=0.1, inplace=False)
+                )
+                (dropout_layer): Identity()
+                (gamma2): Identity()
+              )
+            )
+            (norms): ModuleList(
+              (0-2): 3 x LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+        )
+      )
+      (decoder): DetectionTransformerDecoder(
+        (layers): ModuleList(
+          (0-5): 6 x DetrTransformerDecoderLayer(
+            (attentions): ModuleList(
+              (0): MultiheadAttention(
+                (q_proj): Linear(in_features=256, out_features=256, bias=True)
+                (k_proj): Linear(in_features=256, out_features=256, bias=True)
+                (v_proj): Linear(in_features=256, out_features=256, bias=True)
+                (output_proj): Linear(in_features=256, out_features=256, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+              (1): CustomMSDeformableAttention(
+                (sampling_offsets): Linear(in_features=256, out_features=64, bias=True)
+                (attention_weights): Linear(in_features=256, out_features=32, bias=True)
+                (value_proj): Linear(in_features=256, out_features=256, bias=True)
+                (output_proj): Linear(in_features=256, out_features=256, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (ffns): ModuleList(
+              (0): FFN(
+                (layers): Sequential(
+                  (0): Sequential(
+                    (0): Linear(in_features=256, out_features=512, bias=True)
+                    (1): ReLU(inplace=True)
+                    (2): Dropout(p=0.1, inplace=False)
+                  )
+                  (1): Linear(in_features=512, out_features=256, bias=True)
+                  (2): Dropout(p=0.1, inplace=False)
+                )
+                (dropout_layer): Identity()
+                (gamma2): Identity()
+              )
+            )
+            (norms): ModuleList(
+              (0-2): 3 x LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+        )
+      )
+      (reference_points): Linear(in_features=256, out_features=3, bias=True)
+      (can_bus_mlp): Sequential(
+        (0): Linear(in_features=18, out_features=128, bias=True)
+        (1): ReLU(inplace=True)
+        (2): Linear(in_features=128, out_features=256, bias=True)
+        (3): ReLU(inplace=True)
+        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (cls_branches): ModuleList(
+      (0-5): 6 x Sequential(
+        (0): Linear(in_features=256, out_features=256, bias=True)
+        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+        (2): ReLU(inplace=True)
+        (3): Linear(in_features=256, out_features=10, bias=True)
+      )
+    )
+    (reg_branches): ModuleList(
+      (0-5): 6 x Sequential(
+        (0): Linear(in_features=256, out_features=256, bias=True)
+        (1): ReLU()
+        (2): Linear(in_features=256, out_features=256, bias=True)
+        (3): ReLU()
+        (4): Linear(in_features=256, out_features=10, bias=True)
+      )
+    )
+    (bev_embedding): Embedding(2500, 256)
+    (object_query_embedding): Embedding(900, 512)
+    (positional_encoding): LearnedPositionalEncoding(num_feats=128, row_num_embed=50, col_num_embed=50)
+  )
+  (img_backbone): ResNet(
+    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
+    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+    (relu): ReLU(inplace=True)
+    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+    (layer1): ResLayer(
+      (0): Bottleneck(
+        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+        (downsample): Sequential(
+          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        )
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (1): Bottleneck(
+        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (2): Bottleneck(
+        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+    )
+    (layer2): ResLayer(
+      (0): Bottleneck(
+        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+        (downsample): Sequential(
+          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
+          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        )
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (1): Bottleneck(
+        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (2): Bottleneck(
+        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (3): Bottleneck(
+        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+    )
+    (layer3): ResLayer(
+      (0): Bottleneck(
+        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+        (downsample): Sequential(
+          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
+          (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        )
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (1): Bottleneck(
+        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (2): Bottleneck(
+        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (3): Bottleneck(
+        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (4): Bottleneck(
+        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (5): Bottleneck(
+        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+    )
+    (layer4): ResLayer(
+      (0): Bottleneck(
+        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+        (downsample): Sequential(
+          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
+          (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        )
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (1): Bottleneck(
+        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+      (2): Bottleneck(
+        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (relu): ReLU(inplace=True)
+      )
+      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
+    )
+  )
+  init_cfg=[{'type': 'Kaiming', 'layer': 'Conv2d'}, {'type': 'Constant', 'val': 1, 'layer': ['_BatchNorm', 'GroupNorm']}]
+  (img_neck): FPN(
+    (lateral_convs): ModuleList(
+      (0): ConvModule(
+        (conv): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
+      )
+    )
+    (fpn_convs): ModuleList(
+      (0): ConvModule(
+        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+  )
+  init_cfg={'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}
+  (grid_mask): GridMask()
+)
+2026/03/15 22:48:12 - bevformer - INFO - Wrapping model
+2026/03/15 22:48:12 - bevformer - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
+2026/03/15 22:48:12 - bevformer - INFO - Hooks will be executed in the following order:
+before_run:
+(VERY_HIGH   ) RuntimeInfoHook
+(BELOW_NORMAL) LoggerHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+(VERY_LOW    ) CheckpointResumer
+ --------------------
+before_train:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+(VERY_LOW    ) CheckpointResumer
+ --------------------
+before_train_epoch:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+(NORMAL      ) DistSamplerSeedHook
+ --------------------
+before_train_iter:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+ --------------------
+after_train_iter:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+(BELOW_NORMAL) LoggerHook
+(LOW         ) ParamSchedulerHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+ --------------------
+after_train_epoch:
+(NORMAL      ) IterTimerHook
+(LOW         ) ParamSchedulerHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+ --------------------
+before_val:
+(VERY_HIGH   ) RuntimeInfoHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointResumer
+ --------------------
+before_val_epoch:
+(NORMAL      ) IterTimerHook
+ --------------------
+before_val_iter:
+(NORMAL      ) IterTimerHook
+ --------------------
+after_val_iter:
+(NORMAL      ) IterTimerHook
+(BELOW_NORMAL) LoggerHook
+ --------------------
+after_val_epoch:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+(BELOW_NORMAL) LoggerHook
+(LOW         ) ParamSchedulerHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+ --------------------
+after_val:
+(VERY_HIGH   ) RuntimeInfoHook
+ --------------------
+after_train:
+(VERY_HIGH   ) RuntimeInfoHook
+(VERY_LOW    ) CheckpointHookV2
+(VERY_LOW    ) CheckpointUploader
+ --------------------
+before_test:
+(VERY_HIGH   ) RuntimeInfoHook
+ --------------------
+before_test_epoch:
+(NORMAL      ) IterTimerHook
+ --------------------
+before_test_iter:
+(NORMAL      ) IterTimerHook
+ --------------------
+after_test_iter:
+(NORMAL      ) IterTimerHook
+(BELOW_NORMAL) LoggerHook
+ --------------------
+after_test_epoch:
+(VERY_HIGH   ) RuntimeInfoHook
+(NORMAL      ) IterTimerHook
+(BELOW_NORMAL) LoggerHook
+ --------------------
+after_test:
+(VERY_HIGH   ) RuntimeInfoHook
+ --------------------
+after_run:
+(BELOW_NORMAL) LoggerHook
+ --------------------
+2026/03/15 22:48:12 - bevformer - INFO - Config:
+_dim_ = 256
+_ffn_dim_ = 512
+_num_levels_ = 1
+_pos_dim_ = 128
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+bev_h_ = 50
+bev_w_ = 50
+by_epoch = False
+class_names = [
+    'car',
+    'truck',
+    'construction_vehicle',
+    'bus',
+    'trailer',
+    'barrier',
+    'motorcycle',
+    'bicycle',
+    'pedestrian',
+    'traffic_cone',
+]
+custom_hooks = [
+    dict(
+        by_epoch=False,
+        clean_local=False,
+        interval=1,
+        repo_id='5421Project',
+        type='CheckpointUploader'),
+    dict(repo_id='5421Project', resume_type='last', type='CheckpointResumer'),
+]
+data = dict(
+    nonshuffler_sampler=dict(type='DistributedSampler'),
+    samples_per_gpu=1,
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    test=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        frame=[
+            -3,
+            -2,
+            -1,
+        ],
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(
+                flip=False,
+                img_scale=(
+                    800,
+                    450,
+                ),
+                pts_scale_ratio=[
+                    1.0,
+                ],
+                transforms=[
+                    dict(
+                        scales=[
+                            0.5,
+                        ], type='RandomScaleImageMultiViewImage'),
+                    dict(size_divisor=32, type='PadMultiViewImage'),
+                    dict(
+                        class_names=[
+                            'car',
+                            'truck',
+                            'construction_vehicle',
+                            'bus',
+                            'trailer',
+                            'barrier',
+                            'motorcycle',
+                            'bicycle',
+                            'pedestrian',
+                            'traffic_cone',
+                        ],
+                        type='CustomDefaultFormatBundle3D'),
+                    dict(keys=[
+                        'img',
+                    ], type='CustomCollect3D'),
+                ],
+                type='MultiScaleFlipAug3D'),
+        ],
+        test_mode=True,
+        type='CustomNuScenesDataset'),
+    train=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        box_type_3d='LiDAR',
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=True,
+                with_label_3d=True),
+            dict(
+                point_cloud_range=[
+                    -51.2,
+                    -51.2,
+                    -5.0,
+                    51.2,
+                    51.2,
+                    3.0,
+                ],
+                type='ObjectRangeFilter'),
+            dict(
+                classes=[
+                    'car',
+                    'truck',
+                    'construction_vehicle',
+                    'bus',
+                    'trailer',
+                    'barrier',
+                    'motorcycle',
+                    'bicycle',
+                    'pedestrian',
+                    'traffic_cone',
+                ],
+                type='ObjectNameFilter'),
+            dict(type='PhotoMetricDistortionMultiViewImage'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(scales=[
+                0.5,
+            ], type='RandomScaleImageMultiViewImage'),
+            dict(size_divisor=32, type='PadMultiViewImage'),
+            dict(
+                class_names=[
+                    'car',
+                    'truck',
+                    'construction_vehicle',
+                    'bus',
+                    'trailer',
+                    'barrier',
+                    'motorcycle',
+                    'bicycle',
+                    'pedestrian',
+                    'traffic_cone',
+                ],
+                type='CustomDefaultFormatBundle3D'),
+            dict(
+                keys=[
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                    'img',
+                ],
+                type='CustomCollect3D'),
+            dict(type='TypeConverter'),
+        ],
+        queue_length=4,
+        test_mode=False,
+        type='CustomNuScenesDataset',
+        use_valid_flag=True),
+    val=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        frame=(),
+        frames=[
+            -3,
+            -2,
+            -1,
+        ],
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(
+                flip=False,
+                img_scale=(
+                    800,
+                    450,
+                ),
+                pts_scale_ratio=[
+                    1.0,
+                ],
+                transforms=[
+                    dict(
+                        scales=[
+                            0.5,
+                        ], type='RandomScaleImageMultiViewImage'),
+                    dict(size_divisor=32, type='PadMultiViewImage'),
+                    dict(
+                        class_names=[
+                            'car',
+                            'truck',
+                            'construction_vehicle',
+                            'bus',
+                            'trailer',
+                            'barrier',
+                            'motorcycle',
+                            'bicycle',
+                            'pedestrian',
+                            'traffic_cone',
+                        ],
+                        type='CustomDefaultFormatBundle3D'),
+                    dict(keys=[
+                        'img',
+                    ], type='CustomCollect3D'),
+                ],
+                type='MultiScaleFlipAug3D'),
+        ],
+        samples_per_gpu=1,
+        test_mode=True,
+        type='CustomNuScenesDataset'),
+    workers_per_gpu=4)
+data_root = 'data/nuscenes/v1.0-mini/'
+dataset_type = 'CustomNuScenesDataset'
+decoder = dict(
+    num_layers=6,
+    return_intermediate=True,
+    transformerlayers=dict(
+        attn_cfgs=[
+            dict(
+                dropout=0.1,
+                embed_dims=256,
+                num_heads=8,
+                type='MultiheadAttention'),
+            dict(
+                embed_dims=256,
+                num_levels=1,
+                type='CustomMSDeformableAttention'),
+        ],
+        ffn_cfgs=dict(
+            feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
+        operation_order=(
+            'self_attn',
+            'norm',
+            'cross_attn',
+            'norm',
+            'ffn',
+            'norm',
+        ),
+        type='DetrTransformerDecoderLayer'),
+    type='DetectionTransformerDecoder')
+default_hooks = dict(
+    checkpoint=dict(
+        by_epoch=False,
+        interval=1,
+        max_keep_ckpts=1,
+        save_best=[
+            'loss',
+            'mAP',
+            'NDS',
+        ],
+        type='CheckpointHookV2'),
+    logger=dict(
+        interval=1,
+        interval_exp_name=1000,
+        log_metric_by_epoch=False,
+        type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    runtime_info=dict(type='RuntimeInfoHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'))
+encoder = dict(
+    num_layers=3,
+    num_points_in_pillar=8,
+    pc_range=[
+        -51.2,
+        -51.2,
+        -5.0,
+        51.2,
+        51.2,
+        3.0,
+    ],
+    return_intermediate=False,
+    transformerlayers=dict(
+        attn_cfgs=[
+            dict(embed_dims=256, num_levels=1, type='TemporalSelfAttention'),
+            dict(
+                deformable_attention=dict(
+                    embed_dims=256,
+                    num_levels=1,
+                    num_points=8,
+                    type='MSDeformableAttention3D'),
+                embed_dims=256,
+                pc_range=[
+                    -51.2,
+                    -51.2,
+                    -5.0,
+                    51.2,
+                    51.2,
+                    3.0,
+                ],
+                type='SpatialCrossAttention'),
+        ],
+        ffn_cfgs=dict(
+            feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
+        operation_order=(
+            'self_attn',
+            'norm',
+            'cross_attn',
+            'norm',
+            'ffn',
+            'norm',
+        ),
+        type='BEVFormerLayer'),
+    type='BEVFormerEncoder')
+env_cfg = dict(dist_cfg=dict(backend='nccl'))
+experiment_name = 'debug'
+file_client_args = dict(backend='disk')
+frames = [
+    -3,
+    -2,
+    -1,
+]
+gpu_ids = range(0, 1)
+img_norm_cfg = dict(
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    to_rgb=True)
+input_modality = dict(
+    use_camera=True,
+    use_external=False,
+    use_lidar=False,
+    use_map=False,
+    use_radar=False)
+interval = 1
+launcher = 'none'
+load_from = None
+log_interval = 1
+log_processor = dict(window_size=20)
+lr_config = dict(
+    min_lr_ratio=0.001,
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.3333333333333333)
+max_epochs = 5
+max_iters = 2
+model = dict(
+    img_backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        norm_cfg=dict(requires_grad=False, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(3, ),
+        style='pytorch',
+        type='ResNet'),
+    img_neck=dict(
+        add_extra_convs='on_output',
+        in_channels=[
+            2048,
+        ],
+        num_outs=1,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        start_level=0,
+        type='FPN'),
+    pretrained=dict(img='torchvision://resnet50'),
+    pts_bbox_head=dict(
+        as_two_stage=False,
+        bbox_coder=dict(
+            max_num=300,
+            num_classes=10,
+            pc_range=[
+                -51.2,
+                -51.2,
+                -5.0,
+                51.2,
+                51.2,
+                3.0,
+            ],
+            post_center_range=[
+                -61.2,
+                -61.2,
+                -10.0,
+                61.2,
+                61.2,
+                10.0,
+            ],
+            type='NMSFreeCoder',
+            voxel_size=[
+                0.2,
+                0.2,
+                8,
+            ]),
+        bev_h=50,
+        bev_w=50,
+        in_channels=256,
+        loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=2.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
+        num_classes=10,
+        num_query=900,
+        positional_encoding=dict(
+            col_num_embed=50,
+            num_feats=128,
+            row_num_embed=50,
+            type='LearnedPositionalEncoding'),
+        sync_cls_avg_factor=True,
+        transformer=dict(
+            decoder=dict(
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    attn_cfgs=[
+                        dict(
+                            dropout=0.1,
+                            embed_dims=256,
+                            num_heads=8,
+                            type='MultiheadAttention'),
+                        dict(
+                            embed_dims=256,
+                            num_levels=1,
+                            type='CustomMSDeformableAttention'),
+                    ],
+                    ffn_cfgs=dict(
+                        feedforward_channels=512,
+                        ffn_drop=0.1,
+                        num_fcs=2,
+                        type='FFN'),
+                    operation_order=(
+                        'self_attn',
+                        'norm',
+                        'cross_attn',
+                        'norm',
+                        'ffn',
+                        'norm',
+                    ),
+                    type='DetrTransformerDecoderLayer'),
+                type='DetectionTransformerDecoder'),
+            embed_dims=256,
+            encoder=dict(
+                num_layers=3,
+                num_points_in_pillar=8,
+                pc_range=[
+                    -51.2,
+                    -51.2,
+                    -5.0,
+                    51.2,
+                    51.2,
+                    3.0,
+                ],
+                return_intermediate=False,
+                transformerlayers=dict(
+                    attn_cfgs=[
+                        dict(
+                            embed_dims=256,
+                            num_levels=1,
+                            type='TemporalSelfAttention'),
+                        dict(
+                            deformable_attention=dict(
+                                embed_dims=256,
+                                num_levels=1,
+                                num_points=8,
+                                type='MSDeformableAttention3D'),
+                            embed_dims=256,
+                            pc_range=[
+                                -51.2,
+                                -51.2,
+                                -5.0,
+                                51.2,
+                                51.2,
+                                3.0,
+                            ],
+                            type='SpatialCrossAttention'),
+                    ],
+                    ffn_cfgs=dict(
+                        feedforward_channels=512,
+                        ffn_drop=0.1,
+                        num_fcs=2,
+                        type='FFN'),
+                    operation_order=(
+                        'self_attn',
+                        'norm',
+                        'cross_attn',
+                        'norm',
+                        'ffn',
+                        'norm',
+                    ),
+                    type='BEVFormerLayer'),
+                type='BEVFormerEncoder'),
+            num_cams=6,
+            num_feature_levels=1,
+            rotate_prev_bev=True,
+            type='PerceptionTransformer',
+            use_can_bus=True,
+            use_shift=True),
+        type='BEVFormerHead',
+        with_box_refine=True),
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                cls_cost=dict(type='FocalCost', weight=2.0),
+                iou_cost=dict(type='SmoothL1Cost', weight=0.25),
+                pc_range=[
+                    -51.2,
+                    -51.2,
+                    -5.0,
+                    51.2,
+                    51.2,
+                    3.0,
+                ],
+                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+                type='HungarianAssigner3D'),
+            grid_size=[
+                512,
+                512,
+                1,
+            ],
+            out_size_factor=4,
+            point_cloud_range=[
+                -51.2,
+                -51.2,
+                -5.0,
+                51.2,
+                51.2,
+                3.0,
+            ],
+            voxel_size=[
+                0.2,
+                0.2,
+                8,
+            ])),
+    type='BEVFormerDetector',
+    use_grid_mask=True,
+    video_test_mode=True)
+optim_wrapper = dict(
+    optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.01),
+    type='OptimWrapper')
+optimizer = dict(lr=0.0001, type='AdamW', weight_decay=0.01)
+param_scheduler = dict(
+    milestones=[
+        1,
+        2,
+    ], type='MultiStepLR')
+point_cloud_range = [
+    -51.2,
+    -51.2,
+    -5.0,
+    51.2,
+    51.2,
+    3.0,
+]
+pts_bbox_head = dict(
+    as_two_stage=False,
+    bbox_coder=dict(
+        max_num=300,
+        num_classes=10,
+        pc_range=[
+            -51.2,
+            -51.2,
+            -5.0,
+            51.2,
+            51.2,
+            3.0,
+        ],
+        post_center_range=[
+            -61.2,
+            -61.2,
+            -10.0,
+            61.2,
+            61.2,
+            10.0,
+        ],
+        type='NMSFreeCoder',
+        voxel_size=[
+            0.2,
+            0.2,
+            8,
+        ]),
+    bev_h=50,
+    bev_w=50,
+    in_channels=256,
+    loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
+    loss_cls=dict(
+        alpha=0.25,
+        gamma=2.0,
+        loss_weight=2.0,
+        type='FocalLoss',
+        use_sigmoid=True),
+    loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
+    num_classes=10,
+    num_query=900,
+    positional_encoding=dict(
+        col_num_embed=50,
+        num_feats=128,
+        row_num_embed=50,
+        type='LearnedPositionalEncoding'),
+    sync_cls_avg_factor=True,
+    transformer=dict(
+        decoder=dict(
+            num_layers=6,
+            return_intermediate=True,
+            transformerlayers=dict(
+                attn_cfgs=[
+                    dict(
+                        dropout=0.1,
+                        embed_dims=256,
+                        num_heads=8,
+                        type='MultiheadAttention'),
+                    dict(
+                        embed_dims=256,
+                        num_levels=1,
+                        type='CustomMSDeformableAttention'),
+                ],
+                ffn_cfgs=dict(
+                    feedforward_channels=512,
+                    ffn_drop=0.1,
+                    num_fcs=2,
+                    type='FFN'),
+                operation_order=(
+                    'self_attn',
+                    'norm',
+                    'cross_attn',
+                    'norm',
+                    'ffn',
+                    'norm',
+                ),
+                type='DetrTransformerDecoderLayer'),
+            type='DetectionTransformerDecoder'),
+        embed_dims=256,
+        encoder=dict(
+            num_layers=3,
+            num_points_in_pillar=8,
+            pc_range=[
+                -51.2,
+                -51.2,
+                -5.0,
+                51.2,
+                51.2,
+                3.0,
+            ],
+            return_intermediate=False,
+            transformerlayers=dict(
+                attn_cfgs=[
+                    dict(
+                        embed_dims=256,
+                        num_levels=1,
+                        type='TemporalSelfAttention'),
+                    dict(
+                        deformable_attention=dict(
+                            embed_dims=256,
+                            num_levels=1,
+                            num_points=8,
+                            type='MSDeformableAttention3D'),
+                        embed_dims=256,
+                        pc_range=[
+                            -51.2,
+                            -51.2,
+                            -5.0,
+                            51.2,
+                            51.2,
+                            3.0,
+                        ],
+                        type='SpatialCrossAttention'),
+                ],
+                ffn_cfgs=dict(
+                    feedforward_channels=512,
+                    ffn_drop=0.1,
+                    num_fcs=2,
+                    type='FFN'),
+                operation_order=(
+                    'self_attn',
+                    'norm',
+                    'cross_attn',
+                    'norm',
+                    'ffn',
+                    'norm',
+                ),
+                type='BEVFormerLayer'),
+            type='BEVFormerEncoder'),
+        num_cams=6,
+        num_feature_levels=1,
+        rotate_prev_bev=True,
+        type='PerceptionTransformer',
+        use_can_bus=True,
+        use_shift=True),
+    type='BEVFormerHead',
+    with_box_refine=True)
+queue_length = 4
+resume = False
+scales = [
+    0.5,
+]
+test_cfg = dict(max_iters=1)
+test_dataloader = dict(
+    batch_size=1,
+    collate_fn=dict(type='test_collate'),
+    dataset=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        frame=[
+            -3,
+            -2,
+            -1,
+        ],
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(
+                flip=False,
+                img_scale=(
+                    800,
+                    450,
+                ),
+                pts_scale_ratio=[
+                    1.0,
+                ],
+                transforms=[
+                    dict(
+                        scales=[
+                            0.5,
+                        ], type='RandomScaleImageMultiViewImage'),
+                    dict(size_divisor=32, type='PadMultiViewImage'),
+                    dict(
+                        class_names=[
+                            'car',
+                            'truck',
+                            'construction_vehicle',
+                            'bus',
+                            'trailer',
+                            'barrier',
+                            'motorcycle',
+                            'bicycle',
+                            'pedestrian',
+                            'traffic_cone',
+                        ],
+                        type='CustomDefaultFormatBundle3D'),
+                    dict(keys=[
+                        'img',
+                    ], type='CustomCollect3D'),
+                ],
+                type='MultiScaleFlipAug3D'),
+        ],
+        test_mode=True,
+        type='CustomNuScenesDataset'),
+    num_workers=0,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+test_evaluator = dict(metrics=[
+    dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        data_root='data/nuscenes/v1.0-mini/',
+        type='src.NuScenesMetric',
+        version='v1.0-mini'),
+])
+test_max_iters = 1
+test_pipeline = [
+    dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+    dict(
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        to_rgb=True,
+        type='NormalizeMultiviewImage'),
+    dict(
+        flip=False,
+        img_scale=(
+            800,
+            450,
+        ),
+        pts_scale_ratio=[
+            1.0,
+        ],
+        transforms=[
+            dict(scales=[
+                0.5,
+            ], type='RandomScaleImageMultiViewImage'),
+            dict(size_divisor=32, type='PadMultiViewImage'),
+            dict(
+                class_names=[
+                    'car',
+                    'truck',
+                    'construction_vehicle',
+                    'bus',
+                    'trailer',
+                    'barrier',
+                    'motorcycle',
+                    'bicycle',
+                    'pedestrian',
+                    'traffic_cone',
+                ],
+                type='CustomDefaultFormatBundle3D'),
+            dict(keys=[
+                'img',
+            ], type='CustomCollect3D'),
+        ],
+        type='MultiScaleFlipAug3D'),
+]
+train_cfg = dict(by_epoch=False, max_epochs=5, max_iters=2, val_interval=1)
+train_dataloader = dict(
+    batch_size=1,
+    collate_fn=dict(type='train_collate'),
+    dataset=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        box_type_3d='LiDAR',
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=True,
+                with_label_3d=True),
+            dict(
+                point_cloud_range=[
+                    -51.2,
+                    -51.2,
+                    -5.0,
+                    51.2,
+                    51.2,
+                    3.0,
+                ],
+                type='ObjectRangeFilter'),
+            dict(
+                classes=[
+                    'car',
+                    'truck',
+                    'construction_vehicle',
+                    'bus',
+                    'trailer',
+                    'barrier',
+                    'motorcycle',
+                    'bicycle',
+                    'pedestrian',
+                    'traffic_cone',
+                ],
+                type='ObjectNameFilter'),
+            dict(type='PhotoMetricDistortionMultiViewImage'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(scales=[
+                0.5,
+            ], type='RandomScaleImageMultiViewImage'),
+            dict(size_divisor=32, type='PadMultiViewImage'),
+            dict(
+                class_names=[
+                    'car',
+                    'truck',
+                    'construction_vehicle',
+                    'bus',
+                    'trailer',
+                    'barrier',
+                    'motorcycle',
+                    'bicycle',
+                    'pedestrian',
+                    'traffic_cone',
+                ],
+                type='CustomDefaultFormatBundle3D'),
+            dict(
+                keys=[
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                    'img',
+                ],
+                type='CustomCollect3D'),
+            dict(type='TypeConverter'),
+        ],
+        queue_length=4,
+        test_mode=False,
+        type='CustomNuScenesDataset',
+        use_valid_flag=True),
+    num_workers=0,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        point_cloud_range=[
+            -51.2,
+            -51.2,
+            -5.0,
+            51.2,
+            51.2,
+            3.0,
+        ],
+        type='ObjectRangeFilter'),
+    dict(
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        type='ObjectNameFilter'),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        to_rgb=True,
+        type='NormalizeMultiviewImage'),
+    dict(scales=[
+        0.5,
+    ], type='RandomScaleImageMultiViewImage'),
+    dict(size_divisor=32, type='PadMultiViewImage'),
+    dict(
+        class_names=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        type='CustomDefaultFormatBundle3D'),
+    dict(
+        keys=[
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+            'img',
+        ], type='CustomCollect3D'),
+    dict(type='TypeConverter'),
+]
+transformer = dict(
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        transformerlayers=dict(
+            attn_cfgs=[
+                dict(
+                    dropout=0.1,
+                    embed_dims=256,
+                    num_heads=8,
+                    type='MultiheadAttention'),
+                dict(
+                    embed_dims=256,
+                    num_levels=1,
+                    type='CustomMSDeformableAttention'),
+            ],
+            ffn_cfgs=dict(
+                feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
+            operation_order=(
+                'self_attn',
+                'norm',
+                'cross_attn',
+                'norm',
+                'ffn',
+                'norm',
+            ),
+            type='DetrTransformerDecoderLayer'),
+        type='DetectionTransformerDecoder'),
+    embed_dims=256,
+    encoder=dict(
+        num_layers=3,
+        num_points_in_pillar=8,
+        pc_range=[
+            -51.2,
+            -51.2,
+            -5.0,
+            51.2,
+            51.2,
+            3.0,
+        ],
+        return_intermediate=False,
+        transformerlayers=dict(
+            attn_cfgs=[
+                dict(
+                    embed_dims=256, num_levels=1,
+                    type='TemporalSelfAttention'),
+                dict(
+                    deformable_attention=dict(
+                        embed_dims=256,
+                        num_levels=1,
+                        num_points=8,
+                        type='MSDeformableAttention3D'),
+                    embed_dims=256,
+                    pc_range=[
+                        -51.2,
+                        -51.2,
+                        -5.0,
+                        51.2,
+                        51.2,
+                        3.0,
+                    ],
+                    type='SpatialCrossAttention'),
+            ],
+            ffn_cfgs=dict(
+                feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
+            operation_order=(
+                'self_attn',
+                'norm',
+                'cross_attn',
+                'norm',
+                'ffn',
+                'norm',
+            ),
+            type='BEVFormerLayer'),
+        type='BEVFormerEncoder'),
+    num_cams=6,
+    num_feature_levels=1,
+    rotate_prev_bev=True,
+    type='PerceptionTransformer',
+    use_can_bus=True,
+    use_shift=True)
+val_cfg = dict(max_iters=1)
+val_dataloader = dict(
+    batch_size=1,
+    collate_fn=dict(type='test_collate'),
+    dataset=dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        bev_size=(
+            50,
+            50,
+        ),
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        frame=(),
+        frames=[
+            -3,
+            -2,
+            -1,
+        ],
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        pipeline=[
+            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
+            dict(
+                mean=[
+                    123.675,
+                    116.28,
+                    103.53,
+                ],
+                std=[
+                    58.395,
+                    57.12,
+                    57.375,
+                ],
+                to_rgb=True,
+                type='NormalizeMultiviewImage'),
+            dict(
+                flip=False,
+                img_scale=(
+                    800,
+                    450,
+                ),
+                pts_scale_ratio=[
+                    1.0,
+                ],
+                transforms=[
+                    dict(
+                        scales=[
+                            0.5,
+                        ], type='RandomScaleImageMultiViewImage'),
+                    dict(size_divisor=32, type='PadMultiViewImage'),
+                    dict(
+                        class_names=[
+                            'car',
+                            'truck',
+                            'construction_vehicle',
+                            'bus',
+                            'trailer',
+                            'barrier',
+                            'motorcycle',
+                            'bicycle',
+                            'pedestrian',
+                            'traffic_cone',
+                        ],
+                        type='CustomDefaultFormatBundle3D'),
+                    dict(keys=[
+                        'img',
+                    ], type='CustomCollect3D'),
+                ],
+                type='MultiScaleFlipAug3D'),
+        ],
+        samples_per_gpu=1,
+        test_mode=True,
+        type='CustomNuScenesDataset'),
+    num_workers=0,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+val_evaluator = dict(metrics=[
+    dict(
+        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
+        classes=[
+            'car',
+            'truck',
+            'construction_vehicle',
+            'bus',
+            'trailer',
+            'barrier',
+            'motorcycle',
+            'bicycle',
+            'pedestrian',
+            'traffic_cone',
+        ],
+        data_root='data/nuscenes/v1.0-mini/',
+        jsonfile_prefix='results',
+        modality=dict(
+            use_camera=True,
+            use_external=False,
+            use_lidar=False,
+            use_map=False,
+            use_radar=False),
+        plot_every_run=True,
+        plot_examples=1,
+        type='src.NuScenesMetric',
+        version='v1.0-mini'),
+])
+val_interval = 1
+val_max_iters = 1
+version = 'v1.0-mini'
+visualizer = dict(
+    type='Visualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend'),
+    ])
+voxel_size = [
+    0.2,
+    0.2,
+    8,
+]
+work_dir = 'experiment'
+2026/03/15 22:48:13 - bevformer - INFO - See full config in 'experiment/debug/bevformer_tiny_test.py'.
+2026/03/15 22:48:15 - bevformer - INFO - Checkpoints will be saved to 'experiment/debug' after every 1 steps.
+2026/03/15 22:48:15 - bevformer - INFO - Initialize best checkpoints by train phase.
+2026/03/15 22:48:15 - bevformer - INFO - Set best path for 'loss' None.
+2026/03/15 22:48:15 - bevformer - INFO - Set best path for 'mAP' None.
+2026/03/15 22:48:15 - bevformer - INFO - Set best path for 'NDS' None.
+2026/03/15 22:48:15 - bevformer - INFO - The best checkpoints will be saved to 'experiment/debug' based on ['loss', 'mAP', 'NDS'] with rules ['less', 'greater', 'greater'] after every 1 steps.
+2026/03/15 22:48:15 - bevformer - INFO - Keep maximum 1 checkpoints in local.
+2026/03/15 22:48:15 - bevformer - INFO - Checkpoints will be pushed to repo 'https://huggingface.co/5421Project/debug' after every 1 steps.
+2026/03/15 22:48:39 - bevformer - INFO - Epoch(train) [1][  1/323]  lr: 1.0000e-04  eta: 0:00:24  time: 24.1245  data_time: 1.0970  loss: 58.3901  loss_cls: 2.3029  loss_bbox: 7.7319  d0.loss_cls: 2.2785  d0.loss_bbox: 7.3663  d1.loss_cls: 2.2856  d1.loss_bbox: 7.4108  d2.loss_cls: 2.2401  d2.loss_bbox: 7.3151  d3.loss_cls: 2.2908  d3.loss_bbox: 7.3724  d4.loss_cls: 2.3867  d4.loss_bbox: 7.4091
+2026/03/15 22:48:39 - bevformer - INFO - Saving checkpoint at 1 iterations
+2026/03/15 22:48:40 - bevformer - INFO - Saving best checkpoints...
+2026/03/15 22:48:40 - bevformer - INFO - Set 'best_score_loss' to +/-inf as it is not in message hub.
+2026/03/15 22:48:40 - bevformer - INFO - [loss]: Best score: inf, current score: 58.39009094238281
+2026/03/15 22:48:41 - bevformer - INFO - The best checkpoint with 58.3901 loss at 1 iter is saved to 'best_loss_iter_1.pth'.
+2026/03/15 22:48:41 - bevformer - INFO - Resaving checkpoint at 1 iter...
+2026/03/15 22:48:42 - bevformer - INFO - Pushing checkpoint at 1 steps...
+2026/03/15 22:48:52 - bevformer - INFO - Pushed last checkpoint 'experiment/debug/iter_1.pth' to repo
+2026/03/15 22:48:56 - bevformer - INFO - Pushed best checkpoint 'best_loss_iter_1.pth' of [loss]...
+2026/03/15 22:48:59 - bevformer - INFO - Epoch(val) [0][ 1/81]    eta: 0:02:55  time: 2.1911  data_time: 0.6630
+2026/03/15 22:49:10 - bevformer - INFO - Epoch(val) [0][81/81]    NDS: 0.0185  mAP: 0.0002  data_time: 0.6630  time: 2.1911
+2026/03/15 22:49:10 - bevformer - INFO - Save best checkpoints after val epoch.
+2026/03/15 22:49:10 - bevformer - INFO - Saving best checkpoints...
+2026/03/15 22:49:10 - bevformer - INFO - Set 'best_score_mAP' to +/-inf as it is not in message hub.
+2026/03/15 22:49:10 - bevformer - INFO - [mAP]: Best score: -inf, current score: 0.00023853881585173065
+2026/03/15 22:49:10 - bevformer - INFO - The best checkpoint with 0.0002 mAP at 1 iter is saved to 'best_mAP_iter_1.pth'.
+2026/03/15 22:49:10 - bevformer - INFO - Set 'best_score_NDS' to +/-inf as it is not in message hub.
+2026/03/15 22:49:10 - bevformer - INFO - [NDS]: Best score: -inf, current score: 0.018478272642075605
+2026/03/15 22:49:11 - bevformer - INFO - The best checkpoint with 0.0185 NDS at 1 iter is saved to 'best_NDS_iter_1.pth'.
+2026/03/15 22:49:11 - bevformer - INFO - Resaving checkpoint at 1 iter...
+2026/03/15 22:49:14 - bevformer - INFO - Pushed best checkpoint 'best_mAP_iter_1.pth' of [mAP]...
+2026/03/15 22:49:16 - bevformer - INFO - Pushed best checkpoint 'best_NDS_iter_1.pth' of [NDS]...
+2026/03/15 22:49:40 - bevformer - INFO - Epoch(train) [1][  2/323]  lr: 1.0000e-04  eta: 0:00:00  time: 32.7943  data_time: 10.6893  loss: 54.7058  loss_cls: 2.2897  loss_bbox: 7.1520  d0.loss_cls: 2.2585  d0.loss_bbox: 6.7714  d1.loss_cls: 2.2267  d1.loss_bbox: 6.7523  d2.loss_cls: 2.1501  d2.loss_bbox: 6.7822  d3.loss_cls: 2.2182  d3.loss_bbox: 6.8959  d4.loss_cls: 2.2799  d4.loss_bbox: 6.9290
+2026/03/15 22:49:40 - bevformer - INFO - Saving checkpoint at 2 iterations
+2026/03/15 22:49:42 - bevformer - INFO - Saving best checkpoints...
+2026/03/15 22:49:42 - bevformer - INFO - Got best score ['loss'] from message hub
+2026/03/15 22:49:42 - bevformer - INFO - [loss]: Best score: 58.39009094238281, current score: 51.02156066894531
+2026/03/15 22:49:42 - bevformer - INFO - The previous best checkpoint 'experiment/debug/best_loss_iter_1.pth' is removed
+2026/03/15 22:49:42 - bevformer - INFO - The best checkpoint with 51.0216 loss at 2 iter is saved to 'best_loss_iter_2.pth'.
+2026/03/15 22:49:42 - bevformer - INFO - Resaving checkpoint at 2 iter...
+2026/03/15 22:49:44 - bevformer - INFO - Pushing checkpoint at 2 steps...
+2026/03/15 22:49:47 - bevformer - INFO - Pushed last checkpoint 'experiment/debug/iter_2.pth' to repo
+2026/03/15 22:49:48 - bevformer - INFO - Removed 'iter_1.pth' from repo
+2026/03/15 22:49:53 - bevformer - INFO - Pushed best checkpoint 'best_loss_iter_2.pth' of [loss]...
+2026/03/15 22:49:55 - bevformer - INFO - Removed 'best_loss_iter_1.pth' from repo
+2026/03/15 22:50:00 - bevformer - INFO - Epoch(val) [0][ 1/81]    eta: 0:03:12  time: 2.3014  data_time: 0.7300
+2026/03/15 22:50:11 - bevformer - INFO - Epoch(val) [0][81/81]    NDS: 0.0178  mAP: 0.0002  data_time: 0.7300  time: 2.3014
+2026/03/15 22:50:11 - bevformer - INFO - Save best checkpoints after val epoch.
+2026/03/15 22:50:11 - bevformer - INFO - Saving best checkpoints...
+2026/03/15 22:50:11 - bevformer - INFO - Got best score ['mAP'] from message hub
+2026/03/15 22:50:11 - bevformer - INFO - [mAP]: Best score: 0.00023853881585173065, current score: 0.0002481977075675005
+2026/03/15 22:50:11 - bevformer - INFO - The previous best checkpoint 'experiment/debug/best_mAP_iter_1.pth' is removed
+2026/03/15 22:50:12 - bevformer - INFO - The best checkpoint with 0.0002 mAP at 2 iter is saved to 'best_mAP_iter_2.pth'.
+2026/03/15 22:50:12 - bevformer - INFO - Got best score ['NDS'] from message hub
+2026/03/15 22:50:12 - bevformer - INFO - [NDS]: Best score: 0.018478272642075605, current score: 0.017786070826831646
+2026/03/15 22:50:12 - bevformer - INFO - Resaving checkpoint at 2 iter...
+2026/03/15 22:50:15 - bevformer - INFO - Pushed best checkpoint 'best_mAP_iter_2.pth' of [mAP]...
+2026/03/15 22:50:16 - bevformer - INFO - Removed 'best_mAP_iter_1.pth' from repo
+2026/03/15 22:50:17 - bevformer - INFO - Pushing visualizing data and safetensors to repo after training...