diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..91e23aa62e1eb5d1d900bfc71a9062279d0c7f2f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pictures/da.png filter=lfs diff=lfs merge=lfs -text
+pictures/detect.png filter=lfs diff=lfs merge=lfs -text
+pictures/detect_onnx.jpg filter=lfs diff=lfs merge=lfs -text
+pictures/input1.gif filter=lfs diff=lfs merge=lfs -text
+pictures/input2.gif filter=lfs diff=lfs merge=lfs -text
+pictures/ll.png filter=lfs diff=lfs merge=lfs -text
+pictures/output1.gif filter=lfs diff=lfs merge=lfs -text
+pictures/output2.gif filter=lfs diff=lfs merge=lfs -text
+pictures/output_onnx.jpg filter=lfs diff=lfs merge=lfs -text
+pictures/yolop.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..901061412743b3f85fa1dce42444ccc6932e0a34
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+.DS_Store
+__pycache__/
+.idea/
+.tmp/
+.vscode/
+bdd/
+runs/
+inference/
+*.pth
+*.pt
+*.tar
+*.tar.gz
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..46e89ae9c33188a6435a43d01387d5b28863fadc
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Hust Visual Learning Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README _CH.md b/README _CH.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ca8b0181334ba2cec8a08248fbc1c912b902684
--- /dev/null
+++ b/README _CH.md	
@@ -0,0 +1,319 @@
+<div align="left">   
+
+## You Only :eyes: Once for Panoptic ​ :car: Perception
+> [**You Only Look at Once for Panoptic driving Perception**](https://arxiv.org/abs/2108.11250)
+>
+> by Dong Wu, Manwen Liao, Weitian Zhang, [Xinggang Wang](https://xinggangw.info/)<sup> :email:</sup>     [*School of EIC, HUST*](http://eic.hust.edu.cn/English/Home.htm)
+>
+>  (<sup>:email:</sup>) corresponding author.
+>
+> *arXiv technical report ([arXiv 2108.11250](https://arxiv.org/abs/2108.11250))*
+
+---
+
+[English Document](https://github.com/hustvl/YOLOP)
+
+### YOLOP框架
+
+![yolop](pictures/yolop.png)
+
+### 贡献
+
+* 我们提出了一种高效的多任务网络，该网络可以联合处理自动驾驶中的目标检测、可驾驶区域分割和车道检测三个关键任务。不但节省了计算成本，减少了推理时间，还提高了各个任务的性能。我们的工作是第一个在嵌入式设备上实现实时，同时在`BDD100K`数据集上保持`SOTA`（最先进的）性能水平。
+
+* 我们设计了消融实验来验证我们的多任务方案的有效性。证明了这三个任务不需要繁琐的交替优化就可以联合学习。
+
+* 我们设计了消融实验证明了基于网格的检测任务的预测机制与语义分割任务的预测机制更相关，相信这将为其他相关的多任务学习研究工作提供参考。  
+  
+
+### 实验结果
+
+#### 交通目标检测结果：
+
+| Model          | Recall(%) | mAP50(%) | Speed(fps) |
+| -------------- | --------- | -------- | ---------- |
+| `Multinet`     | 81.3      | 60.2     | 8.6        |
+| `DLT-Net`      | 89.4      | 68.4     | 9.3        |
+| `Faster R-CNN` | 77.2      | 55.6     | 5.3        |
+| `YOLOv5s`      | 86.8      | 77.2     | 82         |
+| `YOLOP(ours)`  | 89.2      | 76.5     | 41         |
+#### 可行驶区域分割结果：
+
+| Model         | mIOU(%) | Speed(fps) |
+| ------------- | ------- | ---------- |
+| `Multinet`    | 71.6    | 8.6        |
+| `DLT-Net`     | 71.3    | 9.3        |
+| `PSPNet`      | 89.6    | 11.1       |
+| `YOLOP(ours)` | 91.5    | 41         |
+
+#### 车道线检测结果:
+
+| Model         | mIOU(%) | IOU(%) |
+| ------------- | ------- | ------ |
+| `ENet`        | 34.12   | 14.64  |
+| `SCNN`        | 35.79   | 15.84  |
+| `ENet-SAD`    | 36.56   | 16.02  |
+| `YOLOP(ours)` | 70.50   | 26.20  |
+
+#### 消融实验 1: 端对端训练 v.s. 分步训练:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ |
+| `ES-W`          | 87.0      | 75.3  | 90.4    | 66.8        | 26.2   |
+| `ED-W`          | 87.3      | 76.0  | 91.6    | 71.2        | 26.1   |
+| `ES-D-W`        | 87.0      | 75.1  | 91.7    | 68.6        | 27.0   |
+| `ED-S-W`        | 87.5      | 76.1  | 91.6    | 68.0        | 26.8   |
+| `End-to-end`    | 89.2      | 76.5  | 91.5    | 70.5        | 26.2   |
+
+#### 消融实验 2: 多任务学习 v.s. 单任务学习:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |
+| `Det(only)`     | 88.2      | 76.9  | -       | -           | -      | 15.7            |
+| `Da-Seg(only)`  | -         | -     | 92.0    | -           | -      | 14.8            |
+| `Ll-Seg(only)`  | -         | -     | -       | 79.6        | 27.9   | 14.8            |
+| `Multitask`     | 89.2      | 76.5  | 91.5    | 70.5        | 26.2   | 24.4            |
+  
+#### 消融实验 3: 基于网格 v.s. 基于区域:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |
+| `R-CNNP Det(only)`     | 79.0      | 67.3  |  -      | -           | -      | -            |
+| `R-CNNP Seg(only)`     | -         | -     | 90.2    | 59.5        | 24.0   | -            |
+| `R-CNNP Multitask`     | 77.2(-1.8)| 62.6(-4.7)| 86.8(-3.4)| 49.8(-9.7)| 21.5(-2.5)| 103.3            | 
+| `YOLOP  Det(only)`     | 88.2      | 76.9  | -       | -           | -      | -            |
+| `YOLOP  Seg(only)`     | -         | -     | 91.6    | 69.9        | 26.5   | -            |
+| `YOLOP  Multitask`     | 89.2(+1.0)| 76.5(-0.4)| 91.5(-0.1)| 70.5(+0.6)| 26.2(-0.3)| 24.4            |   
+
+**Notes**: 
+
+- 我们工作参考了以下工作： `Multinet`  ([论文](https://arxiv.org/pdf/1612.07695.pdf?utm_campaign=affiliate-ir-Optimise%20media%28%20South%20East%20Asia%29%20Pte.%20ltd._156_-99_national_R_all_ACQ_cpa_en&utm_content=&utm_source=%20388939),[代码](https://github.com/MarvinTeichmann/MultiNet)）,`DLT-Net`   ([论文](https://ieeexplore.ieee.org/abstract/document/8937825)）,`Faster R-CNN`  ([论文](https://proceedings.neurips.cc/paper/2015/file/14bfa6bb14875e45bba028a21ed38046-Paper.pdf),[代码](https://github.com/ShaoqingRen/faster_rcnn)）,`YOLOv5`（[代码](https://github.com/ultralytics/yolov5))  ,`PSPNet`([论文](https://openaccess.thecvf.com/content_cvpr_2017/papers/Zhao_Pyramid_Scene_Parsing_CVPR_2017_paper.pdf),[代码](https://github.com/hszhao/PSPNet)) ,`ENet`([论文](https://arxiv.org/pdf/1606.02147.pdf),[代码](https://github.com/osmr/imgclsmob))    `SCNN`([论文](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/download/16802/16322),[代码](https://github.com/XingangPan/SCNN))    `SAD-ENet`([论文](https://openaccess.thecvf.com/content_ICCV_2019/papers/Hou_Learning_Lightweight_Lane_Detection_CNNs_by_Self_Attention_Distillation_ICCV_2019_paper.pdf),[代码](https://github.com/cardwing/Codes-for-Lane-Detection)). 感谢他们精彩的工作
+- 在表 4中, E, D, S 和 W 分别代表 编码器（Encoder）, 检测头（Detect head）, 两个分割头（Segment heads）和整个网络（whole network）. 所以算法 (首先，我们只训练编码器和检测头。然后我们冻结编码器和检测头只训练两个分割头。最后，整个网络进行联合训练三个任务) 可以被记作 `ED-S-W`，以此类推。
+
+---
+
+### 可视化
+
+#### 交通目标检测结果
+
+![detect result](pictures/detect.png)
+
+#### 可行驶区域分割结果
+
+![](pictures/da.png)
+
+#### 车道线分割结果
+
+![](pictures/ll.png)
+
+**注意点**: 
+
+- 车道线分割结果是经过曲线拟合的.
+
+---
+
+### Project Structure
+
+```python
+├─inference
+│ ├─images   # inference images
+│ ├─output   # inference result
+├─lib
+│ ├─config/default   # configuration of training and validation
+│ ├─core    
+│ │ ├─activations.py   # activation function
+│ │ ├─evaluate.py   # calculation of metric
+│ │ ├─function.py   # training and validation of model
+│ │ ├─general.py   #calculation of metric、nms、conversion of data-format、visualization
+│ │ ├─loss.py   # loss function
+│ │ ├─postprocess.py   # postprocess(refine da-seg and ll-seg, unrelated to paper)
+│ ├─dataset
+│ │ ├─AutoDriveDataset.py   # Superclass dataset，general function
+│ │ ├─bdd.py   # Subclass dataset，specific function
+│ │ ├─hust.py   # Subclass dataset(Campus scene, unrelated to paper)
+│ │ ├─convect.py 
+│ │ ├─DemoDataset.py   # demo dataset(image, video and stream)
+│ ├─models
+│ │ ├─YOLOP.py    # Setup and Configuration of model
+│ │ ├─light.py    # Model lightweight（unrelated to paper, zwt)
+│ │ ├─commom.py   # calculation module
+│ ├─utils
+│ │ ├─augmentations.py    # data augumentation
+│ │ ├─autoanchor.py   # auto anchor(k-means)
+│ │ ├─split_dataset.py  # (Campus scene, unrelated to paper)
+│ │ ├─utils.py  # logging、device_select、time_measure、optimizer_select、model_save&initialize 、Distributed training
+│ ├─run
+│ │ ├─dataset/training time  # Visualization, logging and model_save
+├─tools
+│ │ ├─demo.py    # demo(folder、camera)
+│ │ ├─test.py    
+│ │ ├─train.py    
+├─toolkits
+│ │ ├─deploy    # Deployment of model
+│ │ ├─datapre    # Generation of gt(mask) for drivable area segmentation task
+├─weights    # Pretraining model
+```
+
+---
+
+### Requirement
+
+整个代码库是在 python 3.版本, PyTorch 1.7+版本和 torchvision 0.8+版本上开发的:
+
+```
+conda install pytorch==1.7.0 torchvision==0.8.0 cudatoolkit=10.2 -c pytorch
+```
+
+其他依赖库的版本要求详见`requirements.txt`：
+
+```setup
+pip install -r requirements.txt
+```
+
+### Data preparation
+
+#### Download
+
+- 从 [images](https://bdd-data.berkeley.edu/)下载图片数据集
+
+- 从 [det_annotations](https://drive.google.com/file/d/1Ge-R8NTxG1eqd4zbryFo-1Uonuh0Nxyl/view?usp=sharing)下载检测任务的标签
+- 从 [da_seg_annotations](https://drive.google.com/file/d/1xy_DhUZRHR8yrZG3OwTQAHhYTnXn7URv/view?usp=sharing)下载可行驶区域分割任务的标签
+- 从 [ll_seg_annotations](https://drive.google.com/file/d/1lDNTPIQj_YLNZVkksKM25CvCHuquJ8AP/view?usp=sharing)下载车道线分割任务的标签
+
+我们推荐按照如下图片数据集文件结构:
+
+```
+├─dataset root
+│ ├─images
+│ │ ├─train
+│ │ ├─val
+│ ├─det_annotations
+│ │ ├─train
+│ │ ├─val
+│ ├─da_seg_annotations
+│ │ ├─train
+│ │ ├─val
+│ ├─ll_seg_annotations
+│ │ ├─train
+│ │ ├─val
+```
+
+在 `./lib/config/default.py`下更新数据集的路径配置。
+
+### 模型训练
+
+你可以在 `./lib/config/default.py`设定训练配置. (包括:  预训练模型的读取，损失函数， 数据增强，optimizer，训练预热和余弦退火，自动anchor，训练轮次epoch, batch_size)
+
+
+
+如果你想尝试交替优化或者单一任务学习，可以在`./lib/config/default.py` 中将对应的配置选项修改为 `True`。(如下，所有的配置都是 `False`, which means training multiple tasks end to end)。
+
+```python
+# Alternating optimization
+_C.TRAIN.SEG_ONLY = False           # Only train two segmentation branchs
+_C.TRAIN.DET_ONLY = False           # Only train detection branch
+_C.TRAIN.ENC_SEG_ONLY = False       # Only train encoder and two segmentation branchs
+_C.TRAIN.ENC_DET_ONLY = False       # Only train encoder and detection branch
+
+# Single task 
+_C.TRAIN.DRIVABLE_ONLY = False      # Only train da_segmentation task
+_C.TRAIN.LANE_ONLY = False          # Only train ll_segmentation task
+_C.TRAIN.DET_ONLY = False          # Only train detection task
+```
+
+开始训练:
+
+```shell
+python tools/train.py
+```
+多GPU训练:
+```
+python -m torch.distributed.launch --nproc_per_node=N tools/train.py  # N: the number of GPUs
+```
+
+### 模型评测
+
+你可以在 `./lib/config/default.py`设定测试配置(包括： batch_size 以及  nms的阈值).
+
+开始评测:
+
+```shell
+python tools/test.py --weights weights/End-to-end.pth
+```
+
+
+
+### Demo测试
+
+我们提供两种测试方案
+
+#### 
+
+测试所使用的的图片存储在 `--source`下, 然后测试结果会保存在 `--save-dir`下：
+
+```shell
+python tools/demo.py --source inference/images
+```
+
+
+
+#### 相机实时
+
+如果你的计算机连接了摄像头, 你可以将 `source` 设为摄像头的序号(默认值为 0).
+
+```shell
+python tools/demo.py --source 0
+```
+
+
+
+#### 展示
+
+<table>
+    <tr>
+            <th>input</th>
+            <th>output</th>
+    </tr>
+    <tr>
+        <td><img src=pictures/input1.gif /></td>
+        <td><img src=pictures/output1.gif/></td>
+    </tr>
+    <tr>
+         <td><img src=pictures/input2.gif /></td>
+        <td><img src=pictures/output2.gif/></td>
+    </tr>
+</table>
+
+
+
+### 部署
+
+我们的模型可以在 `Jetson Tx2`上 连接`Zed Camera` 实时推理。我们使用 `TensorRT` 工具进行推理加速。我们在  `./toolkits/deploy`提供模型部署和推理的全部代码。 
+
+
+
+### 分割标签生成
+
+你可以通过运行以下命令生成可行驶区域的Mask标签
+
+```shell
+python toolkits/datasetpre/gen_bdd_seglabel.py
+```
+
+
+
+## 引用
+
+如果你发现我们的代码和论文对你的研究有帮助， 可以考虑给我们 star :star:   和引用 :pencil: :
+
+```BibTeX
+@article{wu2022yolop,
+  title={Yolop: You only look once for panoptic driving perception},
+  author={Wu, Dong and Liao, Man-Wen and Zhang, Wei-Tian and Wang, Xing-Gang and Bai, Xiang and Cheng, Wen-Qing and Liu, Wen-Yu},
+  journal={Machine Intelligence Research},
+  pages={1--13},
+  year={2022},
+  publisher={Springer}
+}
+```
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0e7a1145e146e96184ebf17e3abca0ce44bd4f3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,347 @@
+<div align="left">   
+
+## You Only :eyes: Once for Panoptic ​ :car: Perception
+> [**You Only Look at Once for Panoptic driving Perception**](https://link.springer.com/article/10.1007/s11633-022-1339-y)
+>
+> by Dong Wu, Manwen Liao, Weitian Zhang, [Xinggang Wang](https://xwcv.github.io/)<sup> :email:</sup>, [Xiang Bai](https://scholar.google.com/citations?user=UeltiQ4AAAAJ&hl=zh-CN), [Wenqing Cheng](http://eic.hust.edu.cn/professor/chengwenqing/), [Wenyu Liu](http://eic.hust.edu.cn/professor/liuwenyu/)      [*School of EIC, HUST*](http://eic.hust.edu.cn/English/Home.htm)
+>
+>  (<sup>:email:</sup>) corresponding author.
+>
+> *arXiv technical report ([Machine Intelligence Research2022](https://link.springer.com/article/10.1007/s11633-022-1339-y))*
+
+---
+
+[中文文档](https://github.com/hustvl/YOLOP/blob/main/README%20_CH.md)
+
+### The Illustration of YOLOP
+
+![yolop](pictures/yolop.png)
+
+### Contributions
+
+* We put forward an efficient multi-task network that can jointly handle three crucial tasks in autonomous driving: object detection, drivable area segmentation and lane detection to save computational costs, reduce inference time as well as improve the performance of each task. Our work is the first to reach real-time on embedded devices while maintaining state-of-the-art level performance on the `BDD100K `dataset.
+
+* We design the ablative experiments to verify the effectiveness of our multi-tasking scheme. It is proved that the three tasks can be learned jointly without tedious alternating optimization.
+  
+* We design the ablative experiments to prove that the grid-based prediction mechanism of detection task is more related to that of semantic segmentation task, which is believed to provide reference for other relevant multi-task learning research works.
+
+### Results
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/yolop-you-only-look-once-for-panoptic-driving/traffic-object-detection-on-bdd100k)](https://paperswithcode.com/sota/traffic-object-detection-on-bdd100k?p=yolop-you-only-look-once-for-panoptic-driving)
+#### Traffic Object Detection Result
+
+| Model          | Recall(%) | mAP50(%) | Speed(fps) |
+| -------------- | --------- | -------- | ---------- |
+| `Multinet`     | 81.3      | 60.2     | 8.6        |
+| `DLT-Net`      | 89.4      | 68.4     | 9.3        |
+| `Faster R-CNN` | 81.2      | 64.9     | 8.8        |
+| `YOLOv5s`      | 86.8      | 77.2     | 82         |
+| `YOLOP(ours)`  | 89.2      | 76.5     | 41         |
+#### Drivable Area Segmentation Result
+
+| Model         | mIOU(%) | Speed(fps) |
+| ------------- | ------- | ---------- |
+| `Multinet`    | 71.6    | 8.6        |
+| `DLT-Net`     | 71.3    | 9.3        |
+| `PSPNet`      | 89.6    | 11.1       |
+| `YOLOP(ours)` | 91.5    | 41         |
+
+#### Lane Detection Result:
+
+| Model         | mIOU(%) | IOU(%) |
+| ------------- | ------- | ------ |
+| `ENet`        | 34.12   | 14.64  |
+| `SCNN`        | 35.79   | 15.84  |
+| `ENet-SAD`    | 36.56   | 16.02  |
+| `YOLOP(ours)` | 70.50   | 26.20  |
+
+#### Ablation Studies 1: End-to-end v.s. Step-by-step:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ |
+| `ES-W`          | 87.0      | 75.3  | 90.4    | 66.8        | 26.2   |
+| `ED-W`          | 87.3      | 76.0  | 91.6    | 71.2        | 26.1   |
+| `ES-D-W`        | 87.0      | 75.1  | 91.7    | 68.6        | 27.0   |
+| `ED-S-W`        | 87.5      | 76.1  | 91.6    | 68.0        | 26.8   |
+| `End-to-end`    | 89.2      | 76.5  | 91.5    | 70.5        | 26.2   |
+
+#### Ablation Studies 2: Multi-task v.s. Single task:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |
+| `Det(only)`     | 88.2      | 76.9  | -       | -           | -      | 15.7            |
+| `Da-Seg(only)`  | -         | -     | 92.0    | -           | -      | 14.8            |
+| `Ll-Seg(only)`  | -         | -     | -       | 79.6        | 27.9   | 14.8            |
+| `Multitask`     | 89.2      | 76.5  | 91.5    | 70.5        | 26.2   | 24.4            |
+
+#### Ablation Studies 3: Grid-based v.s. Region-based:
+
+| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |
+| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |
+| `R-CNNP Det(only)`     | 79.0      | 67.3  |  -      | -           | -      | -            |
+| `R-CNNP Seg(only)`     | -         | -     | 90.2    | 59.5        | 24.0   | -            |
+| `R-CNNP Multitask`     | 77.2(-1.8)| 62.6(-4.7)| 86.8(-3.4)| 49.8(-9.7)| 21.5(-2.5)| 103.3            | 
+| `YOLOP  Det(only)`     | 88.2      | 76.9  | -       | -           | -      | -            |
+| `YOLOP  Seg(only)`     | -         | -     | 91.6    | 69.9        | 26.5   | -            |
+| `YOLOP  Multitask`     | 89.2(+1.0)| 76.5(-0.4)| 91.5(-0.1)| 70.5(+0.6)| 26.2(-0.3)| 24.4            |   
+
+  
+**Notes**: 
+
+- The works we has use for reference including `Multinet`  ([paper](https://arxiv.org/pdf/1612.07695.pdf?utm_campaign=affiliate-ir-Optimise%20media%28%20South%20East%20Asia%29%20Pte.%20ltd._156_-99_national_R_all_ACQ_cpa_en&utm_content=&utm_source=%20388939),[code](https://github.com/MarvinTeichmann/MultiNet)）,`DLT-Net`   ([paper](https://ieeexplore.ieee.org/abstract/document/8937825)）,`Faster R-CNN`  ([paper](https://proceedings.neurips.cc/paper/2015/file/14bfa6bb14875e45bba028a21ed38046-Paper.pdf),[code](https://github.com/ShaoqingRen/faster_rcnn)）,`YOLOv5s`（[code](https://github.com/ultralytics/yolov5))  ,`PSPNet`([paper](https://openaccess.thecvf.com/content_cvpr_2017/papers/Zhao_Pyramid_Scene_Parsing_CVPR_2017_paper.pdf),[code](https://github.com/hszhao/PSPNet)) ,`ENet`([paper](https://arxiv.org/pdf/1606.02147.pdf),[code](https://github.com/osmr/imgclsmob))    `SCNN`([paper](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/download/16802/16322),[code](https://github.com/XingangPan/SCNN))    `SAD-ENet`([paper](https://openaccess.thecvf.com/content_ICCV_2019/papers/Hou_Learning_Lightweight_Lane_Detection_CNNs_by_Self_Attention_Distillation_ICCV_2019_paper.pdf),[code](https://github.com/cardwing/Codes-for-Lane-Detection)). Thanks for their wonderful works.
+- In table 4, E, D, S and W refer to Encoder, Detect head, two Segment heads and whole network. So the Algorithm (First, we only train Encoder and Detect head. Then we freeze the Encoder and Detect head as well as train two Segmentation heads. Finally, the entire network is trained jointly for all three tasks.) can be marked as ED-S-W, and the same for others.
+
+---
+
+### Visualization
+
+#### Traffic Object Detection Result
+
+![detect result](pictures/detect.png)
+
+#### Drivable Area Segmentation Result
+
+![](pictures/da.png)
+
+#### Lane Detection Result
+
+![](pictures/ll.png)
+
+**Notes**: 
+
+- The visualization of lane detection result has been post processed by quadratic fitting.
+
+---
+
+### Project Structure
+
+```python
+├─inference
+│ ├─images   # inference images
+│ ├─output   # inference result
+├─lib
+│ ├─config/default   # configuration of training and validation
+│ ├─core    
+│ │ ├─activations.py   # activation function
+│ │ ├─evaluate.py   # calculation of metric
+│ │ ├─function.py   # training and validation of model
+│ │ ├─general.py   #calculation of metric、nms、conversion of data-format、visualization
+│ │ ├─loss.py   # loss function
+│ │ ├─postprocess.py   # postprocess(refine da-seg and ll-seg, unrelated to paper)
+│ ├─dataset
+│ │ ├─AutoDriveDataset.py   # Superclass dataset，general function
+│ │ ├─bdd.py   # Subclass dataset，specific function
+│ │ ├─hust.py   # Subclass dataset(Campus scene, unrelated to paper)
+│ │ ├─convect.py 
+│ │ ├─DemoDataset.py   # demo dataset(image, video and stream)
+│ ├─models
+│ │ ├─YOLOP.py    # Setup and Configuration of model
+│ │ ├─light.py    # Model lightweight（unrelated to paper, zwt)
+│ │ ├─commom.py   # calculation module
+│ ├─utils
+│ │ ├─augmentations.py    # data augumentation
+│ │ ├─autoanchor.py   # auto anchor(k-means)
+│ │ ├─split_dataset.py  # (Campus scene, unrelated to paper)
+│ │ ├─utils.py  # logging、device_select、time_measure、optimizer_select、model_save&initialize 、Distributed training
+│ ├─run
+│ │ ├─dataset/training time  # Visualization, logging and model_save
+├─tools
+│ │ ├─demo.py    # demo(folder、camera)
+│ │ ├─test.py    
+│ │ ├─train.py    
+├─toolkits
+│ │ ├─deploy    # Deployment of model
+│ │ ├─datapre    # Generation of gt(mask) for drivable area segmentation task
+├─weights    # Pretraining model
+```
+
+---
+
+### Requirement
+
+This codebase has been developed with python version 3.7, PyTorch 1.7+ and torchvision 0.8+:
+
+```
+conda install pytorch==1.7.0 torchvision==0.8.0 cudatoolkit=10.2 -c pytorch
+```
+
+See `requirements.txt` for additional dependencies and version requirements.
+
+```setup
+pip install -r requirements.txt
+```
+
+### Data preparation
+
+#### Download
+
+- Download the images from [images](https://bdd-data.berkeley.edu/).
+
+- Download the annotations of detection from [det_annotations](https://drive.google.com/file/d/1Ge-R8NTxG1eqd4zbryFo-1Uonuh0Nxyl/view?usp=sharing). 
+- Download the annotations of drivable area segmentation from [da_seg_annotations](https://drive.google.com/file/d/1xy_DhUZRHR8yrZG3OwTQAHhYTnXn7URv/view?usp=sharing). 
+- Download the annotations of lane line segmentation from [ll_seg_annotations](https://drive.google.com/file/d/1lDNTPIQj_YLNZVkksKM25CvCHuquJ8AP/view?usp=sharing). 
+
+We recommend the dataset directory structure to be the following:
+
+```
+# The id represent the correspondence relation
+├─dataset root
+│ ├─images
+│ │ ├─train
+│ │ ├─val
+│ ├─det_annotations
+│ │ ├─train
+│ │ ├─val
+│ ├─da_seg_annotations
+│ │ ├─train
+│ │ ├─val
+│ ├─ll_seg_annotations
+│ │ ├─train
+│ │ ├─val
+```
+
+Update the your dataset path in the `./lib/config/default.py`.
+
+### Training
+
+You can set the training configuration in the `./lib/config/default.py`. (Including:  the loading of preliminary model,  loss,  data augmentation, optimizer, warm-up and cosine annealing, auto-anchor, training epochs, batch_size).
+
+If you want try alternating optimization or train model for single task, please modify the corresponding configuration in `./lib/config/default.py` to `True`. (As following, all configurations is `False`, which means training multiple tasks end to end).
+
+```python
+# Alternating optimization
+_C.TRAIN.SEG_ONLY = False           # Only train two segmentation branchs
+_C.TRAIN.DET_ONLY = False           # Only train detection branch
+_C.TRAIN.ENC_SEG_ONLY = False       # Only train encoder and two segmentation branchs
+_C.TRAIN.ENC_DET_ONLY = False       # Only train encoder and detection branch
+
+# Single task 
+_C.TRAIN.DRIVABLE_ONLY = False      # Only train da_segmentation task
+_C.TRAIN.LANE_ONLY = False          # Only train ll_segmentation task
+_C.TRAIN.DET_ONLY = False          # Only train detection task
+```
+
+Start training:
+
+```shell
+python tools/train.py
+```
+Multi GPU mode:
+```shell
+python -m torch.distributed.launch --nproc_per_node=N tools/train.py  # N: the number of GPUs
+```
+
+
+### Evaluation
+
+You can set the evaluation configuration in the `./lib/config/default.py`. (Including： batch_size and threshold value for nms).
+
+Start evaluating:
+
+```shell
+python tools/test.py --weights weights/End-to-end.pth
+```
+
+
+
+### Demo Test
+
+We provide two testing method.
+
+#### Folder
+
+You can store the image or video in `--source`, and then save the reasoning result to `--save-dir`
+
+```shell
+python tools/demo.py --source inference/images
+```
+
+
+
+#### Camera
+
+If there are any camera connected to your computer, you can set the `source` as the camera number(The default is 0).
+
+```shell
+python tools/demo.py --source 0
+```
+
+
+
+#### Demonstration
+
+<table>
+    <tr>
+            <th>input</th>
+            <th>output</th>
+    </tr>
+    <tr>
+        <td><img src=pictures/input1.gif /></td>
+        <td><img src=pictures/output1.gif/></td>
+    </tr>
+    <tr>
+         <td><img src=pictures/input2.gif /></td>
+        <td><img src=pictures/output2.gif/></td>
+    </tr>
+</table>
+
+
+
+### Deployment
+
+Our model can reason in real-time on `Jetson Tx2`, with `Zed Camera` to capture image. We use `TensorRT` tool for speeding up. We provide code for deployment and reasoning of model in  `./toolkits/deploy`.
+
+
+
+### Segmentation Label(Mask) Generation
+
+You can generate the label for drivable area segmentation task by running
+
+```shell
+python toolkits/datasetpre/gen_bdd_seglabel.py
+```
+
+
+
+#### Model Transfer
+
+Before reasoning with TensorRT C++ API, you need to transfer the `.pth` file into binary file which can be read by C++.
+
+```shell
+python toolkits/deploy/gen_wts.py
+```
+
+After running the above command, you obtain a binary file named `yolop.wts`.
+
+
+
+#### Running Inference
+
+TensorRT needs an engine file for inference. Building an engine is time-consuming. It is convenient to save an engine file so that you can reuse it every time you run the inference. The process is integrated in `main.cpp`. It can determine whether to build an engine according to the existence of your engine file.
+
+
+
+### Third Parties Resource  
+* YOLOP OpenCV-DNN C++ Demo: [YOLOP-opencv-dnn](https://github.com/hpc203/YOLOP-opencv-dnn) from [hpc203](https://github.com/hpc203)  
+* YOLOP ONNXRuntime C++ Demo: [lite.ai.toolkit](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ort/cv/yolop.cpp) from [DefTruth](https://github.com/DefTruth)  
+* YOLOP NCNN C++ Demo: [YOLOP-NCNN](https://github.com/EdVince/YOLOP-NCNN) from [EdVince](https://github.com/EdVince)  
+* YOLOP MNN C++ Demo: [YOLOP-MNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/mnn/cv/mnn_yolop.cpp) from [DefTruth](https://github.com/DefTruth) 
+* YOLOP TNN C++ Demo: [YOLOP-TNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/tnn/cv/tnn_yolop.cpp) from [DefTruth](https://github.com/DefTruth) 	
+
+
+
+## Citation
+
+If you find our paper and code useful for your research, please consider giving a star :star:   and citation :pencil: :
+
+```BibTeX
+@article{wu2022yolop,
+  title={Yolop: You only look once for panoptic driving perception},
+  author={Wu, Dong and Liao, Man-Wen and Zhang, Wei-Tian and Wang, Xing-Gang and Bai, Xiang and Cheng, Wen-Qing and Liu, Wen-Yu},
+  journal={Machine Intelligence Research},
+  pages={1--13},
+  year={2022},
+  publisher={Springer}
+}
+```
+
diff --git a/export_onnx.py b/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f671423e8414b431252a95aafa6a9569b5e9bf2
--- /dev/null
+++ b/export_onnx.py
@@ -0,0 +1,200 @@
+import torch
+import torch.nn as nn
+from lib.models.common import Conv, SPP, Bottleneck, BottleneckCSP, Focus, Concat, Detect, SharpenConv
+from torch.nn import Upsample
+from lib.utils import check_anchor_order
+from lib.utils import initialize_weights
+import argparse
+import onnx
+import onnxruntime as ort
+import onnxsim
+
+import math
+import cv2
+
+# The lane line and the driving area segment branches without share information with each other and without link
+YOLOP = [
+    [24, 33, 42],  # Det_out_idx, Da_Segout_idx, LL_Segout_idx
+    [-1, Focus, [3, 32, 3]],  # 0
+    [-1, Conv, [32, 64, 3, 2]],  # 1
+    [-1, BottleneckCSP, [64, 64, 1]],  # 2
+    [-1, Conv, [64, 128, 3, 2]],  # 3
+    [-1, BottleneckCSP, [128, 128, 3]],  # 4
+    [-1, Conv, [128, 256, 3, 2]],  # 5
+    [-1, BottleneckCSP, [256, 256, 3]],  # 6
+    [-1, Conv, [256, 512, 3, 2]],  # 7
+    [-1, SPP, [512, 512, [5, 9, 13]]],  # 8 SPP
+    [-1, BottleneckCSP, [512, 512, 1, False]],  # 9
+    [-1, Conv, [512, 256, 1, 1]],  # 10
+    [-1, Upsample, [None, 2, 'nearest']],  # 11
+    [[-1, 6], Concat, [1]],  # 12
+    [-1, BottleneckCSP, [512, 256, 1, False]],  # 13
+    [-1, Conv, [256, 128, 1, 1]],  # 14
+    [-1, Upsample, [None, 2, 'nearest']],  # 15
+    [[-1, 4], Concat, [1]],  # 16         #Encoder
+
+    [-1, BottleneckCSP, [256, 128, 1, False]],  # 17
+    [-1, Conv, [128, 128, 3, 2]],  # 18
+    [[-1, 14], Concat, [1]],  # 19
+    [-1, BottleneckCSP, [256, 256, 1, False]],  # 20
+    [-1, Conv, [256, 256, 3, 2]],  # 21
+    [[-1, 10], Concat, [1]],  # 22
+    [-1, BottleneckCSP, [512, 512, 1, False]],  # 23
+    [[17, 20, 23], Detect,
+     [1, [[3, 9, 5, 11, 4, 20], [7, 18, 6, 39, 12, 31], [19, 50, 38, 81, 68, 157]], [128, 256, 512]]],
+    # Detection head 24: from_(features from specific layers), block, nc(num_classes) anchors ch(channels)
+
+    [16, Conv, [256, 128, 3, 1]],  # 25
+    [-1, Upsample, [None, 2, 'nearest']],  # 26
+    [-1, BottleneckCSP, [128, 64, 1, False]],  # 27
+    [-1, Conv, [64, 32, 3, 1]],  # 28
+    [-1, Upsample, [None, 2, 'nearest']],  # 29
+    [-1, Conv, [32, 16, 3, 1]],  # 30
+    [-1, BottleneckCSP, [16, 8, 1, False]],  # 31
+    [-1, Upsample, [None, 2, 'nearest']],  # 32
+    [-1, Conv, [8, 2, 3, 1]],  # 33 Driving area segmentation head
+
+    [16, Conv, [256, 128, 3, 1]],  # 34
+    [-1, Upsample, [None, 2, 'nearest']],  # 35
+    [-1, BottleneckCSP, [128, 64, 1, False]],  # 36
+    [-1, Conv, [64, 32, 3, 1]],  # 37
+    [-1, Upsample, [None, 2, 'nearest']],  # 38
+    [-1, Conv, [32, 16, 3, 1]],  # 39
+    [-1, BottleneckCSP, [16, 8, 1, False]],  # 40
+    [-1, Upsample, [None, 2, 'nearest']],  # 41
+    [-1, Conv, [8, 2, 3, 1]]  # 42 Lane line segmentation head
+]
+
+
+class MCnet(nn.Module):
+    def __init__(self, block_cfg):
+        super(MCnet, self).__init__()
+        layers, save = [], []
+        self.nc = 1  # traffic or not
+        self.detector_index = -1
+        self.det_out_idx = block_cfg[0][0]
+        self.seg_out_idx = block_cfg[0][1:]
+        self.num_anchors = 3
+        self.num_outchannel = 5 + self.nc  # dx,dy,dw,dh,obj_conf+cls_conf
+        # Build model
+        for i, (from_, block, args) in enumerate(block_cfg[1:]):
+            block = eval(block) if isinstance(block, str) else block  # eval strings
+            if block is Detect:
+                self.detector_index = i
+            block_ = block(*args)
+            block_.index, block_.from_ = i, from_
+            layers.append(block_)
+            save.extend(x % i for x in ([from_] if isinstance(from_, int) else from_) if x != -1)  # append to savelist
+        assert self.detector_index == block_cfg[0][0]
+
+        self.model, self.save = nn.Sequential(*layers), sorted(save)
+        self.names = [str(i) for i in range(self.nc)]
+
+        # set stride、anchor for detector
+        Detector = self.model[self.detector_index]  # detector
+        if isinstance(Detector, Detect):
+            s = 128  # 2x min stride
+            # for x in self.forward(torch.zeros(1, 3, s, s)):
+            #     print (x.shape)
+            with torch.no_grad():
+                model_out = self.forward(torch.zeros(1, 3, s, s))
+                detects, _, _ = model_out
+                Detector.stride = torch.tensor([s / x.shape[-2] for x in detects])  # forward
+            # print("stride"+str(Detector.stride ))
+            Detector.anchors /= Detector.stride.view(-1, 1, 1)  # Set the anchors for the corresponding scale
+            check_anchor_order(Detector)
+            self.stride = Detector.stride
+            # self._initialize_biases()
+        initialize_weights(self)
+
+    def forward(self, x):
+        cache = []
+        out = []
+        det_out = None
+        for i, block in enumerate(self.model):
+            if block.from_ != -1:
+                x = cache[block.from_] if isinstance(block.from_, int) \
+                    else [x if j == -1 else cache[j] for j in
+                          block.from_]  # calculate concat detect
+            x = block(x)
+            if i in self.seg_out_idx:  # save driving area segment result
+                # m = nn.Sigmoid()
+                # out.append(m(x))
+                out.append(torch.sigmoid(x))
+            if i == self.detector_index:
+                # det_out = x
+                if self.training:
+                    det_out = x
+                else:
+                    det_out = x[0]  # (torch.cat(z, 1), input_feat) if test
+            cache.append(x if block.index in self.save else None)
+        return det_out, out[0], out[1]  # det, da, ll
+        # (1,na*ny*nx*nl,no=2+2+1+nc=xy+wh+obj_conf+cls_prob), (1,2,h,w) (1,2,h,w)
+
+    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
+        # m = self.model[-1]  # Detect() module
+        m = self.model[self.detector_index]  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--height', type=int, default=640)  # height
+    parser.add_argument('--width', type=int, default=640)  # width
+    args = parser.parse_args()
+
+    do_simplify = True
+
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = MCnet(YOLOP)
+    checkpoint = torch.load('./weights/End-to-end.pth', map_location=device)
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+
+    height = args.height
+    width = args.width
+    print("Load ./weights/End-to-end.pth done!")
+    onnx_path = f'./weights/yolop-{height}-{width}.onnx'
+    inputs = torch.randn(1, 3, height, width)
+
+    print(f"Converting to {onnx_path}")
+    torch.onnx.export(model, inputs, onnx_path,
+                      verbose=False, opset_version=12, input_names=['images'],
+                      output_names=['det_out', 'drive_area_seg', 'lane_line_seg'])
+    print('convert', onnx_path, 'to onnx finish!!!')
+    # Checks
+    model_onnx = onnx.load(onnx_path)  # load onnx model
+    onnx.checker.check_model(model_onnx)  # check onnx model
+    print(onnx.helper.printable_graph(model_onnx.graph))  # print
+
+    if do_simplify:
+        print(f'simplifying with onnx-simplifier {onnxsim.__version__}...')
+        model_onnx, check = onnxsim.simplify(model_onnx, check_n=3)
+        assert check, 'assert check failed'
+        onnx.save(model_onnx, onnx_path)
+
+    x = inputs.cpu().numpy()
+    try:
+        sess = ort.InferenceSession(onnx_path)
+
+        for ii in sess.get_inputs():
+            print("Input: ", ii)
+        for oo in sess.get_outputs():
+            print("Output: ", oo)
+
+        print('read onnx using onnxruntime sucess')
+    except Exception as e:
+        print('read failed')
+        raise e
+
+    """
+    PYTHONPATH=. python3 ./export_onnx.py --height 640 --width 640
+    PYTHONPATH=. python3 ./export_onnx.py --height 1280 --width 1280
+    PYTHONPATH=. python3 ./export_onnx.py --height 320 --width 320
+    """
diff --git a/hubconf.py b/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f04375bdbc66f7c7d1d9349bee8198278d85dc5
--- /dev/null
+++ b/hubconf.py
@@ -0,0 +1,30 @@
+# YOLOP by hustvl, MIT License
+dependencies = ['torch']
+import torch
+from lib.utils.utils import select_device
+from lib.config import cfg
+from lib.models import get_net
+from pathlib import Path
+import os
+
+def yolop(pretrained=True, device="cpu"):
+    """Creates YOLOP model
+    Arguments:
+        pretrained (bool): load pretrained weights into the model
+        wieghts (int): the url of pretrained weights
+        device (str): cuda device i.e. 0 or 0,1,2,3 or cpu
+    Returns:
+        YOLOP pytorch model
+    """
+    device = select_device(device = device)
+    model = get_net(cfg)
+    if pretrained:
+        path = os.path.join(Path(__file__).resolve().parent, "weights/End-to-end.pth")
+        checkpoint = torch.load(path, map_location= device)
+        model.load_state_dict(checkpoint['state_dict'])
+    model = model.to(device)
+    return model
+
+
+
+
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/__pycache__/__init__.cpython-310.pyc b/lib/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c81ef5d61e81784d947b7452d423f16a9418a1c1
Binary files /dev/null and b/lib/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/config/__init__.py b/lib/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34052907947a5431f48239e16ec45820a06bc441
--- /dev/null
+++ b/lib/config/__init__.py
@@ -0,0 +1,2 @@
+from .default import _C as cfg
+from .default import update_config
\ No newline at end of file
diff --git a/lib/config/__pycache__/__init__.cpython-310.pyc b/lib/config/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90b172955d3f6847216d7cb0a9c3d5dbcc85959b
Binary files /dev/null and b/lib/config/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/config/__pycache__/__init__.cpython-37.pyc b/lib/config/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a6e3db761f6b1eb9b0357a686c486f4728655ce
Binary files /dev/null and b/lib/config/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lib/config/__pycache__/default.cpython-310.pyc b/lib/config/__pycache__/default.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4d8b7175d93b8f1a32040de5418ba16cdd0e58a
Binary files /dev/null and b/lib/config/__pycache__/default.cpython-310.pyc differ
diff --git a/lib/config/__pycache__/default.cpython-37.pyc b/lib/config/__pycache__/default.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baab70c9f899175fad40e9e1330fda1704fd7de5
Binary files /dev/null and b/lib/config/__pycache__/default.cpython-37.pyc differ
diff --git a/lib/config/default.py b/lib/config/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..a66d2377ad341b0048680eb27cf835ad82017889
--- /dev/null
+++ b/lib/config/default.py
@@ -0,0 +1,165 @@
+import os
+import platform
+from yacs.config import CfgNode as CN
+
+
+_C = CN()
+
+_C.LOG_DIR = 'runs/'
+_C.GPUS = (0,)  # 只使用第一块 GPU（你只有一块）
+# Auto-detect optimal worker count based on OS and CPU cores
+# Windows: use 0 to avoid multiprocessing issues (most stable)
+# Linux: use 8 workers for better performance (4090D推荐8-16)
+_C.WORKERS = 0 if platform.system() == 'Windows' else 8
+_C.PIN_MEMORY = False  # 4090D 启用 PIN_MEMORY 加速数据传输
+_C.PRINT_FREQ = 20
+_C.AUTO_RESUME =False       # Resume from the last training interrupt
+_C.NEED_AUTOANCHOR = False      # Re-select the prior anchor(k-means)    When training from scratch (epoch=0), set it to be ture!
+_C.DEBUG = False
+_C.num_seg_class = 2
+
+# Cudnn related params
+_C.CUDNN = CN()
+_C.CUDNN.BENCHMARK = True
+_C.CUDNN.DETERMINISTIC = False
+_C.CUDNN.ENABLED = True
+
+
+# common params for NETWORK
+_C.MODEL = CN(new_allowed=True)
+_C.MODEL.NAME = 'yolop_yolov11_small'
+_C.MODEL.STRU_WITHSHARE = False     #add share_block to segbranch
+_C.MODEL.HEADS_NAME = ['']
+_C.MODEL.PRETRAINED = ""
+_C.MODEL.PRETRAINED_DET = ""  # 已弃用
+_C.MODEL.IMAGE_SIZE = [640, 640]  # width * height, ex: 192 * 256
+_C.MODEL.EXTRA = CN(new_allowed=True)
+
+
+# loss params
+_C.LOSS = CN(new_allowed=True)
+_C.LOSS.LOSS_NAME = ''
+# YOLOv11 Multi-head lambda weights: [cls, obj(unused), box, da_seg, ll_seg, ll_iou]
+# 这是在各个GAIN基础上的额外权重系数，最终损失 = 原始损失 × GAIN × LAMBDA
+_C.LOSS.MULTI_HEAD_LAMBDA = [1.0, 1.0, 1.0, 0.5, 0.5, 0.8]  # 分割任务降权
+_C.LOSS.FL_GAMMA = 0.0  # focal loss gamma
+_C.LOSS.CLS_POS_WEIGHT = 1.0  # classification loss positive weights
+_C.LOSS.OBJ_POS_WEIGHT = 1.0  # object loss positive weights (YOLOv5 only)
+_C.LOSS.SEG_POS_WEIGHT = 1.0  # segmentation loss positive weights
+_C.LOSS.BOX_GAIN = 7.5  # box loss gain (YOLOv11 default, 适合小量级box loss)
+_C.LOSS.CLS_GAIN = 0.5  # classification loss gain
+_C.LOSS.OBJ_GAIN = 1.0  # object loss gain (YOLOv5 only, unused in YOLOv11)
+_C.LOSS.DA_SEG_GAIN = 0.2  # driving area seg loss (降低: 2.0->0.5, seg量级大需要小权重)
+_C.LOSS.LL_SEG_GAIN = 0.2  # lane line seg loss (降低: 2.0->0.5)
+_C.LOSS.LL_IOU_GAIN = 0.2  # lane line iou loss (降低: 2.0->1.0, iou量级适中)
+
+
+# DATASET related params
+_C.DATASET = CN(new_allowed=True)
+_C.DATASET.DATAROOT = 'E:\\minimake\\100k'       # the path of images folder
+_C.DATASET.LABELROOT = 'E:/minimake/det_annotations/zwt/bdd/bdd100k/labels/det_annotations'      # the path of det_annotations folder label
+_C.DATASET.MASKROOT = 'E:/minimake/da_seg_annotations'                # the path of da_seg_annotations folder mask
+_C.DATASET.LANEROOT = 'E:\\minimake\\ll_seg_annotations'               # the path of ll_seg_annotations folder lane
+_C.DATASET.DATASET = 'BddDataset'
+_C.DATASET.TRAIN_SET = 'train'
+_C.DATASET.TEST_SET = 'val'
+_C.DATASET.DATA_FORMAT = 'jpg'
+_C.DATASET.SELECT_DATA = False
+_C.DATASET.ORG_IMG_SIZE = [720, 1280]
+
+# training data augmentation
+_C.DATASET.FLIP = True
+_C.DATASET.SCALE_FACTOR = 0.5 # 0.25
+_C.DATASET.ROT_FACTOR = 15 # 10
+_C.DATASET.TRANSLATE = 0.1
+_C.DATASET.SHEAR = 0.0
+_C.DATASET.COLOR_RGB = False
+_C.DATASET.HSV_H = 0.015  # image HSV-Hue augmentation (fraction)
+_C.DATASET.HSV_S = 0.7  # image HSV-Saturation augmentation (fraction)
+_C.DATASET.HSV_V = 0.4  # image HSV-Value augmentation (fraction)
+# TODO: more augmet params to add
+
+
+# train
+_C.TRAIN = CN(new_allowed=True)
+# 大 batch_size (64) 需要更大的学习率，按比例缩放：lr = base_lr * (batch_size / base_batch)
+# base: lr=0.001, batch=4  →  batch=64: lr = 0.001 * (64/4) = 0.016
+_C.TRAIN.LR0 = 0.001  # initial learning rate for batch_size=64 (scaled from 0.001)
+_C.TRAIN.LRF = 0.01  # final OneCycleLR learning rate (lr0 * lrf) - YOLOv11 uses 0.01
+_C.TRAIN.WARMUP_EPOCHS = 5.0  # 大batch增加warmup到5 epoch
+_C.TRAIN.WARMUP_BIASE_LR = 0.1
+_C.TRAIN.WARMUP_MOMENTUM = 0.8
+
+_C.TRAIN.OPTIMIZER = 'adam'
+_C.TRAIN.MOMENTUM = 0.937
+_C.TRAIN.WD = 0.0005
+_C.TRAIN.NESTEROV = True
+_C.TRAIN.GAMMA1 = 0.99
+_C.TRAIN.GAMMA2 = 0.0
+
+_C.TRAIN.BEGIN_EPOCH = 0
+_C.TRAIN.END_EPOCH = 200  # YOLOv11 recommends 200+ epochs
+
+_C.TRAIN.VAL_FREQ = 1
+_C.TRAIN.BATCH_SIZE_PER_GPU = 2  # 4090D (24GB) 可以支持 batch_size=64
+_C.TRAIN.SHUFFLE = True
+
+_C.TRAIN.IOU_THRESHOLD = 0.2
+_C.TRAIN.ANCHOR_THRESHOLD = 4.0
+
+# if training 3 tasks end-to-end, set all parameters as True
+# Alternating optimization
+_C.TRAIN.SEG_ONLY = False           # Only train two segmentation branchs
+_C.TRAIN.DET_ONLY = False           # Only train detection branch
+_C.TRAIN.ENC_SEG_ONLY = False       # Only train encoder and two segmentation branchs
+_C.TRAIN.ENC_DET_ONLY = False       # Only train encoder and detection branch
+
+# Single task 
+_C.TRAIN.DRIVABLE_ONLY = False      # Only train da_segmentation task
+_C.TRAIN.LANE_ONLY = False          # Only train ll_segmentation task
+_C.TRAIN.DET_ONLY = False          # Only train detection task
+
+
+
+
+_C.TRAIN.PLOT = True                # 
+
+# testing
+_C.TEST = CN(new_allowed=True)
+_C.TEST.BATCH_SIZE_PER_GPU = 8
+_C.TEST.MODEL_FILE = ''
+_C.TEST.SAVE_JSON = False
+_C.TEST.SAVE_TXT = False
+_C.TEST.PLOTS = True
+_C.TEST.NMS_CONF_THRESHOLD  = 0.001
+_C.TEST.NMS_IOU_THRESHOLD  = 0.6
+
+
+def update_config(cfg, args):
+    cfg.defrost()
+    # cfg.merge_from_file(args.cfg)
+
+    if args.modelDir:
+        cfg.OUTPUT_DIR = args.modelDir
+
+    if args.logDir:
+        cfg.LOG_DIR = args.logDir
+    
+    # if args.conf_thres:
+    #     cfg.TEST.NMS_CONF_THRESHOLD = args.conf_thres
+
+    # if args.iou_thres:
+    #     cfg.TEST.NMS_IOU_THRESHOLD = args.iou_thres
+    
+
+
+    # cfg.MODEL.PRETRAINED = os.path.join(
+    #     cfg.DATA_DIR, cfg.MODEL.PRETRAINED
+    # )
+    #
+    # if cfg.TEST.MODEL_FILE:
+    #     cfg.TEST.MODEL_FILE = os.path.join(
+    #         cfg.DATA_DIR, cfg.TEST.MODEL_FILE
+    #     )
+
+    cfg.freeze()
diff --git a/lib/config/yolov11.py b/lib/config/yolov11.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee0bdcd3464b6023c15e7e4e221be623f7e2ea9e
--- /dev/null
+++ b/lib/config/yolov11.py
@@ -0,0 +1,30 @@
+"""
+YOLOv11 Backbone 配置文件
+使用 YOLOv11 作为 backbone 的 YOLOP 模型配置
+"""
+import os
+import platform
+from .default import _C as cfg
+
+# 覆盖默认配置以使用 YOLOv11
+cfg.MODEL.USE_YOLOV11 = True
+cfg.MODEL.YOLOV11_SCALE = 's'  # 'n', 's', 'm', 'l', 'x'
+cfg.MODEL.YOLOV11_WEIGHTS = ''  # 'weights/yolo11s.pt'
+cfg.MODEL.FREEZE_BACKBONE = False  # 是否冻结 backbone
+
+# 训练配置
+cfg.TRAIN.BATCH_SIZE_PER_GPU = 8  # YOLOv11s 可以用更大的 batch size
+cfg.TRAIN.END_EPOCH = 200  # YOLOv11 建议 200+ epochs
+
+# 学习率配置（冻结 backbone 时可以用更大的学习率）
+cfg.TRAIN.LR0 = 0.01  # 冻结 backbone 时增大学习率
+cfg.TRAIN.LRF = 0.01
+cfg.TRAIN.WARMUP_EPOCHS = 3.0  # 减少 warmup
+
+# 损失权重（YOLOv11 适配）
+cfg.LOSS.MULTI_HEAD_LAMBDA = [1.0, 1.0, 1.0, 0.5, 0.5, 0.8]
+cfg.LOSS.BOX_GAIN = 7.5
+cfg.LOSS.CLS_GAIN = 0.5
+cfg.LOSS.DA_SEG_GAIN = 0.5
+cfg.LOSS.LL_SEG_GAIN = 0.5
+cfg.LOSS.LL_IOU_GAIN = 1.0
diff --git a/lib/core/__init__.py b/lib/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8baaf0be623aaa200324da599152ce234f40d21
--- /dev/null
+++ b/lib/core/__init__.py
@@ -0,0 +1 @@
+from .function import AverageMeter
\ No newline at end of file
diff --git a/lib/core/__pycache__/__init__.cpython-310.pyc b/lib/core/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..651af6761baa4e8beebc12ab3d8adcaca6995483
Binary files /dev/null and b/lib/core/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/__init__.cpython-37.pyc b/lib/core/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c50fcc9d12b541969662ee7883fd4def7e32230e
Binary files /dev/null and b/lib/core/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lib/core/__pycache__/evaluate.cpython-310.pyc b/lib/core/__pycache__/evaluate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a523ecd53ea90e2f06b53f0da6b01262f30abf9a
Binary files /dev/null and b/lib/core/__pycache__/evaluate.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/evaluate.cpython-37.pyc b/lib/core/__pycache__/evaluate.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89c294cb9c9170e51efd794cbf61df660741fa33
Binary files /dev/null and b/lib/core/__pycache__/evaluate.cpython-37.pyc differ
diff --git a/lib/core/__pycache__/function.cpython-310.pyc b/lib/core/__pycache__/function.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2ba40361167d2530cd135746f35d4dde86595af
Binary files /dev/null and b/lib/core/__pycache__/function.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/function.cpython-37.pyc b/lib/core/__pycache__/function.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2718e69d1e0ce140c1b5dcdd8a4e51fe8e1ffa4b
Binary files /dev/null and b/lib/core/__pycache__/function.cpython-37.pyc differ
diff --git a/lib/core/__pycache__/general.cpython-310.pyc b/lib/core/__pycache__/general.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a5ab382e40218e693ee857c4eb8cb922f60a1c
Binary files /dev/null and b/lib/core/__pycache__/general.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/general.cpython-37.pyc b/lib/core/__pycache__/general.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1f268299aa8091a1e0dc3a058e9f30937cff593
Binary files /dev/null and b/lib/core/__pycache__/general.cpython-37.pyc differ
diff --git a/lib/core/__pycache__/loss.cpython-310.pyc b/lib/core/__pycache__/loss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea8365498b5514ddbda64d6e631887d202ca95bb
Binary files /dev/null and b/lib/core/__pycache__/loss.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/postprocess.cpython-310.pyc b/lib/core/__pycache__/postprocess.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1dbc9e8911f1edf497ce07de8ac4b6b321cea39
Binary files /dev/null and b/lib/core/__pycache__/postprocess.cpython-310.pyc differ
diff --git a/lib/core/__pycache__/postprocess.cpython-37.pyc b/lib/core/__pycache__/postprocess.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccdf551db2052ad2063ef56ebac87d4d726a3499
Binary files /dev/null and b/lib/core/__pycache__/postprocess.cpython-37.pyc differ
diff --git a/lib/core/activations.py b/lib/core/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f5a30f021fdf0b639116a8c32d6135a2d16750
--- /dev/null
+++ b/lib/core/activations.py
@@ -0,0 +1,72 @@
+# Activation functions
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# Swish https://arxiv.org/pdf/1905.02244.pdf ---------------------------------------------------------------------------
+class Swish(nn.Module):  #
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+class Hardswish(nn.Module):  # export-friendly version of nn.Hardswish()
+    @staticmethod
+    def forward(x):
+        # return x * F.hardsigmoid(x)  # for torchscript and CoreML
+        return x * F.hardtanh(x + 3, 0., 6.) / 6.  # for torchscript, CoreML and ONNX
+
+
+class MemoryEfficientSwish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x * torch.sigmoid(x)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            return grad_output * (sx * (1 + x * (1 - sx)))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# Mish https://github.com/digantamisra98/Mish --------------------------------------------------------------------------
+class Mish(nn.Module):
+    @staticmethod
+    def forward(x):
+        return x * F.softplus(x).tanh()
+
+
+class MemoryEfficientMish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            fx = F.softplus(x).tanh()
+            return grad_output * (fx + x * sx * (1 - fx * fx))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# FReLU https://arxiv.org/abs/2007.11824 -------------------------------------------------------------------------------
+class FReLU(nn.Module):
+    def __init__(self, c1, k=3):  # ch_in, kernel
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False)
+        self.bn = nn.BatchNorm2d(c1)
+
+    def forward(self, x):
+        return torch.max(x, self.bn(self.conv(x)))
diff --git a/lib/core/evaluate.py b/lib/core/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5812c68aec58898b3eaa9e8a66dc7e9d27bf85c
--- /dev/null
+++ b/lib/core/evaluate.py
@@ -0,0 +1,278 @@
+# Model validation metrics
+
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from . import general
+
+
+def fitness(x):
+    # Model fitness as a weighted combination of metrics
+    w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
+    return (x[:, :4] * w).sum(1)
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Sort by objectness
+    i = np.argsort(-conf)  # sorted index from big to small
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes, each number just showed up once
+    unique_classes = np.unique(target_cls)
+
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    pr_score = 0.1  # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898
+    s = [unique_classes.shape[0], tp.shape[1]]  # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95)
+    ap, p, r = np.zeros(s), np.zeros((unique_classes.shape[0], 1000)), np.zeros((unique_classes.shape[0], 1000))
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = (target_cls == c).sum()  # number of labels
+        n_p = i.sum()  # number of predictions
+
+        if n_p == 0 or n_l == 0:
+            continue
+        else:
+            # Accumulate FPs and TPs
+            fpc = (1 - tp[i]).cumsum(0)
+            tpc = tp[i].cumsum(0)
+
+            # Recall
+            recall = tpc / (n_l + 1e-16)  # recall curve
+            r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0)  # r at pr_score, negative x, xp because xp decreases
+
+            # Precision
+            precision = tpc / (tpc + fpc)  # precision curve
+            p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
+
+            # AP from recall-precision curve
+            for j in range(tp.shape[1]):
+                ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+                if plot and (j == 0):
+                    py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+
+    # Compute F1 score (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + 1e-16)
+    i = r.mean(0).argmax()
+
+    if plot:
+        plot_pr_curve(px, py, ap, save_dir, names)
+
+    return p[:, i], r[:, i], ap, f1, unique_classes.astype('int32')
+
+
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves
+    # Arguments
+        recall:    The recall curve (list)
+        precision: The precision curve (list)
+    # Returns
+        Average precision, precision curve, recall curve
+    """
+
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.], recall, [recall[-1] + 0.01]))
+    mpre = np.concatenate(([1.], precision, [0.]))
+
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+
+    return ap, mpre, mrec
+
+
+class ConfusionMatrix:
+    # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix
+    def __init__(self, nc, conf=0.25, iou_thres=0.45):
+        self.matrix = np.zeros((nc + 1, nc + 1))
+        self.nc = nc  # number of classes
+        self.conf = conf
+        self.iou_thres = iou_thres
+
+    def process_batch(self, detections, labels):
+        """
+        Return intersection-over-union (Jaccard index) of boxes.
+        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+        Arguments:
+            detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+            labels (Array[M, 5]), class, x1, y1, x2, y2
+        Returns:
+            None, updates confusion matrix accordingly
+        """
+        detections = detections[detections[:, 4] > self.conf]
+        gt_classes = labels[:, 0].int()
+        detection_classes = detections[:, 5].int()
+        iou = general.box_iou(labels[:, 1:], detections[:, :4])
+
+        x = torch.where(iou > self.iou_thres)
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+        else:
+            matches = np.zeros((0, 3))
+
+        n = matches.shape[0] > 0
+        m0, m1, _ = matches.transpose().astype(np.int16)
+        for i, gc in enumerate(gt_classes):
+            j = m0 == i
+            if n and sum(j) == 1:
+                self.matrix[gc, detection_classes[m1[j]]] += 1  # correct
+            else:
+                self.matrix[gc, self.nc] += 1  # background FP
+
+        if n:
+            for i, dc in enumerate(detection_classes):
+                if not any(m1 == i):
+                    self.matrix[self.nc, dc] += 1  # background FN
+
+    def matrix(self):
+        return self.matrix
+
+    def plot(self, save_dir='', names=()):
+        try:
+            import seaborn as sn
+
+            array = self.matrix / (self.matrix.sum(0).reshape(1, self.nc + 1) + 1E-6)  # normalize
+            array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
+
+            fig = plt.figure(figsize=(12, 9), tight_layout=True)
+            sn.set(font_scale=1.0 if self.nc < 50 else 0.8)  # for label size
+            labels = (0 < len(names) < 99) and len(names) == self.nc  # apply names to ticklabels
+            sn.heatmap(array, annot=self.nc < 30, annot_kws={"size": 8}, cmap='Blues', fmt='.2f', square=True,
+                       xticklabels=names + ['background FN'] if labels else "auto",
+                       yticklabels=names + ['background FP'] if labels else "auto").set_facecolor((1, 1, 1))
+            fig.axes[0].set_xlabel('True')
+            fig.axes[0].set_ylabel('Predicted')
+            fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250)
+        except Exception as e:
+            pass
+
+    def print(self):
+        for i in range(self.nc + 1):
+            print(' '.join(map(str, self.matrix[i])))
+
+class SegmentationMetric(object):
+    '''
+    imgLabel [batch_size, height(144), width(256)]
+    confusionMatrix [[0(TN),1(FP)],
+                     [2(FN),3(TP)]]
+    '''
+    def __init__(self, numClass):
+        self.numClass = numClass
+        self.confusionMatrix = np.zeros((self.numClass,)*2)
+
+    def pixelAccuracy(self):
+        # return all class overall pixel accuracy
+        # acc = (TP + TN) / (TP + TN + FP + TN)
+        acc = np.diag(self.confusionMatrix).sum() /  self.confusionMatrix.sum()
+        return acc
+        
+    def lineAccuracy(self):
+        Acc = np.diag(self.confusionMatrix) / (self.confusionMatrix.sum(axis=1) + 1e-12)
+        return Acc[1]
+
+    def classPixelAccuracy(self):
+        # return each category pixel accuracy(A more accurate way to call it precision)
+        # acc = (TP) / TP + FP
+        classAcc = np.diag(self.confusionMatrix) / (self.confusionMatrix.sum(axis=0) + 1e-12)
+        return classAcc
+
+    def meanPixelAccuracy(self):
+        classAcc = self.classPixelAccuracy()
+        meanAcc = np.nanmean(classAcc)
+        return meanAcc
+
+    def meanIntersectionOverUnion(self):
+        # Intersection = TP Union = TP + FP + FN
+        # IoU = TP / (TP + FP + FN)
+        intersection = np.diag(self.confusionMatrix)
+        union = np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) - np.diag(self.confusionMatrix)
+        IoU = intersection / union
+        IoU[np.isnan(IoU)] = 0
+        mIoU = np.nanmean(IoU)
+        return mIoU
+    
+    def IntersectionOverUnion(self):
+        intersection = np.diag(self.confusionMatrix)
+        union = np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) - np.diag(self.confusionMatrix)
+        IoU = intersection / union
+        IoU[np.isnan(IoU)] = 0
+        return IoU[1]
+
+    def genConfusionMatrix(self, imgPredict, imgLabel):
+        # remove classes from unlabeled pixels in gt image and predict
+        # print(imgLabel.shape)
+        mask = (imgLabel >= 0) & (imgLabel < self.numClass)
+        label = self.numClass * imgLabel[mask] + imgPredict[mask]
+        count = np.bincount(label, minlength=self.numClass**2)
+        confusionMatrix = count.reshape(self.numClass, self.numClass)
+        return confusionMatrix
+
+    def Frequency_Weighted_Intersection_over_Union(self):
+        # FWIOU =     [(TP+FN)/(TP+FP+TN+FN)] *[TP / (TP + FP + FN)]
+        freq = np.sum(self.confusionMatrix, axis=1) / np.sum(self.confusionMatrix)
+        iu = np.diag(self.confusionMatrix) / (
+                np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) -
+                np.diag(self.confusionMatrix))
+        FWIoU = (freq[freq > 0] * iu[freq > 0]).sum()
+        return FWIoU
+
+
+    def addBatch(self, imgPredict, imgLabel):
+        assert imgPredict.shape == imgLabel.shape
+        self.confusionMatrix += self.genConfusionMatrix(imgPredict, imgLabel)
+
+    def reset(self):
+        self.confusionMatrix = np.zeros((self.numClass, self.numClass))
+
+
+
+
+
+# Plots ----------------------------------------------------------------------------------------------------------------
+
+def plot_pr_curve(px, py, ap, save_dir='.', names=()):
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = np.stack(py, axis=1)
+
+    if 0 < len(names) < 21:  # show mAP in legend if < 10 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0])  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color='grey')  # plot(recall, precision)
+
+    ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean())
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250)
diff --git a/lib/core/function.py b/lib/core/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..554b1fb388c1368daa496e0e6f0a2350367adb21
--- /dev/null
+++ b/lib/core/function.py
@@ -0,0 +1,510 @@
+import time
+from lib.core.evaluate import ConfusionMatrix,SegmentationMetric
+from lib.core.general import non_max_suppression,check_img_size,scale_coords,xyxy2xywh,xywh2xyxy,box_iou,coco80_to_coco91_class,plot_images,ap_per_class,output_to_target
+from lib.utils.utils import time_synchronized
+from lib.utils import plot_img_and_mask,plot_one_box,show_seg_result
+import torch
+from threading import Thread
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from pathlib import Path
+import json
+import random
+import cv2
+import os
+import math
+from torch.cuda import amp
+from tqdm import tqdm
+
+
+def train(cfg, train_loader, model, criterion, optimizer, scaler, epoch, num_batch, num_warmup,
+          writer_dict, logger, device, rank=-1):
+    """
+    train for one epoch
+
+    Inputs:
+    - config: configurations 
+    - train_loader: loder for data
+    - model: 
+    - criterion: (function) calculate all the loss, return total_loss, head_losses
+    - writer_dict:
+    outputs(2,)
+    output[0] len:3, [1,3,32,32,85], [1,3,16,16,85], [1,3,8,8,85]
+    output[1] len:1, [2,256,256]
+    output[2] len:1, [2,256,256]
+    target(2,)
+    target[0] [1,n,5]
+    target[1] [2,256,256]
+    target[2] [2,256,256]
+    Returns:
+    None
+
+    """
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+
+    # switch to train mode
+    model.train()
+    start = time.time()
+    for i, (input, target, paths, shapes) in enumerate(train_loader):
+        intermediate = time.time()
+        #print('tims:{}'.format(intermediate-start))
+        num_iter = i + num_batch * (epoch - 1)
+
+        if num_iter < num_warmup:
+            # warm up
+            lf = lambda x: ((1 + math.cos(x * math.pi / cfg.TRAIN.END_EPOCH)) / 2) * \
+                           (1 - cfg.TRAIN.LRF) + cfg.TRAIN.LRF  # cosine
+            xi = [0, num_warmup]
+            # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
+            for j, x in enumerate(optimizer.param_groups):
+                # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
+                x['lr'] = np.interp(num_iter, xi, [cfg.TRAIN.WARMUP_BIASE_LR if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
+                if 'momentum' in x:
+                    x['momentum'] = np.interp(num_iter, xi, [cfg.TRAIN.WARMUP_MOMENTUM, cfg.TRAIN.MOMENTUM])
+
+        data_time.update(time.time() - start)
+        if not cfg.DEBUG:
+            input = input.to(device, non_blocking=True)
+            assign_target = []
+            for tgt in target:
+                assign_target.append(tgt.to(device))
+            target = assign_target
+        with amp.autocast(enabled=device.type != 'cpu'):
+            outputs = model(input)
+            total_loss, head_losses = criterion(outputs, target, shapes,model)
+            # print(head_losses)
+
+        # compute gradient and do update step
+        optimizer.zero_grad()
+        scaler.scale(total_loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+
+        if rank in [-1, 0]:
+            # measure accuracy and record loss
+            losses.update(total_loss.item(), input.size(0))
+
+            # _, avg_acc, cnt, pred = accuracy(output.detach().cpu().numpy(),
+            #                                  target.detach().cpu().numpy())
+            # acc.update(avg_acc, cnt)
+
+            # measure elapsed time
+            batch_time.update(time.time() - start)
+            end = time.time()
+            if i % cfg.PRINT_FREQ == 0:
+                msg = 'Epoch: [{0}][{1}/{2}]\t' \
+                      'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
+                      'Speed {speed:.1f} samples/s\t' \
+                      'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \
+                      'Loss {loss.val:.5f} ({loss.avg:.5f})'.format(
+                          epoch, i, len(train_loader), batch_time=batch_time,
+                          speed=input.size(0)/batch_time.val,
+                          data_time=data_time, loss=losses)
+                logger.info(msg)
+
+                writer = writer_dict['writer']
+                global_steps = writer_dict['train_global_steps']
+                writer.add_scalar('train_loss', losses.val, global_steps)
+                # writer.add_scalar('train_acc', acc.val, global_steps)
+                writer_dict['train_global_steps'] = global_steps + 1
+
+
+def validate(epoch,config, val_loader, val_dataset, model, criterion, output_dir,
+             tb_log_dir, writer_dict=None, logger=None, device='cpu', rank=-1):
+    """
+    validata
+
+    Inputs:
+    - config: configurations 
+    - train_loader: loder for data
+    - model: 
+    - criterion: (function) calculate all the loss, return 
+    - writer_dict: 
+
+    Return:
+    None
+    """
+    # setting
+    max_stride = 32
+    weights = None
+
+    save_dir = output_dir + os.path.sep + 'visualization'
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
+
+    # print(save_dir)
+    _, imgsz = [check_img_size(x, s=max_stride) for x in config.MODEL.IMAGE_SIZE] #imgsz is multiple of max_stride
+    batch_size = config.TRAIN.BATCH_SIZE_PER_GPU * len(config.GPUS)
+    test_batch_size = config.TEST.BATCH_SIZE_PER_GPU * len(config.GPUS)
+    training = False
+    is_coco = False #is coco dataset
+    save_conf=False # save auto-label confidences
+    verbose=False
+    save_hybrid=False
+    log_imgs,wandb = min(16,100), None
+
+    nc = 1
+    iouv = torch.linspace(0.5,0.95,10).to(device)     #iou vector for mAP@0.5:0.95
+    niou = iouv.numel()
+
+    try:
+        import wandb
+    except ImportError:
+        wandb = None
+        log_imgs = 0
+
+    seen =  0 
+    confusion_matrix = ConfusionMatrix(nc=model.nc) #detector confusion matrix
+    da_metric = SegmentationMetric(config.num_seg_class) #segment confusion matrix    
+    ll_metric = SegmentationMetric(2) #segment confusion matrix
+
+    names = {k: v for k, v in enumerate(model.names if hasattr(model, 'names') else model.module.names)}
+    colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]
+    coco91class = coco80_to_coco91_class()
+    
+    s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
+    p, r, f1, mp, mr, map50, map, t_inf, t_nms = 0., 0., 0., 0., 0., 0., 0., 0., 0.
+    
+    losses = AverageMeter()
+
+    da_acc_seg = AverageMeter()
+    da_IoU_seg = AverageMeter()
+    da_mIoU_seg = AverageMeter()
+
+    ll_acc_seg = AverageMeter()
+    ll_IoU_seg = AverageMeter()
+    ll_mIoU_seg = AverageMeter()
+
+    T_inf = AverageMeter()
+    T_nms = AverageMeter()
+
+    # switch to train mode
+    model.eval()
+    jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
+
+    for batch_i, (img, target, paths, shapes) in tqdm(enumerate(val_loader), total=len(val_loader)):
+        if not config.DEBUG:
+            img = img.to(device, non_blocking=True)
+            assign_target = []
+            for tgt in target:
+                assign_target.append(tgt.to(device))
+            target = assign_target
+            nb, _, height, width = img.shape    #batch size, channel, height, width
+
+        with torch.no_grad():
+            pad_w, pad_h = shapes[0][1][1]
+            pad_w = int(pad_w)
+            pad_h = int(pad_h)
+            ratio = shapes[0][1][0][0]
+
+            t = time_synchronized()
+            det_out, da_seg_out, ll_seg_out= model(img)
+            t_inf = time_synchronized() - t
+            if batch_i > 0:
+                T_inf.update(t_inf/img.size(0),img.size(0))
+
+            inf_out,train_out = det_out
+
+            #driving area segment evaluation
+            _,da_predict=torch.max(da_seg_out, 1)
+            _,da_gt=torch.max(target[1], 1)
+            da_predict = da_predict[:, pad_h:height-pad_h, pad_w:width-pad_w]
+            da_gt = da_gt[:, pad_h:height-pad_h, pad_w:width-pad_w]
+
+            da_metric.reset()
+            da_metric.addBatch(da_predict.cpu(), da_gt.cpu())
+            da_acc = da_metric.pixelAccuracy()
+            da_IoU = da_metric.IntersectionOverUnion()
+            da_mIoU = da_metric.meanIntersectionOverUnion()
+
+            da_acc_seg.update(da_acc,img.size(0))
+            da_IoU_seg.update(da_IoU,img.size(0))
+            da_mIoU_seg.update(da_mIoU,img.size(0))
+
+            #lane line segment evaluation
+            _,ll_predict=torch.max(ll_seg_out, 1)
+            _,ll_gt=torch.max(target[2], 1)
+            ll_predict = ll_predict[:, pad_h:height-pad_h, pad_w:width-pad_w]
+            ll_gt = ll_gt[:, pad_h:height-pad_h, pad_w:width-pad_w]
+
+            ll_metric.reset()
+            ll_metric.addBatch(ll_predict.cpu(), ll_gt.cpu())
+            ll_acc = ll_metric.lineAccuracy()
+            ll_IoU = ll_metric.IntersectionOverUnion()
+            ll_mIoU = ll_metric.meanIntersectionOverUnion()
+
+            ll_acc_seg.update(ll_acc,img.size(0))
+            ll_IoU_seg.update(ll_IoU,img.size(0))
+            ll_mIoU_seg.update(ll_mIoU,img.size(0))
+            
+            total_loss, head_losses = criterion((train_out,da_seg_out, ll_seg_out), target, shapes,model)   #Compute loss
+            losses.update(total_loss.item(), img.size(0))
+
+            #NMS
+            t = time_synchronized()
+            target[0][:, 2:] *= torch.Tensor([width, height, width, height]).to(device)  # to pixels
+            lb = [target[0][target[0][:, 0] == i, 1:] for i in range(nb)] if save_hybrid else []  # for autolabelling
+            output = non_max_suppression(inf_out, conf_thres= config.TEST.NMS_CONF_THRESHOLD, iou_thres=config.TEST.NMS_IOU_THRESHOLD, labels=lb)
+            #output = non_max_suppression(inf_out, conf_thres=0.001, iou_thres=0.6)
+            #output = non_max_suppression(inf_out, conf_thres=config.TEST.NMS_CONF_THRES, iou_thres=config.TEST.NMS_IOU_THRES)
+            t_nms = time_synchronized() - t
+            if batch_i > 0:
+                T_nms.update(t_nms/img.size(0),img.size(0))
+
+            if config.TEST.PLOTS:
+                if batch_i == 0:
+                    for i in range(test_batch_size):
+                        img_test = cv2.imread(paths[i])
+                        da_seg_mask = da_seg_out[i][:, pad_h:height-pad_h, pad_w:width-pad_w].unsqueeze(0)
+                        da_seg_mask = torch.nn.functional.interpolate(da_seg_mask, scale_factor=int(1/ratio), mode='bilinear')
+                        _, da_seg_mask = torch.max(da_seg_mask, 1)
+
+                        da_gt_mask = target[1][i][:, pad_h:height-pad_h, pad_w:width-pad_w].unsqueeze(0)
+                        da_gt_mask = torch.nn.functional.interpolate(da_gt_mask, scale_factor=int(1/ratio), mode='bilinear')
+                        _, da_gt_mask = torch.max(da_gt_mask, 1)
+
+                        da_seg_mask = da_seg_mask.int().squeeze().cpu().numpy()
+                        da_gt_mask = da_gt_mask.int().squeeze().cpu().numpy()
+                        # seg_mask = seg_mask > 0.5
+                        # plot_img_and_mask(img_test, seg_mask, i,epoch,save_dir)
+                        img_test1 = img_test.copy()
+                        _ = show_seg_result(img_test, da_seg_mask, i,epoch,save_dir)
+                        _ = show_seg_result(img_test1, da_gt_mask, i, epoch, save_dir, is_gt=True)
+
+                        img_ll = cv2.imread(paths[i])
+                        ll_seg_mask = ll_seg_out[i][:, pad_h:height-pad_h, pad_w:width-pad_w].unsqueeze(0)
+                        ll_seg_mask = torch.nn.functional.interpolate(ll_seg_mask, scale_factor=int(1/ratio), mode='bilinear')
+                        _, ll_seg_mask = torch.max(ll_seg_mask, 1)
+
+                        ll_gt_mask = target[2][i][:, pad_h:height-pad_h, pad_w:width-pad_w].unsqueeze(0)
+                        ll_gt_mask = torch.nn.functional.interpolate(ll_gt_mask, scale_factor=int(1/ratio), mode='bilinear')
+                        _, ll_gt_mask = torch.max(ll_gt_mask, 1)
+
+                        ll_seg_mask = ll_seg_mask.int().squeeze().cpu().numpy()
+                        ll_gt_mask = ll_gt_mask.int().squeeze().cpu().numpy()
+                        # seg_mask = seg_mask > 0.5
+                        # plot_img_and_mask(img_test, seg_mask, i,epoch,save_dir)
+                        img_ll1 = img_ll.copy()
+                        _ = show_seg_result(img_ll, ll_seg_mask, i,epoch,save_dir, is_ll=True)
+                        _ = show_seg_result(img_ll1, ll_gt_mask, i, epoch, save_dir, is_ll=True, is_gt=True)
+
+                        img_det = cv2.imread(paths[i])
+                        img_gt = img_det.copy()
+                        det = output[i].clone()
+                        if len(det):
+                            det[:,:4] = scale_coords(img[i].shape[1:],det[:,:4],img_det.shape).round()
+                        for *xyxy,conf,cls in reversed(det):
+                            #print(cls)
+                            label_det_pred = f'{names[int(cls)]} {conf:.2f}'
+                            plot_one_box(xyxy, img_det , label=label_det_pred, color=colors[int(cls)], line_thickness=3)
+                        cv2.imwrite(save_dir+"/batch_{}_{}_det_pred.png".format(epoch,i),img_det)
+
+                        labels = target[0][target[0][:, 0] == i, 1:]
+                        # print(labels)
+                        labels[:,1:5]=xywh2xyxy(labels[:,1:5])
+                        if len(labels):
+                            labels[:,1:5]=scale_coords(img[i].shape[1:],labels[:,1:5],img_gt.shape).round()
+                        for cls,x1,y1,x2,y2 in labels:
+                            #print(names)
+                            #print(cls)
+                            label_det_gt = f'{names[int(cls)]}'
+                            xyxy = (x1,y1,x2,y2)
+                            plot_one_box(xyxy, img_gt , label=label_det_gt, color=colors[int(cls)], line_thickness=3)
+                        cv2.imwrite(save_dir+"/batch_{}_{}_det_gt.png".format(epoch,i),img_gt)
+
+        # Statistics per image
+        # output([xyxy,conf,cls])
+        # target[0] ([img_id,cls,xyxy])
+        for si, pred in enumerate(output):
+            labels = target[0][target[0][:, 0] == si, 1:]     #all object in one image 
+            nl = len(labels)    # num of object
+            tcls = labels[:, 0].tolist() if nl else []  # target class
+            path = Path(paths[si])
+            seen += 1
+
+            if len(pred) == 0:
+                if nl:
+                    stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
+                continue
+
+            # Predictions
+            predn = pred.clone()
+            scale_coords(img[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1])  # native-space pred
+
+            # Append to text file
+            if config.TEST.SAVE_TXT:
+                gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]]  # normalization gain whwh
+                for *xyxy, conf, cls in predn.tolist():
+                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
+                    line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
+                    with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f:
+                        f.write(('%g ' * len(line)).rstrip() % line + '\n')
+
+            # W&B logging
+            if config.TEST.PLOTS and len(wandb_images) < log_imgs:
+                box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
+                             "class_id": int(cls),
+                             "box_caption": "%s %.3f" % (names[cls], conf),
+                             "scores": {"class_score": conf},
+                             "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
+                boxes = {"predictions": {"box_data": box_data, "class_labels": names}}  # inference-space
+                wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name))
+
+            # Append to pycocotools JSON dictionary
+            if config.TEST.SAVE_JSON:
+                # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
+                image_id = int(path.stem) if path.stem.isnumeric() else path.stem
+                box = xyxy2xywh(predn[:, :4])  # xywh
+                box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+                for p, b in zip(pred.tolist(), box.tolist()):
+                    jdict.append({'image_id': image_id,
+                                  'category_id': coco91class[int(p[5])] if is_coco else int(p[5]),
+                                  'bbox': [round(x, 3) for x in b],
+                                  'score': round(p[4], 5)})
+
+
+            # Assign all predictions as incorrect
+            correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device)
+            if nl:
+                detected = []  # target indices
+                tcls_tensor = labels[:, 0]
+
+                # target boxes
+                tbox = xywh2xyxy(labels[:, 1:5])
+                scale_coords(img[si].shape[1:], tbox, shapes[si][0], shapes[si][1])  # native-space labels
+                if config.TEST.PLOTS:
+                    confusion_matrix.process_batch(pred, torch.cat((labels[:, 0:1], tbox), 1))
+
+                # Per target class
+                for cls in torch.unique(tcls_tensor):                    
+                    ti = (cls == tcls_tensor).nonzero(as_tuple=False).view(-1)  # prediction indices
+                    pi = (cls == pred[:, 5]).nonzero(as_tuple=False).view(-1)  # target indices
+
+                    # Search for detections
+                    if pi.shape[0]:
+                        # Prediction to target ious
+                        # n*m  n:pred  m:label
+                        ious, i = box_iou(predn[pi, :4], tbox[ti]).max(1)  # best ious, indices
+                        # Append detections
+                        detected_set = set()
+                        for j in (ious > iouv[0]).nonzero(as_tuple=False):
+                            d = ti[i[j]]  # detected target
+                            if d.item() not in detected_set:
+                                detected_set.add(d.item())
+                                detected.append(d)
+                                correct[pi[j]] = ious[j] > iouv  # iou_thres is 1xn
+                                if len(detected) == nl:  # all targets already located in image
+                                    break
+
+            # Append statistics (correct, conf, pcls, tcls)
+            stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
+
+        if config.TEST.PLOTS and batch_i < 3:
+            f = save_dir +'/'+ f'test_batch{batch_i}_labels.jpg'  # labels
+            #Thread(target=plot_images, args=(img, target[0], paths, f, names), daemon=True).start()
+            f = save_dir +'/'+ f'test_batch{batch_i}_pred.jpg'  # predictions
+            #Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start()
+
+    # Compute statistics
+    # stats : [[all_img_correct]...[all_img_tcls]]
+    stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy  zip(*) :unzip
+
+    map70 = None
+    map75 = None
+    if len(stats) and stats[0].any():
+        p, r, ap, f1, ap_class = ap_per_class(*stats, plot=False, save_dir=save_dir, names=names)
+        ap50, ap70, ap75,ap = ap[:, 0], ap[:,4], ap[:,5],ap.mean(1)  # [P, R, AP@0.5, AP@0.5:0.95]
+        mp, mr, map50, map70, map75, map = p.mean(), r.mean(), ap50.mean(), ap70.mean(),ap75.mean(),ap.mean()
+        nt = np.bincount(stats[3].astype(np.int64), minlength=nc)  # number of targets per class
+    else:
+        nt = torch.zeros(1)
+
+    # Print results
+    pf = '%20s' + '%12.3g' * 6  # print format
+    print(pf % ('all', seen, nt.sum(), mp, mr, map50, map))
+    #print(map70)
+    #print(map75)
+
+    # Print results per class
+    if (verbose or (nc <= 20 and not training)) and nc > 1 and len(stats):
+        for i, c in enumerate(ap_class):
+            print(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i]))
+
+    # Print speeds
+    t = tuple(x / seen * 1E3 for x in (t_inf, t_nms, t_inf + t_nms)) + (imgsz, imgsz, batch_size)  # tuple
+    if not training:
+        print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t)
+
+    # Plots
+    if config.TEST.PLOTS:
+        confusion_matrix.plot(save_dir=save_dir, names=list(names.values()))
+        if wandb and wandb.run:
+            wandb.log({"Images": wandb_images})
+            wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))]})
+
+    # Save JSON
+    if config.TEST.SAVE_JSON and len(jdict):
+        w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else ''  # weights
+        anno_json = '../coco/annotations/instances_val2017.json'  # annotations json
+        pred_json = str(save_dir / f"{w}_predictions.json")  # predictions json
+        print('\nEvaluating pycocotools mAP... saving %s...' % pred_json)
+        with open(pred_json, 'w') as f:
+            json.dump(jdict, f)
+
+        try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
+            from pycocotools.coco import COCO
+            from pycocotools.cocoeval import COCOeval
+
+            anno = COCO(anno_json)  # init annotations api
+            pred = anno.loadRes(pred_json)  # init predictions api
+            eval = COCOeval(anno, pred, 'bbox')
+            if is_coco:
+                eval.params.imgIds = [int(Path(x).stem) for x in val_loader.dataset.img_files]  # image IDs to evaluate
+            eval.evaluate()
+            eval.accumulate()
+            eval.summarize()
+            map, map50 = eval.stats[:2]  # update results (mAP@0.5:0.95, mAP@0.5)
+        except Exception as e:
+            print(f'pycocotools unable to run: {e}')
+
+    # Return results
+    if not training:
+        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if config.TEST.SAVE_TXT else ''
+        print(f"Results saved to {save_dir}{s}")
+    model.float()  # for training
+    maps = np.zeros(nc) + map
+    for i, c in enumerate(ap_class):
+        maps[c] = ap[i]
+
+    da_segment_result = (da_acc_seg.avg,da_IoU_seg.avg,da_mIoU_seg.avg)
+    ll_segment_result = (ll_acc_seg.avg,ll_IoU_seg.avg,ll_mIoU_seg.avg)
+
+    # print(da_segment_result)
+    # print(ll_segment_result)
+    detect_result = np.asarray([mp, mr, map50, map])
+    # print('mp:{},mr:{},map50:{},map:{}'.format(mp, mr, map50, map))
+    #print segmet_result
+    t = [T_inf.avg, T_nms.avg]
+    return da_segment_result, ll_segment_result, detect_result, losses.avg, maps, t
+        
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count if self.count != 0 else 0
\ No newline at end of file
diff --git a/lib/core/general.py b/lib/core/general.py
new file mode 100644
index 0000000000000000000000000000000000000000..4834490cdf298454ca542caded1ea5c09985e3cc
--- /dev/null
+++ b/lib/core/general.py
@@ -0,0 +1,466 @@
+import glob
+import logging
+import os
+import platform
+import random
+import re
+import shutil
+import subprocess
+import time
+import torchvision
+from contextlib import contextmanager
+from copy import copy
+from pathlib import Path
+
+import cv2
+import math
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import yaml
+from PIL import Image
+from scipy.cluster.vq import kmeans
+from scipy.signal import butter, filtfilt
+from tqdm import tqdm
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-9):
+    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
+    box2 = box2.T
+
+    # Get the coordinates of bounding boxes
+    if x1y1x2y2:  # x1, y1, x2, y2 = box1
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+    else:  # transform from xywh to xyxy
+        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+
+    # Intersection area
+    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
+            (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
+
+    # Union Area
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+    union = w1 * h1 + w2 * h2 - inter + eps
+
+    iou = inter / union
+    if GIoU or DIoU or CIoU:
+        cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex (smallest enclosing box) width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
+            rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
+                    (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center distance squared
+            if DIoU:
+                return iou - rho2 / c2  # DIoU
+            elif CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+                with torch.no_grad():
+                    alpha = v / ((1 + eps) - iou + v)
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+        else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+            c_area = cw * ch + eps  # convex area
+            return iou - (c_area - union) / c_area  # GIoU
+    else:
+        return iou  # IoU
+
+
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1]) #(x2-x1)*(y2-y1)
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
+
+def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, labels=()):
+    """Performs Non-Maximum Suppression (NMS) on inference results
+
+    Returns:
+         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+    """
+
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    max_det = 300  # maximum number of detections per image
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 5), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+
+    return output
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+    
+def fitness(x):
+    # Returns fitness (for use with results.txt or evolve.txt)
+    w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
+    return (x[:, :4] * w).sum(1)
+
+def check_img_size(img_size, s=32):
+    # Verify img_size is a multiple of stride s
+    new_size = make_divisible(img_size, int(s))  # ceil gs-multiple
+    if new_size != img_size:
+        print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size))
+    return new_size
+
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+
+def make_divisible(x, divisor):
+    # Returns x evenly divisible by divisor
+    return math.ceil(x / divisor) * divisor
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+def plot_images(images, targets, paths=None, fname='images.jpg', names=None, max_size=640, max_subplots=16):
+    # Plot image grid with labels
+
+    if isinstance(images, torch.Tensor):
+        images = images.cpu().float().numpy()
+    if isinstance(targets, torch.Tensor):
+        targets = targets.cpu().numpy()
+
+    # un-normalise
+    if np.max(images[0]) <= 1:
+        images *= 255
+
+    tl = 3  # line thickness
+    tf = max(tl - 1, 1)  # font thickness
+    bs, _, h, w = images.shape  # batch size, _, height, width
+    bs = min(bs, max_subplots)  # limit plot images
+    ns = np.ceil(bs ** 0.5)  # number of subplots (square)
+
+    # Check if we should resize
+    scale_factor = max_size / max(h, w)
+    if scale_factor < 1:
+        h = math.ceil(scale_factor * h)
+        w = math.ceil(scale_factor * w)
+
+    colors = color_list()  # list of colors
+    mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8)  # init
+    for i, img in enumerate(images):
+        if i == max_subplots:  # if last batch has fewer images than we expect
+            break
+
+        block_x = int(w * (i // ns))
+        block_y = int(h * (i % ns))
+
+        img = img.transpose(1, 2, 0)
+        if scale_factor < 1:
+            img = cv2.resize(img, (w, h))
+
+        mosaic[block_y:block_y + h, block_x:block_x + w, :] = img
+        if len(targets) > 0:
+            image_targets = targets[targets[:, 0] == i]
+            boxes = xywh2xyxy(image_targets[:, 2:6]).T
+            classes = image_targets[:, 1].astype('int')
+            labels = image_targets.shape[1] == 6  # labels if no conf column
+            conf = None if labels else image_targets[:, 6]  # check for confidence presence (label vs pred)
+
+            if boxes.shape[1]:
+                if boxes.max() <= 1.01:  # if normalized with tolerance 0.01
+                    boxes[[0, 2]] *= w  # scale to pixels
+                    boxes[[1, 3]] *= h
+                elif scale_factor < 1:  # absolute coords need scale if image scales
+                    boxes *= scale_factor
+            boxes[[0, 2]] += block_x
+            boxes[[1, 3]] += block_y
+            for j, box in enumerate(boxes.T):
+                cls = int(classes[j])
+                color = colors[cls % len(colors)]
+                cls = names[cls] if names else cls
+                if labels or conf[j] > 0.25:  # 0.25 conf thresh
+                    label = '%s' % cls if labels else '%s %.1f' % (cls, conf[j])
+                    plot_one_box(box, mosaic, label=label, color=color, line_thickness=tl)
+
+        # Draw image filename labels
+        if paths:
+            label = Path(paths[i]).name[:40]  # trim to 40 char
+            t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+            cv2.putText(mosaic, label, (block_x + 5, block_y + t_size[1] + 5), 0, tl / 3, [220, 220, 220], thickness=tf,
+                        lineType=cv2.LINE_AA)
+
+        # Image border
+        cv2.rectangle(mosaic, (block_x, block_y), (block_x + w, block_y + h), (255, 255, 255), thickness=3)
+
+    if fname:
+        r = min(1280. / max(h, w) / ns, 1.0)  # ratio to limit image size
+        mosaic = cv2.resize(mosaic, (int(ns * w * r), int(ns * h * r)), interpolation=cv2.INTER_AREA)
+        # cv2.imwrite(fname, cv2.cvtColor(mosaic, cv2.COLOR_BGR2RGB))  # cv2 save
+        Image.fromarray(mosaic).save(fname)  # PIL save
+    return mosaic
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    # Plots one bounding box on image img
+    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+
+def color_list():
+    # Return first 10 plt colors as (r,g,b) https://stackoverflow.com/questions/51350872/python-from-color-name-to-rgb
+    def hex2rgb(h):
+        return tuple(int(str(h[1 + i:1 + i + 2]), 16) for i in (0, 2, 4))
+
+    return [hex2rgb(h) for h in plt.rcParams['axes.prop_cycle'].by_key()['color']]
+
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(target_cls)
+
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    pr_score = 0.1  # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898
+    s = [unique_classes.shape[0], tp.shape[1]]  # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95)
+    ap, p, r = np.zeros(s), np.zeros((unique_classes.shape[0], 1000)), np.zeros((unique_classes.shape[0], 1000))
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = (target_cls == c).sum()  # number of labels
+        n_p = i.sum()  # number of predictions
+
+        if n_p == 0 or n_l == 0:
+            continue
+        else:
+            # Accumulate FPs and TPs
+            fpc = (1 - tp[i]).cumsum(0)
+            tpc = tp[i].cumsum(0)
+
+            # Recall
+            recall = tpc / (n_l + 1e-16)  # recall curve
+            r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0)  # negative x, xp because xp decreases
+
+            # Precision
+            precision = tpc / (tpc + fpc)  # precision curve
+            p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
+            # AP from recall-precision curve
+            for j in range(tp.shape[1]):
+                ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+                if plot and (j == 0):
+                    py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+
+    # Compute F1 score (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + 1e-16)
+    i=r.mean(0).argmax()
+
+    if plot:
+        plot_pr_curve(px, py, ap, save_dir, names)
+
+    return p[:, i], r[:, i], ap, f1[:, i], unique_classes.astype('int32')
+
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rbgirshick/py-faster-rcnn.
+    # Arguments
+        recall:    The recall curve (list).
+        precision: The precision curve (list).
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.], recall, [recall[-1] + 1E-3]))
+    mpre = np.concatenate(([1.], precision, [0.]))
+
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+
+    return ap, mpre, mrec
+
+def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
+    # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/
+    # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n')
+    # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n')
+    # x1 = [list(a[i] == b).index(True) + 1 for i in range(80)]  # darknet to coco
+    # x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)]  # coco to darknet
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+         35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+         64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+    return x
+
+def output_to_target(output):
+    # Convert model output to target format [batch_id, class_id, x, y, w, h, conf]
+    targets = []
+    for i, o in enumerate(output):
+        for *box, conf, cls in o.cpu().numpy():
+            targets.append([i, cls, *list(*xyxy2xywh(np.array(box)[None])), conf])
+    return np.array(targets)
+
+def plot_pr_curve(px, py, ap, save_dir='.', names=()):
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = np.stack(py, axis=1)
+
+    if 0 < len(names) < 21:  # show mAP in legend if < 10 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0])  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color='grey')  # plot(recall, precision)
+
+    ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean())
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250)
\ No newline at end of file
diff --git a/lib/core/loss.py b/lib/core/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..687818de96d16457795766d21a539d4f7ba5c0b8
--- /dev/null
+++ b/lib/core/loss.py
@@ -0,0 +1,249 @@
+import torch.nn as nn
+import torch
+from .general import bbox_iou
+from .postprocess import build_targets
+from lib.core.evaluate import SegmentationMetric
+
+class MultiHeadLoss(nn.Module):
+    """
+    collect all the loss we need
+    """
+    def __init__(self, losses, cfg, lambdas=None):
+        """
+        Inputs:
+        - losses: (list)[nn.Module, nn.Module, ...]
+        - cfg: config object
+        - lambdas: (list) + IoU loss, weight for each loss
+        """
+        super().__init__()
+        # lambdas: [cls, obj, iou, la_seg, ll_seg, ll_iou]
+        if not lambdas:
+            lambdas = [1.0 for _ in range(len(losses) + 3)]
+        assert all(lam >= 0.0 for lam in lambdas)
+
+        self.losses = nn.ModuleList(losses)
+        self.lambdas = lambdas
+        self.cfg = cfg
+
+    def forward(self, head_fields, head_targets, shapes, model):
+        """
+        Inputs:
+        - head_fields: (list) output from each task head
+        - head_targets: (list) ground-truth for each task head
+        - model:
+
+        Returns:
+        - total_loss: sum of all the loss
+        - head_losses: (tuple) contain all loss[loss1, loss2, ...]
+
+        """
+        # head_losses = [ll
+        #                 for l, f, t in zip(self.losses, head_fields, head_targets)
+        #                 for ll in l(f, t)]
+        #
+        # assert len(self.lambdas) == len(head_losses)
+        # loss_values = [lam * l
+        #                for lam, l in zip(self.lambdas, head_losses)
+        #                if l is not None]
+        # total_loss = sum(loss_values) if loss_values else None
+        # print(model.nc)
+        total_loss, head_losses = self._forward_impl(head_fields, head_targets, shapes, model)
+
+        return total_loss, head_losses
+
+    def _forward_impl(self, predictions, targets, shapes, model):
+        """
+
+        Args:
+            predictions: predicts of [[det_head1, det_head2, det_head3], drive_area_seg_head, lane_line_seg_head]
+            targets: gts [det_targets, segment_targets, lane_targets]
+            model:
+
+        Returns:
+            total_loss: sum of all the loss
+            head_losses: list containing losses
+
+        """
+        cfg = self.cfg
+        device = targets[0].device
+        lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device)
+        tcls, tbox, indices, anchors = build_targets(cfg, predictions[0], targets[0], model)  # targets
+
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        cp, cn = smooth_BCE(eps=0.0)
+
+        BCEcls, BCEobj, BCEseg = self.losses
+
+        # Calculate Losses
+        nt = 0  # number of targets
+        no = len(predictions[0])  # number of outputs
+        balance = [4.0, 1.0, 0.4] if no == 3 else [4.0, 1.0, 0.4, 0.1]  # P3-5 or P3-6
+
+        # calculate detection loss
+        for i, pi in enumerate(predictions[0]):  # layer index, layer predictions
+            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
+            tobj = torch.zeros_like(pi[..., 0], device=device)  # target obj
+
+            n = b.shape[0]  # number of targets
+            if n:
+                nt += n  # cumulative targets
+                ps = pi[b, a, gj, gi]  # prediction subset corresponding to targets
+
+                # Regression
+                pxy = ps[:, :2].sigmoid() * 2. - 0.5
+                pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]
+                pbox = torch.cat((pxy, pwh), 1).to(device)  # predicted box
+                iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, CIoU=True)  # iou(prediction, target)
+                lbox += (1.0 - iou).mean()  # iou loss
+
+                # Objectness
+                tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * iou.detach().clamp(0).type(tobj.dtype)  # iou ratio
+
+                # Classification
+                # print(model.nc)
+                if model.nc > 1:  # cls loss (only if multiple classes)
+                    t = torch.full_like(ps[:, 5:], cn, device=device)  # targets
+                    t[range(n), tcls[i]] = cp
+                    lcls += BCEcls(ps[:, 5:], t)  # BCE
+            lobj += BCEobj(pi[..., 4], tobj) * balance[i]  # obj loss
+
+        drive_area_seg_predicts = predictions[1].view(-1)
+        drive_area_seg_targets = targets[1].view(-1)
+        lseg_da = BCEseg(drive_area_seg_predicts, drive_area_seg_targets)
+
+        lane_line_seg_predicts = predictions[2].view(-1)
+        lane_line_seg_targets = targets[2].view(-1)
+        lseg_ll = BCEseg(lane_line_seg_predicts, lane_line_seg_targets)
+
+        metric = SegmentationMetric(2)
+        nb, _, height, width = targets[1].shape
+        pad_w, pad_h = shapes[0][1][1]
+        pad_w = int(pad_w)
+        pad_h = int(pad_h)
+        _,lane_line_pred=torch.max(predictions[2], 1)
+        _,lane_line_gt=torch.max(targets[2], 1)
+        lane_line_pred = lane_line_pred[:, pad_h:height-pad_h, pad_w:width-pad_w]
+        lane_line_gt = lane_line_gt[:, pad_h:height-pad_h, pad_w:width-pad_w]
+        metric.reset()
+        metric.addBatch(lane_line_pred.cpu(), lane_line_gt.cpu())
+        IoU = metric.IntersectionOverUnion()
+        liou_ll = 1 - IoU
+
+        s = 3 / no  # output count scaling
+        lcls *= cfg.LOSS.CLS_GAIN * s * self.lambdas[0]
+        lobj *= cfg.LOSS.OBJ_GAIN * s * (1.4 if no == 4 else 1.) * self.lambdas[1]
+        lbox *= cfg.LOSS.BOX_GAIN * s * self.lambdas[2]
+
+        lseg_da *= cfg.LOSS.DA_SEG_GAIN * self.lambdas[3]
+        lseg_ll *= cfg.LOSS.LL_SEG_GAIN * self.lambdas[4]
+        liou_ll *= cfg.LOSS.LL_IOU_GAIN * self.lambdas[5]
+
+        
+        if cfg.TRAIN.DET_ONLY or cfg.TRAIN.ENC_DET_ONLY or cfg.TRAIN.DET_ONLY:
+            lseg_da = 0 * lseg_da
+            lseg_ll = 0 * lseg_ll
+            liou_ll = 0 * liou_ll
+            
+        if cfg.TRAIN.SEG_ONLY or cfg.TRAIN.ENC_SEG_ONLY:
+            lcls = 0 * lcls
+            lobj = 0 * lobj
+            lbox = 0 * lbox
+
+        if cfg.TRAIN.LANE_ONLY:
+            lcls = 0 * lcls
+            lobj = 0 * lobj
+            lbox = 0 * lbox
+            lseg_da = 0 * lseg_da
+
+        if cfg.TRAIN.DRIVABLE_ONLY:
+            lcls = 0 * lcls
+            lobj = 0 * lobj
+            lbox = 0 * lbox
+            lseg_ll = 0 * lseg_ll
+            liou_ll = 0 * liou_ll
+
+        loss = lbox + lobj + lcls + lseg_da + lseg_ll + liou_ll
+        # loss = lseg
+        # return loss * bs, torch.cat((lbox, lobj, lcls, loss)).detach()
+        
+        # 返回详细的损失字典，方便tensorboard记录
+        loss_dict = {
+            'box_loss': lbox.item(),
+            'obj_loss': lobj.item(),
+            'cls_loss': lcls.item(),
+            'da_seg_loss': lseg_da.item(),
+            'll_seg_loss': lseg_ll.item(),
+            'll_iou_loss': liou_ll.item(),
+            'total_loss': loss.item()
+        }
+        
+        return loss, loss_dict
+
+
+def get_loss(cfg, device):
+    """
+    get MultiHeadLoss
+
+    Inputs:
+    -cfg: configuration use the loss_name part or 
+          function part(like regression classification)
+    -device: cpu or gpu device
+
+    Returns:
+    -loss: (MultiHeadLoss)
+
+    """
+    # class loss criteria
+    BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([cfg.LOSS.CLS_POS_WEIGHT])).to(device)
+    # object loss criteria
+    BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([cfg.LOSS.OBJ_POS_WEIGHT])).to(device)
+    # segmentation loss criteria
+    BCEseg = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([cfg.LOSS.SEG_POS_WEIGHT])).to(device)
+    # Focal loss
+    gamma = cfg.LOSS.FL_GAMMA  # focal loss gamma
+    if gamma > 0:
+        BCEcls, BCEobj = FocalLoss(BCEcls, gamma), FocalLoss(BCEobj, gamma)
+
+    loss_list = [BCEcls, BCEobj, BCEseg]
+    loss = MultiHeadLoss(loss_list, cfg=cfg, lambdas=cfg.LOSS.MULTI_HEAD_LAMBDA)
+    return loss
+
+# example
+# class L1_Loss(nn.Module)
+
+
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+
+
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        # alpha  balance positive & negative samples
+        # gamma  focus on difficult samples
+        super(FocalLoss, self).__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = 'none'  # required to apply FL to each element
+
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        else:  # 'none'
+            return loss
diff --git a/lib/core/postprocess.py b/lib/core/postprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..907be0918f4f5f2e0f1599eca9728a9932920fef
--- /dev/null
+++ b/lib/core/postprocess.py
@@ -0,0 +1,225 @@
+import torch
+from lib.utils import is_parallel
+import numpy as np
+np.set_printoptions(threshold=np.inf)
+import cv2
+from sklearn.cluster import DBSCAN
+
+
+def build_targets(cfg, predictions, targets, model):
+    '''
+    predictions
+    [16, 3, 32, 32, 85]
+    [16, 3, 16, 16, 85]
+    [16, 3, 8, 8, 85]
+    torch.tensor(predictions[i].shape)[[3, 2, 3, 2]]
+    [32,32,32,32]
+    [16,16,16,16]
+    [8,8,8,8]
+    targets[3,x,7]
+    t [index, class, x, y, w, h, head_index]
+    '''
+    # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
+    det = model.module.model[model.module.detector_index] if is_parallel(model) \
+        else model.model[model.detector_index]  # Detect() module
+    # print(type(model))
+    # det = model.model[model.detector_index]
+    # print(type(det))
+    na, nt = det.na, targets.shape[0]  # number of anchors, targets
+    tcls, tbox, indices, anch = [], [], [], []
+    gain = torch.ones(7, device=targets.device)  # normalized to gridspace gain
+    ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)
+    targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)  # append anchor indices
+    
+    g = 0.5  # bias
+    off = torch.tensor([[0, 0],
+                        [1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m
+                        # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
+                        ], device=targets.device).float() * g  # offsets
+    
+    for i in range(det.nl):
+        anchors = det.anchors[i] #[3,2]
+        gain[2:6] = torch.tensor(predictions[i].shape)[[3, 2, 3, 2]]  # xyxy gain
+        # Match targets to anchors
+        t = targets * gain
+
+        if nt:
+            # Matches
+            r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
+            j = torch.max(r, 1. / r).max(2)[0] < cfg.TRAIN.ANCHOR_THRESHOLD  # compare
+            # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
+            t = t[j]  # filter
+
+            # Offsets
+            gxy = t[:, 2:4]  # grid xy
+            gxi = gain[[2, 3]] - gxy  # inverse
+            j, k = ((gxy % 1. < g) & (gxy > 1.)).T
+            l, m = ((gxi % 1. < g) & (gxi > 1.)).T
+            j = torch.stack((torch.ones_like(j), j, k, l, m))
+            t = t.repeat((5, 1, 1))[j]
+            offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
+        else:
+            t = targets[0]
+            offsets = 0
+
+        # Define
+        b, c = t[:, :2].long().T  # image, class
+        gxy = t[:, 2:4]  # grid xy
+        gwh = t[:, 4:6]  # grid wh
+        gij = (gxy - offsets).long()
+        gi, gj = gij.T  # grid xy indices
+
+        # Append
+        a = t[:, 6].long()  # anchor indices
+        indices.append((b, a, gj.clamp_(0, int(gain[3]) - 1), gi.clamp_(0, int(gain[2]) - 1)))  # image, anchor, grid indices
+        tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
+        anch.append(anchors[a])  # anchors
+        tcls.append(c)  # class
+
+    return tcls, tbox, indices, anch
+
+def morphological_process(image, kernel_size=5, func_type=cv2.MORPH_CLOSE):
+    """
+    morphological process to fill the hole in the binary segmentation result
+    :param image:
+    :param kernel_size:
+    :return:
+    """
+    if len(image.shape) == 3:
+        raise ValueError('Binary segmentation result image should be a single channel image')
+
+    if image.dtype is not np.uint8:
+        image = np.array(image, np.uint8)
+
+    kernel = cv2.getStructuringElement(shape=cv2.MORPH_ELLIPSE, ksize=(kernel_size, kernel_size))
+
+    # close operation fille hole
+    closing = cv2.morphologyEx(image, func_type, kernel, iterations=1)
+
+    return closing
+
+def connect_components_analysis(image):
+    """
+    connect components analysis to remove the small components
+    :param image:
+    :return:
+    """
+    if len(image.shape) == 3:
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    else:
+        gray_image = image
+    # print(gray_image.dtype)
+    return cv2.connectedComponentsWithStats(gray_image, connectivity=8, ltype=cv2.CV_32S)
+
+def if_y(samples_x):
+    for sample_x in samples_x:
+        if len(sample_x):
+            # if len(sample_x) != (sample_x[-1] - sample_x[0] + 1) or sample_x[-1] == sample_x[0]:
+            if sample_x[-1] == sample_x[0]:
+                return False
+    return True
+    
+def fitlane(mask, sel_labels, labels, stats):
+    H, W = mask.shape
+    for label_group in sel_labels:
+        states = [stats[k] for k in label_group]
+        x, y, w, h, _ = states[0]
+        # if len(label_group) > 1:
+        #     print('in')
+        #     for m in range(len(label_group)-1):
+        #         labels[labels == label_group[m+1]] = label_group[0]
+        t = label_group[0]
+        # samples_y = np.linspace(y, H-1, 30)
+        # else:
+        samples_y = np.linspace(y, y+h-1, 30)
+        
+        samples_x = [np.where(labels[int(sample_y)]==t)[0] for sample_y in samples_y]
+
+        if if_y(samples_x):
+            samples_x = [int(np.mean(sample_x)) if len(sample_x) else -1 for sample_x in samples_x]
+            samples_x = np.array(samples_x)
+            samples_y = np.array(samples_y)
+            samples_y = samples_y[samples_x != -1]
+            samples_x = samples_x[samples_x != -1]
+            func = np.polyfit(samples_y, samples_x, 2)
+            x_limits = np.polyval(func, H-1)
+            # if (y_max + h - 1) >= 720:
+            if x_limits < 0 or x_limits > W:
+            # if (y_max + h - 1) > 720:
+                # draw_y = np.linspace(y, 720-1, 720-y)
+                draw_y = np.linspace(y, y+h-1, h)
+            else:
+                # draw_y = np.linspace(y, y+h-1, y+h-y)
+                draw_y = np.linspace(y, H-1, H-y)
+            draw_x = np.polyval(func, draw_y)
+            # draw_y = draw_y[draw_x < W]
+            # draw_x = draw_x[draw_x < W]
+            draw_points = (np.asarray([draw_x, draw_y]).T).astype(np.int32)
+            cv2.polylines(mask, [draw_points], False, 1, thickness=15)
+        else:
+            # if ( + w - 1) >= 1280:
+            samples_x = np.linspace(x, W-1, 30)
+            # else:
+            #     samples_x = np.linspace(x, x_max+w-1, 30)
+            samples_y = [np.where(labels[:, int(sample_x)]==t)[0] for sample_x in samples_x]
+            samples_y = [int(np.mean(sample_y)) if len(sample_y) else -1 for sample_y in samples_y]
+            samples_x = np.array(samples_x)
+            samples_y = np.array(samples_y)
+            samples_x = samples_x[samples_y != -1]
+            samples_y = samples_y[samples_y != -1]
+            try:
+                func = np.polyfit(samples_x, samples_y, 2)
+            except:
+                pass
+            # y_limits = np.polyval(func, 0)
+            # if y_limits > 720 or y_limits < 0:
+            # if (x + w - 1) >= 1280:
+            #     draw_x = np.linspace(x, 1280-1, 1280-x)
+            # else:
+            y_limits = np.polyval(func, 0)
+            if y_limits >= H or y_limits < 0:
+                draw_x = np.linspace(x, x+w-1, w+x-x)
+            else:
+                y_limits = np.polyval(func, W-1)
+                if y_limits >= H or y_limits < 0:
+                    draw_x = np.linspace(x, x+w-1, w+x-x)
+                # if x+w-1 < 640:
+                #     draw_x = np.linspace(0, x+w-1, w+x-x)
+                else:
+                    draw_x = np.linspace(x, W-1, W-x)
+            draw_y = np.polyval(func, draw_x)
+            draw_points = (np.asarray([draw_x, draw_y]).T).astype(np.int32)
+            cv2.polylines(mask, [draw_points], False, 1, thickness=15)
+    return mask
+
+def connect_lane(image, shadow_height=0):
+    if len(image.shape) == 3:
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    else:
+        gray_image = image
+    if shadow_height:
+        image[:shadow_height] = 0
+    mask = np.zeros((image.shape[0], image.shape[1]), np.uint8)
+    
+    num_labels, labels, stats, centers = cv2.connectedComponentsWithStats(gray_image, connectivity=8, ltype=cv2.CV_32S)
+    # ratios = []
+    selected_label = []
+    
+    for t in range(1, num_labels, 1):
+        _, _, _, _, area = stats[t]
+        if area > 400:
+            selected_label.append(t)
+    if len(selected_label) == 0:
+        return mask
+    else:
+        split_labels = [[label,] for label in selected_label]
+        mask_post = fitlane(mask, split_labels, labels, stats)
+        return mask_post
+
+
+
+
+
+
+
+
diff --git a/lib/dataset/AutoDriveDataset.py b/lib/dataset/AutoDriveDataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e7f95a8a451182c66a5b0474fddd33da09edfe
--- /dev/null
+++ b/lib/dataset/AutoDriveDataset.py
@@ -0,0 +1,264 @@
+import cv2
+import numpy as np
+# np.set_printoptions(threshold=np.inf)
+import random
+import torch
+import torchvision.transforms as transforms
+# from visualization import plot_img_and_mask,plot_one_box,show_seg_result
+from pathlib import Path
+from PIL import Image
+from torch.utils.data import Dataset
+from ..utils import letterbox, augment_hsv, random_perspective, xyxy2xywh, cutout
+
+
+class AutoDriveDataset(Dataset):
+    """
+    A general Dataset for some common function
+    """
+    def __init__(self, cfg, is_train, inputsize=640, transform=None):
+        """
+        initial all the characteristic
+
+        Inputs:
+        -cfg: configurations
+        -is_train(bool): whether train set or not
+        -transform: ToTensor and Normalize
+        
+        Returns:
+        None
+        """
+        self.is_train = is_train
+        self.cfg = cfg
+        self.transform = transform
+        self.inputsize = inputsize
+        self.Tensor = transforms.ToTensor()
+        img_root = Path(cfg.DATASET.DATAROOT)
+        label_root = Path(cfg.DATASET.LABELROOT)
+        mask_root = Path(cfg.DATASET.MASKROOT)
+        lane_root = Path(cfg.DATASET.LANEROOT)
+        if is_train:
+            indicator = cfg.DATASET.TRAIN_SET
+        else:
+            indicator = cfg.DATASET.TEST_SET
+        self.img_root = img_root / indicator
+        self.label_root = label_root / indicator
+        self.mask_root = mask_root / indicator
+        self.lane_root = lane_root / indicator
+        # self.label_list = self.label_root.iterdir()
+        self.mask_list = self.mask_root.iterdir()
+
+        self.db = []
+
+        self.data_format = cfg.DATASET.DATA_FORMAT
+
+        self.scale_factor = cfg.DATASET.SCALE_FACTOR
+        self.rotation_factor = cfg.DATASET.ROT_FACTOR
+        self.flip = cfg.DATASET.FLIP
+        self.color_rgb = cfg.DATASET.COLOR_RGB
+
+        # self.target_type = cfg.MODEL.TARGET_TYPE
+        self.shapes = np.array(cfg.DATASET.ORG_IMG_SIZE)
+    
+    def _get_db(self):
+        """
+        finished on children Dataset(for dataset which is not in Bdd100k format, rewrite children Dataset)
+        """
+        raise NotImplementedError
+
+    def evaluate(self, cfg, preds, output_dir):
+        """
+        finished on children dataset
+        """
+        raise NotImplementedError
+    
+    def __len__(self,):
+        """
+        number of objects in the dataset
+        """
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """
+        Get input and groud-truth from database & add data augmentation on input
+
+        Inputs:
+        -idx: the index of image in self.db(database)(list)
+        self.db(list) [a,b,c,...]
+        a: (dictionary){'image':, 'information':}
+
+        Returns:
+        -image: transformed image, first passed the data augmentation in __getitem__ function(type:numpy), then apply self.transform
+        -target: ground truth(det_gt,seg_gt)
+
+        function maybe useful
+        cv2.imread
+        cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
+        cv2.warpAffine
+        """
+        data = self.db[idx]
+        img = cv2.imread(data["image"], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        # seg_label = cv2.imread(data["mask"], 0)
+        if self.cfg.num_seg_class == 3:
+            seg_label = cv2.imread(data["mask"])
+        else:
+            seg_label = cv2.imread(data["mask"], 0)
+        lane_label = cv2.imread(data["lane"], 0)
+        #print(lane_label.shape)
+        # print(seg_label.shape)
+        # print(lane_label.shape)
+        # print(seg_label.shape)
+        resized_shape = self.inputsize
+        if isinstance(resized_shape, list):
+            resized_shape = max(resized_shape)
+        h0, w0 = img.shape[:2]  # orig hw
+        r = resized_shape / max(h0, w0)  # resize image to img_size
+        if r != 1:  # always resize down, only resize up if training with augmentation
+            interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
+            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
+            seg_label = cv2.resize(seg_label, (int(w0 * r), int(h0 * r)), interpolation=interp)
+            lane_label = cv2.resize(lane_label, (int(w0 * r), int(h0 * r)), interpolation=interp)
+        h, w = img.shape[:2]
+        
+        (img, seg_label, lane_label), ratio, pad = letterbox((img, seg_label, lane_label), resized_shape, auto=True, scaleup=self.is_train)
+        shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+        # ratio = (w / w0, h / h0)
+        # print(resized_shape)
+        
+        det_label = data["label"]
+        labels=[]
+        
+        if det_label.size > 0:
+            # Normalized xywh to pixel xyxy format
+            labels = det_label.copy()
+            labels[:, 1] = ratio[0] * w * (det_label[:, 1] - det_label[:, 3] / 2) + pad[0]  # pad width
+            labels[:, 2] = ratio[1] * h * (det_label[:, 2] - det_label[:, 4] / 2) + pad[1]  # pad height
+            labels[:, 3] = ratio[0] * w * (det_label[:, 1] + det_label[:, 3] / 2) + pad[0]
+            labels[:, 4] = ratio[1] * h * (det_label[:, 2] + det_label[:, 4] / 2) + pad[1]
+            
+        if self.is_train:
+            combination = (img, seg_label, lane_label)
+            (img, seg_label, lane_label), labels = random_perspective(
+                combination=combination,
+                targets=labels,
+                degrees=self.cfg.DATASET.ROT_FACTOR,
+                translate=self.cfg.DATASET.TRANSLATE,
+                scale=self.cfg.DATASET.SCALE_FACTOR,
+                shear=self.cfg.DATASET.SHEAR
+            )
+            #print(labels.shape)
+            augment_hsv(img, hgain=self.cfg.DATASET.HSV_H, sgain=self.cfg.DATASET.HSV_S, vgain=self.cfg.DATASET.HSV_V)
+            # img, seg_label, labels = cutout(combination=combination, labels=labels)
+
+            if len(labels):
+                # convert xyxy to xywh
+                labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])
+
+                # Normalize coordinates 0 - 1
+                labels[:, [2, 4]] /= img.shape[0]  # height
+                labels[:, [1, 3]] /= img.shape[1]  # width
+
+            # if self.is_train:
+            # random left-right flip
+            lr_flip = True
+            if lr_flip and random.random() < 0.5:
+                img = np.fliplr(img)
+                seg_label = np.fliplr(seg_label)
+                lane_label = np.fliplr(lane_label)
+                if len(labels):
+                    labels[:, 1] = 1 - labels[:, 1]
+
+            # random up-down flip
+            ud_flip = False
+            if ud_flip and random.random() < 0.5:
+                img = np.flipud(img)
+                seg_label = np.filpud(seg_label)
+                lane_label = np.filpud(lane_label)
+                if len(labels):
+                    labels[:, 2] = 1 - labels[:, 2]
+        
+        else:
+            if len(labels):
+                # convert xyxy to xywh
+                labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])
+
+                # Normalize coordinates 0 - 1
+                labels[:, [2, 4]] /= img.shape[0]  # height
+                labels[:, [1, 3]] /= img.shape[1]  # width
+
+        labels_out = torch.zeros((len(labels), 6))
+        if len(labels):
+            labels_out[:, 1:] = torch.from_numpy(labels)
+        # Convert
+        # img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        # img = img.transpose(2, 0, 1)
+        img = np.ascontiguousarray(img)
+        # seg_label = np.ascontiguousarray(seg_label)
+        # if idx == 0:
+        #     print(seg_label[:,:,0])
+
+        if self.cfg.num_seg_class == 3:
+            _,seg0 = cv2.threshold(seg_label[:,:,0],128,255,cv2.THRESH_BINARY)
+            _,seg1 = cv2.threshold(seg_label[:,:,1],1,255,cv2.THRESH_BINARY)
+            _,seg2 = cv2.threshold(seg_label[:,:,2],1,255,cv2.THRESH_BINARY)
+        else:
+            _,seg1 = cv2.threshold(seg_label,1,255,cv2.THRESH_BINARY)
+            _,seg2 = cv2.threshold(seg_label,1,255,cv2.THRESH_BINARY_INV)
+        _,lane1 = cv2.threshold(lane_label,1,255,cv2.THRESH_BINARY)
+        _,lane2 = cv2.threshold(lane_label,1,255,cv2.THRESH_BINARY_INV)
+#        _,seg2 = cv2.threshold(seg_label[:,:,2],1,255,cv2.THRESH_BINARY)
+        # # seg1[cutout_mask] = 0
+        # # seg2[cutout_mask] = 0
+        
+        # seg_label /= 255
+        # seg0 = self.Tensor(seg0)
+        if self.cfg.num_seg_class == 3:
+            seg0 = self.Tensor(seg0)
+        seg1 = self.Tensor(seg1)
+        seg2 = self.Tensor(seg2)
+        # seg1 = self.Tensor(seg1)
+        # seg2 = self.Tensor(seg2)
+        lane1 = self.Tensor(lane1)
+        lane2 = self.Tensor(lane2)
+
+        # seg_label = torch.stack((seg2[0], seg1[0]),0)
+        if self.cfg.num_seg_class == 3:
+            seg_label = torch.stack((seg0[0],seg1[0],seg2[0]),0)
+        else:
+            seg_label = torch.stack((seg2[0], seg1[0]),0)
+            
+        lane_label = torch.stack((lane2[0], lane1[0]),0)
+        # _, gt_mask = torch.max(seg_label, 0)
+        # _ = show_seg_result(img, gt_mask, idx, 0, save_dir='debug', is_gt=True)
+        
+
+        target = [labels_out, seg_label, lane_label]
+        img = self.transform(img)
+
+        return img, target, data["image"], shapes
+
+    def select_data(self, db):
+        """
+        You can use this function to filter useless images in the dataset
+
+        Inputs:
+        -db: (list)database
+
+        Returns:
+        -db_selected: (list)filtered dataset
+        """
+        db_selected = ...
+        return db_selected
+
+    @staticmethod
+    def collate_fn(batch):
+        img, label, paths, shapes= zip(*batch)
+        label_det, label_seg, label_lane = [], [], []
+        for i, l in enumerate(label):
+            l_det, l_seg, l_lane = l
+            l_det[:, 0] = i  # add target image index for build_targets()
+            label_det.append(l_det)
+            label_seg.append(l_seg)
+            label_lane.append(l_lane)
+        return torch.stack(img, 0), [torch.cat(label_det, 0), torch.stack(label_seg, 0), torch.stack(label_lane, 0)], paths, shapes
+
diff --git a/lib/dataset/DemoDataset.py b/lib/dataset/DemoDataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a7f39bcfd9e19985a3c87f5c2498a4b982b9a71
--- /dev/null
+++ b/lib/dataset/DemoDataset.py
@@ -0,0 +1,188 @@
+import glob
+import os
+import random
+import shutil
+import time
+from pathlib import Path
+from threading import Thread
+
+import cv2
+import math
+import numpy as np
+import torch
+from PIL import Image, ExifTags
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from ..utils import letterbox_for_img, clean_str
+
+img_formats = ['.bmp', '.jpg', '.jpeg', '.png', '.tif', '.tiff', '.dng']
+vid_formats = ['.mov', '.avi', '.mp4', '.mpg', '.mpeg', '.m4v', '.wmv', '.mkv']
+
+class LoadImages:  # for inference
+    def __init__(self, path, img_size=640):
+        p = str(Path(path))  # os-agnostic
+        p = os.path.abspath(p)  # absolute path
+        if '*' in p:
+            files = sorted(glob.glob(p, recursive=True))  # glob
+        elif os.path.isdir(p):
+            files = sorted(glob.glob(os.path.join(p, '*.*')))  # dir
+        elif os.path.isfile(p):
+            files = [p]  # files
+        else:
+            raise Exception('ERROR: %s does not exist' % p)
+
+        images = [x for x in files if os.path.splitext(x)[-1].lower() in img_formats]
+        videos = [x for x in files if os.path.splitext(x)[-1].lower() in vid_formats]
+        ni, nv = len(images), len(videos)
+
+        self.img_size = img_size
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = 'images'
+        if any(videos):
+            self.new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \
+                            (p, img_formats, vid_formats)
+
+    def __iter__(self):
+        self.count = 0
+        return self
+
+    def __next__(self):
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+
+        if self.video_flag[self.count]:
+            # Read video
+            self.mode = 'video'
+            ret_val, img0 = self.cap.read()
+            if not ret_val:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                else:
+                    path = self.files[self.count]
+                    self.new_video(path)
+                    ret_val, img0 = self.cap.read()
+            h0, w0 = img0.shape[:2]
+
+            self.frame += 1
+            print('\n video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='')
+
+        else:
+            # Read image
+            self.count += 1
+            img0 = cv2.imread(path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)  # BGR
+            #img0 = cv2.cvtColor(img0, cv2.COLOR_BGR2RGB)
+            assert img0 is not None, 'Image Not Found ' + path
+            print('image %g/%g %s: \n' % (self.count, self.nf, path), end='')
+            h0, w0 = img0.shape[:2]
+
+        # Padded resize
+        img, ratio, pad = letterbox_for_img(img0, new_shape=self.img_size, auto=True)
+        h, w = img.shape[:2]
+        shapes = (h0, w0), ((h / h0, w / w0), pad)
+
+        # Convert
+        #img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+
+        # cv2.imwrite(path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1])  # save letterbox image
+        return path, img, img0, self.cap, shapes
+
+    def new_video(self, path):
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    def __len__(self):
+        return self.nf  # number of files
+
+
+
+class LoadStreams:  # multiple IP or RTSP cameras
+    def __init__(self, sources='streams.txt', img_size=640, auto=True):
+        self.mode = 'stream'
+        self.img_size = img_size
+
+        if os.path.isfile(sources):
+            with open(sources, 'r') as f:
+                sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())]
+        else:
+            sources = [sources]
+
+        n = len(sources)
+        self.imgs, self.fps, self.frames, self.threads = [None] * n, [0] * n, [0] * n, [None] * n
+        self.sources = [clean_str(x) for x in sources]  # clean source names for later
+        self.auto = auto
+        for i, s in enumerate(sources):  # index, source
+            # Start thread to read frames from video stream
+            print(f'{i + 1}/{n}: {s}... ', end='')
+            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
+            cap = cv2.VideoCapture(s)
+            assert cap.isOpened(), f'Failed to open {s}'
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            self.fps[i] = max(cap.get(cv2.CAP_PROP_FPS) % 100, 0) or 30.0  # 30 FPS fallback
+            self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float('inf')  # infinite stream fallback
+
+            _, self.imgs[i] = cap.read()  # guarantee first frame
+            self.threads[i] = Thread(target=self.update, args=([i, cap]), daemon=True)
+            print(f" success ({self.frames[i]} frames {w}x{h} at {self.fps[i]:.2f} FPS)")
+            self.threads[i].start()
+        print('')  # newline
+
+        # check for common shapes
+
+        s = np.stack([letterbox_for_img(x, self.img_size, auto=self.auto)[0].shape for x in self.imgs], 0)  # shapes
+        self.rect = np.unique(s, axis=0).shape[0] == 1  # rect inference if all shapes equal
+        if not self.rect:
+            print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
+
+    def update(self, i, cap):
+        # Read stream `i` frames in daemon thread
+        n, f, read = 0, self.frames[i], 1  # frame number, frame array, inference every 'read' frame
+        while cap.isOpened() and n < f:
+            n += 1
+            # _, self.imgs[index] = cap.read()
+            cap.grab()
+            if n % read == 0:
+                success, im = cap.retrieve()
+                self.imgs[i] = im if success else self.imgs[i] * 0
+            time.sleep(1 / self.fps[i])  # wait time
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'):  # q to quit
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Letterbox
+        img0 = self.imgs.copy()
+
+        h0, w0 = img0[0].shape[:2]
+        img, _, pad = letterbox_for_img(img0[0], self.img_size, auto=self.rect and self.auto)
+
+        # Stack
+        h, w = img.shape[:2]
+        shapes = (h0, w0), ((h / h0, w / w0), pad)
+
+        # Convert
+        #img = img[..., ::-1].transpose((0, 3, 1, 2))  # BGR to RGB, BHWC to BCHW
+        img = np.ascontiguousarray(img)
+
+        return self.sources, img, img0[0], None, shapes
+
+    def __len__(self):
+        return len(self.sources)  # 1E12 frames = 32 streams at 30 FPS for 30 years
\ No newline at end of file
diff --git a/lib/dataset/__init__.py b/lib/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d39e5d658297f283a6c49b6a63c1f143442bc9
--- /dev/null
+++ b/lib/dataset/__init__.py
@@ -0,0 +1,3 @@
+from .bdd import BddDataset
+from .AutoDriveDataset import AutoDriveDataset
+from .DemoDataset import LoadImages, LoadStreams
diff --git a/lib/dataset/__pycache__/AutoDriveDataset.cpython-310.pyc b/lib/dataset/__pycache__/AutoDriveDataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb4a74de98f76754906aab65ead9035af662b626
Binary files /dev/null and b/lib/dataset/__pycache__/AutoDriveDataset.cpython-310.pyc differ
diff --git a/lib/dataset/__pycache__/AutoDriveDataset.cpython-37.pyc b/lib/dataset/__pycache__/AutoDriveDataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a4ec2ac802c6175e1d7b126028df226ea3e4062
Binary files /dev/null and b/lib/dataset/__pycache__/AutoDriveDataset.cpython-37.pyc differ
diff --git a/lib/dataset/__pycache__/DemoDataset.cpython-310.pyc b/lib/dataset/__pycache__/DemoDataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74a7c4977c499c23358b3d7f77b8c738520ee04e
Binary files /dev/null and b/lib/dataset/__pycache__/DemoDataset.cpython-310.pyc differ
diff --git a/lib/dataset/__pycache__/DemoDataset.cpython-37.pyc b/lib/dataset/__pycache__/DemoDataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10d72d543ce93325fb9be7a1122d6dd4fc5bf029
Binary files /dev/null and b/lib/dataset/__pycache__/DemoDataset.cpython-37.pyc differ
diff --git a/lib/dataset/__pycache__/__init__.cpython-310.pyc b/lib/dataset/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c1ab9c872ecd7dde398dc180ca79c1da88beac2
Binary files /dev/null and b/lib/dataset/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/dataset/__pycache__/__init__.cpython-37.pyc b/lib/dataset/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d4b7a649073ff40da78b3f2233a09c0e2f06cb
Binary files /dev/null and b/lib/dataset/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lib/dataset/__pycache__/bdd.cpython-310.pyc b/lib/dataset/__pycache__/bdd.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8377690cf5d4f04b5f323e4604307d5d2ff2e90
Binary files /dev/null and b/lib/dataset/__pycache__/bdd.cpython-310.pyc differ
diff --git a/lib/dataset/__pycache__/bdd.cpython-37.pyc b/lib/dataset/__pycache__/bdd.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc5ca855a52ee154a35deae50748225f145b20e0
Binary files /dev/null and b/lib/dataset/__pycache__/bdd.cpython-37.pyc differ
diff --git a/lib/dataset/__pycache__/convert.cpython-310.pyc b/lib/dataset/__pycache__/convert.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58e860a348c0ace243f2cb723ad9fee8f99e2ee0
Binary files /dev/null and b/lib/dataset/__pycache__/convert.cpython-310.pyc differ
diff --git a/lib/dataset/__pycache__/convert.cpython-37.pyc b/lib/dataset/__pycache__/convert.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e41846711230d8a851730a5a5062045402367350
Binary files /dev/null and b/lib/dataset/__pycache__/convert.cpython-37.pyc differ
diff --git a/lib/dataset/bdd.py b/lib/dataset/bdd.py
new file mode 100644
index 0000000000000000000000000000000000000000..538ab4d5cae57fbfac6d9a3ba07c8f72e083e8c5
--- /dev/null
+++ b/lib/dataset/bdd.py
@@ -0,0 +1,85 @@
+import numpy as np
+import json
+
+from .AutoDriveDataset import AutoDriveDataset
+from .convert import convert, id_dict, id_dict_single
+from tqdm import tqdm
+
+single_cls = True       # just detect vehicle
+
+class BddDataset(AutoDriveDataset):
+    def __init__(self, cfg, is_train, inputsize, transform=None):
+        super().__init__(cfg, is_train, inputsize, transform)
+        self.db = self._get_db()
+        self.cfg = cfg
+
+    def _get_db(self):
+        """
+        get database from the annotation file
+
+        Inputs:
+
+        Returns:
+        gt_db: (list)database   [a,b,c,...]
+                a: (dictionary){'image':, 'information':, ......}
+        image: image path
+        mask: path of the segmetation label
+        label: [cls_id, center_x//256, center_y//256, w//256, h//256] 256=IMAGE_SIZE
+        """
+        print('building database...')
+        gt_db = []
+        height, width = self.shapes
+        for mask in tqdm(list(self.mask_list)):
+            mask_path = str(mask)
+            label_path = mask_path.replace(str(self.mask_root), str(self.label_root)).replace(".png", ".json")
+            image_path = mask_path.replace(str(self.mask_root), str(self.img_root)).replace(".png", ".jpg")
+            lane_path = mask_path.replace(str(self.mask_root), str(self.lane_root))
+            with open(label_path, 'r') as f:
+                label = json.load(f)
+            data = label['frames'][0]['objects']
+            data = self.filter_data(data)
+            gt = np.zeros((len(data), 5))
+            for idx, obj in enumerate(data):
+                category = obj['category']
+                if category == "traffic light":
+                    color = obj['attributes']['trafficLightColor']
+                    category = "tl_" + color
+                if category in id_dict.keys():
+                    x1 = float(obj['box2d']['x1'])
+                    y1 = float(obj['box2d']['y1'])
+                    x2 = float(obj['box2d']['x2'])
+                    y2 = float(obj['box2d']['y2'])
+                    cls_id = id_dict[category]
+                    if single_cls:
+                         cls_id=0
+                    gt[idx][0] = cls_id
+                    box = convert((width, height), (x1, x2, y1, y2))
+                    gt[idx][1:] = list(box)
+                
+
+            rec = [{
+                'image': image_path,
+                'label': gt,
+                'mask': mask_path,
+                'lane': lane_path
+            }]
+
+            gt_db += rec
+        print('database build finish')
+        return gt_db
+
+    def filter_data(self, data):
+        remain = []
+        for obj in data:
+            if 'box2d' in obj.keys():  # obj.has_key('box2d'):
+                if single_cls:
+                    if obj['category'] in id_dict_single.keys():
+                        remain.append(obj)
+                else:
+                    remain.append(obj)
+        return remain
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        """  
+        """
+        pass
diff --git a/lib/dataset/convert.py b/lib/dataset/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..da92c95717e8e9554e110e2792a9189ad12d7c10
--- /dev/null
+++ b/lib/dataset/convert.py
@@ -0,0 +1,31 @@
+# bdd_labels = {
+# 'unlabeled':0, 'dynamic': 1, 'ego vehicle': 2, 'ground': 3, 
+# 'static': 4, 'parking': 5, 'rail track': 6, 'road': 7, 
+# 'sidewalk': 8, 'bridge': 9, 'building': 10, 'fence': 11, 
+# 'garage': 12, 'guard rail': 13, 'tunnel': 14, 'wall': 15,
+# 'banner': 16, 'billboard': 17, 'lane divider': 18,'parking sign': 19, 
+# 'pole': 20, 'polegroup': 21, 'street light': 22, 'traffic cone': 23,
+# 'traffic device': 24, 'traffic light': 25, 'traffic sign': 26, 'traffic sign frame': 27,
+# 'terrain': 28, 'vegetation': 29, 'sky': 30, 'person': 31,
+# 'rider': 32, 'bicycle': 33, 'bus': 34, 'car': 35, 
+# 'caravan': 36, 'motorcycle': 37, 'trailer': 38, 'train': 39,
+# 'truck': 40       
+# }
+id_dict = {'person': 0, 'rider': 1, 'car': 2, 'bus': 3, 'truck': 4, 
+'bike': 5, 'motor': 6, 'tl_green': 7, 'tl_red': 8, 
+'tl_yellow': 9, 'tl_none': 10, 'traffic sign': 11, 'train': 12}
+id_dict_single = {'car': 0, 'bus': 1, 'truck': 2,'train': 3}
+# id_dict = {'car': 0, 'bus': 1, 'truck': 2}
+
+def convert(size, box):
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0
+    y = (box[2] + box[3])/2.0
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x*dw
+    w = w*dw
+    y = y*dh
+    h = h*dh
+    return (x,y,w,h)
diff --git a/lib/dataset/hust.py b/lib/dataset/hust.py
new file mode 100644
index 0000000000000000000000000000000000000000..145f931cf90c9d12a00c91ffa971ca742a7a91bf
--- /dev/null
+++ b/lib/dataset/hust.py
@@ -0,0 +1,87 @@
+import numpy as np
+import json
+
+from .AutoDriveDataset import AutoDriveDataset
+from .convert import convert, id_dict, id_dict_single
+from tqdm import tqdm
+import os
+
+single_cls = False     # just detect vehicle
+
+class HustDataset(AutoDriveDataset):
+    def __init__(self, cfg, is_train, inputsize, transform=None):
+        super().__init__(cfg, is_train, inputsize, transform)
+        self.db = self._get_db()
+        self.cfg = cfg
+
+    def _get_db(self):
+        """
+        get database from the annotation file
+
+        Inputs:
+
+        Returns:
+        gt_db: (list)database   [a,b,c,...]
+                a: (dictionary){'image':, 'information':, ......}
+        image: image path
+        mask: path of the segmetation label
+        label: [cls_id, center_x//256, center_y//256, w//256, h//256] 256=IMAGE_SIZE
+        """
+        print('building database...')
+        gt_db = []
+        height, width = self.shapes
+        for mask in tqdm(list(self.mask_list)):
+            mask_path = str(mask)
+            label_path = self.label_root
+            # label_path = mask_path.replace(str(self.mask_root), str(self.label_root)).replace(".png", ".json")
+            image_path = mask_path.replace(str(self.mask_root), str(self.img_root)).replace(".png", ".jpg")
+            lane_path = mask_path.replace(str(self.mask_root), str(self.lane_root))
+            with open(label_path, 'r') as f:
+                label = json.load(f)
+            data = label[int(os.path.basename(image_path)[:-4])]["labels"]
+            data = self.filter_data(data)
+            gt = np.zeros((len(data), 5))
+            for idx, obj in enumerate(data):
+                category = obj['category']
+                if category == "traffic light":
+                    color = obj['attributes']['Traffic Light Color'][0]
+                    category = "tl_" + color
+                if category in id_dict.keys():
+                    x1 = float(obj['box2d']['x1'])
+                    y1 = float(obj['box2d']['y1'])
+                    x2 = float(obj['box2d']['x2'])
+                    y2 = float(obj['box2d']['y2'])
+                    cls_id = id_dict[category]
+                    if single_cls:
+                         cls_id=0
+                    gt[idx][0] = cls_id
+                    box = convert((width, height), (x1, x2, y1, y2))
+                    gt[idx][1:] = list(box)
+                
+
+            rec = [{
+                'image': image_path,
+                'label': gt,
+                'mask': mask_path,
+                'lane': lane_path
+            }]
+
+            gt_db += rec
+        print('database build finish')
+        return gt_db
+
+    def filter_data(self, data):
+        remain = []
+        for obj in data:
+            if 'box2d' in obj.keys():  # obj.has_key('box2d'):
+                if single_cls:
+                    if obj['category'] in id_dict_single.keys():
+                        remain.append(obj)
+                else:
+                    remain.append(obj)
+        return remain
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        """  
+        """
+        pass
diff --git a/lib/models/YOLOP.py b/lib/models/YOLOP.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d68f4658a3d1c94c56eab411f166ff8a18e0464
--- /dev/null
+++ b/lib/models/YOLOP.py
@@ -0,0 +1,596 @@
+import torch
+from torch import tensor
+import torch.nn as nn
+import sys,os
+import math
+import sys
+sys.path.append(os.getcwd())
+#sys.path.append("lib/models")
+#sys.path.append("lib/utils")
+#sys.path.append("/workspace/wh/projects/DaChuang")
+from lib.utils import initialize_weights
+# from lib.models.common2 import DepthSeperabelConv2d as Conv
+# from lib.models.common2 import SPP, Bottleneck, BottleneckCSP, Focus, Concat, Detect
+from lib.models.common import Conv, SPP, Bottleneck, BottleneckCSP, Focus, Concat, Detect, SharpenConv
+from torch.nn import Upsample
+from lib.utils import check_anchor_order
+from lib.core.evaluate import SegmentationMetric
+from lib.utils.utils import time_synchronized
+
+"""
+MCnet_SPP = [
+[ -1, Focus, [3, 32, 3]],
+[ -1, Conv, [32, 64, 3, 2]],
+[ -1, BottleneckCSP, [64, 64, 1]],
+[ -1, Conv, [64, 128, 3, 2]],
+[ -1, BottleneckCSP, [128, 128, 3]],
+[ -1, Conv, [128, 256, 3, 2]],
+[ -1, BottleneckCSP, [256, 256, 3]],
+[ -1, Conv, [256, 512, 3, 2]],
+[ -1, SPP, [512, 512, [5, 9, 13]]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+[ -1, Conv,[512, 256, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1, 6], Concat, [1]],
+[ -1, BottleneckCSP, [512, 256, 1, False]],
+[ -1, Conv, [256, 128, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,4], Concat, [1]],
+[ -1, BottleneckCSP, [256, 128, 1, False]],
+[ -1, Conv, [128, 128, 3, 2]],
+[ [-1, 14], Concat, [1]],
+[ -1, BottleneckCSP, [256, 256, 1, False]],
+[ -1, Conv, [256, 256, 3, 2]],
+[ [-1, 10], Concat, [1]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+# [ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]],
+[ [17, 20, 23], Detect,  [13, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]],
+[ 17, Conv, [128, 64, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,2], Concat, [1]],
+[ -1, BottleneckCSP, [128, 64, 1, False]],
+[ -1, Conv, [64, 32, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [32, 16, 3, 1]],
+[ -1, BottleneckCSP, [16, 8, 1, False]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, SPP, [8, 2, [5, 9, 13]]] #segmentation output
+]
+# [2,6,3,9,5,13], [7,19,11,26,17,39], [28,64,44,103,61,183]
+
+MCnet_0 = [
+[ -1, Focus, [3, 32, 3]],
+[ -1, Conv, [32, 64, 3, 2]],
+[ -1, BottleneckCSP, [64, 64, 1]],
+[ -1, Conv, [64, 128, 3, 2]],
+[ -1, BottleneckCSP, [128, 128, 3]],
+[ -1, Conv, [128, 256, 3, 2]],
+[ -1, BottleneckCSP, [256, 256, 3]],
+[ -1, Conv, [256, 512, 3, 2]],
+[ -1, SPP, [512, 512, [5, 9, 13]]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+[ -1, Conv,[512, 256, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1, 6], Concat, [1]],
+[ -1, BottleneckCSP, [512, 256, 1, False]],
+[ -1, Conv, [256, 128, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,4], Concat, [1]],
+[ -1, BottleneckCSP, [256, 128, 1, False]],
+[ -1, Conv, [128, 128, 3, 2]],
+[ [-1, 14], Concat, [1]],
+[ -1, BottleneckCSP, [256, 256, 1, False]],
+[ -1, Conv, [256, 256, 3, 2]],
+[ [-1, 10], Concat, [1]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [128, 64, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,2], Concat, [1]],
+[ -1, BottleneckCSP, [128, 64, 1, False]],
+[ -1, Conv, [64, 32, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [32, 16, 3, 1]],
+[ -1, BottleneckCSP, [16, 8, 1, False]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [8, 2, 3, 1]], #Driving area segmentation output
+
+[ 16, Conv, [128, 64, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,2], Concat, [1]],
+[ -1, BottleneckCSP, [128, 64, 1, False]],
+[ -1, Conv, [64, 32, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [32, 16, 3, 1]],
+[ -1, BottleneckCSP, [16, 8, 1, False]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [8, 2, 3, 1]], #Lane line segmentation output
+]
+
+
+# The lane line and the driving area segment branches share information with each other
+MCnet_share = [
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 64, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ [-1,2], Concat, [1]],  #27
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #28
+[ -1, Conv, [64, 32, 3, 1]],    #29
+[ -1, Upsample, [None, 2, 'nearest']],  #30
+[ -1, Conv, [32, 16, 3, 1]],    #31
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #32 driving area segment neck
+
+[ 16, Conv, [256, 64, 3, 1]],   #33
+[ -1, Upsample, [None, 2, 'nearest']],  #34
+[ [-1,2], Concat, [1]], #35
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #36
+[ -1, Conv, [64, 32, 3, 1]],    #37
+[ -1, Upsample, [None, 2, 'nearest']],  #38
+[ -1, Conv, [32, 16, 3, 1]],    #39   
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #40 lane line segment neck
+
+[ [31,39], Concat, [1]],    #41
+[ -1, Conv, [32, 8, 3, 1]],     #42    Share_Block
+
+
+[ [32,42], Concat, [1]],     #43
+[ -1, Upsample, [None, 2, 'nearest']],  #44
+[ -1, Conv, [16, 2, 3, 1]], #45 Driving area segmentation output
+
+
+[ [40,42], Concat, [1]],    #46
+[ -1, Upsample, [None, 2, 'nearest']],  #47
+[ -1, Conv, [16, 2, 3, 1]] #48Lane line segmentation output
+]
+
+# The lane line and the driving area segment branches without share information with each other
+MCnet_no_share = [
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 64, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ [-1,2], Concat, [1]],  #27
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #28
+[ -1, Conv, [64, 32, 3, 1]],    #29
+[ -1, Upsample, [None, 2, 'nearest']],  #30
+[ -1, Conv, [32, 16, 3, 1]],    #31
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #32 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #33
+[ -1, Conv, [8, 3, 3, 1]], #34 Driving area segmentation output
+
+[ 16, Conv, [256, 64, 3, 1]],   #35
+[ -1, Upsample, [None, 2, 'nearest']],  #36
+[ [-1,2], Concat, [1]], #37
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #38
+[ -1, Conv, [64, 32, 3, 1]],    #39
+[ -1, Upsample, [None, 2, 'nearest']],  #40
+[ -1, Conv, [32, 16, 3, 1]],    #41
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #42 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #43
+[ -1, Conv, [8, 2, 3, 1]] #44 Lane line segmentation output
+]
+
+MCnet_feedback = [
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 128, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #28
+[ -1, Conv, [64, 32, 3, 1]],    #29
+[ -1, Upsample, [None, 2, 'nearest']],  #30
+[ -1, Conv, [32, 16, 3, 1]],    #31
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #32 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #33
+[ -1, Conv, [8, 2, 3, 1]], #34 Driving area segmentation output
+
+[ 16, Conv, [256, 128, 3, 1]],   #35
+[ -1, Upsample, [None, 2, 'nearest']],  #36
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #38
+[ -1, Conv, [64, 32, 3, 1]],    #39
+[ -1, Upsample, [None, 2, 'nearest']],  #40
+[ -1, Conv, [32, 16, 3, 1]],    #41
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #42 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #43
+[ -1, Conv, [8, 2, 3, 1]] #44 Lane line segmentation output
+]
+
+
+MCnet_Da_feedback1 = [
+[46, 26, 35],   #Det_out_idx, Da_Segout_idx, LL_Segout_idx
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16     backbone+fpn
+[ -1,Conv,[256,256,1,1]],   #17
+
+
+[ 16, Conv, [256, 128, 3, 1]],   #18
+[ -1, Upsample, [None, 2, 'nearest']],  #19
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #20
+[ -1, Conv, [64, 32, 3, 1]],    #21
+[ -1, Upsample, [None, 2, 'nearest']],  #22
+[ -1, Conv, [32, 16, 3, 1]],    #23
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #24 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #25
+[ -1, Conv, [8, 2, 3, 1]], #26 Driving area segmentation output
+
+
+[ 16, Conv, [256, 128, 3, 1]],   #27
+[ -1, Upsample, [None, 2, 'nearest']],  #28
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #29
+[ -1, Conv, [64, 32, 3, 1]],    #30
+[ -1, Upsample, [None, 2, 'nearest']],  #31
+[ -1, Conv, [32, 16, 3, 1]],    #32
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #33 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #34
+[ -1, Conv, [8, 2, 3, 1]], #35Lane line segmentation output
+
+
+[ 23, Conv, [16, 16, 3, 2]],     #36
+[ -1, Conv, [16, 32, 3, 2]],    #2 times 2xdownsample    37
+
+[ [-1,17], Concat, [1]],       #38
+[ -1, BottleneckCSP, [288, 128, 1, False]],    #39
+[ -1, Conv, [128, 128, 3, 2]],      #40
+[ [-1, 14], Concat, [1]],       #41
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #42
+[ -1, Conv, [256, 256, 3, 2]],      #43
+[ [-1, 10], Concat, [1]],   #44
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #45
+[ [39, 42, 45], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]] #Detect output 46
+]
+
+
+
+# The lane line and the driving area segment branches share information with each other and feedback to det_head
+MCnet_Da_feedback2 = [
+[47, 26, 35],   #Det_out_idx, Da_Segout_idx, LL_Segout_idx
+[25, 28, 31, 33],   #layer in Da_branch to do SAD
+[34, 37, 40, 42],   #layer in LL_branch to do SAD
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16     backbone+fpn
+[ -1,Conv,[256,256,1,1]],   #17
+
+
+[ 16, Conv, [256, 128, 3, 1]],   #18
+[ -1, Upsample, [None, 2, 'nearest']],  #19
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #20
+[ -1, Conv, [64, 32, 3, 1]],    #21
+[ -1, Upsample, [None, 2, 'nearest']],  #22
+[ -1, Conv, [32, 16, 3, 1]],    #23
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #24 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #25
+[ -1, Conv, [8, 2, 3, 1]], #26 Driving area segmentation output
+
+
+[ 16, Conv, [256, 128, 3, 1]],   #27
+[ -1, Upsample, [None, 2, 'nearest']],  #28
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #29
+[ -1, Conv, [64, 32, 3, 1]],    #30
+[ -1, Upsample, [None, 2, 'nearest']],  #31
+[ -1, Conv, [32, 16, 3, 1]],    #32
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #33 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #34
+[ -1, Conv, [8, 2, 3, 1]], #35Lane line segmentation output
+
+
+[ 23, Conv, [16, 64, 3, 2]],     #36
+[ -1, Conv, [64, 256, 3, 2]],    #2 times 2xdownsample    37
+
+[ [-1,17], Concat, [1]],       #38
+
+[-1, Conv, [512, 256, 3, 1]],     #39
+[ -1, BottleneckCSP, [256, 128, 1, False]],    #40
+[ -1, Conv, [128, 128, 3, 2]],      #41
+[ [-1, 14], Concat, [1]],       #42
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #43
+[ -1, Conv, [256, 256, 3, 2]],      #44
+[ [-1, 10], Concat, [1]],   #45
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #46
+[ [40, 42, 45], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]] #Detect output 47
+]
+
+MCnet_share1 = [
+[24, 33, 45],   #Det_out_idx, Da_Segout_idx, LL_Segout_idx
+[25, 28, 31, 33],   #layer in Da_branch to do SAD
+[34, 37, 40, 42],   #layer in LL_branch to do SAD
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 128, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #27
+[ -1, Conv, [64, 32, 3, 1]],    #28
+[ -1, Upsample, [None, 2, 'nearest']],  #29
+[ -1, Conv, [32, 16, 3, 1]],    #30
+
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #31 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #32
+[ -1, Conv, [8, 2, 3, 1]], #33 Driving area segmentation output
+
+[ 16, Conv, [256, 128, 3, 1]],   #34
+[ -1, Upsample, [None, 2, 'nearest']],  #35
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #36
+[ -1, Conv, [64, 32, 3, 1]],    #37
+[ -1, Upsample, [None, 2, 'nearest']],  #38
+[ -1, Conv, [32, 16, 3, 1]],    #39
+
+[ 30, SharpenConv, [16,16, 3, 1]], #40
+[ -1, Conv, [16, 16, 3, 1]], #41
+[ [-1, 39], Concat, [1]],   #42
+[ -1, BottleneckCSP, [32, 8, 1, False]],    #43 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #44
+[ -1, Conv, [8, 2, 3, 1]] #45 Lane line segmentation output
+]"""
+
+
+# The lane line and the driving area segment branches without share information with each other and without link
+YOLOP = [
+[24, 33, 42],   #Det_out_idx, Da_Segout_idx, LL_Segout_idx
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16         #Encoder
+
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detection head 24
+
+[ 16, Conv, [256, 128, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #27
+[ -1, Conv, [64, 32, 3, 1]],    #28
+[ -1, Upsample, [None, 2, 'nearest']],  #29
+[ -1, Conv, [32, 16, 3, 1]],    #30
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #31
+[ -1, Upsample, [None, 2, 'nearest']],  #32
+[ -1, Conv, [8, 2, 3, 1]], #33 Driving area segmentation head
+
+[ 16, Conv, [256, 128, 3, 1]],   #34
+[ -1, Upsample, [None, 2, 'nearest']],  #35
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #36
+[ -1, Conv, [64, 32, 3, 1]],    #37
+[ -1, Upsample, [None, 2, 'nearest']],  #38
+[ -1, Conv, [32, 16, 3, 1]],    #39
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #40
+[ -1, Upsample, [None, 2, 'nearest']],  #41
+[ -1, Conv, [8, 2, 3, 1]] #42 Lane line segmentation head
+]
+
+
+class MCnet(nn.Module):
+    def __init__(self, block_cfg, **kwargs):
+        super(MCnet, self).__init__()
+        layers, save= [], []
+        self.nc = 1
+        self.detector_index = -1
+        self.det_out_idx = block_cfg[0][0]
+        self.seg_out_idx = block_cfg[0][1:]
+        
+
+        # Build model
+        for i, (from_, block, args) in enumerate(block_cfg[1:]):
+            block = eval(block) if isinstance(block, str) else block  # eval strings
+            if block is Detect:
+                self.detector_index = i
+            block_ = block(*args)
+            block_.index, block_.from_ = i, from_
+            layers.append(block_)
+            save.extend(x % i for x in ([from_] if isinstance(from_, int) else from_) if x != -1)  # append to savelist
+        assert self.detector_index == block_cfg[0][0]
+
+        self.model, self.save = nn.Sequential(*layers), sorted(save)
+        self.names = [str(i) for i in range(self.nc)]
+
+        # set stride、anchor for detector
+        Detector = self.model[self.detector_index]  # detector
+        if isinstance(Detector, Detect):
+            s = 128  # 2x min stride
+            # for x in self.forward(torch.zeros(1, 3, s, s)):
+            #     print (x.shape)
+            with torch.no_grad():
+                model_out = self.forward(torch.zeros(1, 3, s, s))
+                detects, _, _= model_out
+                Detector.stride = torch.tensor([s / x.shape[-2] for x in detects])  # forward
+            # print("stride"+str(Detector.stride ))
+            Detector.anchors /= Detector.stride.view(-1, 1, 1)  # Set the anchors for the corresponding scale
+            check_anchor_order(Detector)
+            self.stride = Detector.stride
+            self._initialize_biases()
+        
+        initialize_weights(self)
+
+    def forward(self, x):
+        cache = []
+        out = []
+        det_out = None
+        Da_fmap = []
+        LL_fmap = []
+        for i, block in enumerate(self.model):
+            if block.from_ != -1:
+                x = cache[block.from_] if isinstance(block.from_, int) else [x if j == -1 else cache[j] for j in block.from_]       #calculate concat detect
+            x = block(x)
+            if i in self.seg_out_idx:     #save driving area segment result
+                m=nn.Sigmoid()
+                out.append(m(x))
+            if i == self.detector_index:
+                det_out = x
+            cache.append(x if block.index in self.save else None)
+        out.insert(0,det_out)
+        return out
+            
+    
+    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
+        # m = self.model[-1]  # Detect() module
+        m = self.model[self.detector_index]  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+def get_net(cfg, **kwargs): 
+    m_block_cfg = YOLOP
+    model = MCnet(m_block_cfg, **kwargs)
+    return model
+
+
+if __name__ == "__main__":
+    from torch.utils.tensorboard import SummaryWriter
+    model = get_net(False)
+    input_ = torch.randn((1, 3, 256, 256))
+    gt_ = torch.rand((1, 2, 256, 256))
+    metric = SegmentationMetric(2)
+    model_out,SAD_out = model(input_)
+    detects, dring_area_seg, lane_line_seg = model_out
+    Da_fmap, LL_fmap = SAD_out
+    for det in detects:
+        print(det.shape)
+    print(dring_area_seg.shape)
+    print(lane_line_seg.shape)
+ 
diff --git a/lib/models/YOLOP_YOLOv11.py b/lib/models/YOLOP_YOLOv11.py
new file mode 100644
index 0000000000000000000000000000000000000000..e556e96573f8f7ecceb7a63a35cab96815ee9e9c
--- /dev/null
+++ b/lib/models/YOLOP_YOLOv11.py
@@ -0,0 +1,378 @@
+import torch
+import torch.nn as nn
+import math
+from ultralytics import YOLO
+from ultralytics.nn.modules import Conv, Concat
+from lib.models.common import Focus, BottleneckCSP, Detect
+from lib.utils import check_anchor_order
+import logging
+
+class YOLOv11Backbone(nn.Module):
+    def __init__(self, width_multiple=0.25, depth_multiple=0.50, yolo_model_path=None):
+        """
+        YOLOv11 Backbone - 直接从 ultralytics YOLO 模型提取
+        
+        Args:
+            width_multiple: 通道数缩放因子 (n=0.25, s=0.50, m=1.00, l=1.00, x=1.50)
+            depth_multiple: 深度缩放因子 (n=0.50, s=0.50, m=0.50, l=1.00, x=1.00)
+            yolo_model_path: YOLOv11 预训练模型路径（可选）
+
+        Warning:
+            不同的yolo model(n, s, m, l, x)模型结构都会不同，目前这个是以 small 为例，
+            恰好可以输出(128, 256, 512)通道数 (虽然有adapter也无所谓)
+        """
+        super().__init__()
+
+        self.out_indices = [4, 6, 10]  # P3, P4, P5
+        
+        # 如果提供了预训练模型路径，直接加载
+        if yolo_model_path:
+            yolo = YOLO(yolo_model_path)
+            yolo_model = yolo.model
+            
+            # 提取 backbone 层 (0-10)
+            self.layers = nn.ModuleList([yolo_model.model[i] for i in range(11)])
+            
+            # 获取输出通道数 (C3k2 和 C2PSA 都有 cv2 属性)
+            self.out_channels = [
+                yolo_model.model[self.out_indices[0]].conv.out_channels,   # P3 (C3k2)
+                yolo_model.model[self.out_indices[1]].conv.out_channels,   # P4 (C3k2)
+                yolo_model.model[self.out_indices[2]].conv.out_channels,  # P5 (C2PSA)
+            ]
+        else:
+            # 如果没有预训练模型，使用 ultralytics 的模块构建
+            from ultralytics.nn.modules import Conv, C3k2, SPPF, C2PSA
+            
+            # 根据 width_multiple 计算通道数
+            def make_divisible(x, divisor=8):
+                """确保通道数是 divisor 的倍数"""
+                return int(math.ceil(x / divisor) * divisor)
+            
+            c1 = make_divisible(64 * width_multiple)
+            c2 = make_divisible(128 * width_multiple)
+            c3 = make_divisible(256 * width_multiple)
+            c4 = make_divisible(512 * width_multiple)
+            c5 = make_divisible(1024 * width_multiple)
+            
+            # 根据 depth_multiple 计算重复次数
+            n1 = max(round(2 * depth_multiple), 1)  # C3k2 repeats
+            
+            self.layers = nn.ModuleList([
+                Conv(3, c1, k=3, s=2),  # 0
+                Conv(c1, c2, k=3, s=2),  # 1
+                C3k2(c2, c3, n=n1, shortcut=False, e=0.25),  # 2
+                Conv(c3, c3, k=3, s=2),  # 3
+                C3k2(c3, c4, n=n1, shortcut=False, e=0.25),  # 4
+                Conv(c4, c4, k=3, s=2),  # 5
+                C3k2(c4, c4, n=n1, shortcut=True),  # 6
+                Conv(c4, c5, k=3, s=2),  # 7
+                C3k2(c5, c5, n=n1, shortcut=True),  # 8
+                SPPF(c5, c5, k=5),  # 9
+                C2PSA(c5, c5, n=n1),  # 10
+            ])
+            self.out_channels = []
+            for i in self.out_indices:
+                layer = self.layers[i]
+                #（Conv)
+                if hasattr(layer, 'conv'):
+                    self.out_channels.append(layer.conv.out_channels)
+                elif hasattr(layer, 'cv2'):  # (C3k2)
+                    self.out_channels.append(layer.cv2.conv.out_channels)
+                else:
+                    raise AttributeError(f"Layer {i} 没有 conv 或 cv2 属性，请检查模块结构")
+    
+    def forward(self, x):
+        outputs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outputs.append(x)
+        return outputs
+
+class ChannelAdapter(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+    
+    def forward(self, x):
+        return self.conv(x)
+
+class YOLOPWithYOLOv11(nn.Module):
+
+    def __init__(self, num_seg_class=2, yolo_scale='n', yolo_weights_path=None):
+        """
+        YOLOP with YOLOv11 Backbone
+        
+        Args:
+            num_seg_class: 分割类别数
+            yolo_scale: YOLOv11 规模 ('n', 's', 'm', 'l', 'x')
+            yolo_weights_path: YOLOv11 预训练权重路径（可选）
+        """
+        super().__init__()
+        
+        # YOLOv11 缩放参数
+        scale_configs = {
+            'n': {'width': 0.25, 'depth': 0.50},  # nano
+            's': {'width': 0.50, 'depth': 0.50},  # small
+            'm': {'width': 1.00, 'depth': 0.50},  # medium
+            'l': {'width': 1.00, 'depth': 1.00},  # large
+            'x': {'width': 1.50, 'depth': 1.00},  # xlarge
+        }
+        
+        if yolo_scale not in scale_configs:
+            raise ValueError(f"Invalid yolo_scale: {yolo_scale}. Must be one of {list(scale_configs.keys())}")
+        
+        scale = scale_configs[yolo_scale]
+        
+        # 如果提供了权重路径，直接从预训练模型提取 backbone
+        if yolo_weights_path:
+            self.backbone = YOLOv11Backbone(yolo_model_path=yolo_weights_path)
+        else:
+            self.backbone = YOLOv11Backbone(width_multiple=scale['width'], depth_multiple=scale['depth'])
+        
+        # 适配 YOLOv11 输出到 YOLOP neck 输入 [128, 256, 512]
+        backbone_channels = self.backbone.out_channels
+        neck_channels = [128, 256, 512]
+        
+        self.adapters = nn.ModuleList([
+            ChannelAdapter(backbone_channels[0], neck_channels[0]),  # P3
+            ChannelAdapter(backbone_channels[1], neck_channels[1]),  # P4
+            ChannelAdapter(backbone_channels[2], neck_channels[2]),  # P5
+        ])
+        # YOLOP neck (层 11-24)
+        self.neck = nn.ModuleList([
+            Conv(512, 256, k=1, s=1),  # 11
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 12
+            Concat(dimension=1),  # 13: Concat [-1, 6]
+            BottleneckCSP(512, 256, n=1, shortcut=False),  # 14
+            Conv(256, 128, k=1, s=1),  # 15
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 16
+            Concat(dimension=1),  # 17: Concat [-1, 4]
+            BottleneckCSP(256, 128, n=1, shortcut=False),  # 18
+            Conv(128, 128, k=3, s=2),  # 19
+            Concat(dimension=1),  # 20: Concat [-1, 14]
+            BottleneckCSP(256, 256, n=1, shortcut=False),  # 21
+            Conv(256, 256, k=3, s=2),  # 22
+            Concat(dimension=1),  # 23: Concat [-1, 10]
+            BottleneckCSP(512, 512, n=1, shortcut=False),  # 24
+        ])
+        # YOLOP heads
+        self.detect_head = Detect(1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512])
+
+        self.drivable_seg_head = nn.ModuleList([
+            Conv(256, 128, k=3, s=1),  # 25
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 26
+            BottleneckCSP(128, 64, n=1, shortcut=False),  # 27
+            Conv(64, 32, k=3, s=1),  # 28
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 29
+            Conv(32, 16, k=3, s=1),  # 30
+            BottleneckCSP(16, 8, n=1, shortcut=False),  # 31
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 32
+            Conv(8, num_seg_class, k=3, s=1),  # 33
+        ])
+        self.lane_seg_head = nn.ModuleList([
+            Conv(256, 128, k=3, s=1),  # 34
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 35
+            BottleneckCSP(128, 64, n=1, shortcut=False),  # 36
+            Conv(64, 32, k=3, s=1),  # 37
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 38
+            Conv(32, 16, k=3, s=1),  # 39
+            BottleneckCSP(16, 8, n=1, shortcut=False),  # 40
+            nn.Upsample(scale_factor=2, mode='nearest'),  # 41
+            Conv(8, 2, k=3, s=1),  # 42
+        ])
+
+        # 初始化 Detection Head 的 stride
+        # self.detect_head.stride = torch.tensor([8., 16., 32.])  # P3, P4, P5 的 stride
+        
+        # 初始化时动态计算 stride
+        s = 128
+        with torch.no_grad():
+            dummy = torch.zeros(1, 3, s, s)
+            detect_out, _, _ = self.forward(dummy)
+            self.detect_head.stride = torch.tensor([s / x.shape[-2] for x in detect_out])
+        self.detect_head.anchors /= self.detect_head.stride.view(-1, 1, 1)  # Set the anchors for the corresponding scale
+        check_anchor_order(self.detect_head)
+        self.stride = self.detect_head.stride
+
+        print(f"Initialized Detect head with strides: {self.detect_head.stride.tolist()}")
+        
+        # 添加必要的属性以兼容训练代码
+        self.nc = 1  # number of classes
+        self.detector_index = -1  # detector在模型中的索引
+        self.names = ['vehicle']  # class names
+        self.model = nn.ModuleList([
+            self.backbone,
+            self.adapters,
+            self.neck,
+            self.detect_head,
+            self.drivable_seg_head,
+            self.lane_seg_head
+        ])
+        self.detector_index = 3  # detect_head 在第4个位置
+        self.det_out_idx = 25
+
+        self.gr = 1.0 # giou loss ratio (obj loss ratio is 1-giou)
+        
+        # 初始化 Detection Head 的偏置
+        self._initialize_biases()
+    
+    def freeze_backbone(self):
+        """冻结backbone和adapters的参数"""
+        logging.info("Freezing backbone parameters...")
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+        for param in self.adapters.parameters():
+            param.requires_grad = False
+        
+        # 验证冻结状态
+        frozen_count = sum(1 for p in self.backbone.parameters() if not p.requires_grad)
+        frozen_count += sum(1 for p in self.adapters.parameters() if not p.requires_grad)
+        total_count = sum(1 for _ in self.backbone.parameters())
+        total_count += sum(1 for _ in self.adapters.parameters())
+        logging.info(f"Frozen {frozen_count}/{total_count} backbone+adapter parameters")
+        
+    def unfreeze_backbone(self):
+        """解冻backbone和adapters的参数"""
+        logging.info("Unfreezing backbone parameters...")
+        for param in self.backbone.parameters():
+            param.requires_grad = True
+        for param in self.adapters.parameters():
+            param.requires_grad = True
+    
+    def _initialize_biases(self, cf=None):
+        """初始化检测头的偏置 (参考原始YOLOP实现)"""
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        m = self.detect_head  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+        
+    def load_yolov11_backbone_weights(self, weights_path, freeze_backbone=False):
+        """
+        从YOLOv11预训练模型加载backbone权重
+        
+        Args:
+            weights_path: YOLOv11权重路径（.pt文件）
+            freeze_backbone: 是否冻结backbone参数
+        """
+        try:
+            from ultralytics import YOLO
+            logging.info(f"Loading YOLOv11 weights from {weights_path}")
+            
+            # 加载YOLOv11模型
+            yolo_model = YOLO(weights_path)
+            yolo_state_dict = yolo_model.model.state_dict()
+            
+            # 映射YOLOv11的backbone权重到我们的模型
+            # YOLOv11的backbone层索引: 0-10
+            backbone_mapping = {
+                # YOLOv11 layer -> our layer
+                'model.0': 'backbone.layers.0',   # Conv 3->64
+                'model.1': 'backbone.layers.1',   # Conv 64->128
+                'model.2': 'backbone.layers.2',   # C3k2 128->256
+                'model.3': 'backbone.layers.3',   # Conv 256->256
+                'model.4': 'backbone.layers.4',   # C3k2 256->512
+                'model.5': 'backbone.layers.5',   # Conv 512->512
+                'model.6': 'backbone.layers.6',   # C3k2 512->512
+                'model.7': 'backbone.layers.7',   # Conv 512->1024
+                'model.8': 'backbone.layers.8',   # C3k2 1024->1024
+                'model.9': 'backbone.layers.9',   # SPPF
+                'model.10': 'backbone.layers.10', # C2PSA
+            }
+            
+            # 构建新的state dict
+            new_state_dict = {}
+            loaded_keys = []
+            for yolo_key, our_key in backbone_mapping.items():
+                for k, v in yolo_state_dict.items():
+                    if k.startswith(yolo_key + '.'):
+                        new_key = k.replace(yolo_key, our_key)
+                        new_state_dict[new_key] = v
+                        loaded_keys.append(new_key)
+            
+            # 加载权重
+            model_dict = self.state_dict()
+            # 只更新存在的键
+            new_state_dict = {k: v for k, v in new_state_dict.items() if k in model_dict}
+            model_dict.update(new_state_dict)
+            self.load_state_dict(model_dict)
+            
+            logging.info(f"Successfully loaded {len(loaded_keys)} backbone parameters from YOLOv11")
+            
+            # 冻结backbone
+            if freeze_backbone:
+                self.freeze_backbone()
+                logging.info("Backbone frozen successfully")
+                
+        except Exception as e:
+            logging.warning(f"Failed to load YOLOv11 weights: {e}")
+            logging.warning("Training will start from scratch")
+    
+    def forward(self, x):
+        features = self.backbone(x)  # YOLOv11 输出 [P3, P4, P5]
+        features = [adapter(f) for adapter, f in zip(self.adapters, features)]  # 适配到 [128, 256, 512]
+        # Neck 前向传播
+        x = features[-1]  # P5 10
+        x = self.neck[0](x)  # 11
+        x = self.neck[1](x)  # 12
+        x = self.neck[2]([x, features[1]])  # 13
+        x = self.neck[3](x)  # 14
+        x = self.neck[4](x)  # 15
+        x = self.neck[5](x)  # 16
+        p3_fpn = self.neck[6]([x, features[0]])  # 17 (P3, 256 通道)
+        p3 = self.neck[7](p3_fpn)  # 18 (P3, 128 通道)
+        x = self.neck[8](p3)  # 19
+        x = self.neck[9]([x, self.neck[4](features[1])])  # 20
+        p4 = self.neck[10](x)  # 21
+        x = self.neck[11](p4)  # 22
+        x = self.neck[12]([x, self.neck[0](features[2])])  # 23
+        p5 = self.neck[13](x)  # 24
+        # Heads
+        detect_out = self.detect_head([p3, p4, p5])  # 使用层 17, 20, 23
+        drivable_out = p3_fpn  # 使用层 16
+        for layer in self.drivable_seg_head:
+            drivable_out = layer(drivable_out)
+
+        lane_out = p3_fpn  # 使用层 16
+        for layer in self.lane_seg_head:
+            lane_out = layer(lane_out)
+
+        drivable_out = torch.sigmoid(drivable_out)
+        lane_out = torch.sigmoid(lane_out)
+
+        return [detect_out, drivable_out, lane_out]
+
+
+def get_net_yolov11(cfg, **kwargs):
+    """
+    获取带有YOLOv11 backbone的YOLOP模型
+    
+    Args:
+        cfg: 配置对象
+        **kwargs: 其他参数，包括：
+            - yolov11_weights: YOLOv11预训练权重路径
+            - freeze_backbone: 是否冻结backbone
+            - yolo_scale: YOLOv11规模 ('n', 's', 'm', 'l', 'x')
+    """
+    num_seg_class = cfg.num_seg_class if hasattr(cfg, 'num_seg_class') else 2
+    yolo_scale = kwargs.get('yolo_scale', 'n')  # 默认使用 nano
+    
+    # 如果提供了权重路径，直接用权重初始化
+    yolov11_weights = kwargs.get('yolov11_weights', f'weights/yolo11{yolo_scale}.pt')
+    freeze_backbone = kwargs.get('freeze_backbone', False)
+    
+    # 在初始化时就加载预训练权重
+    import os
+    if os.path.exists(yolov11_weights):
+        logging.info(f"Creating model with YOLOv11{yolo_scale} pretrained weights from {yolov11_weights}")
+        model = YOLOPWithYOLOv11(num_seg_class=num_seg_class, yolo_scale=yolo_scale, yolo_weights_path=yolov11_weights)
+        if freeze_backbone:
+            model.freeze_backbone()
+    else:
+        logging.warning(f"YOLOv11 weights not found at {yolov11_weights}, creating model from scratch")
+        model = YOLOPWithYOLOv11(num_seg_class=num_seg_class, yolo_scale=yolo_scale, yolo_weights_path=None)
+    
+    return model
\ No newline at end of file
diff --git a/lib/models/__init__.py b/lib/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33642eefaddb54c8405ad35edf87006c1dece8c4
--- /dev/null
+++ b/lib/models/__init__.py
@@ -0,0 +1,15 @@
+from .YOLOP import get_net as get_net_yolop
+from .YOLOP_YOLOv11 import get_net_yolov11
+
+def get_net(cfg, **kwargs):
+    """
+    根据配置选择合适的模型
+    
+    Args:
+        cfg: 配置对象
+        **kwargs: 额外参数（用于 YOLOv11）
+    """
+    if hasattr(cfg.MODEL, 'USE_YOLOV11') and cfg.MODEL.USE_YOLOV11:
+        return get_net_yolov11(cfg, **kwargs)
+    else:
+        return get_net_yolop(cfg, **kwargs)
diff --git a/lib/models/__pycache__/YOLOP.cpython-310.pyc b/lib/models/__pycache__/YOLOP.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a09858e5302429c849cd220e58dd97bd0cbfe00
Binary files /dev/null and b/lib/models/__pycache__/YOLOP.cpython-310.pyc differ
diff --git a/lib/models/__pycache__/YOLOP.cpython-37.pyc b/lib/models/__pycache__/YOLOP.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6d739ef08d2b7c9aba05abf838e263503bcb84b
Binary files /dev/null and b/lib/models/__pycache__/YOLOP.cpython-37.pyc differ
diff --git a/lib/models/__pycache__/YOLOP_YOLOv11.cpython-310.pyc b/lib/models/__pycache__/YOLOP_YOLOv11.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b680f66b4dd5aa5cab453502bdafdb5c8658ae7b
Binary files /dev/null and b/lib/models/__pycache__/YOLOP_YOLOv11.cpython-310.pyc differ
diff --git a/lib/models/__pycache__/__init__.cpython-310.pyc b/lib/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..266b4dea536fb93148360ea1d87538d846fdc234
Binary files /dev/null and b/lib/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/models/__pycache__/__init__.cpython-37.pyc b/lib/models/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68a6c0fbcde2dfe1c4366b05fd3dcd13b8b6c187
Binary files /dev/null and b/lib/models/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lib/models/__pycache__/common.cpython-310.pyc b/lib/models/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed5cfd2766e9d89175486a40ae0d776cb2b274ee
Binary files /dev/null and b/lib/models/__pycache__/common.cpython-310.pyc differ
diff --git a/lib/models/__pycache__/common.cpython-37.pyc b/lib/models/__pycache__/common.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b06e5ccc84346351d791c2755b6e828f4089053
Binary files /dev/null and b/lib/models/__pycache__/common.cpython-37.pyc differ
diff --git a/lib/models/common.py b/lib/models/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..af33103e2fadba0d0192046e544da7eeb9c9b09f
--- /dev/null
+++ b/lib/models/common.py
@@ -0,0 +1,274 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image, ImageDraw
+import torch.nn.functional as F
+
+
+def autopad(k, p=None):  # kernel, padding
+    # Pad to 'same'
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class DepthSeperabelConv2d(nn.Module):
+    """
+    DepthSeperable Convolution 2d with residual connection
+    """
+
+    def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None, act=True):
+        super(DepthSeperabelConv2d, self).__init__()
+        self.depthwise = nn.Sequential(
+            nn.Conv2d(inplanes, inplanes, kernel_size, stride=stride, groups=inplanes, padding=kernel_size//2, bias=False),
+            nn.BatchNorm2d(inplanes, momentum=BN_MOMENTUM)
+        )
+        # self.depthwise = nn.Conv2d(inplanes, inplanes, kernel_size, stride=stride, groups=inplanes, padding=1, bias=False)
+        # self.pointwise = nn.Conv2d(inplanes, planes, 1, bias=False)
+
+        self.pointwise = nn.Sequential(
+            nn.Conv2d(inplanes, planes, 1, bias=False),
+            nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        )
+        self.downsample = downsample
+        self.stride = stride
+        try:
+            self.act = Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        #residual = x
+
+        out = self.depthwise(x)
+        out = self.act(out)
+        out = self.pointwise(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = self.act(out)
+
+        return out
+
+
+
+class SharpenConv(nn.Module):
+    # SharpenConv convolution
+    def __init__(self, c1, c2, k=3, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(SharpenConv, self).__init__()
+        sobel_kernel = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype='float32')
+        kenel_weight = np.vstack([sobel_kernel]*c2*c1).reshape(c2,c1,3,3)
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.conv.weight.data = torch.from_numpy(kenel_weight)
+        self.conv.weight.requires_grad = False
+        self.bn = nn.BatchNorm2d(c2)
+        try:
+            self.act = Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class Hardswish(nn.Module):  # export-friendly version of nn.Hardswish()
+    @staticmethod
+    def forward(x):
+        # return x * F.hardsigmoid(x)  # for torchscript and CoreML
+        return x * F.hardtanh(x + 3, 0., 6.) / 6.  # for torchscript, CoreML and ONNX
+
+
+class Conv(nn.Module):
+    # Standard convolution
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(Conv, self).__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        try:
+            self.act = Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+        super(Bottleneck, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super(BottleneckCSP, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.LeakyReLU(0.1, inplace=True)
+        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+
+    def forward(self, x):
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
+
+
+class SPP(nn.Module):
+    # Spatial pyramid pooling layer used in YOLOv3-SPP
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super(SPP, self).__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+    def forward(self, x):
+        x = self.cv1(x)
+        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+
+
+class Focus(nn.Module):
+    # Focus wh information into c-space
+    # slice concat conv
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(Focus, self).__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
+
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
+
+
+class Concat(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, dimension=1):
+        super(Concat, self).__init__()
+        self.d = dimension
+
+    def forward(self, x):
+        """ print("***********************")
+        for f in x:
+            print(f.shape) """
+        return torch.cat(x, self.d)
+
+
+class Detect(nn.Module):
+    stride = None  # strides computed during build
+
+    def __init__(self, nc=13, anchors=(), ch=()):  # detection layer
+        super(Detect, self).__init__()
+        self.nc = nc  # number of classes
+        self.no = nc + 5  # number of outputs per anchor 85
+        self.nl = len(anchors)  # number of detection layers 3
+        self.na = len(anchors[0]) // 2  # number of anchors 3
+        self.grid = [torch.zeros(1)] * self.nl  # init grid 
+        a = torch.tensor(anchors).float().view(self.nl, -1, 2)
+        self.register_buffer('anchors', a)  # shape(nl,na,2)
+        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
+        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv  
+
+    def forward(self, x):
+        z = []  # inference output
+        for i in range(self.nl):
+            x[i] = self.m[i](x[i])  # conv
+            # print(str(i)+str(x[i].shape))
+            bs, _, ny, nx = x[i].shape  # x(bs,255,w,w) to x(bs,3,w,w,85)
+            x[i]=x[i].view(bs, self.na, self.no, ny*nx).permute(0, 1, 3, 2).view(bs, self.na, ny, nx, self.no).contiguous()
+            # x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+            # print(str(i)+str(x[i].shape))
+
+            if not self.training:  # inference
+                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
+                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
+                y = x[i].sigmoid()
+                #print("**")
+                #print(y.shape) #[1, 3, w, h, 85]
+                #print(self.grid[i].shape) #[1, 3, w, h, 2]
+                y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+                """print("**")
+                print(y.shape)  #[1, 3, w, h, 85]
+                print(y.view(bs, -1, self.no).shape) #[1, 3*w*h, 85]"""
+                z.append(y.view(bs, -1, self.no))
+        return x if self.training else (torch.cat(z, 1), x)
+
+    @staticmethod
+    def _make_grid(nx=20, ny=20):
+        
+        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
+
+
+"""class Detections:
+    # detections class for YOLOv5 inference results
+    def __init__(self, imgs, pred, names=None):
+        super(Detections, self).__init__()
+        d = pred[0].device  # device
+        gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs]  # normalizations
+        self.imgs = imgs  # list of images as numpy arrays
+        self.pred = pred  # list of tensors pred[0] = (xyxy, conf, cls)
+        self.names = names  # class names
+        self.xyxy = pred  # xyxy pixels
+        self.xywh = [xyxy2xywh(x) for x in pred]  # xywh pixels
+        self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)]  # xyxy normalized
+        self.xywhn = [x / g for x, g in zip(self.xywh, gn)]  # xywh normalized
+        self.n = len(self.pred)
+
+    def display(self, pprint=False, show=False, save=False):
+        colors = color_list()
+        for i, (img, pred) in enumerate(zip(self.imgs, self.pred)):
+            str = f'Image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} '
+            if pred is not None:
+                for c in pred[:, -1].unique():
+                    n = (pred[:, -1] == c).sum()  # detections per class
+                    str += f'{n} {self.names[int(c)]}s, '  # add to string
+                if show or save:
+                    img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
+                    for *box, conf, cls in pred:  # xyxy, confidence, class
+                        # str += '%s %.2f, ' % (names[int(cls)], conf)  # label
+                        ImageDraw.Draw(img).rectangle(box, width=4, outline=colors[int(cls) % 10])  # plot
+            if save:
+                f = f'results{i}.jpg'
+                str += f"saved to '{f}'"
+                img.save(f)  # save
+            if show:
+                img.show(f'Image {i}')  # show
+            if pprint:
+                print(str)
+
+    def print(self):
+        self.display(pprint=True)  # print results
+
+    def show(self):
+        self.display(show=True)  # show results
+
+    def save(self):
+        self.display(save=True)  # save results
+
+    def __len__(self):
+        return self.n
+
+    def tolist(self):
+        # return a list of Detections objects, i.e. 'for result in results.tolist():'
+        x = [Detections([self.imgs[i]], [self.pred[i]], self.names) for i in range(self.n)]
+        for d in x:
+            for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
+                setattr(d, k, getattr(d, k)[0])  # pop out of list"""
\ No newline at end of file
diff --git a/lib/models/common2.py b/lib/models/common2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9984577924c0e155a3f77fc40bfaca9d83ab88ac
--- /dev/null
+++ b/lib/models/common2.py
@@ -0,0 +1,310 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image, ImageDraw
+
+
+def autopad(k, p=None):  # kernel, padding
+    # Pad to 'same'
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class DepthSeperabelConv2d(nn.Module):
+    """
+    DepthSeperable Convolution 2d with residual connection
+    """
+
+    def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None, act=True):
+        super(DepthSeperabelConv2d, self).__init__()
+        self.depthwise = nn.Sequential(
+            nn.Conv2d(inplanes, inplanes, kernel_size, stride=stride, groups=inplanes, padding=kernel_size // 2,
+                      bias=False),
+            nn.BatchNorm2d(inplanes)
+        )
+        # self.depthwise = nn.Conv2d(inplanes, inplanes, kernel_size,
+        # stride=stride, groups=inplanes, padding=1, bias=False)
+        # self.pointwise = nn.Conv2d(inplanes, planes, 1, bias=False)
+
+        self.pointwise = nn.Sequential(
+            nn.Conv2d(inplanes, planes, 1, bias=False),
+            nn.BatchNorm2d(planes)
+        )
+        self.downsample = downsample
+        self.stride = stride
+        try:
+            self.act = nn.Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        # residual = x
+
+        out = self.depthwise(x)
+        out = self.act(out)
+        out = self.pointwise(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = self.act(out)
+
+        return out
+
+
+class SharpenConv(nn.Module):
+    # SharpenConv convolution
+    def __init__(self, c1, c2, k=3, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(SharpenConv, self).__init__()
+        sobel_kernel = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype='float32')
+        kenel_weight = np.vstack([sobel_kernel] * c2 * c1).reshape(c2, c1, 3, 3)
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.conv.weight.data = torch.from_numpy(kenel_weight)
+        self.conv.weight.requires_grad = False
+        self.bn = nn.BatchNorm2d(c2)
+        try:
+            self.act = nn.Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class Conv(nn.Module):
+    # Standard convolution
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(Conv, self).__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        try:
+            self.act = nn.Hardswish() if act else nn.Identity()
+        except:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+        super(Bottleneck, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super(BottleneckCSP, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.LeakyReLU(0.1, inplace=True)
+        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+
+    def forward(self, x):
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
+
+
+class SPP(nn.Module):
+    # Spatial pyramid pooling layer used in YOLOv3-SPP
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super(SPP, self).__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+    def forward(self, x):
+        x = self.cv1(x)
+        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+
+
+class Contract(nn.Module):
+    # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
+    def __init__(self, gain=2):
+        super().__init__()
+        self.gain = gain
+
+    def forward(self, x):
+        N, C, H, W = x.size()  # assert (H / s == 0) and (W / s == 0), 'Indivisible gain'
+        s = self.gain
+        x = x.view(N, C, H // s, s, W // s, s)  # x(1,64,40,2,40,2)
+        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
+        return x.view(N, C * s * s, H // s, W // s)  # x(1,256,40,40)
+
+
+class Focus(nn.Module):
+    # Focus wh information into c-space
+    # slice concat conv
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(Focus, self).__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
+        self.contract = Contract(gain=2)
+
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
+        # return self.conv(self.contract(x))
+
+
+class Concat(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, dimension=1):
+        super(Concat, self).__init__()
+        self.d = dimension
+
+    def forward(self, x):
+        """ print("***********************")
+        for f in x:
+            print(f.shape) """
+        return torch.cat(x, self.d)
+
+
+class Detect(nn.Module):
+    stride = None  # strides computed during build
+
+    def __init__(self, nc=13, anchors=(), ch=()):  # detection layer
+        super(Detect, self).__init__()
+        self.nc = nc  # number of classes
+        self.no = nc + 5  # number of outputs per anchor
+        self.nl = len(anchors)  # number of detection layers 3
+        self.na = len(anchors[0]) // 2  # number of anchors 3
+        self.grid = [torch.zeros(1)] * self.nl  # init grid
+        a = torch.tensor(anchors).float().view(self.nl, -1, 2)  # (nl=3,na=3,2)
+        self.register_buffer('anchors', a)  # shape(nl,na,2)
+        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl=3,1,na=3,1,1,2)
+        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
+        self.inplace = True  # use in-place ops (e.g. slice assignment)
+
+    # def forward(self, x):
+    #     z = []  # inference output
+    #     for i in range(self.nl):
+    #         x[i] = self.m[i](x[i])  # conv
+    #         # print(str(i)+str(x[i].shape))
+    #         bs, _, ny, nx = x[i].shape  # x(bs,255,w,w) to x(bs,3,w,w,85)
+    #         x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+    #         # print(str(i)+str(x[i].shape))
+    #
+    #         if not self.training:  # inference
+    #             if self.grid[i].shape[2:4] != x[i].shape[2:4]:
+    #                 self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
+    #             y = x[i].sigmoid()
+    #             # print("**")
+    #             # print(y.shape) #[1, 3, w, h, 85]
+    #             # print(self.grid[i].shape) #[1, 3, w, h, 2]
+    #             y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+    #             y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+    #             """print("**")
+    #             print(y.shape)  #[1, 3, w, h, 85]
+    #             print(y.view(bs, -1, self.no).shape) #[1, 3*w*h, 85]"""
+    #             z.append(y.view(bs, -1, self.no))
+    #     return x if self.training else (torch.cat(z, 1), x)
+
+    def forward(self, x):
+        z = []  # inference output
+        for i in range(self.nl):
+            x[i] = self.m[i](x[i])  # conv (bs,na*no,ny,nx)
+            bs, _, ny, nx = x[i].shape
+
+            # x(bs,255,20,20) to x(bs,3,20,20,nc+5) (bs,na,ny,nx,no=nc+5=4+1+nc)
+
+            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+
+            if not self.training:  # inference
+                # if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
+                #     self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
+
+                self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
+
+                y = x[i].sigmoid()  # (bs,na,ny,nx,no=nc+5=4+1+nc)
+
+                xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy (bs,na,ny,nx,2)
+                wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2)  # wh (bs,na,ny,nx,2)
+                y = torch.cat((xy, wh, y[..., 4:]), -1) # (bs,na,ny,nx,2+2+1+nc=xy+wh+conf+cls_prob)
+
+                z.append(y.view(bs, -1, self.no))  # y (bs,na*ny*nx,no=2+2+1+nc=xy+wh+conf+cls_prob)
+
+        return x if self.training else (torch.cat(z, 1), x)
+        # torch.cat(z, 1) (bs,na*ny*nx*nl,no=2+2+1+nc=xy+wh+conf+cls_prob)
+
+    @staticmethod
+    def _make_grid(nx=20, ny=20):
+
+        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
+
+
+"""class Detections:
+    # detections class for YOLOv5 inference results
+    def __init__(self, imgs, pred, names=None):
+        super(Detections, self).__init__()
+        d = pred[0].device  # device
+        gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs]  # normalizations
+        self.imgs = imgs  # list of images as numpy arrays
+        self.pred = pred  # list of tensors pred[0] = (xyxy, conf, cls)
+        self.names = names  # class names
+        self.xyxy = pred  # xyxy pixels
+        self.xywh = [xyxy2xywh(x) for x in pred]  # xywh pixels
+        self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)]  # xyxy normalized
+        self.xywhn = [x / g for x, g in zip(self.xywh, gn)]  # xywh normalized
+        self.n = len(self.pred)
+
+    def display(self, pprint=False, show=False, save=False):
+        colors = color_list()
+        for i, (img, pred) in enumerate(zip(self.imgs, self.pred)):
+            str = f'Image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} '
+            if pred is not None:
+                for c in pred[:, -1].unique():
+                    n = (pred[:, -1] == c).sum()  # detections per class
+                    str += f'{n} {self.names[int(c)]}s, '  # add to string
+                if show or save:
+                    img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
+                    for *box, conf, cls in pred:  # xyxy, confidence, class
+                        # str += '%s %.2f, ' % (names[int(cls)], conf)  # label
+                        ImageDraw.Draw(img).rectangle(box, width=4, outline=colors[int(cls) % 10])  # plot
+            if save:
+                f = f'results{i}.jpg'
+                str += f"saved to '{f}'"
+                img.save(f)  # save
+            if show:
+                img.show(f'Image {i}')  # show
+            if pprint:
+                print(str)
+
+    def print(self):
+        self.display(pprint=True)  # print results
+
+    def show(self):
+        self.display(show=True)  # show results
+
+    def save(self):
+        self.display(save=True)  # save results
+
+    def __len__(self):
+        return self.n
+
+    def tolist(self):
+        # return a list of Detections objects, i.e. 'for result in results.tolist():'
+        x = [Detections([self.imgs[i]], [self.pred[i]], self.names) for i in range(self.n)]
+        for d in x:
+            for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
+                setattr(d, k, getattr(d, k)[0])  # pop out of list"""
diff --git a/lib/models/light.py b/lib/models/light.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a96ad894e7d840b9375d41d6ba37e194617225
--- /dev/null
+++ b/lib/models/light.py
@@ -0,0 +1,496 @@
+import torch
+from torch import tensor
+import torch.nn as nn
+import sys,os
+import math
+import sys
+sys.path.append(os.getcwd())
+from lib.utils import initialize_weights
+# from lib.models.common2 import DepthSeperabelConv2d as Conv
+# from lib.models.common2 import SPP, Bottleneck, BottleneckCSP, Focus, Concat, Detect
+from lib.models.common import Conv, SPP, Bottleneck, BottleneckCSP, Focus, Concat, Detect
+from torch.nn import Upsample
+from lib.utils import check_anchor_order
+from lib.core.evaluate import SegmentationMetric
+from lib.utils.utils import time_synchronized
+
+CSPDarknet_s = [
+[ -1, Focus, [3, 32, 3]],
+[ -1, Conv, [32, 64, 3, 2]],
+[ -1, BottleneckCSP, [64, 64, 1]],
+[ -1, Conv, [64, 128, 3, 2]],
+[ -1, BottleneckCSP, [128, 128, 3]],
+[ -1, Conv, [128, 256, 3, 2]],
+[ -1, BottleneckCSP, [256, 256, 3]],
+[ -1, Conv, [256, 512, 3, 2]],
+[ -1, SPP, [512, 512, [5, 9, 13]]],
+[ -1, BottleneckCSP, [512, 512, 1, False]]
+]
+
+# MCnet = [
+# [ -1, Focus, [3, 32, 3]],
+# [ -1, Conv, [32, 64, 3, 2]],
+# [ -1, BottleneckCSP, [64, 64, 1]],
+# [ -1, Conv, [64, 128, 3, 2]],
+# [ -1, BottleneckCSP, [128, 128, 3]],
+# [ -1, Conv, [128, 256, 3, 2]],
+# [ -1, BottleneckCSP, [256, 256, 3]],
+# [ -1, Conv, [256, 512, 3, 2]],
+# [ -1, SPP, [512, 512, [5, 9, 13]]],
+# [ -1, BottleneckCSP, [512, 512, 1, False]],
+# [ -1, Conv,[512, 256, 1, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ [-1, 6], Concat, [1]],
+# [ -1, BottleneckCSP, [512, 256, 1, False]],
+# [ -1, Conv, [256, 128, 1, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ [-1,4], Concat, [1]],
+# [ -1, BottleneckCSP, [256, 128, 1, False]],
+# [ -1, Conv, [128, 128, 3, 2]],
+# [ [-1, 14], Concat, [1]],
+# [ -1, BottleneckCSP, [256, 256, 1, False]],
+# [ -1, Conv, [256, 256, 3, 2]],
+# [ [-1, 10], Concat, [1]],
+# [ -1, BottleneckCSP, [512, 512, 1, False]],
+# [ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]],
+# [ 17, Conv, [128, 64, 3, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ [-1,2], Concat, [1]],
+# [ -1, BottleneckCSP, [128, 64, 1, False]],
+# [ -1, Conv, [64, 32, 3, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ -1, Conv, [32, 16, 3, 1]],
+# [ -1, BottleneckCSP, [16, 8, 1, False]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ -1, Conv, [8, 2, 3, 1]] #segmentation output
+# ]
+
+MCnet_SPP = [
+[ -1, Focus, [3, 32, 3]],
+[ -1, Conv, [32, 64, 3, 2]],
+[ -1, BottleneckCSP, [64, 64, 1]],
+[ -1, Conv, [64, 128, 3, 2]],
+[ -1, BottleneckCSP, [128, 128, 3]],
+[ -1, Conv, [128, 256, 3, 2]],
+[ -1, BottleneckCSP, [256, 256, 3]],
+[ -1, Conv, [256, 512, 3, 2]],
+[ -1, SPP, [512, 512, [5, 9, 13]]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+[ -1, Conv,[512, 256, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1, 6], Concat, [1]],
+[ -1, BottleneckCSP, [512, 256, 1, False]],
+[ -1, Conv, [256, 128, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,4], Concat, [1]],
+[ -1, BottleneckCSP, [256, 128, 1, False]],
+[ -1, Conv, [128, 128, 3, 2]],
+[ [-1, 14], Concat, [1]],
+[ -1, BottleneckCSP, [256, 256, 1, False]],
+[ -1, Conv, [256, 256, 3, 2]],
+[ [-1, 10], Concat, [1]],
+[ -1, BottleneckCSP, [512, 512, 1, False]],
+# [ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]],
+[ [17, 20, 23], Detect,  [13, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]],
+[ 17, Conv, [128, 64, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,2], Concat, [1]],
+[ -1, BottleneckCSP, [128, 64, 1, False]],
+[ -1, Conv, [64, 32, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, Conv, [32, 16, 3, 1]],
+[ -1, BottleneckCSP, [16, 8, 1, False]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ -1, SPP, [8, 2, [5, 9, 13]]] #segmentation output
+]
+# [2,6,3,9,5,13], [7,19,11,26,17,39], [28,64,44,103,61,183]
+MCnet_fast = [
+[ -1, Focus, [3, 32, 3]],#0
+[ -1, Conv, [32, 64, 3, 2]],#1
+[ -1, BottleneckCSP, [64, 128, 1, True, True]],#2
+[ -1, BottleneckCSP, [128, 256, 1, True, True]],#4
+[ -1, BottleneckCSP, [256, 512, 1, True, True]],#6
+[ -1, SPP, [512, 512, [5, 9, 13]]],#8
+[ -1, BottleneckCSP, [512, 512, 1, False]],#9
+[ -1, Conv,[512, 256, 1, 1]],#10
+[ -1, Upsample, [None, 2, 'nearest']],#11
+[ [-1, 6], Concat, [1]],#12
+[ -1, BottleneckCSP, [512, 256, 1, False]],#13
+[ -1, Conv, [256, 128, 1, 1]],#14
+[ -1, Upsample, [None, 2, 'nearest']],#15
+[ [-1,4], Concat, [1]],#16
+[ -1, BottleneckCSP, [256, 128, 1, False, True]],#17
+[ [-1, 14], Concat, [1]],#19
+[ -1, BottleneckCSP, [256, 256, 1, False, True]],#20
+[ [-1, 10], Concat, [1]],#22
+[ -1, BottleneckCSP, [512, 512, 1, False]],#23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 64, 3, 1]],#25
+[ -1, Upsample, [None, 2, 'nearest']],#26
+[ [-1,2], Concat, [1]],#27
+[ -1, BottleneckCSP, [128, 32, 1, False]],#28
+# [ -1, Conv, [64, 32, 1, 1]],#29
+[ -1, Upsample, [None, 2, 'nearest']],#30
+# [ -1, Conv, [32, 16, 1, 1]],#31
+[ -1, BottleneckCSP, [32, 8, 1, False]],#32
+[ -1, Upsample, [None, 2, 'nearest']],#33
+[ -1, Conv, [8, 2, 1, 1]], #Driving area segmentation output#34
+
+[ 16, Conv, [256, 64, 3, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+[ [-1,2], Concat, [1]],
+[ -1, BottleneckCSP, [128, 32, 1, False]],
+# [ -1, Conv, [64, 32, 1, 1]],
+[ -1, Upsample, [None, 2, 'nearest']],
+# [ -1, Conv, [32, 16, 1, 1]],
+[ 31, BottleneckCSP, [32, 8, 1, False]],#35
+[ -1, Upsample, [None, 2, 'nearest']],#36
+[ -1, Conv, [8, 2, 1, 1]], #Lane line segmentation output #37
+]
+
+MCnet_light = [
+[ -1, Focus, [3, 32, 3]],#0
+[ -1, Conv, [32, 64, 3, 2]],#1
+[ -1, BottleneckCSP, [64, 64, 1]],#2
+[ -1, Conv, [64, 128, 3, 2]],#3
+[ -1, BottleneckCSP, [128, 128, 3]],#4
+[ -1, Conv, [128, 256, 3, 2]],#5
+[ -1, BottleneckCSP, [256, 256, 3]],#6
+[ -1, Conv, [256, 512, 3, 2]],#7
+[ -1, SPP, [512, 512, [5, 9, 13]]],#8
+[ -1, BottleneckCSP, [512, 512, 1, False]],#9
+[ -1, Conv,[512, 256, 1, 1]],#10
+[ -1, Upsample, [None, 2, 'nearest']],#11
+[ [-1, 6], Concat, [1]],#12
+[ -1, BottleneckCSP, [512, 256, 1, False]],#13
+[ -1, Conv, [256, 128, 1, 1]],#14
+[ -1, Upsample, [None, 2, 'nearest']],#15
+[ [-1,4], Concat, [1]],#16
+[ -1, BottleneckCSP, [256, 128, 1, False]],#17
+[ -1, Conv, [128, 128, 3, 2]],#18
+[ [-1, 14], Concat, [1]],#19
+[ -1, BottleneckCSP, [256, 256, 1, False]],#20
+[ -1, Conv, [256, 256, 3, 2]],#21
+[ [-1, 10], Concat, [1]],#22
+[ -1, BottleneckCSP, [512, 512, 1, False]],#23
+[ [17, 20, 23], Detect,  [1, [[4,12,6,18,10,27], [15,38,24,59,39,78], [51,125,73,168,97,292]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 128, 3, 1]],#25
+[ -1, Upsample, [None, 2, 'nearest']],#26
+# [ [-1,2], Concat, [1]],#27
+[ -1, BottleneckCSP, [128, 64, 1, False]],#27
+[ -1, Conv, [64, 32, 3, 1]],#28
+[ -1, Upsample, [None, 2, 'nearest']],#29
+[ -1, Conv, [32, 16, 3, 1]],#30
+[ -1, BottleneckCSP, [16, 8, 1, False]],#31
+[ -1, Upsample, [None, 2, 'nearest']],#32
+[ -1, Conv, [8, 3, 3, 1]], #Driving area segmentation output#33
+
+# [ 16, Conv, [128, 64, 3, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ [-1,2], Concat, [1]],
+# [ -1, BottleneckCSP, [128, 64, 1, False]],
+# [ -1, Conv, [64, 32, 3, 1]],
+# [ -1, Upsample, [None, 2, 'nearest']],
+# [ -1, Conv, [32, 16, 3, 1]],
+[ 30, BottleneckCSP, [16, 8, 1, False]],#34
+[ -1, Upsample, [None, 2, 'nearest']],#35
+[ -1, Conv, [8, 2, 3, 1]], #Lane line segmentation output #36
+]
+
+
+# The lane line and the driving area segment branches share information with each other
+MCnet_share = [
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [1, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 64, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ [-1,2], Concat, [1]],  #27
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #28
+[ -1, Conv, [64, 32, 3, 1]],    #29
+[ -1, Upsample, [None, 2, 'nearest']],  #30
+[ -1, Conv, [32, 16, 3, 1]],    #31
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #32 driving area segment neck
+
+[ 16, Conv, [256, 64, 3, 1]],   #33
+[ -1, Upsample, [None, 2, 'nearest']],  #34
+[ [-1,2], Concat, [1]], #35
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #36
+[ -1, Conv, [64, 32, 3, 1]],    #37
+[ -1, Upsample, [None, 2, 'nearest']],  #38
+[ -1, Conv, [32, 16, 3, 1]],    #39   
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #40 lane line segment neck
+
+[ [31,39], Concat, [1]],    #41
+[ -1, Conv, [32, 8, 3, 1]],     #42    Share_Block
+
+
+[ [32,42], Concat, [1]],     #43
+[ -1, Upsample, [None, 2, 'nearest']],  #44
+[ -1, Conv, [16, 2, 3, 1]], #45 Driving area segmentation output
+
+
+[ [40,42], Concat, [1]],    #46
+[ -1, Upsample, [None, 2, 'nearest']],  #47
+[ -1, Conv, [16, 2, 3, 1]] #48Lane line segmentation output
+]
+
+# The lane line and the driving area segment branches without share information with each other
+MCnet_no_share = [
+[ -1, Focus, [3, 32, 3]],   #0
+[ -1, Conv, [32, 64, 3, 2]],    #1
+[ -1, BottleneckCSP, [64, 64, 1]],  #2
+[ -1, Conv, [64, 128, 3, 2]],   #3
+[ -1, BottleneckCSP, [128, 128, 3]],    #4
+[ -1, Conv, [128, 256, 3, 2]],  #5
+[ -1, BottleneckCSP, [256, 256, 3]],    #6
+[ -1, Conv, [256, 512, 3, 2]],  #7
+[ -1, SPP, [512, 512, [5, 9, 13]]],     #8
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #9
+[ -1, Conv,[512, 256, 1, 1]],   #10
+[ -1, Upsample, [None, 2, 'nearest']],  #11
+[ [-1, 6], Concat, [1]],    #12
+[ -1, BottleneckCSP, [512, 256, 1, False]], #13
+[ -1, Conv, [256, 128, 1, 1]],  #14
+[ -1, Upsample, [None, 2, 'nearest']],  #15
+[ [-1,4], Concat, [1]],     #16
+[ -1, BottleneckCSP, [256, 128, 1, False]],     #17
+[ -1, Conv, [128, 128, 3, 2]],      #18
+[ [-1, 14], Concat, [1]],       #19
+[ -1, BottleneckCSP, [256, 256, 1, False]],     #20
+[ -1, Conv, [256, 256, 3, 2]],      #21
+[ [-1, 10], Concat, [1]],   #22
+[ -1, BottleneckCSP, [512, 512, 1, False]],     #23
+[ [17, 20, 23], Detect,  [13, [[3,9,5,11,4,20], [7,18,6,39,12,31], [19,50,38,81,68,157]], [128, 256, 512]]], #Detect output 24
+
+[ 16, Conv, [256, 64, 3, 1]],   #25
+[ -1, Upsample, [None, 2, 'nearest']],  #26
+[ [-1,2], Concat, [1]],  #27
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #28
+[ -1, Conv, [64, 32, 3, 1]],    #29
+[ -1, Upsample, [None, 2, 'nearest']],  #30
+[ -1, Conv, [32, 16, 3, 1]],    #31
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #32 driving area segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #33
+[ -1, Conv, [8, 3, 3, 1]], #34 Driving area segmentation output
+
+[ 16, Conv, [256, 64, 3, 1]],   #35
+[ -1, Upsample, [None, 2, 'nearest']],  #36
+[ [-1,2], Concat, [1]], #37
+[ -1, BottleneckCSP, [128, 64, 1, False]],  #38
+[ -1, Conv, [64, 32, 3, 1]],    #39
+[ -1, Upsample, [None, 2, 'nearest']],  #40
+[ -1, Conv, [32, 16, 3, 1]],    #41
+[ -1, BottleneckCSP, [16, 8, 1, False]],    #42 lane line segment neck
+[ -1, Upsample, [None, 2, 'nearest']],  #43
+[ -1, Conv, [8, 2, 3, 1]] #44 Lane line segmentation output
+]
+
+
+
+class MCnet(nn.Module):
+    def __init__(self, block_cfg, **kwargs):
+        super(MCnet, self).__init__()
+        layers, save= [], []
+        self.nc = 13
+        self.detector_index = -1
+        self.Da_out_idx = 45 if len(block_cfg)==49 else 34
+        # self.Da_out_idx = 37
+
+        # Build model
+        # print(block_cfg)
+        for i, (from_, block, args) in enumerate(block_cfg):
+            block = eval(block) if isinstance(block, str) else block  # eval strings
+            if block is Detect:
+                self.detector_index = i
+            block_ = block(*args)
+            block_.index, block_.from_ = i, from_
+            layers.append(block_)
+            save.extend(x % i for x in ([from_] if isinstance(from_, int) else from_) if x != -1)  # append to savelist
+        self.model, self.save = nn.Sequential(*layers), sorted(save)
+        self.names = [str(i) for i in range(self.nc)]
+
+        # set stride、anchor for detector
+        Detector = self.model[self.detector_index]  # detector
+        if isinstance(Detector, Detect):
+            s = 128  # 2x min stride
+            # for x in self.forward(torch.zeros(1, 3, s, s)):
+            #     print (x.shape)
+            with torch.no_grad():
+                detects, _, _= self.forward(torch.zeros(1, 3, s, s))
+                Detector.stride = torch.tensor([s / x.shape[-2] for x in detects])  # forward
+            # print("stride"+str(Detector.stride ))
+            Detector.anchors /= Detector.stride.view(-1, 1, 1)  # Set the anchors for the corresponding scale
+            check_anchor_order(Detector)
+            self.stride = Detector.stride
+            self._initialize_biases()
+        
+        initialize_weights(self)
+
+    def forward(self, x):
+        cache = []
+        out = []
+        #times = []
+        for i, block in enumerate(self.model):
+            #t0 = time_synchronized()
+            if block.from_ != -1:
+                x = cache[block.from_] if isinstance(block.from_, int) else [x if j == -1 else cache[j] for j in block.from_]       #calculate concat detect
+            x = block(x)
+            if isinstance(block, Detect):   # save detector result
+                out.append(x)
+            if i == self.Da_out_idx:     #save driving area segment result
+                m=nn.Sigmoid()
+                out.append(m(x))
+            cache.append(x if block.index in self.save else None)
+            """t1 = time_synchronized()
+            print(str(i) + " : " + str(t1-t0))
+            times.append(t1-t0)
+        print(sum(times[:25]))
+        print(sum(times[25:33]))
+        print(sum(times[33:41]))
+        print(sum(times[41:43]))
+        print(sum(times[43:46]))
+        print(sum(times[46:]))"""
+        m=nn.Sigmoid()
+        out.append(m(x))
+        return out
+    
+    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
+        # m = self.model[-1]  # Detect() module
+        m = self.model[self.detector_index]  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+class CSPDarknet(nn.Module):
+    def __init__(self, block_cfg, **kwargs):
+        super(CSPDarknet, self).__init__()
+        layers, save= [], []
+        # self.nc = 13    #output category num
+        self.nc = 1
+        self.detector_index = -1
+
+        # Build model
+        for i, (from_, block, args) in enumerate(block_cfg):
+            block = eval(block) if isinstance(block, str) else block  # eval strings
+            if block is Detect:
+                self.detector_index = i
+            block_ = block(*args)
+            block_.index, block_.from_ = i, from_
+            layers.append(block_)
+            save.extend(x % i for x in ([from_] if isinstance(from_, int) else from_) if x != -1)  # append to savelist
+        self.model, self.save = nn.Sequential(*layers), sorted(save)
+        self.names = [str(i) for i in range(self.nc)]
+
+        # set stride、anchor for detector
+        Detector = self.model[self.detector_index]  # detector
+        if isinstance(Detector, Detect):
+            s = 128  # 2x min stride
+            # for x in self.forward(torch.zeros(1, 3, s, s)):
+            #     print (x.shape)
+            with torch.no_grad():
+                detects, _ = self.forward(torch.zeros(1, 3, s, s))
+                Detector.stride = torch.tensor([s / x.shape[-2] for x in detects])  # forward
+            # print("stride"+str(Detector.stride ))
+            Detector.anchors /= Detector.stride.view(-1, 1, 1)  # Set the anchors for the corresponding scale
+            check_anchor_order(Detector)
+            self.stride = Detector.stride
+            self._initialize_biases()
+        
+        initialize_weights(self)
+
+    def forward(self, x):
+        cache = []
+        out = []
+        for i, block in enumerate(self.model):
+            if block.from_ != -1:
+                x = cache[block.from_] if isinstance(block.from_, int) else [x if j == -1 else cache[j] for j in block.from_]       #calculate concat detect
+            start = time.time()
+            x = block(x)
+            end = time.time()
+            print(start-end)
+            """y = None if isinstance(x, list) else x.shape"""
+            if isinstance(block, Detect):   # save detector result
+                out.append(x)
+            cache.append(x if block.index in self.save else None)
+        m=nn.Sigmoid()
+        out.append(m(x))
+        # out.append(x)
+        # print(out[0][0].shape, out[0][1].shape, out[0][2].shape)
+        return out
+    
+    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
+        # m = self.model[-1]  # Detect() module
+        m = self.model[self.detector_index]  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+
+def get_net(cfg, **kwargs): 
+    # m_block_cfg = MCnet_share if cfg.MODEL.STRU_WITHSHARE else MCnet_no_share
+    m_block_cfg = MCnet_no_share
+    model = MCnet(m_block_cfg, **kwargs)
+    return model
+
+
+if __name__ == "__main__":
+    from torch.utils.tensorboard import SummaryWriter
+    model = get_net(False)
+    input_ = torch.randn((1, 3, 256, 256))
+    gt_ = torch.rand((1, 2, 256, 256))
+    metric = SegmentationMetric(2)
+
+    detects, dring_area_seg, lane_line_seg = model(input_)
+    for det in detects:
+        print(det.shape)
+    print(dring_area_seg.shape)
+    print(dring_area_seg.view(-1).shape)
+    _,predict=torch.max(dring_area_seg, 1)
+    print(predict.shape)
+    print(lane_line_seg.shape)
+
+    _,lane_line_pred=torch.max(lane_line_seg, 1)
+    _,lane_line_gt=torch.max(gt_, 1)
+    metric.reset()
+    metric.addBatch(lane_line_pred.cpu(), lane_line_gt.cpu())
+    acc = metric.pixelAccuracy()
+    meanAcc = metric.meanPixelAccuracy()
+    mIoU = metric.meanIntersectionOverUnion()
+    FWIoU = metric.Frequency_Weighted_Intersection_over_Union()
+    IoU = metric.IntersectionOverUnion()
+    print(IoU)
+    print(mIoU)
\ No newline at end of file
diff --git a/lib/utils/__init__.py b/lib/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd59e9977011221ab0466eb7ca75afe9634f4ec8
--- /dev/null
+++ b/lib/utils/__init__.py
@@ -0,0 +1,4 @@
+from .utils import initialize_weights, xyxy2xywh, is_parallel, DataLoaderX, torch_distributed_zero_first, clean_str
+from .autoanchor import check_anchor_order, run_anchor, kmean_anchors
+from .augmentations import augment_hsv, random_perspective, cutout, letterbox,letterbox_for_img
+from .plot import plot_img_and_mask,plot_one_box,show_seg_result
diff --git a/lib/utils/__pycache__/__init__.cpython-310.pyc b/lib/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d798b5f738ed1e5f511c53ed196dde27377489af
Binary files /dev/null and b/lib/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/lib/utils/__pycache__/__init__.cpython-37.pyc b/lib/utils/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..188c61b10f2951a033752b96f902013e69766b74
Binary files /dev/null and b/lib/utils/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lib/utils/__pycache__/augmentations.cpython-310.pyc b/lib/utils/__pycache__/augmentations.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7904e5d5255bf9359e4af5676c05fc3863049691
Binary files /dev/null and b/lib/utils/__pycache__/augmentations.cpython-310.pyc differ
diff --git a/lib/utils/__pycache__/augmentations.cpython-37.pyc b/lib/utils/__pycache__/augmentations.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..816f7827872774ca1a511a275c89529a3022292f
Binary files /dev/null and b/lib/utils/__pycache__/augmentations.cpython-37.pyc differ
diff --git a/lib/utils/__pycache__/autoanchor.cpython-310.pyc b/lib/utils/__pycache__/autoanchor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d01101e260a50fbfa78d2f0ea1d0610d69d142a
Binary files /dev/null and b/lib/utils/__pycache__/autoanchor.cpython-310.pyc differ
diff --git a/lib/utils/__pycache__/autoanchor.cpython-37.pyc b/lib/utils/__pycache__/autoanchor.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..102733868d97041bc4363e3487ac0465b99f67dd
Binary files /dev/null and b/lib/utils/__pycache__/autoanchor.cpython-37.pyc differ
diff --git a/lib/utils/__pycache__/plot.cpython-310.pyc b/lib/utils/__pycache__/plot.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..772002ccb21dd92ca18c2976c1df34e13bd44465
Binary files /dev/null and b/lib/utils/__pycache__/plot.cpython-310.pyc differ
diff --git a/lib/utils/__pycache__/plot.cpython-37.pyc b/lib/utils/__pycache__/plot.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67b358c57ac26c6fe21f5935e42653dca787e08f
Binary files /dev/null and b/lib/utils/__pycache__/plot.cpython-37.pyc differ
diff --git a/lib/utils/__pycache__/utils.cpython-310.pyc b/lib/utils/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e9f3fd3ffafe5c93cf4fdd966f7eb5fcb8ac39f
Binary files /dev/null and b/lib/utils/__pycache__/utils.cpython-310.pyc differ
diff --git a/lib/utils/__pycache__/utils.cpython-37.pyc b/lib/utils/__pycache__/utils.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..490229941d48267a0b937829453763e8a48eac63
Binary files /dev/null and b/lib/utils/__pycache__/utils.cpython-37.pyc differ
diff --git a/lib/utils/augmentations.py b/lib/utils/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5bf0d0e653aedbf7ec30de88c6e21f17620c48
--- /dev/null
+++ b/lib/utils/augmentations.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import cv2
+import random
+import math
+
+
+def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
+    """change color hue, saturation, value"""
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
+    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+    # Histogram equalization
+    # if random.random() < 0.2:
+    #     for i in range(3):
+    #         img[:, :, i] = cv2.equalizeHist(img[:, :, i])
+
+
+def random_perspective(combination, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
+    """combination of img transform"""
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+    img, gray, line = combination
+    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = img.shape[1] + border[1] * 2
+
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
+    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
+            gray = cv2.warpPerspective(gray, M, dsize=(width, height), borderValue=0)
+            line = cv2.warpPerspective(line, M, dsize=(width, height), borderValue=0)
+        else:  # affine
+            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+            gray = cv2.warpAffine(gray, M[:2], dsize=(width, height), borderValue=0)
+            line = cv2.warpAffine(line, M[:2], dsize=(width, height), borderValue=0)
+
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(img[:, :, ::-1])  # base
+    # ax[1].imshow(img2[:, :, ::-1])  # warped
+
+    # Transform label coordinates
+    n = len(targets)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        if perspective:
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        else:  # affine
+            xy = xy[:, :2].reshape(n, 8)
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # # apply angle-based reduction of bounding boxes
+        # radians = a * math.pi / 180
+        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+        # x = (xy[:, 2] + xy[:, 0]) / 2
+        # y = (xy[:, 3] + xy[:, 1]) / 2
+        # w = (xy[:, 2] - xy[:, 0]) * reduction
+        # h = (xy[:, 3] - xy[:, 1]) * reduction
+        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+
+        # filter candidates
+        i = _box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T)
+        targets = targets[i]
+        targets[:, 1:5] = xy[i]
+
+    combination = (img, gray, line)
+    return combination, targets
+
+
+def cutout(combination, labels):
+    # Applies image cutout augmentation https://arxiv.org/abs/1708.04552
+    image, gray = combination
+    h, w = image.shape[:2]
+
+    def bbox_ioa(box1, box2):
+        # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2
+        box2 = box2.transpose()
+
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+
+        # Intersection area
+        inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
+                     (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)
+
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16
+
+        # Intersection over box2 area
+        return inter_area / box2_area
+
+    # create random masks
+    scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16  # image size fraction
+    for s in scales:
+        mask_h = random.randint(1, int(h * s))
+        mask_w = random.randint(1, int(w * s))
+
+        # box
+        xmin = max(0, random.randint(0, w) - mask_w // 2)
+        ymin = max(0, random.randint(0, h) - mask_h // 2)
+        xmax = min(w, xmin + mask_w)
+        ymax = min(h, ymin + mask_h)
+        # print('xmin:{},ymin:{},xmax:{},ymax:{}'.format(xmin,ymin,xmax,ymax))
+
+        # apply random color mask
+        image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)]
+        gray[ymin:ymax, xmin:xmax] = -1
+
+        # return unobscured labels
+        if len(labels) and s > 0.03:
+            box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
+            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
+            labels = labels[ioa < 0.60]  # remove >60% obscured labels
+
+    return image, gray, labels
+
+
+def letterbox(combination, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
+    """Resize the input image and automatically padding to suitable shape :https://zhuanlan.zhihu.com/p/172121380"""
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    img, gray, line = combination
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        gray = cv2.resize(gray, new_unpad, interpolation=cv2.INTER_LINEAR)
+        line = cv2.resize(line, new_unpad, interpolation=cv2.INTER_LINEAR)
+
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    gray = cv2.copyMakeBorder(gray, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)  # add border
+    line = cv2.copyMakeBorder(line, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)  # add border
+    # print(img.shape)
+    
+    combination = (img, gray, line)
+    return combination, ratio, (dw, dh)
+
+def letterbox_for_img(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+
+
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
+
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_AREA)
+
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return img, ratio, (dw, dh)
+
+
+def _box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1):  # box1(4,n), box2(4,n)
+    # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr)  # candidates
diff --git a/lib/utils/autoanchor.py b/lib/utils/autoanchor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7820153eacb1ca1cc86f881ab1cddd1e601f0856
--- /dev/null
+++ b/lib/utils/autoanchor.py
@@ -0,0 +1,134 @@
+# Auto-anchor utils
+
+import numpy as np
+import torch
+import yaml
+from scipy.cluster.vq import kmeans
+from tqdm import tqdm
+from lib.utils import is_parallel
+
+
+def check_anchor_order(m):
+    # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary
+    a = m.anchor_grid.prod(-1).view(-1)  # anchor area
+    da = a[-1] - a[0]  # delta a
+    ds = m.stride[-1] - m.stride[0]  # delta s
+    if da.sign() != ds.sign():  # same order
+        print('Reversing anchor order')
+        m.anchors[:] = m.anchors.flip(0)
+        m.anchor_grid[:] = m.anchor_grid.flip(0)
+
+
+def run_anchor(logger,dataset, model, thr=4.0, imgsz=640):
+    det = model.module.model[model.module.detector_index] if is_parallel(model) \
+        else model.model[model.detector_index]
+    anchor_num = det.na * det.nl
+    new_anchors = kmean_anchors(dataset, n=anchor_num, img_size=imgsz, thr=thr, gen=1000, verbose=False)
+    new_anchors = torch.tensor(new_anchors, device=det.anchors.device).type_as(det.anchors)
+    det.anchor_grid[:] = new_anchors.clone().view_as(det.anchor_grid)  # for inference
+    det.anchors[:] = new_anchors.clone().view_as(det.anchors) / det.stride.to(det.anchors.device).view(-1, 1, 1)  # loss
+    check_anchor_order(det)
+    logger.info(str(det.anchors))
+    print('New anchors saved to model. Update model config to use these anchors in the future.')
+
+
+def kmean_anchors(path='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
+    """ Creates kmeans-evolved anchors from training dataset
+
+        Arguments:
+            path: path to dataset *.yaml, or a loaded dataset
+            n: number of anchors
+            img_size: image size used for training
+            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
+            gen: generations to evolve anchors using genetic algorithm
+            verbose: print all results
+
+        Return:
+            k: kmeans evolved anchors
+
+        Usage:
+            from utils.autoanchor import *; _ = kmean_anchors()
+    """
+    thr = 1. / thr
+
+    def metric(k, wh):  # compute metrics
+        r = wh[:, None] / k[None]
+        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
+        # x = wh_iou(wh, torch.tensor(k))  # iou metric
+        return x, x.max(1)[0]  # x, best_x
+
+    def anchor_fitness(k):  # mutation fitness
+        _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
+        return (best * (best > thr).float()).mean()  # fitness
+
+    def print_results(k):
+        k = k[np.argsort(k.prod(1))]  # sort small to large
+        x, best = metric(k, wh0)
+        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
+        print('thr=%.2f: %.4f best possible recall, %.2f anchors past thr' % (thr, bpr, aat))
+        print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' %
+              (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='')
+        for i, x in enumerate(k):
+            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
+        return k
+
+    if isinstance(path, str):  # not class
+        raise TypeError('Dataset must be class, but found str')
+    else:
+        dataset = path  # dataset
+
+    labels = [db['label'] for db in dataset.db]
+    labels = np.vstack(labels)
+    if not (labels[:, 1:] <= 1).all():
+        # normalize label
+        labels[:, [2, 4]] /= dataset.shapes[0]
+        labels[:, [1, 3]] /= dataset.shapes[1]
+    # Get label wh
+    shapes = img_size * dataset.shapes / dataset.shapes.max()
+    # wh0 = np.concatenate([l[:, 3:5] * shapes for l in labels])  # wh
+    wh0 = labels[:, 3:5] * shapes
+    # Filter
+    i = (wh0 < 3.0).any(1).sum()
+    if i:
+        print('WARNING: Extremely small objects found. '
+              '%g of %g labels are < 3 pixels in width or height.' % (i, len(wh0)))
+    wh = wh0[(wh0 >= 2.0).any(1)]  # filter > 2 pixels
+
+    # Kmeans calculation
+    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
+    s = wh.std(0)  # sigmas for whitening
+    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
+    k *= s
+    wh = torch.tensor(wh, dtype=torch.float32)  # filtered
+    wh0 = torch.tensor(wh0, dtype=torch.float32)  # unfiltered
+    k = print_results(k)
+
+    # Plot
+    # k, d = [None] * 20, [None] * 20
+    # for i in tqdm(range(1, 21)):
+    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
+    # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True)
+    # ax = ax.ravel()
+    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
+    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
+    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
+    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
+    # fig.savefig('wh.png', dpi=200)
+
+    # Evolve
+    npr = np.random
+    f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
+    pbar = tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm')  # progress bar
+    for _ in pbar:
+        v = np.ones(sh)
+        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
+            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
+        kg = (k.copy() * v).clip(min=2.0)
+        fg = anchor_fitness(kg)
+        if fg > f:
+            f, k = fg, kg.copy()
+            pbar.desc = 'Evolving anchors with Genetic Algorithm: fitness = %.4f' % f
+            if verbose:
+                print_results(k)
+
+    return print_results(k)
diff --git a/lib/utils/plot.py b/lib/utils/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..b759ef4f731bd8fee1aa8894dab31508adafd34e
--- /dev/null
+++ b/lib/utils/plot.py
@@ -0,0 +1,113 @@
+## 处理pred结果的.json文件,画图
+import matplotlib.pyplot as plt
+import cv2
+import numpy as np
+import random
+
+
+def plot_img_and_mask(img, mask, index,epoch,save_dir):
+    classes = mask.shape[2] if len(mask.shape) > 2 else 1
+    fig, ax = plt.subplots(1, classes + 1)
+    ax[0].set_title('Input image')
+    ax[0].imshow(img)
+    if classes > 1:
+        for i in range(classes):
+            ax[i+1].set_title(f'Output mask (class {i+1})')
+            ax[i+1].imshow(mask[:, :, i])
+    else:
+        ax[1].set_title(f'Output mask')
+        ax[1].imshow(mask)
+    plt.xticks([]), plt.yticks([])
+    # plt.show()
+    plt.savefig(save_dir+"/batch_{}_{}_seg.png".format(epoch,index))
+
+def show_seg_result(img, result, index, epoch, save_dir=None, is_ll=False,palette=None,is_demo=False,is_gt=False):
+    # img = mmcv.imread(img)
+    # img = img.copy()
+    # seg = result[0]
+    if palette is None:
+        palette = np.random.randint(
+                0, 255, size=(3, 3))
+    palette[0] = [0, 0, 0]
+    palette[1] = [0, 255, 0]
+    palette[2] = [255, 0, 0]
+    palette = np.array(palette)
+    assert palette.shape[0] == 3 # len(classes)
+    assert palette.shape[1] == 3
+    assert len(palette.shape) == 2
+    
+    if not is_demo:
+        color_seg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[result == label, :] = color
+    else:
+        color_area = np.zeros((result[0].shape[0], result[0].shape[1], 3), dtype=np.uint8)
+        
+        # for label, color in enumerate(palette):
+        #     color_area[result[0] == label, :] = color
+
+        color_area[result[0] == 1] = [0, 255, 0]
+        color_area[result[1] ==1] = [255, 0, 0]
+        color_seg = color_area
+
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+    # print(color_seg.shape)
+    color_mask = np.mean(color_seg, 2)
+    img[color_mask != 0] = img[color_mask != 0] * 0.5 + color_seg[color_mask != 0] * 0.5
+    # img = img * 0.5 + color_seg * 0.5
+    img = img.astype(np.uint8)
+    img = cv2.resize(img, (1280,720), interpolation=cv2.INTER_LINEAR)
+
+    if not is_demo:
+        if not is_gt:
+            if not is_ll:
+                cv2.imwrite(save_dir+"/batch_{}_{}_da_segresult.png".format(epoch,index), img)
+            else:
+                cv2.imwrite(save_dir+"/batch_{}_{}_ll_segresult.png".format(epoch,index), img)
+        else:
+            if not is_ll:
+                cv2.imwrite(save_dir+"/batch_{}_{}_da_seg_gt.png".format(epoch,index), img)
+            else:
+                cv2.imwrite(save_dir+"/batch_{}_{}_ll_seg_gt.png".format(epoch,index), img)  
+    return img
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    # Plots one bounding box on image img
+    tl = line_thickness or round(0.0001 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    # if label:
+    #     tf = max(tl - 1, 1)  # font thickness
+    #     t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+    #     c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+    #     cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+    #     print(label)
+        # cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+
+
+if __name__ == "__main__":
+    pass
+# def plot():
+#     cudnn.benchmark = cfg.CUDNN.BENCHMARK
+#     torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
+#     torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
+
+#     device = select_device(logger, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU) if not cfg.DEBUG \
+#         else select_device(logger, 'cpu')
+
+#     if args.local_rank != -1:
+#         assert torch.cuda.device_count() > args.local_rank
+#         torch.cuda.set_device(args.local_rank)
+#         device = torch.device('cuda', args.local_rank)
+#         dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+    
+#     model = get_net(cfg).to(device)
+#     model_file = '/home/zwt/DaChuang/weights/epoch--2.pth'
+#     checkpoint = torch.load(model_file)
+#     model.load_state_dict(checkpoint['state_dict'])
+#     if rank == -1 and torch.cuda.device_count() > 1:
+#         model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()
+#     if rank != -1:
+#         model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank)
\ No newline at end of file
diff --git a/lib/utils/split_dataset.py b/lib/utils/split_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb7432916542c78fbad632f30c285e3bd03a1d0
--- /dev/null
+++ b/lib/utils/split_dataset.py
@@ -0,0 +1,30 @@
+import random
+import shutil
+import os
+
+def split(path, mask_path, lane_path):
+    os.mkdir(path + 'train')
+    os.mkdir(path + 'val')
+    os.mkdir(mask_path + 'train')
+    os.mkdir(mask_path + 'val')
+    os.mkdir(lane_path + 'train')
+    os.mkdir(lane_path + 'val')
+    val_index = random.sample(range(660), 200)
+    for i in range(660):
+        if i in val_index:
+            shutil.move(path+'{}.png'.format(i), path + 'val')
+            shutil.move(mask_path+'{}.png'.format(i), mask_path + 'val')
+            shutil.move(lane_path+'{}.png'.format(i), lane_path + 'val')
+        else:
+            shutil.move(path+'{}.png'.format(i), path + 'train')
+            shutil.move(mask_path+'{}.png'.format(i), mask_path + 'train')
+            shutil.move(lane_path+'{}.png'.format(i), lane_path + 'train')
+
+
+if __name__ == '__main__':
+    path = "/home/wqm/bdd/data_hust/"
+    mask_path = "/home/wqm/bdd/hust_area/"
+    lane_path = "/home/wqm/bdd/hust_lane/"
+    split(path, mask_path, lane_path)
+
+
diff --git a/lib/utils/utils.py b/lib/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..529eb815bf0a0f7db9fa4834cdc926db8630ab6a
--- /dev/null
+++ b/lib/utils/utils.py
@@ -0,0 +1,164 @@
+import os
+import logging
+import time
+from collections import namedtuple
+from pathlib import Path
+
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import numpy as np
+from torch.utils.data import DataLoader
+from prefetch_generator import BackgroundGenerator
+from contextlib import contextmanager
+import re
+
+def clean_str(s):
+    # Cleans a string by replacing special characters with underscore _
+    return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s)
+
+def create_logger(cfg, cfg_path, phase='train', rank=-1):
+    # set up logger dir
+    dataset = cfg.DATASET.DATASET
+    dataset = dataset.replace(':', '_')
+    model = cfg.MODEL.NAME
+    cfg_path = os.path.basename(cfg_path).split('.')[0]
+
+    if rank in [-1, 0]:
+        time_str = time.strftime('%Y-%m-%d-%H-%M')
+        log_file = '{}_{}_{}.log'.format(cfg_path, time_str, phase)
+        # set up tensorboard_log_dir
+        tensorboard_log_dir = Path(cfg.LOG_DIR) / dataset / model / \
+                                  (cfg_path + '_' + time_str)
+        final_output_dir = tensorboard_log_dir
+        if not tensorboard_log_dir.exists():
+            print('=> creating {}'.format(tensorboard_log_dir))
+            tensorboard_log_dir.mkdir(parents=True)
+
+        final_log_file = tensorboard_log_dir / log_file
+        head = '%(asctime)-15s %(message)s'
+        logging.basicConfig(filename=str(final_log_file),
+                            format=head)
+        logger = logging.getLogger()
+        logger.setLevel(logging.INFO)
+        console = logging.StreamHandler()
+        logging.getLogger('').addHandler(console)
+
+        return logger, str(final_output_dir), str(tensorboard_log_dir)
+    else:
+        return None, None, None
+
+
+def select_device(logger=None, device='', batch_size=None):
+    # device = 'cpu' or '0' or '0,1,2,3'
+    cpu_request = device.lower() == 'cpu'
+    if device and not cpu_request:  # if device requested other than 'cpu'
+        os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable
+        assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device  # check availablity
+
+    cuda = False if cpu_request else torch.cuda.is_available()
+    if cuda:
+        c = 1024 ** 2  # bytes to MB
+        ng = torch.cuda.device_count()
+        if ng > 1 and batch_size:  # check that batch_size is compatible with device_count
+            assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng)
+        x = [torch.cuda.get_device_properties(i) for i in range(ng)]
+        s = f'Using torch {torch.__version__} '
+        for i in range(0, ng):
+            if i == 1:
+                s = ' ' * len(s)
+            if logger:
+                logger.info("%sCUDA:%g (%s, %dMB)" % (s, i, x[i].name, x[i].total_memory / c))
+    else:
+        if logger:
+            logger.info(f'Using torch {torch.__version__} CPU')
+
+    if logger:
+        logger.info('')  # skip a line
+    return torch.device('cuda:0' if cuda else 'cpu')
+
+
+def get_optimizer(cfg, model):
+    optimizer = None
+    if cfg.TRAIN.OPTIMIZER == 'sgd':
+        optimizer = optim.SGD(
+            filter(lambda p: p.requires_grad, model.parameters()),
+            lr=cfg.TRAIN.LR0,
+            momentum=cfg.TRAIN.MOMENTUM,
+            weight_decay=cfg.TRAIN.WD,
+            nesterov=cfg.TRAIN.NESTEROV
+        )
+    elif cfg.TRAIN.OPTIMIZER == 'adam':
+        optimizer = optim.Adam(
+            filter(lambda p: p.requires_grad, model.parameters()),
+            #model.parameters(),
+            lr=cfg.TRAIN.LR0,
+            betas=(cfg.TRAIN.MOMENTUM, 0.999)
+        )
+
+    return optimizer
+
+
+def save_checkpoint(epoch, name, model, optimizer, output_dir, filename, is_best=False):
+    model_state = model.module.state_dict() if is_parallel(model) else model.state_dict()
+    checkpoint = {
+            'epoch': epoch,
+            'model': name,
+            'state_dict': model_state,
+            # 'best_state_dict': model.module.state_dict(),
+            # 'perf': perf_indicator,
+            'optimizer': optimizer.state_dict(),
+        }
+    torch.save(checkpoint, os.path.join(output_dir, filename))
+    if is_best and 'state_dict' in checkpoint:
+        torch.save(checkpoint['state_dict'],
+                   os.path.join(output_dir, 'model_best.pth'))
+
+
+def initialize_weights(model):
+    for m in model.modules():
+        t = type(m)
+        if t is nn.Conv2d:
+            pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        elif t is nn.BatchNorm2d:
+            m.eps = 1e-3
+            m.momentum = 0.03
+        elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
+        # elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
+            m.inplace = True
+
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+
+def is_parallel(model):
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+class DataLoaderX(DataLoader):
+    """prefetch dataloader"""
+    def __iter__(self):
+        return BackgroundGenerator(super().__iter__())
+
+@contextmanager
+def torch_distributed_zero_first(local_rank: int):
+    """
+    Decorator to make all processes in distributed training wait for each local_master to do something.
+    """
+    if local_rank not in [-1, 0]:
+        torch.distributed.barrier()
+    yield
+    if local_rank == 0:
+        torch.distributed.barrier()
diff --git a/pictures/da.png b/pictures/da.png
new file mode 100644
index 0000000000000000000000000000000000000000..cf15284335ff6327ac4909b44f4e72d76121c789
--- /dev/null
+++ b/pictures/da.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b34697fc4aa4d9edc0649c89b746f41ff576d1821f0824609e5574bad1f5275
+size 678354
diff --git a/pictures/da_onnx.jpg b/pictures/da_onnx.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d9ebe95d3b58aa53ce6e629e4dab1b7c20eb7208
Binary files /dev/null and b/pictures/da_onnx.jpg differ
diff --git a/pictures/detect.png b/pictures/detect.png
new file mode 100644
index 0000000000000000000000000000000000000000..53089ee335d0d9c4b05e1063d89ffd44bd7c14d5
--- /dev/null
+++ b/pictures/detect.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e391c9ea8236f695a19862dcb5f9928512091bfc6b706ee9d6f51cb309b0bd1c
+size 703389
diff --git a/pictures/detect_onnx.jpg b/pictures/detect_onnx.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82fd22fbf57610552581c9e3a0bab24835ab4042
--- /dev/null
+++ b/pictures/detect_onnx.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc4677b934ce1fa0df22ec03316732655e9e1bef8cf189571060eb6855772e83
+size 104482
diff --git a/pictures/input1.gif b/pictures/input1.gif
new file mode 100644
index 0000000000000000000000000000000000000000..7c1ffdd406cf80d2ff4c32d1f18ca1e87155e1ad
--- /dev/null
+++ b/pictures/input1.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654f61e6067b922ef5675d2a1414038aae5252d5282a8247aef9300936bdad21
+size 3708060
diff --git a/pictures/input2.gif b/pictures/input2.gif
new file mode 100644
index 0000000000000000000000000000000000000000..d3e39318b8f9bd58fc2746875a500ff296eb8009
--- /dev/null
+++ b/pictures/input2.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a395dc377605d48d0bf61e79b6dde140a251088309d98435c4b7549b5b9fc2db
+size 3272798
diff --git a/pictures/ll.png b/pictures/ll.png
new file mode 100644
index 0000000000000000000000000000000000000000..38b9d7672d85e203b6b96645423a81805ec34af3
--- /dev/null
+++ b/pictures/ll.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eb2f43fd97be1d8ef29509299b6eb586b0b01b116ac908b9c3debf0bff1238b
+size 711221
diff --git a/pictures/ll_onnx.jpg b/pictures/ll_onnx.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9d9906bb0283d64a8c7b62730e9b2933f71930a7
Binary files /dev/null and b/pictures/ll_onnx.jpg differ
diff --git a/pictures/output1.gif b/pictures/output1.gif
new file mode 100644
index 0000000000000000000000000000000000000000..3a2efe36ea6f9a7c3111cf825b10934123159faf
--- /dev/null
+++ b/pictures/output1.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67164a2c548e373bfa45592c78c326432242c1b791fd5fe3e9743a8d9c32120e
+size 3578418
diff --git a/pictures/output2.gif b/pictures/output2.gif
new file mode 100644
index 0000000000000000000000000000000000000000..b9a2775ab73185e808274ec4f9cd46545e81393a
--- /dev/null
+++ b/pictures/output2.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65714c5a0452b7067b662871473e238b1c0a212079cf90c6234b89c6768e2e33
+size 3318360
diff --git a/pictures/output_onnx.jpg b/pictures/output_onnx.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ebe09d208c7ccad51788a87791251aea52969a77
--- /dev/null
+++ b/pictures/output_onnx.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:866ed08ea4a6055f2abf97011b39caaae70a569d4731cc1bbe3040c948532dba
+size 135432
diff --git a/pictures/yolop.png b/pictures/yolop.png
new file mode 100644
index 0000000000000000000000000000000000000000..acf6facb386f64ac00fe0ab3c8d374735b25ad49
--- /dev/null
+++ b/pictures/yolop.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b10de231967cbe15442c4b2fdd7f1037f8f94cac12824442ddd8945545f19548
+size 150600
diff --git a/quick_train.py b/quick_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a33b0d087c49abc2617202137353bbbae525c30e
--- /dev/null
+++ b/quick_train.py
@@ -0,0 +1,284 @@
+"""
+快速训练脚本 - 用于测试和调试
+只使用数据集的前100个样本进行快速多 epoch 测试
+"""
+import argparse
+import os, sys
+import math
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(BASE_DIR)
+
+import pprint
+import time
+import torch
+import torch.nn.parallel
+from torch.cuda import amp
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torchvision.transforms as transforms
+import numpy as np
+from tensorboardX import SummaryWriter
+
+import lib.dataset as dataset
+from lib.config import cfg
+from lib.config import update_config
+from lib.core.loss import get_loss
+from lib.core.function import train
+from lib.core.function import validate
+from lib.core.general import fitness
+from lib.models import get_net
+from lib.utils.utils import get_optimizer
+from lib.utils.utils import save_checkpoint
+from lib.utils.utils import create_logger, select_device
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Quick train for testing')
+    
+    parser.add_argument('--config', type=str, default='yolov11',
+                        help='config to use: default or yolov11')
+    parser.add_argument('--samples', type=int, default=100,
+                        help='number of samples to use for quick test')
+    parser.add_argument('--epochs', type=int, default=10,
+                        help='number of epochs for quick test')
+    parser.add_argument('--batch-size', type=int, default=4,
+                        help='batch size for quick test')
+    parser.add_argument('--yolo-scale', type=str, default='s',
+                        choices=['n', 's', 'm', 'l', 'x'],
+                        help='YOLOv11 scale (only used if config=yolov11)')
+    parser.add_argument('--freeze-backbone', action='store_true',
+                        help='freeze YOLOv11 backbone')
+    parser.add_argument('--workers', type=int, default=0,
+                        help='number of data loading workers')
+    
+    args = parser.parse_args()
+    return args
+
+
+class SubsetDataset(torch.utils.data.Dataset):
+    """数据集子集包装器"""
+    def __init__(self, dataset, num_samples):
+        self.dataset = dataset
+        self.num_samples = min(num_samples, len(dataset))
+        
+    def __len__(self):
+        return self.num_samples
+    
+    def __getitem__(self, idx):
+        if idx >= self.num_samples:
+            raise IndexError
+        return self.dataset[idx]
+
+
+def main():
+    args = parse_args()
+    
+    # 加载配置
+    if args.config == 'yolov11':
+        from lib.config.yolov11 import cfg
+        # 覆盖配置
+        cfg.MODEL.YOLOV11_SCALE = args.yolo_scale
+        cfg.MODEL.YOLOV11_WEIGHTS = f'weights/yolo11{args.yolo_scale}.pt'
+        cfg.MODEL.FREEZE_BACKBONE = args.freeze_backbone
+    else:
+        from lib.config.default import _C as cfg
+    
+    # 覆盖快速测试配置
+    cfg.TRAIN.BEGIN_EPOCH = 0
+    cfg.TRAIN.END_EPOCH = args.epochs
+    cfg.TRAIN.BATCH_SIZE_PER_GPU = args.batch_size
+    cfg.WORKERS = args.workers
+    cfg.PRINT_FREQ = 5  # 更频繁的打印
+    
+    # 设置日志
+    logger, final_output_dir, tb_log_dir = create_logger(
+        cfg, cfg.LOG_DIR, 'quick_train'
+    )
+    
+    logger.info("="*80)
+    logger.info("QUICK TRAIN MODE - Testing Configuration")
+    logger.info("="*80)
+    logger.info(f"Config: {args.config}")
+    logger.info(f"Samples: {args.samples}")
+    logger.info(f"Epochs: {args.epochs}")
+    logger.info(f"Batch size: {args.batch_size}")
+    if args.config == 'yolov11':
+        logger.info(f"YOLOv11 scale: {args.yolo_scale}")
+        logger.info(f"Freeze backbone: {args.freeze_backbone}")
+    logger.info("="*80)
+    
+    writer_dict = {
+        'writer': SummaryWriter(log_dir=tb_log_dir),
+        'train_global_steps': 0,
+        'valid_global_steps': 0,
+    }
+    
+    # CUDNN 配置
+    cudnn.benchmark = cfg.CUDNN.BENCHMARK
+    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
+    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
+    
+    # 创建模型
+    logger.info("Building model...")
+    device = select_device(logger, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU)
+    
+    if hasattr(cfg.MODEL, 'USE_YOLOV11') and cfg.MODEL.USE_YOLOV11:
+        model = get_net(
+            cfg, 
+            yolo_scale=cfg.MODEL.YOLOV11_SCALE,
+            yolo_weights_path=cfg.MODEL.YOLOV11_WEIGHTS,
+            freeze_backbone=cfg.MODEL.FREEZE_BACKBONE
+        ).to(device)
+    else:
+        model = get_net(cfg).to(device)
+    
+    logger.info("Model created successfully")
+
+    print("++++++++++++++++++++++")
+    print(model.model[model.detector_index])
+    
+    # 统计参数
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"Total parameters: {total_params:,}")
+    logger.info(f"Trainable parameters: {trainable_params:,}")
+    logger.info(f"Frozen parameters: {total_params - trainable_params:,}")
+    
+    # 损失函数和优化器
+    criterion = get_loss(cfg, device=device)
+    optimizer = get_optimizer(cfg, model)
+    
+    # 学习率调度器
+    lf = lambda x: ((1 + math.cos(x * math.pi / cfg.TRAIN.END_EPOCH)) / 2) * \
+                   (1 - cfg.TRAIN.LRF) + cfg.TRAIN.LRF
+    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+    
+    # 创建数据集
+    logger.info("Loading dataset...")
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+    
+    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+        cfg=cfg,
+        is_train=True,
+        inputsize=cfg.MODEL.IMAGE_SIZE,
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+    )
+    
+    # 只使用部分数据
+    train_dataset = SubsetDataset(train_dataset, args.samples)
+    logger.info(f"Using {len(train_dataset)} training samples")
+    
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU,
+        shuffle=True,
+        num_workers=cfg.WORKERS,
+        pin_memory=cfg.PIN_MEMORY,
+        collate_fn=dataset.AutoDriveDataset.collate_fn
+    )
+    
+    # 验证集
+    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+        cfg=cfg,
+        is_train=False,
+        inputsize=cfg.MODEL.IMAGE_SIZE,
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+    )
+    valid_dataset = SubsetDataset(valid_dataset, args.samples // 2)
+    logger.info(f"Using {len(valid_dataset)} validation samples")
+    
+    valid_loader = torch.utils.data.DataLoader(
+        valid_dataset,
+        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU,
+        shuffle=False,
+        num_workers=cfg.WORKERS,
+        pin_memory=cfg.PIN_MEMORY,
+        collate_fn=dataset.AutoDriveDataset.collate_fn
+    )
+    
+    # 混合精度训练
+    scaler = amp.GradScaler(enabled=device.type != 'cpu')
+    
+    # 训练循环
+    logger.info("Starting training...")
+    logger.info("="*80)
+    
+    best_fitness = 0.0
+    num_batch = len(train_loader)
+    num_warmup = max(round(cfg.TRAIN.WARMUP_EPOCHS * num_batch), 1000)
+    
+    for epoch in range(cfg.TRAIN.BEGIN_EPOCH, cfg.TRAIN.END_EPOCH):
+        logger.info(f"\n{'='*80}")
+        logger.info(f"Epoch {epoch}/{cfg.TRAIN.END_EPOCH-1}")
+        logger.info(f"{'='*80}")
+        
+        # 训练一个 epoch
+        train(
+            cfg, train_loader, model, criterion, optimizer,
+            scaler, epoch, num_batch, num_warmup,
+            writer_dict, logger, device
+        )
+        
+        # 更新学习率
+        lr_scheduler.step()
+        
+        # 验证
+        if (epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH - 1):
+            logger.info("\nValidating...")
+            da_segment_results, ll_segment_results, detect_results, total_loss, maps, times = validate(
+                epoch, cfg, valid_loader, valid_dataset, model, criterion,
+                final_output_dir, tb_log_dir, writer_dict, logger, device
+            )
+            
+            # 计算 fitness
+            fi = fitness(np.array(detect_results).reshape(1, -1))
+            logger.info(f"Fitness: {fi.item():.4f}")
+            
+            # 保存最佳模型
+            if fi > best_fitness:
+                best_fitness = fi
+                # logger.info(f"Fitness: {float(fi):.4f}")
+                # logger.info(f"Fitness: {fi.item():.4f}")
+                logger.info(f"New best fitness: {best_fitness.item():.4f}")
+                save_checkpoint(
+                    epoch= epoch + 1,
+                    name='111',
+                    model=model,
+                    optimizer=optimizer,
+                    output_dir=final_output_dir,
+                    filename='checkpoint_best.pth',
+                    is_best=True
+                )
+            
+            # 保存检查点
+            save_checkpoint(
+                epoch=epoch,
+                name=cfg.MODEL.NAME,
+                model=model,
+                # 'best_state_dict': model.module.state_dict(),
+                # 'perf': perf_indicator,
+                optimizer=optimizer,
+                output_dir=final_output_dir,
+                filename=f'epoch-{epoch}.pth'
+            )
+    
+    logger.info("\n" + "="*80)
+    logger.info("Training completed!")
+    logger.info(f"Best fitness: {best_fitness.item():.4f}")
+    logger.info(f"Results saved to: {final_output_dir}")
+    logger.info("="*80)
+    
+    writer_dict['writer'].close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..527c1d1b72148a05daf347cd0c6703ede5bb6ea5
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+scipy
+tqdm
+yacs
+Cython
+matplotlib>=3.2.2
+numpy>=1.18.5
+opencv-python>=4.1.2
+Pillow
+PyYAML>=5.3
+scipy>=1.4.1
+tensorboardX
+seaborn
+prefetch_generator
+imageio
+scikit-learn
+ultralytics
diff --git a/script/check_dataset_format.py b/script/check_dataset_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..c42eaf93033b1fb25548c6d1060859842b778826
--- /dev/null
+++ b/script/check_dataset_format.py
@@ -0,0 +1,178 @@
+"""
+数据集格式验证脚本
+用于验证 train_loader 加载的 input 和 target 格式
+特别是验证 target[0] 是否为 [image_idx, class_id, x_center, y_center, width, height]
+"""
+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+import torch
+import torchvision.transforms as transforms
+from lib.config import cfg
+import lib.dataset as dataset
+from lib.utils import DataLoaderX
+
+def check_dataset_format():
+    """验证数据集加载格式"""
+    
+    print("="*80)
+    print("开始验证数据集加载格式...")
+    print("="*80)
+    
+    # 数据预处理
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], 
+        std=[0.229, 0.224, 0.225]
+    )
+    
+    # 创建训练数据集
+    print("\n1. 创建数据集...")
+    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+        cfg=cfg,
+        is_train=True,
+        inputsize=cfg.MODEL.IMAGE_SIZE,
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+    )
+    print(f"   数据集类型: {cfg.DATASET.DATASET}")
+    print(f"   数据集大小: {len(train_dataset)}")
+    
+    # 打印类别数量
+    if hasattr(train_dataset, 'names'):
+        print(f"   数据集类别: {train_dataset.names}")
+        print(f"   类别数量: {len(train_dataset.names)}")
+    else:
+        print("   数据集没有 names 属性")
+    # 打印类别数量
+    if hasattr(train_dataset, "names"):
+        print(f"   数据集类别数量: {len(train_dataset.names)}")
+    else:
+        print("   数据集不包含 names 属性，无法统计类别数量。")
+    
+    # 创建 DataLoader
+    print("\n2. 创建 DataLoader...")
+    train_loader = DataLoaderX(
+        train_dataset,
+        batch_size=4,  # 使用小 batch_size 方便查看
+        shuffle=False,
+        num_workers=0,  # Windows 上使用 0
+        pin_memory=False,
+        collate_fn=dataset.AutoDriveDataset.collate_fn
+    )
+    print(f"   Batch size: ")
+    print(f"   Total batches: {len(train_loader)}")
+    
+    # 获取第一个 batch
+    print("\n3. 加载第一个 batch...")
+    for i, (input, target, paths, shapes) in enumerate(train_loader):
+        print("\n" + "="*80)
+        print(f"Batch {i} 数据格式分析:")
+        print("="*80)
+        
+        # 分析 input
+        print("\n[INPUT - 图像数据]")
+        print(f"  类型: {type(input)}")
+        print(f"  形状: {input.shape}")
+        print(f"  dtype: {input.dtype}")
+        print(f"  值范围: [{input.min():.3f}, {input.max():.3f}]")
+        
+        # 分析 target
+        print("\n[TARGET - 标注数据]")
+        print(f"  类型: {type(target)}")
+        print(f"  长度: {len(target)} (包含 3 个元素: det, da_seg, ll_seg)")
+        
+        # target[0] - 检测标签 (最重要)
+        print(f"\n  target[0] - 检测标签 (Detection Labels):")
+        print(f"    类型: {type(target[0])}")
+        print(f"    形状: {target[0].shape}")
+        print(f"    dtype: {target[0].dtype}")
+        print(f"    说明: [N, 6] 其中 N 是所有图片的目标总数，6 维度为:")
+        print(f"          [image_idx, class_id, x_center, y_center, width, height]")
+        
+        # 打印前几个样本
+        if target[0].shape[0] > 0:
+            print(f"\n    前 5 个目标样本:")
+            print(f"    {'索引':<6} {'img_idx':<10} {'class_id':<10} {'x_center':<12} {'y_center':<12} {'width':<12} {'height':<12}")
+            print(f"    {'-'*76}")
+            for idx in range(min(5, target[0].shape[0])):
+                obj = target[0][idx]
+                print(f"    {idx:<6} {obj[0].item():<10.0f} {obj[1].item():<10.0f} {obj[2].item():<12.6f} {obj[3].item():<12.6f} {obj[4].item():<12.6f} {obj[5].item():<12.6f}")
+            
+            # 验证归一化
+            print(f"\n    验证坐标是否归一化到 [0, 1]:")
+            xywh_data = target[0][:, 2:]  # 提取 xywh 坐标
+            print(f"      x_center 范围: [{xywh_data[:, 0].min():.6f}, {xywh_data[:, 0].max():.6f}]")
+            print(f"      y_center 范围: [{xywh_data[:, 1].min():.6f}, {xywh_data[:, 1].max():.6f}]")
+            print(f"      width    范围: [{xywh_data[:, 2].min():.6f}, {xywh_data[:, 2].max():.6f}]")
+            print(f"      height   范围: [{xywh_data[:, 3].min():.6f}, {xywh_data[:, 3].max():.6f}]")
+            
+            # 检查是否归一化
+            is_normalized = (xywh_data >= 0).all() and (xywh_data <= 1).all()
+            if is_normalized:
+                print(f"      ✓ 坐标已归一化到 [0, 1]")
+            else:
+                print(f"      ✗ 警告: 坐标未完全归一化!")
+            
+            # 统计每张图片的目标数量
+            print(f"\n    每张图片的目标数量:")
+            for img_idx in range(input.shape[0]):
+                count = (target[0][:, 0] == img_idx).sum().item()
+                print(f"      图片 {img_idx}: {count} 个目标")
+        else:
+            print(f"    (该 batch 没有检测目标)")
+        
+        # target[1] - 驾驶区域分割标签
+        print(f"\n  target[1] - 驾驶区域分割标签 (Drivable Area Segmentation):")
+        print(f"    类型: {type(target[1])}")
+        print(f"    形状: {target[1].shape}")
+        print(f"    dtype: {target[1].dtype}")
+        print(f"    值范围: [{target[1].min():.3f}, {target[1].max():.3f}]")
+        print(f"    说明: [batch_size, num_classes, H, W]")
+        
+        # target[2] - 车道线分割标签
+        print(f"\n  target[2] - 车道线分割标签 (Lane Line Segmentation):")
+        print(f"    类型: {type(target[2])}")
+        print(f"    形状: {target[2].shape}")
+        print(f"    dtype: {target[2].dtype}")
+        print(f"    值范围: [{target[2].min():.3f}, {target[2].max():.3f}]")
+        print(f"    说明: [batch_size, num_classes, H, W]")
+        
+        # 分析 paths
+        print(f"\n[PATHS - 图像路径]")
+        print(f"  类型: {type(paths)}")
+        print(f"  长度: {len(paths)}")
+        if len(paths) > 0:
+            print(f"  示例路径:")
+            for idx, path in enumerate(paths):
+                print(f"    [{idx}] {path}")
+        
+        # 分析 shapes
+        print(f"\n[SHAPES - 图像尺寸信息]")
+        print(f"  类型: {type(shapes)}")
+        print(f"  长度: {len(shapes)}")
+        if len(shapes) > 0:
+            print(f"  示例 (原始尺寸, ((缩放比例), (padding))):")
+            for idx, shape in enumerate(shapes[:2]):  # 只显示前2个
+                print(f"    [{idx}] {shape}")
+        
+        print("\n" + "="*80)
+        print("验证结论:")
+        print("="*80)
+        print("✓ target[0] 格式为: [image_idx, class_id, x_center, y_center, width, height]")
+        print("✓ xywh 坐标已归一化到 [0, 1]")
+        print("✓ image_idx 用于区分 batch 中不同图片的目标")
+        print("✓ class_id 表示目标类别")
+        print("="*80)
+        
+        # 只查看第一个 batch
+        break
+    
+    print("\n验证完成!")
+
+
+if __name__ == '__main__':
+    check_dataset_format()
diff --git a/script/check_yolov11_model_architecture.py b/script/check_yolov11_model_architecture.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c626c6ffa07084c3f55e4bc0f69680b733a42f5
--- /dev/null
+++ b/script/check_yolov11_model_architecture.py
@@ -0,0 +1,320 @@
+"""
+检查YOLOv11 backbone的YOLOP模型架构
+验证backbone和neck的结构是否与预期一致
+"""
+import sys
+import os
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+import torch
+import torch.nn as nn
+from lib.models.YOLOP_YOLOv11 import YOLOPWithYOLOv11
+from lib.config import cfg
+from lib.config import update_config
+
+
+def count_parameters(model):
+    """统计模型参数数量"""
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return total_params, trainable_params
+
+
+def check_module_structure(module, name, depth=0):
+    """递归检查模块结构"""
+    indent = "  " * depth
+    print(f"{indent}{name}:")
+    
+    if isinstance(module, nn.ModuleList):
+        for i, sub_module in enumerate(module):
+            sub_name = f"[{i}] {type(sub_module).__name__}"
+            if depth < 2:  # 限制递归深度
+                check_module_structure(sub_module, sub_name, depth + 1)
+            else:
+                print(f"{indent}  {sub_name}")
+    elif isinstance(module, nn.Sequential):
+        for i, sub_module in enumerate(module):
+            sub_name = f"[{i}] {type(sub_module).__name__}"
+            print(f"{indent}  {sub_name}")
+    else:
+        # 打印模块的子模块
+        children = list(module.named_children())
+        if children and depth < 2:
+            for child_name, child_module in children:
+                print(f"{indent}  {child_name}: {type(child_module).__name__}")
+
+
+def test_forward_pass(model, input_size=(1, 3, 640, 640)):
+    """测试前向传播"""
+    print("\n" + "="*80)
+    print("Testing forward pass...")
+    print("="*80)
+    
+    model.eval()
+    with torch.no_grad():
+        dummy_input = torch.randn(input_size)
+        try:
+            det_out, da_out, ll_out = model(dummy_input)
+            
+            print(f"\n✓ Forward pass successful!")
+            print(f"\nInput shape: {dummy_input.shape}")
+            
+            # 检测输出
+            if isinstance(det_out, tuple):
+                print(f"\nDetection output (tuple with {len(det_out)} elements):")
+                for i, out in enumerate(det_out):
+                    if isinstance(out, list):
+                        print(f"  Element {i} (list with {len(out)} items):")
+                        for j, item in enumerate(out):
+                            print(f"    [{j}] shape: {item.shape}")
+                    else:
+                        print(f"  Element {i} shape: {out.shape}")
+            else:
+                print(f"\nDetection output shape: {det_out.shape}")
+            
+            # 分割输出
+            print(f"\nDrivable area segmentation output shape: {da_out.shape}")
+            print(f"Lane line segmentation output shape: {ll_out.shape}")
+            
+            return True
+        except Exception as e:
+            print(f"\n✗ Forward pass failed with error:")
+            print(f"  {type(e).__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
+
+def check_backbone_structure(model):
+    """检查backbone结构"""
+    print("\n" + "="*80)
+    print("Checking Backbone Structure (YOLOv11)")
+    print("="*80)
+    
+    backbone = model.backbone
+    print(f"\nBackbone type: {type(backbone).__name__}")
+    print(f"Number of layers: {len(backbone.layers)}")
+    print(f"Output indices (P3, P4, P5): {backbone.out_indices}")
+    
+    print("\nBackbone layers:")
+    for i, layer in enumerate(backbone.layers):
+        layer_type = type(layer).__name__
+        
+        # 获取输入输出通道数
+        if hasattr(layer, 'conv') and hasattr(layer.conv, 'in_channels'):
+            # Conv 层
+            in_ch = layer.conv.in_channels
+            out_ch = layer.conv.out_channels
+            print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+        elif hasattr(layer, 'cv1') and hasattr(layer, 'cv2'):
+            # C3k2, C2PSA 等层
+            in_ch = layer.cv1.conv.in_channels
+            out_ch = layer.cv2.conv.out_channels
+            print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+        elif hasattr(layer, 'cv1'):
+            # SPPF 层
+            in_ch = layer.cv1.conv.in_channels
+            out_ch = layer.cv2.conv.out_channels if hasattr(layer, 'cv2') else 'N/A'
+            if isinstance(out_ch, int):
+                print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+            else:
+                print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch} channels")
+        else:
+            print(f"  [{i:2d}] {layer_type}")
+    
+    # 测试backbone输出
+    print("\nTesting backbone forward pass...")
+    test_input = torch.randn(1, 3, 384, 640)
+    with torch.no_grad():
+        features = backbone(test_input)
+        print(f"Backbone outputs {len(features)} feature maps:")
+        for i, feat in enumerate(features):
+            print(f"  P{i+3} shape: {feat.shape}")
+
+
+def check_adapters(model):
+    """检查通道适配器"""
+    print("\n" + "="*80)
+    print("Checking Channel Adapters")
+    print("="*80)
+    
+    for i, adapter in enumerate(model.adapters):
+        in_ch = adapter.conv.in_channels
+        out_ch = adapter.conv.out_channels
+        print(f"  Adapter {i}: {in_ch} -> {out_ch} channels")
+
+
+def check_neck_structure(model):
+    """检查neck结构"""
+    print("\n" + "="*80)
+    print("Checking Neck Structure (YOLOP FPN+PAN)")
+    print("="*80)
+    
+    print(f"\nNeck has {len(model.neck)} layers:")
+    for i, layer in enumerate(model.neck):
+        layer_type = type(layer).__name__
+        
+        if hasattr(layer, 'conv'):
+            if hasattr(layer.conv, 'in_channels'):
+                in_ch = layer.conv.in_channels
+                out_ch = layer.conv.out_channels
+                print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+            elif hasattr(layer, 'Upsample'):
+                in_ch = layer.Upsample.in_channels
+                out_ch = layer.Upsample.out_channels
+                print(f"  [{i:2d}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+            else:
+                print(f"  [{i:2d}] {layer_type}")
+        else:
+            print(f"  [{i:2d}] {layer_type}")
+
+
+def check_heads(model):
+    """检查检测和分割头"""
+    print("\n" + "="*80)
+    print("Checking Detection and Segmentation Heads")
+    print("="*80)
+    
+    # 检测头
+    print("\nDetection Head:")
+    print(f"  Type: {type(model.detect_head).__name__}")
+    print(f"  Number of classes: {model.detect_head.nc}")
+    print(f"  Number of detection layers: {model.detect_head.nl}")
+    print(f"  Number of anchors per layer: {model.detect_head.na}")
+    print(f"  Anchors shape: {model.detect_head.anchors.shape}")
+    print(f"  Strides: {model.detect_head.stride}")
+    
+    # 可驾驶区域分割头
+    print(f"\nDrivable Area Segmentation Head:")
+    print(f"  Number of layers: {len(model.drivable_seg_head)}")
+    for i, layer in enumerate(model.drivable_seg_head):
+        layer_type = type(layer).__name__
+        if hasattr(layer, 'conv'):
+            if hasattr(layer.conv, 'in_channels'):
+                in_ch = layer.conv.in_channels
+                out_ch = layer.conv.out_channels
+                print(f"    [{i}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+            else:
+                print(f"    [{i}] {layer_type}")
+        else:
+            print(f"    [{i}] {layer_type}")
+    
+    # 车道线分割头
+    print(f"\nLane Line Segmentation Head:")
+    print(f"  Number of layers: {len(model.lane_seg_head)}")
+    for i, layer in enumerate(model.lane_seg_head):
+        layer_type = type(layer).__name__
+        if hasattr(layer, 'conv'):
+            if hasattr(layer.conv, 'in_channels'):
+                in_ch = layer.conv.in_channels
+                out_ch = layer.conv.out_channels
+                print(f"    [{i}] {layer_type:15s} - {in_ch:4d} -> {out_ch:4d} channels")
+            else:
+                print(f"    [{i}] {layer_type}")
+        else:
+            print(f"    [{i}] {layer_type}")
+
+
+def check_frozen_parameters(model):
+    """检查哪些参数被冻结"""
+    print("\n" + "="*80)
+    print("Checking Frozen Parameters")
+    print("="*80)
+    
+    frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = frozen_params + trainable_params
+    
+    print(f"\nTotal parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    print(f"Frozen parameters: {frozen_params:,}")
+    print(f"Frozen percentage: {frozen_params/total_params*100:.2f}%")
+    
+    # 检查各部分的冻结状态
+    parts = {
+        'Backbone': model.backbone,
+        'Adapters': model.adapters,
+        'Neck': model.neck,
+        'Detection Head': model.detect_head,
+        'Drivable Seg Head': model.drivable_seg_head,
+        'Lane Seg Head': model.lane_seg_head
+    }
+    
+    print("\nParameter status by component:")
+    for name, module in parts.items():
+        total = sum(p.numel() for p in module.parameters())
+        frozen = sum(p.numel() for p in module.parameters() if not p.requires_grad)
+        trainable = total - frozen
+        status = "FROZEN" if frozen == total else ("TRAINABLE" if frozen == 0 else "PARTIAL")
+        print(f"  {name:20s}: {trainable:>10,} trainable, {frozen:>10,} frozen [{status}]")
+
+
+def main():
+    print("="*80)
+    print("YOLOP with YOLOv11 Backbone - Architecture Check")
+    print("="*80)
+    
+    # 创建模型 - 使用 nano 版本
+    yolo_scale = 'n'  # 可以改为 'n', 's', 'm', 'l', 'x'
+    # yolov11_weights = ''  # f'weights/yolo11{yolo_scale}.pt'
+    yolov11_weights = f'weights/yolo11{yolo_scale}.pt'
+    
+    print(f"\n1. Creating model with YOLOv11{yolo_scale} backbone...")
+    if os.path.exists(yolov11_weights):
+        print(f"   Loading pretrained weights from: {yolov11_weights}")
+        model = YOLOPWithYOLOv11(num_seg_class=2, yolo_scale=yolo_scale, yolo_weights_path=yolov11_weights)
+        print("✓ Model created with pretrained backbone")
+    else:
+        print(f"   ⚠ Weights not found at: {yolov11_weights}")
+        print("   Creating model from scratch...")
+        model = YOLOPWithYOLOv11(num_seg_class=2, yolo_scale=yolo_scale, yolo_weights_path=None)
+        print("✓ Model created from scratch")
+    
+    # 统计参数
+    print("\n2. Counting parameters...")
+    total_params, trainable_params = count_parameters(model)
+    print(f"   Total parameters: {total_params:,}")
+    print(f"   Trainable parameters: {trainable_params:,}")
+    
+    # 检查各部分结构
+    check_backbone_structure(model)
+    check_adapters(model)
+    check_neck_structure(model)
+    check_heads(model)
+    
+    # 测试前向传播
+    success = test_forward_pass(model, input_size=(1, 3, 384, 640))
+    
+    # 测试冻结backbone功能
+    if os.path.exists(yolov11_weights):
+        print("\n" + "="*80)
+        print(f"Testing Backbone Freezing")
+        print("="*80)
+        print("\nFreezing backbone parameters...")
+        model.freeze_backbone()
+        check_frozen_parameters(model)
+    else:
+        print("\n" + "="*80)
+        print("Checking Parameters (without pretrained weights)")
+        print("="*80)
+        check_frozen_parameters(model)
+    
+    # 总结
+    print("\n" + "="*80)
+    print("Summary")
+    print("="*80)
+    if success:
+        print("✓ All checks passed!")
+        print("✓ Model architecture is correct and ready for training")
+    else:
+        print("✗ Some checks failed. Please review the errors above.")
+    
+    print("\nModel attributes:")
+    print(f"  model.nc = {model.nc}")
+    print(f"  model.detector_index = {model.detector_index}")
+    print(f"  model.names = {model.names}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/script/compare_backbones.py b/script/compare_backbones.py
new file mode 100644
index 0000000000000000000000000000000000000000..c13293da126577e0ef5dd91be7d9f9490ba8664c
--- /dev/null
+++ b/script/compare_backbones.py
@@ -0,0 +1,212 @@
+"""
+对比YOLOv11 backbone和原始YOLOP backbone的差异
+"""
+import sys
+import os
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+import torch
+from lib.models.YOLOP_YOLOv11 import YOLOPWithYOLOv11, YOLOv11Backbone
+from lib.models.YOLOP import get_net
+from lib.config import cfg
+from lib.config import update_config
+
+
+def count_parameters(model):
+    """统计参数数量"""
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return total, trainable
+
+
+def get_model_size(model):
+    """估算模型大小（MB）"""
+    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
+    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
+    size_mb = (param_size + buffer_size) / (1024 ** 2)
+    return size_mb
+
+
+def test_inference_speed(model, input_size=(1, 3, 640, 640), num_runs=100, warmup=10):
+    """测试推理速度"""
+    import time
+    model.eval()
+    device = next(model.parameters()).device
+    
+    # 准备输入
+    dummy_input = torch.randn(input_size).to(device)
+    
+    # 预热
+    with torch.no_grad():
+        for _ in range(warmup):
+            _ = model(dummy_input)
+    
+    # 测试
+    torch.cuda.synchronize() if device.type == 'cuda' else None
+    start_time = time.time()
+    
+    with torch.no_grad():
+        for _ in range(num_runs):
+            _ = model(dummy_input)
+            torch.cuda.synchronize() if device.type == 'cuda' else None
+    
+    end_time = time.time()
+    avg_time = (end_time - start_time) / num_runs * 1000  # ms
+    fps = 1000 / avg_time
+    
+    return avg_time, fps
+
+
+def main():
+    print("="*80)
+    print("Backbone Comparison: YOLOv11 vs Original YOLOP")
+    print("="*80)
+    
+    # 创建模型
+    print("\n1. Creating models...")
+    
+    print("   Creating YOLOv11-based YOLOP...")
+    model_yolov11 = YOLOPWithYOLOv11(num_seg_class=2)
+    
+    print("   Creating original YOLOP...")
+    try:
+        model_original = get_net(cfg)
+    except Exception as e:
+        print(f"   Failed to create original YOLOP: {e}")
+        model_original = None
+    
+    print("✓ Models created")
+    
+    # 对比参数数量
+    print("\n" + "="*80)
+    print("Parameter Comparison")
+    print("="*80)
+    
+    # YOLOv11 backbone
+    yolov11_backbone_params, _ = count_parameters(model_yolov11.backbone)
+    yolov11_total_params, _ = count_parameters(model_yolov11)
+    yolov11_size = get_model_size(model_yolov11)
+    
+    print("\nYOLOv11-based YOLOP:")
+    print(f"  Backbone parameters: {yolov11_backbone_params:,}")
+    print(f"  Total parameters: {yolov11_total_params:,}")
+    print(f"  Model size: {yolov11_size:.2f} MB")
+    
+    if model_original:
+        original_total_params, _ = count_parameters(model_original)
+        original_size = get_model_size(model_original)
+        
+        print("\nOriginal YOLOP:")
+        print(f"  Total parameters: {original_total_params:,}")
+        print(f"  Model size: {original_size:.2f} MB")
+        
+        print("\nDifference:")
+        param_diff = yolov11_total_params - original_total_params
+        size_diff = yolov11_size - original_size
+        print(f"  Parameters: {param_diff:+,} ({param_diff/original_total_params*100:+.2f}%)")
+        print(f"  Size: {size_diff:+.2f} MB ({size_diff/original_size*100:+.2f}%)")
+    
+    # 对比输出形状
+    print("\n" + "="*80)
+    print("Output Shape Comparison")
+    print("="*80)
+    
+    test_input = torch.randn(1, 3, 640, 640)
+    
+    print("\nYOLOv11-based YOLOP:")
+    with torch.no_grad():
+        det_out, da_out, ll_out = model_yolov11(test_input)
+        print(f"  Detection output: {type(det_out)}")
+        if isinstance(det_out, tuple):
+            for i, out in enumerate(det_out):
+                if isinstance(out, list):
+                    print(f"    Element {i}: list with {len(out)} items")
+                    for j, item in enumerate(out):
+                        print(f"      [{j}] {item.shape}")
+                else:
+                    print(f"    Element {i}: {out.shape}")
+        print(f"  Drivable area output: {da_out.shape}")
+        print(f"  Lane line output: {ll_out.shape}")
+    
+    if model_original:
+        print("\nOriginal YOLOP:")
+        with torch.no_grad():
+            det_out_orig, da_out_orig, ll_out_orig = model_original(test_input)
+            print(f"  Detection output: {type(det_out_orig)}")
+            if isinstance(det_out_orig, tuple):
+                for i, out in enumerate(det_out_orig):
+                    if isinstance(out, list):
+                        print(f"    Element {i}: list with {len(out)} items")
+                        for j, item in enumerate(out):
+                            print(f"      [{j}] {item.shape}")
+                    else:
+                        print(f"    Element {i}: {out.shape}")
+            print(f"  Drivable area output: {da_out_orig.shape}")
+            print(f"  Lane line output: {ll_out_orig.shape}")
+    
+    # 对比推理速度
+    print("\n" + "="*80)
+    print("Inference Speed Comparison (CPU)")
+    print("="*80)
+    
+    print("\nTesting YOLOv11-based YOLOP...")
+    try:
+        avg_time_yolov11, fps_yolov11 = test_inference_speed(model_yolov11, num_runs=50, warmup=5)
+        print(f"  Average time: {avg_time_yolov11:.2f} ms")
+        print(f"  FPS: {fps_yolov11:.2f}")
+    except Exception as e:
+        print(f"  Failed: {e}")
+        avg_time_yolov11, fps_yolov11 = None, None
+    
+    if model_original:
+        print("\nTesting original YOLOP...")
+        try:
+            avg_time_original, fps_original = test_inference_speed(model_original, num_runs=50, warmup=5)
+            print(f"  Average time: {avg_time_original:.2f} ms")
+            print(f"  FPS: {fps_original:.2f}")
+            
+            if avg_time_yolov11 and avg_time_original:
+                time_diff = avg_time_yolov11 - avg_time_original
+                print(f"\nSpeed difference: {time_diff:+.2f} ms ({time_diff/avg_time_original*100:+.2f}%)")
+        except Exception as e:
+            print(f"  Failed: {e}")
+    
+    # Backbone架构对比
+    print("\n" + "="*80)
+    print("Backbone Architecture Comparison")
+    print("="*80)
+    
+    print("\nYOLOv11 Backbone:")
+    print(f"  Number of layers: {len(model_yolov11.backbone.layers)}")
+    print(f"  Output feature indices: {model_yolov11.backbone.out_indices}")
+    print("  Layer types:")
+    for i, layer in enumerate(model_yolov11.backbone.layers):
+        print(f"    [{i:2d}] {type(layer).__name__}")
+    
+    if model_original and hasattr(model_original, 'model'):
+        print("\nOriginal YOLOP Backbone:")
+        print(f"  Number of layers: {len(model_original.model)}")
+        print("  Layer types (first 10):")
+        for i in range(min(10, len(model_original.model))):
+            layer = model_original.model[i]
+            print(f"    [{i:2d}] {type(layer).__name__}")
+    
+    # 总结
+    print("\n" + "="*80)
+    print("Summary")
+    print("="*80)
+    print("\nKey differences:")
+    print("  ✓ YOLOv11 uses newer modules: C3k2, C2PSA, SPPF")
+    print("  ✓ Original YOLOP uses: Focus, BottleneckCSP, SPP")
+    print("  ✓ YOLOv11 backbone can be initialized with pre-trained weights")
+    print("  ✓ Both models have the same output format (compatible with training code)")
+    
+    print("\nRecommendations:")
+    print("  • Use YOLOv11 backbone with pre-trained weights for better initial performance")
+    print("  • Consider freezing backbone for faster training of detection/segmentation heads")
+    print("  • Monitor both training speed and accuracy during experiments")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test.jpg b/test.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8981747b7db21888ed464e9780f8887e3c2af5e0
Binary files /dev/null and b/test.jpg differ
diff --git a/test_onnx.py b/test_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..430cad11bb79b1e16a4944dcc50546a6180e5877
--- /dev/null
+++ b/test_onnx.py
@@ -0,0 +1,172 @@
+import os
+import cv2
+import torch
+import argparse
+import onnxruntime as ort
+import numpy as np
+from lib.core.general import non_max_suppression
+
+
+def resize_unscale(img, new_shape=(640, 640), color=114):
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    canvas = np.zeros((new_shape[0], new_shape[1], 3))
+    canvas.fill(color)
+    # Scale ratio (new / old) new_shape(h,w)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))  # w,h
+    new_unpad_w = new_unpad[0]
+    new_unpad_h = new_unpad[1]
+    pad_w, pad_h = new_shape[1] - new_unpad_w, new_shape[0] - new_unpad_h  # wh padding
+
+    dw = pad_w // 2  # divide padding into 2 sides
+    dh = pad_h // 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_AREA)
+
+    canvas[dh:dh + new_unpad_h, dw:dw + new_unpad_w, :] = img
+
+    return canvas, r, dw, dh, new_unpad_w, new_unpad_h  # (dw,dh)
+
+
+def infer_yolop(weight="yolop-640-640.onnx",
+                img_path="./inference/images/7dd9ef45-f197db95.jpg"):
+
+    ort.set_default_logger_severity(4)
+    onnx_path = f"./weights/{weight}"
+    ort_session = ort.InferenceSession(onnx_path)
+    print(f"Load {onnx_path} done!")
+
+    outputs_info = ort_session.get_outputs()
+    inputs_info = ort_session.get_inputs()
+
+    for ii in inputs_info:
+        print("Input: ", ii)
+    for oo in outputs_info:
+        print("Output: ", oo)
+
+    print("num outputs: ", len(outputs_info))
+
+    save_det_path = f"./pictures/detect_onnx.jpg"
+    save_da_path = f"./pictures/da_onnx.jpg"
+    save_ll_path = f"./pictures/ll_onnx.jpg"
+    save_merge_path = f"./pictures/output_onnx.jpg"
+
+    img_bgr = cv2.imread(img_path)
+    height, width, _ = img_bgr.shape
+
+    # convert to RGB
+    img_rgb = img_bgr[:, :, ::-1].copy()
+
+    # resize & normalize
+    canvas, r, dw, dh, new_unpad_w, new_unpad_h = resize_unscale(img_rgb, (640, 640))
+
+    img = canvas.copy().astype(np.float32)  # (3,640,640) RGB
+    img /= 255.0
+    img[:, :, 0] -= 0.485
+    img[:, :, 1] -= 0.456
+    img[:, :, 2] -= 0.406
+    img[:, :, 0] /= 0.229
+    img[:, :, 1] /= 0.224
+    img[:, :, 2] /= 0.225
+
+    img = img.transpose(2, 0, 1)
+
+    img = np.expand_dims(img, 0)  # (1, 3,640,640)
+
+    # inference: (1,n,6) (1,2,640,640) (1,2,640,640)
+    det_out, da_seg_out, ll_seg_out = ort_session.run(
+        ['det_out', 'drive_area_seg', 'lane_line_seg'],
+        input_feed={"images": img}
+    )
+
+    det_out = torch.from_numpy(det_out).float()
+    boxes = non_max_suppression(det_out)[0]  # [n,6] [x1,y1,x2,y2,conf,cls]
+    boxes = boxes.cpu().numpy().astype(np.float32)
+
+    if boxes.shape[0] == 0:
+        print("no bounding boxes detected.")
+        return
+
+    # scale coords to original size.
+    boxes[:, 0] -= dw
+    boxes[:, 1] -= dh
+    boxes[:, 2] -= dw
+    boxes[:, 3] -= dh
+    boxes[:, :4] /= r
+
+    print(f"detect {boxes.shape[0]} bounding boxes.")
+
+    img_det = img_rgb[:, :, ::-1].copy()
+    for i in range(boxes.shape[0]):
+        x1, y1, x2, y2, conf, label = boxes[i]
+        x1, y1, x2, y2, label = int(x1), int(y1), int(x2), int(y2), int(label)
+        img_det = cv2.rectangle(img_det, (x1, y1), (x2, y2), (0, 255, 0), 2, 2)
+
+    cv2.imwrite(save_det_path, img_det)
+
+    # select da & ll segment area.
+    da_seg_out = da_seg_out[:, :, dh:dh + new_unpad_h, dw:dw + new_unpad_w]
+    ll_seg_out = ll_seg_out[:, :, dh:dh + new_unpad_h, dw:dw + new_unpad_w]
+
+    da_seg_mask = np.argmax(da_seg_out, axis=1)[0]  # (?,?) (0|1)
+    ll_seg_mask = np.argmax(ll_seg_out, axis=1)[0]  # (?,?) (0|1)
+    print(da_seg_mask.shape)
+    print(ll_seg_mask.shape)
+
+    color_area = np.zeros((new_unpad_h, new_unpad_w, 3), dtype=np.uint8)
+    color_area[da_seg_mask == 1] = [0, 255, 0]
+    color_area[ll_seg_mask == 1] = [255, 0, 0]
+    color_seg = color_area
+
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+    color_mask = np.mean(color_seg, 2)
+    img_merge = canvas[dh:dh + new_unpad_h, dw:dw + new_unpad_w, :]
+    img_merge = img_merge[:, :, ::-1]
+
+    # merge: resize to original size
+    img_merge[color_mask != 0] = \
+        img_merge[color_mask != 0] * 0.5 + color_seg[color_mask != 0] * 0.5
+    img_merge = img_merge.astype(np.uint8)
+    img_merge = cv2.resize(img_merge, (width, height),
+                           interpolation=cv2.INTER_LINEAR)
+    for i in range(boxes.shape[0]):
+        x1, y1, x2, y2, conf, label = boxes[i]
+        x1, y1, x2, y2, label = int(x1), int(y1), int(x2), int(y2), int(label)
+        img_merge = cv2.rectangle(img_merge, (x1, y1), (x2, y2), (0, 255, 0), 2, 2)
+
+    # da: resize to original size
+    da_seg_mask = da_seg_mask * 255
+    da_seg_mask = da_seg_mask.astype(np.uint8)
+    da_seg_mask = cv2.resize(da_seg_mask, (width, height),
+                             interpolation=cv2.INTER_LINEAR)
+
+    # ll: resize to original size
+    ll_seg_mask = ll_seg_mask * 255
+    ll_seg_mask = ll_seg_mask.astype(np.uint8)
+    ll_seg_mask = cv2.resize(ll_seg_mask, (width, height),
+                             interpolation=cv2.INTER_LINEAR)
+
+    cv2.imwrite(save_merge_path, img_merge)
+    cv2.imwrite(save_da_path, da_seg_mask)
+    cv2.imwrite(save_ll_path, ll_seg_mask)
+
+    print("detect done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weight', type=str, default="yolop-640-640.onnx")
+    parser.add_argument('--img', type=str, default="./inference/images/9aa94005-ff1d4c9a.jpg")
+    args = parser.parse_args()
+
+    infer_yolop(weight=args.weight, img_path=args.img)
+    """
+    PYTHONPATH=. python3 ./test_onnx.py --weight yolop-640-640.onnx --img test.jpg
+    """
diff --git a/toolkits/datasetpre/gen_bdd_seglabel.py b/toolkits/datasetpre/gen_bdd_seglabel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28922534eba43993cdb82868910968c584e09f0
--- /dev/null
+++ b/toolkits/datasetpre/gen_bdd_seglabel.py
@@ -0,0 +1,110 @@
+from matplotlib.path import Path
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import matplotlib.image as mpimg
+import os
+import json 
+import numpy as np
+
+from tqdm import tqdm
+
+
+def poly2patch(poly2d, closed=False, alpha=1., color=None):
+    moves = {'L': Path.LINETO,
+             'C': Path.CURVE4}
+    points = [p[:2] for p in poly2d]
+    codes = [moves[p[2]] for p in poly2d]
+    codes[0] = Path.MOVETO
+
+    if closed:
+        points.append(points[0])
+        codes.append(Path.CLOSEPOLY)
+
+
+    return mpatches.PathPatch(
+        Path(points, codes),
+        facecolor=color if closed else 'none',
+        edgecolor=color,  # if not closed else 'none',
+        lw=1 if closed else 2 * 1, alpha=alpha,
+        antialiased=False, snap=True)
+
+
+def get_areas_v0(objects):
+    # print(objects['category'])
+    return [o for o in objects
+            if 'poly2d' in o and o['category'].startswith('area')]
+
+
+def draw_drivable(objects, ax):
+    plt.draw()
+
+    objects = get_areas_v0(objects)
+    for obj in objects:
+        if obj['category'] == 'area/drivable':
+            color = (1, 1, 1)
+        # elif obj['category'] == 'area/alternative':
+        #     color = (0, 1, 0)
+        else:
+            if obj['category'] != 'area/alternative':
+                print(obj['category'])
+            color = (0, 0, 0)
+        # alpha = 0.5
+        alpha = 1.0
+        poly2d = obj['poly2d']
+        ax.add_patch(poly2patch(
+                poly2d, closed=True,
+                alpha=alpha, color=color))
+
+    ax.axis('off')
+
+def filter_pic(data):
+    for obj in data:
+        if obj['category'].startswith('area'):
+            return True
+        else:
+            pass
+    return False
+
+def main(mode="train"):
+    image_dir = "bdd/bdd100k/images/100k/{}".format(mode)
+    val_dir = "bdd/bdd100k/labels/100k/{}".format(mode)
+    out_dir = 'bdd_seg_gt/{}'.format(mode)
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+    val_list = os.listdir(val_dir)
+    # val_pd = pd.read_json(open(val_json))
+    # print(val_pd.head())
+
+    for val_json in tqdm(val_list):
+        val_json = os.path.join(val_dir, val_json)
+        val_pd = json.load(open(val_json))
+        data = val_pd['frames'][0]['objects']
+        img_name = val_pd['name']
+
+        remain = filter_pic(data)
+        # if remain:
+        dpi = 80
+        w = 16
+        h = 9
+        image_width = 1280
+        image_height = 720
+        fig = plt.figure(figsize=(w, h), dpi=dpi)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], frameon=False)
+        out_path = os.path.join(out_dir, img_name+'.png')
+        ax.set_xlim(0, image_width - 1)
+        ax.set_ylim(0, image_height - 1)
+        ax.invert_yaxis()
+        ax.add_patch(poly2patch(
+            [[0, 0, 'L'], [0, image_height - 1, 'L'],
+            [image_width - 1, image_height - 1, 'L'],
+            [image_width - 1, 0, 'L']],
+            closed=True, alpha=1., color=(0, 0, 0)))
+        if remain:
+            draw_drivable(data, ax)
+        fig.savefig(out_path, dpi=dpi)
+        plt.close()
+    else:
+        pass
+
+if __name__ == '__main__':
+    main(mode='train')
diff --git a/toolkits/deploy/CMakeLists.txt b/toolkits/deploy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec9af2a9a036f964411d6df05a1498d94eee8d91
--- /dev/null
+++ b/toolkits/deploy/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 2.6)
+
+project(yolop)
+
+add_definitions(-std=c++11)
+
+option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_BUILD_TYPE Release)
+
+
+find_package(ZED 3 REQUIRED)
+find_package(CUDA ${ZED_CUDA_VERSION} EXACT REQUIRED)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+
+# cuda
+include_directories(/usr/local/cuda-10.2/include)
+link_directories(/usr/local/cuda-10.2/lib64)
+# tensorrt
+include_directories(/usr/include/aarch64-linux-gnu/)
+link_directories(/usr/lib/aarch64-linux-gnu/)
+# zed
+include_directories(/usr/local/zed/include)
+link_directories(/usr/local/zed/lib)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+
+set(ZED_LIBS ${ZED_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUDART_LIBRARY})
+
+coda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
+target_link_libraries(myplugins nvinfer cudart)
+
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+add_executable(yolop ${PROJECT_SOURCE_DIR}/main.cpp)
+target_link_libraries(yolop nvinfer)
+target_link_libraries(yolop ${ZED_LIBS})
+target_link_libraries(yolop cudart)
+target_link_libraries(yolop myplugins)
+target_link_libraries(yolop ${OpenCV_LIBS})
+
+add_definitions(-O3 -pthread)
+
diff --git a/toolkits/deploy/common.hpp b/toolkits/deploy/common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78b75e36aeb67aa1685d7336e4f226a8a9ae4314
--- /dev/null
+++ b/toolkits/deploy/common.hpp
@@ -0,0 +1,359 @@
+#ifndef YOLOV5_COMMON_H_
+#define YOLOV5_COMMON_H_
+
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <opencv2/opencv.hpp>
+#include "NvInfer.h"
+#include "yololayer.h"
+
+using namespace nvinfer1;
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
+    int l, r, t, b;
+    float r_w = Yolo::INPUT_W / (img.cols * 1.0);
+    float r_h = Yolo::INPUT_H / (img.rows * 1.0);
+    if (r_h > r_w) {
+        l = bbox[0] - bbox[2] / 2.f;
+        r = bbox[0] + bbox[2] / 2.f;
+        t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
+        b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
+        l = l / r_w;
+        r = r / r_w;
+        t = t / r_w;
+        b = b / r_w;
+    } else {
+        l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
+        r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
+        t = bbox[1] - bbox[3] / 2.f;
+        b = bbox[1] + bbox[3] / 2.f;
+        l = l / r_h;
+        r = r / r_h;
+        t = t / r_h;
+        b = b / r_h;
+    }
+    return cv::Rect(l, t, r - l, b - t);
+}
+
+float iou(float lbox[4], float rbox[4]) {
+    float interBox[] = {
+        (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
+        (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
+        (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
+        (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
+    };
+
+    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
+        return 0.0f;
+
+    float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
+    return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
+}
+
+bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
+    return a.conf > b.conf;
+}
+
+void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
+    int det_size = sizeof(Yolo::Detection) / sizeof(float);
+    std::map<float, std::vector<Yolo::Detection>> m;
+    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
+        if (output[1 + det_size * i + 4] <= conf_thresh) continue;
+        Yolo::Detection det;
+        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
+        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
+        m[det.class_id].push_back(det);
+    }
+    for (auto it = m.begin(); it != m.end(); it++) {
+        //std::cout << it->second[0].class_id << " --- " << std::endl;
+        auto& dets = it->second;
+        std::sort(dets.begin(), dets.end(), cmp);
+        for (size_t m = 0; m < dets.size(); ++m) {
+            auto& item = dets[m];
+            res.push_back(item);
+            for (size_t n = m + 1; n < dets.size(); ++n) {
+                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
+                    dets.erase(dets.begin() + n);
+                    --n;
+                }
+            }
+        }
+    }
+}
+
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+std::map<std::string, Weights> loadWeights(const std::string file) {
+    std::cout << "Loading weights: " << file << std::endl;
+    std::map<std::string, Weights> weightMap;
+
+    // Open weights file
+    std::ifstream input(file);
+    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+
+    // Read number of weight blobs
+    int32_t count;
+    input >> count;
+    assert(count > 0 && "Invalid weight map file.");
+
+    while (count--)
+    {
+        Weights wt{ DataType::kFLOAT, nullptr, 0 };
+        uint32_t size;
+
+        // Read name and type of blob
+        std::string name;
+        input >> name >> std::dec >> size;
+        wt.type = DataType::kFLOAT;
+
+        // Load blob
+        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+        for (uint32_t x = 0, y = size; x < y; ++x)
+        {
+            input >> std::hex >> val[x];
+        }
+        wt.values = val;
+
+        wt.count = size;
+        weightMap[name] = wt;
+    }
+
+    return weightMap;
+}
+
+IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
+    float *gamma = (float*)weightMap[lname + ".weight"].values;
+    float *beta = (float*)weightMap[lname + ".bias"].values;
+    float *mean = (float*)weightMap[lname + ".running_mean"].values;
+    float *var = (float*)weightMap[lname + ".running_var"].values;
+    int len = weightMap[lname + ".running_var"].count;
+
+    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        scval[i] = gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights scale{ DataType::kFLOAT, scval, len };
+
+    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights shift{ DataType::kFLOAT, shval, len };
+
+    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        pval[i] = 1.0;
+    }
+    Weights power{ DataType::kFLOAT, pval, len };
+
+    weightMap[lname + ".scale"] = scale;
+    weightMap[lname + ".shift"] = shift;
+    weightMap[lname + ".power"] = power;
+    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
+    assert(scale_1);
+    return scale_1;
+}
+
+ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
+    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+    int p = ksize / 2;
+    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
+    assert(conv1);
+    conv1->setStrideNd(DimsHW{ s, s });
+    conv1->setPaddingNd(DimsHW{ p, p });
+    conv1->setNbGroups(g);
+    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
+
+    // silu = x * sigmoid
+    // auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
+    // assert(sig);
+    // auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
+    // assert(ew);
+
+    // hard_swish = x * hard_sigmoid
+    auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID);
+    assert(hsig);
+    hsig->setAlpha(1.0 / 6.0);
+    hsig->setBeta(0.5);
+    auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD);
+    assert(ew);
+    return ew;
+}
+
+ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
+    ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
+    ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
+    ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
+    ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
+    ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
+    auto cat = network->addConcatenation(inputTensors, 4);
+    auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
+    return conv;
+}
+
+ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
+    auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
+    auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
+    if (shortcut && c1 == c2) {
+        auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
+        return ew;
+    }
+    return cv2;
+}
+
+ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
+    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+    int c_ = (int)((float)c2 * e);
+    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+    auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
+    ITensor *y1 = cv1->getOutput(0);
+    for (int i = 0; i < n; i++) {
+        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
+        y1 = b->getOutput(0);
+    }
+    auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);
+
+    ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
+    auto cat = network->addConcatenation(inputTensors, 2);
+
+    IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
+    auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
+    lr->setAlpha(0.1);
+
+    auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
+    return cv4;
+}
+
+ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
+    int c_ = (int)((float)c2 * e);
+    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+    auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
+    ITensor *y1 = cv1->getOutput(0);
+    for (int i = 0; i < n; i++) {
+        auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
+        y1 = b->getOutput(0);
+    }
+
+    ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
+    auto cat = network->addConcatenation(inputTensors, 2);
+
+    auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
+    return cv3;
+}
+
+ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
+    int c_ = c1 / 2;
+    auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+
+    auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
+    pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
+    pool1->setStrideNd(DimsHW{ 1, 1 });
+    auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
+    pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
+    pool2->setStrideNd(DimsHW{ 1, 1 });
+    auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
+    pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
+    pool3->setStrideNd(DimsHW{ 1, 1 });
+
+    ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
+    auto cat = network->addConcatenation(inputTensors, 4);
+
+    auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
+    return cv2;
+}
+
+ILayer* preprocess_layer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+    // rescale
+    auto rescale = network->addResize(input);
+    rescale->setOutputDimensions(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W });
+    rescale->setResizeMode(ResizeMode::kLINEAR);
+    // normalize
+    // long len = 3 * Yolo::IMG_H * Yolo::IMG_W;
+    // float *normval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    // for (size_t i = 0; i < len; ++i) {
+    //     normval[i] = 255.0;
+    // }
+    // Weights norm{ DataType::kFLOAT, normval, len };
+    // weightMap["prep.norm"] = norm;
+    // auto constant = network->addConstant(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W }, norm);
+    // auto normalize = network->addElementWise(*rescale->getOutput(0), *constant->getOutput(0), ElementWiseOperation::kDIV);
+
+    //paddng
+    auto padding = network->addPaddingNd(*rescale->getOutput(0),
+                                        DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 },
+                                        DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 });
+    
+    assert(padding);
+    return padding;
+
+}
+
+std::vector<float> getAnchors(std::map<std::string, Weights>& weightMap)
+{
+    std::vector<float> anchors_yolo;
+    Weights Yolo_Anchors = weightMap["model.24.anchor_grid"];
+    assert(Yolo_Anchors.count == 18);
+    int each_yololayer_anchorsnum = Yolo_Anchors.count / 3;
+    const float* tempAnchors = (const float*)(Yolo_Anchors.values);
+    for (int i = 0; i < Yolo_Anchors.count; i++)
+    {
+        if (i < each_yololayer_anchorsnum)
+        {
+            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
+        }
+        if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum)))
+        {
+            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
+        }
+        if (i >= (2 * each_yololayer_anchorsnum))
+        {
+            anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
+        }
+    }
+    
+    return anchors_yolo;
+}
+
+IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2)
+{
+    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
+    std::vector<float> anchors_yolo = getAnchors(weightMap);
+    PluginField pluginMultidata[4];
+    int NetData[4];
+    NetData[0] = Yolo::CLASS_NUM;
+    NetData[1] = Yolo::INPUT_W;
+    NetData[2] = Yolo::INPUT_H;
+    NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT;
+    pluginMultidata[0].data = NetData;
+    pluginMultidata[0].length = 3;
+    pluginMultidata[0].name = "netdata";
+    pluginMultidata[0].type = PluginFieldType::kFLOAT32;
+    int scale[3] = { 8, 16, 32 };
+    int plugindata[3][8];
+    std::string names[3];
+    for (int k = 1; k < 4; k++)
+    {
+        plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1];
+        plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1];
+        for (int i = 2; i < 8; i++)
+        {
+            plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]);
+        }
+        pluginMultidata[k].data = plugindata[k - 1];
+        pluginMultidata[k].length = 8;
+        names[k - 1] = "yolodata" + std::to_string(k);
+        pluginMultidata[k].name = names[k - 1].c_str();
+        pluginMultidata[k].type = PluginFieldType::kFLOAT32;
+    }
+    PluginFieldCollection pluginData;
+    pluginData.nbFields = 4;
+    pluginData.fields = pluginMultidata;
+    IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData);
+    ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) };
+    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
+    return yolo;
+}
+#endif
+
diff --git a/toolkits/deploy/cuda_utils.h b/toolkits/deploy/cuda_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fbd31990c504b2bb2a094e5fb4d7de5ee43e51b
--- /dev/null
+++ b/toolkits/deploy/cuda_utils.h
@@ -0,0 +1,18 @@
+#ifndef TRTX_CUDA_UTILS_H_
+#define TRTX_CUDA_UTILS_H_
+
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+
+#endif  // TRTX_CUDA_UTILS_H_
+
diff --git a/toolkits/deploy/gen_wts.py b/toolkits/deploy/gen_wts.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c2db5fae2f6174601f3fbb7b2268aa3a5e7e99
--- /dev/null
+++ b/toolkits/deploy/gen_wts.py
@@ -0,0 +1,30 @@
+import os, sys
+import torch
+import struct
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(BASE_DIR)
+from lib.models import get_net
+from lib.config import cfg
+
+
+# Initialize
+device = torch.device('cpu')
+# Load model
+model = get_net(cfg)
+checkpoint = torch.load('weights/End-to-end.pth', map_location=device)
+model.load_state_dict(checkpoint['state_dict'])
+# load to FP32
+model.float()
+model.to(device).eval()
+
+f = open('yolop.wts', 'w')
+f.write('{}\n'.format(len(model.state_dict().keys())))
+for k, v in model.state_dict().items():
+    vr = v.reshape(-1).cpu().numpy()
+    f.write('{} {} '.format(k, len(vr)))
+    for vv in vr:
+        f.write(' ')
+        f.write(struct.pack('>f',float(vv)).hex())
+    f.write('\n')
+
+f.close()
diff --git a/toolkits/deploy/infer_files.cpp b/toolkits/deploy/infer_files.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7f8281b33da199a10fd026ad47efff2de85175c
--- /dev/null
+++ b/toolkits/deploy/infer_files.cpp
@@ -0,0 +1,200 @@
+#include "yolov5.hpp"
+
+
+int main(int argc, char** argv) {
+    cudaSetDevice(DEVICE);
+
+    std::string wts_name = "";
+    std::string engine_name = "";
+    float gd = 0.0f, gw = 0.0f;
+    std::string img_dir;
+    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
+        std::cerr << "arguments not right!" << std::endl;
+        std::cerr << "./yolov5 -s [.wts] [.engine] [s/m/l/x or c gd gw]  // serialize model to plan file" << std::endl;
+        std::cerr << "./yolov5 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    // create a model using the API directly and serialize it to a stream
+    if (!wts_name.empty()) {
+        IHostMemory* modelStream{ nullptr };
+        APIToModel(BATCH_SIZE, &modelStream, wts_name);
+        assert(modelStream != nullptr);
+        std::ofstream p(engine_name, std::ios::binary);
+        if (!p) {
+            std::cerr << "could not open plan output file" << std::endl;
+            return -1;
+        }
+        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+        modelStream->destroy();
+        return 0;
+    }
+
+    // deserialize the .engine and run inference
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        return -1;
+    }
+    char *trtModelStream = nullptr;
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    trtModelStream = new char[size];
+    assert(trtModelStream);
+    file.read(trtModelStream, size);
+    file.close();
+
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    // prepare input data ---------------------------
+    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
+    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
+    //    data[i] = 1.0;
+    static float prob[BATCH_SIZE * OUTPUT_SIZE];
+    static int seg_out[BATCH_SIZE * IMG_H * IMG_W];
+    static int lane_out[BATCH_SIZE * IMG_H * IMG_W];
+    IRuntime* runtime = createInferRuntime(gLogger);
+    assert(runtime != nullptr);
+    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
+    assert(engine != nullptr);
+    IExecutionContext* context = engine->createExecutionContext();
+    assert(context != nullptr);
+    delete[] trtModelStream;
+    assert(engine->getNbBindings() == 4);
+    void* buffers[4];
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
+    const int output_det_index = engine->getBindingIndex(OUTPUT_DET_NAME);
+    const int output_seg_index = engine->getBindingIndex(OUTPUT_SEG_NAME);
+    const int output_lane_index = engine->getBindingIndex(OUTPUT_LANE_NAME);
+    assert(inputIndex == 0);
+    assert(output_det_index == 1);
+    assert(output_seg_index == 2);
+    assert(output_lane_index == 3);
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+    // Create stream
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    // store seg results
+    cv::Mat tmp_seg(IMG_H, IMG_W, CV_32S, seg_out);
+    // store lane results
+    cv::Mat tmp_lane(IMG_H, IMG_W, CV_32S, lane_out);
+    // PrintMat(tmp_seg);
+    std::vector<cv::Vec3b> segColor;
+    segColor.push_back(cv::Vec3b(0, 0, 0));
+    segColor.push_back(cv::Vec3b(0, 255, 0));
+    segColor.push_back(cv::Vec3b(255, 0, 0));
+
+    std::vector<cv::Vec3b> laneColor;
+    laneColor.push_back(cv::Vec3b(0, 0, 0));
+    laneColor.push_back(cv::Vec3b(0, 0, 255));
+    laneColor.push_back(cv::Vec3b(0, 0, 0));
+
+    int fcount = 0;  // set for batch-inference
+    for (int f = 0; f < (int)file_names.size(); f++) {
+        fcount++;
+        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
+
+        // preprocess ~3ms
+        for (int b = 0; b < fcount; b++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);  // load image takes ~17ms
+            if (img.empty()) continue;
+            //cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+            cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox
+            int i = 0;
+            // BGR to RGB and normalize
+            for (int row = 0; row < INPUT_H; ++row) {
+                float* uc_pixel = pr_img.ptr<float>(row);
+                for (int col = 0; col < INPUT_W; ++col) {
+                    data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[0];
+                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1];
+                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[2];
+                    uc_pixel += 3;
+                    ++i;
+                }
+            }
+        }
+
+        // Run inference
+        auto start = std::chrono::system_clock::now();
+        doInferenceCpu(*context, stream, buffers, data, prob, seg_out, lane_out, BATCH_SIZE);
+        auto end = std::chrono::system_clock::now();
+        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+
+        // postprocess ~0ms
+        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
+        for (int b = 0; b < fcount; b++) {
+            auto& res = batch_res[b];
+            nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
+        }
+        
+        // show results
+        for (int b = 0; b < fcount; ++b) {
+            auto& res = batch_res[b];
+            //std::cout << res.size() << std::endl;
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
+            // unsigned cnt = 0;
+            // for (const auto &item : seg_out)
+            //     if (item == 1)
+            //         ++cnt;
+            // std::cout << cnt << std::endl;
+
+            // handling seg and lane results
+            cv::Mat seg_res(img.rows, img.cols, CV_32S);
+            cv::resize(tmp_seg, seg_res, seg_res.size(), 0, 0, cv::INTER_NEAREST);
+            cv::Mat lane_res(img.rows, img.cols, CV_32S);
+            cv::resize(tmp_lane, lane_res, lane_res.size(), 0, 0, cv::INTER_NEAREST);
+            for (int row = 0; row < img.rows; ++row) {
+                uchar* pdata = img.data + row * img.step;
+                for (int col = 0; col < img.cols; ++col) {
+                    int seg_idx = seg_res.at<int>(row, col);
+                    int lane_idx = lane_res.at<int>(row, col);
+                    //std::cout << "enter" << ix << std::endl;
+                    for (int i = 0; i < 3; ++i) {
+                        if (lane_idx) {
+                            if (i != 2)
+                                pdata[i] = pdata[i] / 2 + laneColor[lane_idx][i] / 2;
+                        }
+                        else if (seg_idx)
+                            pdata[i] = pdata[i] / 2 + segColor[seg_idx][i] / 2;
+                    }
+                    pdata += 3;
+                }
+            }
+            // handling det results
+
+            for (size_t j = 0; j < res.size(); ++j) {
+                cv::Rect r = get_rect(img, res[j].bbox);
+                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+            }
+            cv::imwrite("../results/_" + file_names[f - fcount + 1 + b], img);
+        }
+        fcount = 0;
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(buffers[inputIndex]));
+    CUDA_CHECK(cudaFree(buffers[output_det_index]));
+    CUDA_CHECK(cudaFree(buffers[output_seg_index]));
+    CUDA_CHECK(cudaFree(buffers[output_lane_index]));
+    // Destroy the engine
+    context->destroy();
+    engine->destroy();
+    runtime->destroy();
+
+    return 0;
+}
diff --git a/toolkits/deploy/logging.h b/toolkits/deploy/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..602b69fb5759ac8401765bad5251928c59bac7c7
--- /dev/null
+++ b/toolkits/deploy/logging.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
+        : mOutput(other.mOutput)
+    {
+    }
+
+    ~LogStreamConsumerBuffer()
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    // synchronizes the stream buffer and returns 0 on success
+    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    // resetting the buffer and flushing the stream
+    virtual int sync()
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+            // set the buffer to empty
+            str("");
+            // flush the stream
+            mOutput.flush();
+        }
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    LogStreamConsumer(Severity reportableSeverity, Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other)
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger()
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) override
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity)
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
+    {
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(name, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int reportPass(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportFail(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int reportWaive(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportTest(const TestAtom& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(const TestAtom& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int i = 0; i < argc; i++)
+        {
+            if (i > 0)
+                ss << " ";
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+} // anonymous namespace
+
+#endif // TENSORRT_LOGGING_H
diff --git a/toolkits/deploy/main.cpp b/toolkits/deploy/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb48c6c3a5a2255d3f8dfea9dba38c6a3ecc46d1
--- /dev/null
+++ b/toolkits/deploy/main.cpp
@@ -0,0 +1,149 @@
+#include "yolov5.hpp"
+#include "zedcam.hpp"
+#include <csignal>
+
+static volatile bool keep_running = true;
+
+
+void keyboard_handler(int sig) {
+    // handle keyboard interrupt
+    if (sig == SIGINT)
+        keep_running = false;
+}
+
+
+int main(int argc, char** argv) {
+    signal(SIGINT, keyboard_handler);
+    cudaSetDevice(DEVICE);
+    // CUcontext ctx;
+    // CUdevice device;
+    // cuInit(0);
+    // cuDeviceGet(&device, 0);
+    // cuCtxCreate(&ctx, 0, device);
+
+    std::string wts_name = "yolop.wts";
+    std::string engine_name = "yolop.engine";
+
+    // deserialize the .engine and run inference
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        // std::cerr << "read " << engine_name << " error!" << std::endl;
+        std::cout << "Building engine..." << std::endl;
+        IHostMemory* modelStream{ nullptr };
+        APIToModel(BATCH_SIZE, &modelStream, wts_name);
+        assert(modelStream != nullptr);
+        std::ofstream p(engine_name, std::ios::binary);
+        if (!p) {
+            std::cerr << "could not open plan output file" << std::endl;
+            return -1;
+        }
+        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+        modelStream->destroy();
+        std::cout << "Engine has been built and saved to file." << std::endl;
+    }
+    char *trtModelStream = nullptr;
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    trtModelStream = new char[size];
+    assert(trtModelStream);
+    file.read(trtModelStream, size);
+    file.close();
+
+    // prepare data ---------------------------
+    static float det_out[BATCH_SIZE * OUTPUT_SIZE];
+    static int seg_out[BATCH_SIZE * IMG_H * IMG_W];
+    static int lane_out[BATCH_SIZE * IMG_H * IMG_W];
+    IRuntime* runtime = createInferRuntime(gLogger);
+    assert(runtime != nullptr);
+    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
+    assert(engine != nullptr);
+    IExecutionContext* context = engine->createExecutionContext();
+    assert(context != nullptr);
+    delete[] trtModelStream;
+    assert(engine->getNbBindings() == 4);
+    void* buffers[4];
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
+    const int output_det_index = engine->getBindingIndex(OUTPUT_DET_NAME);
+    const int output_seg_index = engine->getBindingIndex(OUTPUT_SEG_NAME);
+    const int output_lane_index = engine->getBindingIndex(OUTPUT_LANE_NAME);
+    assert(inputIndex == 0);
+    assert(output_det_index == 1);
+    assert(output_seg_index == 2);
+    assert(output_lane_index == 3);
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+    // Create stream
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    // create zed
+    auto zed = create_camera();
+    sl::Resolution image_size = zed->getCameraInformation().camera_configuration.resolution;
+    sl::Mat img_zed(image_size.width, image_size.height, sl::MAT_TYPE::U8_C4, sl::MEM::GPU);
+    cv::cuda::GpuMat img_ocv = slMat2cvMatGPU(img_zed);
+    cv::cuda::GpuMat cvt_img(image_size.height, image_size.width, CV_8UC3);
+
+    // store seg results
+    cv::Mat tmp_seg(IMG_H, IMG_W, CV_32S, seg_out);
+    // sotore lane results
+    cv::Mat tmp_lane(IMG_H, IMG_W, CV_32S, lane_out);
+    cv::Mat seg_res(image_size.height, image_size.width, CV_32S);
+    cv::Mat lane_res(image_size.height, image_size.width, CV_32S);
+
+    char key = ' ';
+    while (keep_running and key != 'q') {
+        // retrieve img
+        if (zed->grab() != sl::ERROR_CODE::SUCCESS) continue;
+        zed->retrieveImage(img_zed, sl::VIEW::LEFT, sl::MEM::GPU);
+        cudaSetDevice(DEVICE);
+        cv::cuda::cvtColor(img_ocv, cvt_img, cv::COLOR_BGRA2BGR);
+        
+        // preprocess ~3ms
+        preprocess_img_gpu(cvt_img, (float*)buffers[inputIndex], INPUT_W, INPUT_H); // letterbox
+        
+        // buffers[inputIndex] = pr_img.data;
+        // Run inference
+        auto start = std::chrono::system_clock::now();
+        // cuCtxPushCurrent(ctx);
+        doInference(*context, stream, buffers, det_out, seg_out, lane_out, BATCH_SIZE);
+        // cuCtxPopCurrent(&ctx);
+        auto end = std::chrono::system_clock::now();
+        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+
+        // postprocess ~0ms
+        std::vector<Yolo::Detection> batch_res;
+        nms(batch_res, det_out, CONF_THRESH, NMS_THRESH);
+        cv::resize(tmp_seg, seg_res, seg_res.size(), 0, 0, cv::INTER_NEAREST);
+        cv::resize(tmp_lane, lane_res, lane_res.size(), 0, 0, cv::INTER_NEAREST);
+
+        // show results
+        //std::cout << res.size() << std::endl;
+        visualization(cvt_img, seg_res, lane_res, batch_res, key);
+    }
+    // destroy windows
+#ifdef SHOW_IMG
+    cv::destroyAllWindows();
+#endif
+    // close camera
+    img_zed.free();
+    zed->close();
+    delete zed;
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(buffers[inputIndex]));
+    CUDA_CHECK(cudaFree(buffers[output_det_index]));
+    CUDA_CHECK(cudaFree(buffers[output_seg_index]));
+    CUDA_CHECK(cudaFree(buffers[output_lane_index]));
+    // Destroy the engine
+    context->destroy();
+    engine->destroy();
+    runtime->destroy();
+    return 0;
+}
diff --git a/toolkits/deploy/utils.h b/toolkits/deploy/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..89808ad95b73461156746112f26732ecbe370863
--- /dev/null
+++ b/toolkits/deploy/utils.h
@@ -0,0 +1,155 @@
+#ifndef TRTX_YOLOV5_UTILS_H_
+#define TRTX_YOLOV5_UTILS_H_
+
+#include <dirent.h>
+#include <opencv2/opencv.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/highgui.hpp>
+#include <iostream>
+#include "common.hpp"
+
+#define SHOW_IMG
+
+static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
+    int w, h, x, y;
+    float r_w = input_w / (img.cols*1.0);
+    float r_h = input_h / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = input_w;
+        h = r_w * img.rows;
+        x = 0;
+        y = (input_h - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = input_h;
+        x = (input_w - w) / 2;
+        y = 0;
+    }
+    cv::Mat re(h, w, CV_8UC3);
+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    cv::Mat tensor;
+    out.convertTo(tensor, CV_32FC3, 1.f / 255.f);
+    
+    cv::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1);
+    cv::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1);
+    // std::cout << cv::format(out, cv::Formatter::FMT_NUMPY)<< std::endl;
+    // assert(false);
+    // cv::Mat out(input_h, input_w, CV_8UC3);
+    // cv::copyMakeBorder(re, out, y, y, x, x, cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128));
+    return tensor;
+}
+
+void preprocess_img_gpu(cv::cuda::GpuMat& img, float* gpu_input, int input_w, int input_h) {
+    int w, h, x, y;
+    float r_w = input_w / (img.cols*1.0);
+    float r_h = input_h / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = input_w;
+        h = r_w * img.rows;
+        x = 0;
+        y = (input_h - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = input_h;
+        x = (input_w - w) / 2;
+        y = 0;
+    }
+    cv::cuda::GpuMat re(h, w, CV_8UC3);
+    cv::cuda::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+    cv::cuda::GpuMat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
+    cv::cuda::GpuMat tensor;
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    out.convertTo(tensor, CV_32FC3, 1.f / 255.f);
+    cv::cuda::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1);
+    cv::cuda::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1);
+    // cv::Mat out(input_h, input_w, CV_8UC3);
+    // cv::copyMakeBorder(re, out, y, y, x, x, cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128));
+
+    // to tensor
+    std::vector<cv::cuda::GpuMat> chw;
+    for (size_t i = 0; i < 3; ++i)
+    {
+        chw.emplace_back(cv::cuda::GpuMat(tensor.size(), CV_32FC1, gpu_input + i * input_w * input_h));
+    }
+    cv::cuda::split(tensor, chw);
+}
+
+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+            strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+void PrintMat(cv::Mat &A)
+{
+  for(int i=0; i<A.rows; i++)
+  {
+    for(int j=0; j<A.cols; j++)
+        std::cout << A.at<int>(i,j) << ' ';
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+void visualization(cv::cuda::GpuMat& cvt_img, cv::Mat& seg_res, cv::Mat& lane_res, std::vector<Yolo::Detection>& res, char& key)
+{
+    static const std::vector<cv::Vec3b> segColor{cv::Vec3b(0, 0, 0), cv::Vec3b(0, 255, 0), cv::Vec3b(255, 0, 0)};
+    static const std::vector<cv::Vec3b> laneColor{cv::Vec3b(0, 0, 0), cv::Vec3b(0, 0, 255), cv::Vec3b(0, 0, 0)};
+    cv::Mat cvt_img_cpu;
+    cvt_img.download(cvt_img_cpu);
+
+    // handling seg and lane results
+    for (int row = 0; row < cvt_img_cpu.rows; ++row) {
+        uchar* pdata = cvt_img_cpu.data + row * cvt_img_cpu.step;
+        for (int col = 0; col < cvt_img_cpu.cols; ++col) {
+            int seg_idx = seg_res.at<int>(row, col);
+            int lane_idx = lane_res.at<int>(row, col);
+            //std::cout << "enter" << ix << std::endl;
+            for (int i = 0; i < 3; ++i) {
+                if (lane_idx) {
+                    if (i != 2)
+                        pdata[i] = pdata[i] / 2 + laneColor[lane_idx][i] / 2;
+                }
+                else if (seg_idx)
+                    pdata[i] = pdata[i] / 2 + segColor[seg_idx][i] / 2;
+            }
+            pdata += 3;
+        }
+    }
+
+    // handling det results
+    for (size_t j = 0; j < res.size(); ++j) {
+        cv::Rect r = get_rect(cvt_img_cpu, res[j].bbox);
+        cv::rectangle(cvt_img_cpu, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+        cv::putText(cvt_img_cpu, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+    }
+    
+#ifdef SHOW_IMG
+    cv::imshow("img", cvt_img_cpu);
+    key = cv::waitKey(1);
+#else
+    cv::imwrite("../zed_result.jpg", cvt_img_cpu);
+#endif
+}
+
+#endif  // TRTX_YOLOV5_UTILS_H_
\ No newline at end of file
diff --git a/toolkits/deploy/yololayer.cu b/toolkits/deploy/yololayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1923f2a1983c5771ba89ed0c7166ea6639d1771c
--- /dev/null
+++ b/toolkits/deploy/yololayer.cu
@@ -0,0 +1,333 @@
+#include <assert.h>
+#include <vector>
+#include <iostream>
+#include "yololayer.h"
+#include "cuda_utils.h"
+
+namespace Tn
+{
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+using namespace Yolo;
+
+namespace nvinfer1
+{
+    YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
+    {
+        mClassCount = classCount;
+        mYoloV5NetWidth = netWidth;
+        mYoloV5NetHeight = netHeight;
+        mMaxOutObject = maxOut;
+        mYoloKernel = vYoloKernel;
+        mKernelCount = vYoloKernel.size();
+
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
+    }
+    YoloLayerPlugin::~YoloLayerPlugin()
+    {
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaFree(mAnchor[ii]));
+        }
+        CUDA_CHECK(cudaFreeHost(mAnchor));
+    }
+
+    // create the plugin at runtime from a byte stream
+    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
+    {
+        using namespace Tn;
+        const char *d = reinterpret_cast<const char *>(data), *a = d;
+        read(d, mClassCount);
+        read(d, mThreadCount);
+        read(d, mKernelCount);
+        read(d, mYoloV5NetWidth);
+        read(d, mYoloV5NetHeight);
+        read(d, mMaxOutObject);
+        mYoloKernel.resize(mKernelCount);
+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
+        memcpy(mYoloKernel.data(), d, kernelSize);
+        d += kernelSize;
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
+        assert(d == a + length);
+    }
+
+    void YoloLayerPlugin::serialize(void* buffer) const
+    {
+        using namespace Tn;
+        char* d = static_cast<char*>(buffer), *a = d;
+        write(d, mClassCount);
+        write(d, mThreadCount);
+        write(d, mKernelCount);
+        write(d, mYoloV5NetWidth);
+        write(d, mYoloV5NetHeight);
+        write(d, mMaxOutObject);
+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
+        memcpy(d, mYoloKernel.data(), kernelSize);
+        d += kernelSize;
+
+        assert(d == a + getSerializationSize());
+    }
+
+    size_t YoloLayerPlugin::getSerializationSize() const
+    {
+        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject);
+    }
+
+    int YoloLayerPlugin::initialize()
+    {
+        return 0;
+    }
+
+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+        //output the result to channel
+        int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
+
+        return Dims3(totalsize + 1, 1, 1);
+    }
+
+    // Set plugin namespace
+    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
+    {
+        mPluginNamespace = pluginNamespace;
+    }
+
+    const char* YoloLayerPlugin::getPluginNamespace() const
+    {
+        return mPluginNamespace;
+    }
+
+    // Return the DataType of the plugin output at the requested index
+    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+    {
+        return DataType::kFLOAT;
+    }
+
+    // Return true if output tensor is broadcast across a batch.
+    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+    {
+        return false;
+    }
+
+    // Return true if plugin can use input that is broadcast across batch without replication.
+    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
+    {
+        return false;
+    }
+
+    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
+    {
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+    {
+    }
+
+    // Detach the plugin object from its execution context.
+    void YoloLayerPlugin::detachFromContext() {}
+
+    const char* YoloLayerPlugin::getPluginType() const
+    {
+        return "YoloLayer_TRT";
+    }
+
+    const char* YoloLayerPlugin::getPluginVersion() const
+    {
+        return "1";
+    }
+
+    void YoloLayerPlugin::destroy()
+    {
+        delete this;
+    }
+
+    // Clone the plugin
+    IPluginV2IOExt* YoloLayerPlugin::clone() const
+    {
+        YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel);
+        p->setPluginNamespace(mPluginNamespace);
+        return p;
+    }
+
+    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
+
+    __global__ void CalDetection(const float *input, float *output, int noElements,
+        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
+    {
+
+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (idx >= noElements) return;
+
+        int total_grid = yoloWidth * yoloHeight;
+        int bnIdx = idx / total_grid;
+        idx = idx - total_grid * bnIdx;
+        int info_len_i = 5 + classes;
+        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
+
+        for (int k = 0; k < 3; ++k) {
+            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
+            if (box_prob < IGNORE_THRESH) continue;
+            int class_id = 0;
+            float max_cls_prob = 0.0;
+            for (int i = 5; i < info_len_i; ++i) {
+                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
+                if (p > max_cls_prob) {
+                    max_cls_prob = p;
+                    class_id = i - 5;
+                }
+            }
+            float *res_count = output + bnIdx * outputElem;
+            int count = (int)atomicAdd(res_count, 1);
+            if (count >= maxoutobject) return;
+            char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
+            Detection* det = (Detection*)(data);
+
+            int row = idx / yoloWidth;
+            int col = idx % yoloWidth;
+
+            //Location
+            // pytorch:
+            //  y = x[i].sigmoid()
+            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh 
+            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth 
+            det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
+            det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
+
+            // W: (Pw * e^tw) / FeaturemapW * netwidth  
+            // v5: https://github.com/ultralytics/yolov5/issues/471
+            det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
+            det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
+            det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
+            det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
+            det->conf = box_prob * max_cls_prob;
+            det->class_id = class_id;
+        }
+    }
+
+    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
+    {
+        int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
+        for (int idx = 0; idx < batchSize; ++idx) {
+            CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
+        }
+        int numElem = 0;
+        for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
+        {
+            const auto& yolo = mYoloKernel[i];
+            numElem = yolo.width*yolo.height*batchSize; 
+            if (numElem < mThreadCount)
+                mThreadCount = numElem;
+
+            //printf("Net: %d  %d \n", mYoloV5NetWidth, mYoloV5NetHeight);
+            CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> >
+                (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
+        }
+    }
+
+
+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
+        return 0;
+    }
+
+    PluginFieldCollection YoloPluginCreator::mFC{};
+    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
+
+    YoloPluginCreator::YoloPluginCreator()
+    {
+        mPluginAttributes.clear();
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields = mPluginAttributes.data();
+    }
+
+    const char* YoloPluginCreator::getPluginName() const
+    {
+        return "YoloLayer_TRT";
+    }
+
+    const char* YoloPluginCreator::getPluginVersion() const
+    {
+        return "1";
+    }
+
+    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
+    {
+        return &mFC;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+    {
+        int class_count = -1;
+        int input_w = -1;
+        int input_h = -1;
+        int max_output_object_count = -1;
+        std::vector<Yolo::YoloKernel> yolo_kernels(3);
+
+        const PluginField* fields = fc->fields;
+        for (int i = 0; i < fc->nbFields; i++) {
+            if (strcmp(fields[i].name, "netdata") == 0) {
+                assert(fields[i].type == PluginFieldType::kFLOAT32);
+                int *tmp = (int*)(fields[i].data);
+                class_count = tmp[0];
+                input_w = tmp[1];
+                input_h = tmp[2];
+                max_output_object_count = tmp[3];
+            } else if (strstr(fields[i].name, "yolodata") != NULL) {
+                assert(fields[i].type == PluginFieldType::kFLOAT32);
+                int *tmp = (int*)(fields[i].data);
+                YoloKernel kernel;
+                kernel.width = tmp[0];
+                kernel.height = tmp[1];
+                for (int j = 0; j < fields[i].length - 2; j++) {
+                    kernel.anchors[j] = tmp[j + 2];
+                }
+                yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
+            }
+        }
+        assert(class_count && input_w && input_h && max_output_object_count);
+        YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call YoloLayerPlugin::destroy()
+        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+}
+
diff --git a/toolkits/deploy/yololayer.h b/toolkits/deploy/yololayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6406a6af77f4e244f41bde2889d96eddf40a19e6
--- /dev/null
+++ b/toolkits/deploy/yololayer.h
@@ -0,0 +1,143 @@
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <vector>
+#include <string>
+#include "NvInfer.h"
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.1f;
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT * 2];
+    };
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
+    static constexpr int CLASS_NUM = 13;
+    static constexpr int INPUT_H = 384;
+    static constexpr int INPUT_W = 640;
+    static constexpr int IMG_H = 360;
+    static constexpr int IMG_W = 640;
+    // static constexpr int INPUT_H = 192;
+    // static constexpr int INPUT_W = 320;
+    // static constexpr int IMG_H = 180;
+    // static constexpr int IMG_W = 320;
+
+    static constexpr int LOCATIONS = 4;
+    struct alignas(float) Detection {
+        //center_x center_y w h
+        float bbox[LOCATIONS];
+        float conf;  // bbox_conf * cls_conf
+        float class_id;
+    };
+}
+
+namespace nvinfer1
+{
+    class YoloLayerPlugin : public IPluginV2IOExt
+    {
+    public:
+        YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
+        YoloLayerPlugin(const void* data, size_t length);
+        ~YoloLayerPlugin();
+
+        int getNbOutputs() const override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+        int initialize() override;
+
+        virtual void terminate() override {};
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
+
+        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+        virtual size_t getSerializationSize() const override;
+
+        virtual void serialize(void* buffer) const override;
+
+        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
+            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+        }
+
+        const char* getPluginType() const override;
+
+        const char* getPluginVersion() const override;
+
+        void destroy() override;
+
+        IPluginV2IOExt* clone() const override;
+
+        void setPluginNamespace(const char* pluginNamespace) override;
+
+        const char* getPluginNamespace() const override;
+
+        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+        bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+        void attachToContext(
+            cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
+
+        void detachFromContext() override;
+
+    private:
+        void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1);
+        int mThreadCount = 256;
+        const char* mPluginNamespace;
+        int mKernelCount;
+        int mClassCount;
+        int mYoloV5NetWidth;
+        int mYoloV5NetHeight;
+        int mMaxOutObject;
+        std::vector<Yolo::YoloKernel> mYoloKernel;
+        void** mAnchor;
+    };
+
+    class YoloPluginCreator : public IPluginCreator
+    {
+    public:
+        YoloPluginCreator();
+
+        ~YoloPluginCreator() override = default;
+
+        const char* getPluginName() const override;
+
+        const char* getPluginVersion() const override;
+
+        const PluginFieldCollection* getFieldNames() override;
+
+        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+
+        void setPluginNamespace(const char* libNamespace) override
+        {
+            mNamespace = libNamespace;
+        }
+
+        const char* getPluginNamespace() const override
+        {
+            return mNamespace.c_str();
+        }
+
+    private:
+        std::string mNamespace;
+        static PluginFieldCollection mFC;
+        static std::vector<PluginField> mPluginAttributes;
+    };
+    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+};
+
+#endif 
diff --git a/toolkits/deploy/yolov5.hpp b/toolkits/deploy/yolov5.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1ef7a9ee7ab1c8f58ca5cd546dffda854671256
--- /dev/null
+++ b/toolkits/deploy/yolov5.hpp
@@ -0,0 +1,282 @@
+#ifndef YOLOV5_H_
+#define YOLOV5_H_
+
+#include <chrono>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "utils.h"
+#include "calibrator.h"
+
+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
+#define DEVICE 0  // GPU id
+#define NMS_THRESH 0.45
+#define CONF_THRESH 0.25
+#define BATCH_SIZE 1
+
+// stuff we know about the network and the input/output blobs
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int IMG_H = Yolo::IMG_H;
+static const int IMG_W = Yolo::IMG_W;
+static const int CLASS_NUM = Yolo::CLASS_NUM;
+static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
+const char* INPUT_BLOB_NAME = "data";
+const char* OUTPUT_DET_NAME = "det";
+const char* OUTPUT_SEG_NAME = "seg";
+const char* OUTPUT_LANE_NAME = "lane";
+static Logger gLogger;
+
+ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
+    INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
+    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
+    assert(data);
+    // auto shuffle = network->addShuffle(*data);
+    // shuffle->setReshapeDimensions(Dims3{ 3, INPUT_H, INPUT_W });
+    // shuffle->setFirstTranspose(Permutation{ 2, 0, 1 });
+
+    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+
+    // yolov5 backbone
+    // auto focus0 = focus(network, weightMap, *shuffle->getOutput(0), 3, 32, 3, "model.0");
+    auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
+    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
+    auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
+    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
+    auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
+    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
+    auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
+    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
+    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");
+
+    // yolov5 head
+    auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
+    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");
+
+    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
+    for (int i = 0; i < 256 * 2 * 2; i++) {
+        deval[i] = 1.0;
+    }
+    Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
+    IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
+    deconv11->setStrideNd(DimsHW{ 2, 2 });
+    deconv11->setNbGroups(256);
+    weightMap["deconv11"] = deconvwts11;
+
+    ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
+    auto cat12 = network->addConcatenation(inputTensors12, 2);
+    auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
+    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");
+
+    Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
+    deconv15->setStrideNd(DimsHW{ 2, 2 });
+    deconv15->setNbGroups(128);
+
+    ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
+    auto cat16 = network->addConcatenation(inputTensors16, 2);
+    auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
+    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
+
+    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
+    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
+    auto cat19 = network->addConcatenation(inputTensors19, 2);
+    auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
+    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
+
+    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
+    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
+    auto cat22 = network->addConcatenation(inputTensors22, 2);
+    auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
+    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
+
+    auto detect24 = addYoLoLayer(network, weightMap, det0, det1, det2);
+    detect24->getOutput(0)->setName(OUTPUT_DET_NAME);
+
+    auto conv25 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.25");
+    // upsample 26
+    Weights deconvwts26{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv26 = network->addDeconvolutionNd(*conv25->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts26, emptywts);
+    deconv26->setStrideNd(DimsHW{ 2, 2 });
+    deconv26->setNbGroups(128);
+    
+    auto bottleneck_csp27 = bottleneckCSP(network, weightMap, *deconv26->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.27");
+    auto conv28 = convBlock(network, weightMap, *bottleneck_csp27->getOutput(0), 32, 3, 1, 1, "model.28");
+    // upsample 29
+    Weights deconvwts29{ DataType::kFLOAT, deval, 32 * 2 * 2 };
+    IDeconvolutionLayer* deconv29 = network->addDeconvolutionNd(*conv28->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts29, emptywts);
+    deconv29->setStrideNd(DimsHW{ 2, 2 });
+    deconv29->setNbGroups(32);
+
+    auto conv30 = convBlock(network, weightMap, *deconv29->getOutput(0), 16, 3, 1, 1, "model.30");
+    auto bottleneck_csp31 = bottleneckCSP(network, weightMap, *conv30->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.31");
+
+    // upsample32
+    Weights deconvwts32{ DataType::kFLOAT, deval, 8 * 2 * 2 };
+    IDeconvolutionLayer* deconv32 = network->addDeconvolutionNd(*bottleneck_csp31->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts32, emptywts);
+    deconv32->setStrideNd(DimsHW{ 2, 2 });
+    deconv32->setNbGroups(8);
+
+    auto conv33 = convBlock(network, weightMap, *deconv32->getOutput(0), 3, 3, 1, 1, "model.33");
+    // segmentation output
+    ISliceLayer *slicelayer = network->addSlice(*conv33->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
+    auto segout = network->addTopK(*slicelayer->getOutput(0), TopKOperation::kMAX, 1, 1);
+    segout->getOutput(1)->setName(OUTPUT_SEG_NAME);
+
+    auto conv34 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.34");
+
+    // upsample35
+    Weights deconvwts35{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv35 = network->addDeconvolutionNd(*conv34->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts35, emptywts);
+    deconv35->setStrideNd(DimsHW{ 2, 2 });
+    deconv35->setNbGroups(128);
+
+    auto bottleneck_csp36 = bottleneckCSP(network, weightMap, *deconv35->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.36");
+    auto conv37 = convBlock(network, weightMap, *bottleneck_csp36->getOutput(0), 32, 3, 1, 1, "model.37");
+    
+    // upsample38
+    Weights deconvwts38{ DataType::kFLOAT, deval, 32 * 2 * 2 };
+    IDeconvolutionLayer* deconv38 = network->addDeconvolutionNd(*conv37->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts38, emptywts);
+    deconv38->setStrideNd(DimsHW{ 2, 2 });
+    deconv38->setNbGroups(32);
+
+    auto conv39 = convBlock(network, weightMap, *deconv38->getOutput(0), 16, 3, 1, 1, "model.39");
+    auto bottleneck_csp40 = bottleneckCSP(network, weightMap, *conv39->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.40");
+
+    // upsample41
+    Weights deconvwts41{ DataType::kFLOAT, deval, 8 * 2 * 2 };
+    IDeconvolutionLayer* deconv41 = network->addDeconvolutionNd(*bottleneck_csp40->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts41, emptywts);
+    deconv41->setStrideNd(DimsHW{ 2, 2 });
+    deconv41->setNbGroups(8);
+
+    auto conv42 = convBlock(network, weightMap, *deconv41->getOutput(0), 2, 3, 1, 1, "model.42");
+    // lane-det output
+    ISliceLayer *laneSlice = network->addSlice(*conv42->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
+    auto laneout = network->addTopK(*laneSlice->getOutput(0), TopKOperation::kMAX, 1, 1);
+    laneout->getOutput(1)->setName(OUTPUT_LANE_NAME);
+   
+    // // std::cout << std::to_string(slicelayer->getOutput(0)->getDimensions().d[0]) << std::endl;
+    // // ISliceLayer *tmp1 = network->addSlice(*slicelayer->getOutput(0), Dims3{ 0, 0, 0 }, Dims3{ 1, (Yolo::INPUT_H - 2 * Yolo::PAD_H), Yolo::INPUT_W }, Dims3{ 1, 1, 1 });
+    // // ISliceLayer *tmp2 = network->addSlice(*slicelayer->getOutput(0), Dims3{ 1, 0, 0 }, Dims3{ 1, (Yolo::INPUT_H - 2 * Yolo::PAD_H), Yolo::INPUT_W }, Dims3{ 1, 1, 1 });
+    // // auto segout = network->addElementWise(*tmp1->getOutput(0), *tmp2->getOutput(0), ElementWiseOperation::kLESS);
+    // std::cout << std::to_string(conv44->getOutput(0)->getDimensions().d[0]) << std::endl;
+    // std::cout << std::to_string(conv44->getOutput(0)->getDimensions().d[1]) << std::endl;
+    // std::cout << std::to_string(conv44->getOutput(0)->getDimensions().d[2]) << std::endl;
+    // assert(false);
+    // // segout->setOutputType(1, DataType::kFLOAT);
+    // segout->getOutput(1)->setName(OUTPUT_SEG_NAME);
+    // // std::cout << std::to_string(segout->getOutput(1)->getDimensions().d[0]) << std::endl;
+
+    // detection output
+    network->markOutput(*detect24->getOutput(0));
+    // segmentation output
+    network->markOutput(*segout->getOutput(1));
+    // lane output
+    network->markOutput(*laneout->getOutput(1));
+
+    assert(false);
+
+    // Build engine
+    builder->setMaxBatchSize(maxBatchSize);
+    config->setMaxWorkspaceSize(2L * (1L << 30));  // 2GB
+#if defined(USE_FP16)
+    config->setFlag(BuilderFlag::kFP16);
+// #elif defined(USE_INT8)
+//     std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+//     assert(builder->platformHasFastInt8());
+//     config->setFlag(BuilderFlag::kINT8);
+//     Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
+//     config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    // Don't need the network any more
+    network->destroy();
+
+    // Release host memory
+    for (auto& mem : weightMap)
+    {
+        free((void*)(mem.second.values));
+    }
+
+    return engine;
+}
+
+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
+    // Create builder
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+
+    // Create model to populate the network, then set the outputs and create an engine
+    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
+    assert(engine != nullptr);
+
+    // Serialize the engine
+    (*modelStream) = engine->serialize();
+
+    // Close everything down
+    engine->destroy();
+    builder->destroy();
+    config->destroy();
+}
+
+void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, int* seg_output, int* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) {
+    if (argc < 4) return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto net = std::string(argv[4]);
+        if (net == "s") {
+            gd = 0.33;
+            gw = 0.50;
+        } else if (net == "m") {
+            gd = 0.67;
+            gw = 0.75;
+        } else if (net == "l") {
+            gd = 1.0;
+            gw = 1.0;
+        } else if (net == "x") {
+            gd = 1.33;
+            gw = 1.25;
+        } else if (net == "c" && argc == 7) {
+            gd = atof(argv[5]);
+            gw = atof(argv[6]);
+        } else {
+            return false;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 4) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/toolkits/deploy/zedcam.hpp b/toolkits/deploy/zedcam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..511a6e75cb794a42b010f812f88c86f0ef708cce
--- /dev/null
+++ b/toolkits/deploy/zedcam.hpp
@@ -0,0 +1,31 @@
+#ifndef ZEDCAM_H
+#define ZEDCAM_H
+
+#include <sl/Camera.hpp>
+
+sl::Camera* create_camera() {
+    sl::Camera* cam = new sl::Camera();
+    sl::InitParameters init_params;
+    init_params.camera_resolution = sl::RESOLUTION::HD720;
+    init_params.camera_fps = 60;
+    sl::ERROR_CODE err = cam->open(init_params);
+    if (err != sl::ERROR_CODE::SUCCESS) {
+        std::cout << sl::toString(err) << std::endl; // Display the error
+        exit(-1);
+    }
+    return cam;
+}
+
+cv::Mat slMat2cvMat(sl::Mat& input) {
+    // Since cv::Mat data requires a uchar* pointer, we get the uchar1 pointer from sl::Mat (getPtr<T>())
+    // cv::Mat and sl::Mat will share a single memory structure
+    return cv::Mat(input.getHeight(), input.getWidth(), CV_8UC4, input.getPtr<sl::uchar1>(sl::MEM::CPU), input.getStepBytes(sl::MEM::CPU));
+}
+
+cv::cuda::GpuMat slMat2cvMatGPU(sl::Mat& input) {
+    // Since cv::Mat data requires a uchar* pointer, we get the uchar1 pointer from sl::Mat (getPtr<T>())
+    // cv::Mat and sl::Mat will share a single memory structure
+    return cv::cuda::GpuMat(input.getHeight(), input.getWidth(), CV_8UC4, input.getPtr<sl::uchar1>(sl::MEM::GPU), input.getStepBytes(sl::MEM::GPU));
+}
+
+#endif
\ No newline at end of file
diff --git a/tools/demo.py b/tools/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd8bbbca509fcd543177d3df1e234a35e901478
--- /dev/null
+++ b/tools/demo.py
@@ -0,0 +1,179 @@
+import argparse
+import os, sys
+import shutil
+import time
+from pathlib import Path
+import imageio
+
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+print(sys.path)
+import cv2
+import torch
+import torch.backends.cudnn as cudnn
+from numpy import random
+import scipy.special
+import numpy as np
+import torchvision.transforms as transforms
+import PIL.Image as image
+
+from lib.config import cfg
+from lib.config import update_config
+from lib.utils.utils import create_logger, select_device, time_synchronized
+from lib.models import get_net
+from lib.dataset import LoadImages, LoadStreams
+from lib.core.general import non_max_suppression, scale_coords
+from lib.utils import plot_one_box,show_seg_result
+from lib.core.function import AverageMeter
+from lib.core.postprocess import morphological_process, connect_lane
+from tqdm import tqdm
+normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+
+transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+
+
+def detect(cfg,opt):
+
+    logger, _, _ = create_logger(
+        cfg, cfg.LOG_DIR, 'demo')
+
+    device = select_device(logger,opt.device)
+    if os.path.exists(opt.save_dir):  # output dir
+        shutil.rmtree(opt.save_dir)  # delete dir
+    os.makedirs(opt.save_dir)  # make new dir
+    half = device.type != 'cpu'  # half precision only supported on CUDA
+
+    # Load model
+    model = get_net(cfg)
+    checkpoint = torch.load(opt.weights, map_location= device)
+    model.load_state_dict(checkpoint['state_dict'])
+    model = model.to(device)
+    if half:
+        model.half()  # to FP16
+
+    # Set Dataloader
+    if opt.source.isnumeric():
+        cudnn.benchmark = True  # set True to speed up constant image size inference
+        dataset = LoadStreams(opt.source, img_size=opt.img_size)
+        bs = len(dataset)  # batch_size
+    else:
+        dataset = LoadImages(opt.source, img_size=opt.img_size)
+        bs = 1  # batch_size
+
+
+    # Get names and colors
+    names = model.module.names if hasattr(model, 'module') else model.names
+    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
+
+
+    # Run inference
+    t0 = time.time()
+
+    vid_path, vid_writer = None, None
+    img = torch.zeros((1, 3, opt.img_size, opt.img_size), device=device)  # init img
+    _ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once
+    model.eval()
+
+    inf_time = AverageMeter()
+    nms_time = AverageMeter()
+    
+    for i, (path, img, img_det, vid_cap,shapes) in tqdm(enumerate(dataset),total = len(dataset)):
+        img = transform(img).to(device)
+        img = img.half() if half else img.float()  # uint8 to fp16/32
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+        # Inference
+        t1 = time_synchronized()
+        det_out, da_seg_out,ll_seg_out= model(img)
+        t2 = time_synchronized()
+        # if i == 0:
+        #     print(det_out)
+        inf_out, _ = det_out
+        inf_time.update(t2-t1,img.size(0))
+
+        # Apply NMS
+        t3 = time_synchronized()
+        det_pred = non_max_suppression(inf_out, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres, classes=None, agnostic=False)
+        t4 = time_synchronized()
+
+        nms_time.update(t4-t3,img.size(0))
+        det=det_pred[0]
+
+        save_path = str(opt.save_dir +'/'+ Path(path).name) if dataset.mode != 'stream' else str(opt.save_dir + '/' + "web.mp4")
+
+        _, _, height, width = img.shape
+        h,w,_=img_det.shape
+        pad_w, pad_h = shapes[1][1]
+        pad_w = int(pad_w)
+        pad_h = int(pad_h)
+        ratio = shapes[1][0][1]
+
+        da_predict = da_seg_out[:, :, pad_h:(height-pad_h),pad_w:(width-pad_w)]
+        da_seg_mask = torch.nn.functional.interpolate(da_predict, scale_factor=int(1/ratio), mode='bilinear')
+        _, da_seg_mask = torch.max(da_seg_mask, 1)
+        da_seg_mask = da_seg_mask.int().squeeze().cpu().numpy()
+        # da_seg_mask = morphological_process(da_seg_mask, kernel_size=7)
+
+        
+        ll_predict = ll_seg_out[:, :,pad_h:(height-pad_h),pad_w:(width-pad_w)]
+        ll_seg_mask = torch.nn.functional.interpolate(ll_predict, scale_factor=int(1/ratio), mode='bilinear')
+        _, ll_seg_mask = torch.max(ll_seg_mask, 1)
+        ll_seg_mask = ll_seg_mask.int().squeeze().cpu().numpy()
+        # Lane line post-processing
+        #ll_seg_mask = morphological_process(ll_seg_mask, kernel_size=7, func_type=cv2.MORPH_OPEN)
+        #ll_seg_mask = connect_lane(ll_seg_mask)
+
+        img_det = show_seg_result(img_det, (da_seg_mask, ll_seg_mask), _, _, is_demo=True)
+
+        if len(det):
+            det[:,:4] = scale_coords(img.shape[2:],det[:,:4],img_det.shape).round()
+            for *xyxy,conf,cls in reversed(det):
+                label_det_pred = f'{names[int(cls)]} {conf:.2f}'
+                plot_one_box(xyxy, img_det , label=label_det_pred, color=colors[int(cls)], line_thickness=2)
+        
+        if dataset.mode == 'images':
+            cv2.imwrite(save_path,img_det)
+
+        elif dataset.mode == 'video':
+            if vid_path != save_path:  # new video
+                vid_path = save_path
+                if isinstance(vid_writer, cv2.VideoWriter):
+                    vid_writer.release()  # release previous video writer
+
+                fourcc = 'mp4v'  # output video codec
+                fps = vid_cap.get(cv2.CAP_PROP_FPS)
+                h,w,_=img_det.shape
+                vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
+            vid_writer.write(img_det)
+        
+        else:
+            cv2.imshow('image', img_det)
+            cv2.waitKey(1)  # 1 millisecond
+
+    print('Results saved to %s' % Path(opt.save_dir))
+    print('Done. (%.3fs)' % (time.time() - t0))
+    print('inf : (%.4fs/frame)   nms : (%.4fs/frame)' % (inf_time.avg,nms_time.avg))
+
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weights', nargs='+', type=str, default='weights/End-to-end.pth', help='model.pth path(s)')
+    parser.add_argument('--source', type=str, default='inference/videos', help='source')  # file/folder   ex:inference/images
+    parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
+    parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
+    parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--save-dir', type=str, default='inference/output', help='directory to save results')
+    parser.add_argument('--augment', action='store_true', help='augmented inference')
+    parser.add_argument('--update', action='store_true', help='update all models')
+    opt = parser.parse_args()
+    with torch.no_grad():
+        detect(cfg,opt)
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe1cc19c3efd2c5ccfd508aa12a66b8850f8cfd
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,410 @@
+import argparse
+import os, sys
+import math
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+import matplotlib
+matplotlib.use('Agg')  # 必须在 import matplotlib.pyplot 之前，禁用交互式
+
+import pprint
+import time
+import torch
+import torch.nn.parallel
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda import amp
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import numpy as np
+from lib.utils import DataLoaderX, torch_distributed_zero_first
+from tensorboardX import SummaryWriter
+
+import lib.dataset as dataset
+from lib.config import cfg
+from lib.config import update_config
+from lib.core.loss import get_loss
+from lib.core.function import train
+from lib.core.function import validate
+from lib.core.general import fitness
+from lib.models import get_net
+from lib.utils import is_parallel
+from lib.utils.utils import get_optimizer
+from lib.utils.utils import save_checkpoint
+from lib.utils.utils import create_logger, select_device
+from lib.utils import run_anchor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Multitask network')
+    # general
+    # parser.add_argument('--cfg',
+    #                     help='experiment configure file name',
+    #                     required=True,
+    #                     type=str)
+
+    # philly
+    parser.add_argument('--modelDir',
+                        help='model directory',
+                        type=str,
+                        default='')
+    parser.add_argument('--logDir',
+                        help='log directory',
+                        type=str,
+                        default='runs/')
+    parser.add_argument('--dataDir',
+                        help='data directory',
+                        type=str,
+                        default='')
+    parser.add_argument('--prevModelDir',
+                        help='prev Model directory',
+                        type=str,
+                        default='')
+
+    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
+    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    # set all the configurations
+    args = parse_args()
+    update_config(cfg, args)
+
+    # Set DDP variables
+    world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
+
+    rank = global_rank
+    #print(rank)
+    # TODO: handle distributed training logger
+    # set the logger, tb_log_dir means tensorboard logdir
+
+    logger, final_output_dir, tb_log_dir = create_logger(
+        cfg, cfg.LOG_DIR, 'train', rank=rank)
+
+    if rank in [-1, 0]:
+        logger.info(pprint.pformat(args))
+        logger.info(cfg)
+
+        writer_dict = {
+            'writer': SummaryWriter(log_dir=tb_log_dir),
+            'train_global_steps': 0,
+            'valid_global_steps': 0,
+        }
+    else:
+        writer_dict = None
+
+    # cudnn related setting
+    cudnn.benchmark = cfg.CUDNN.BENCHMARK
+    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
+    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
+
+    # bulid up model
+    # start_time = time.time()
+    print("begin to bulid up model...")
+    # DP mode
+    device = select_device(logger, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU* len(cfg.GPUS)) if not cfg.DEBUG \
+        else select_device(logger, 'cpu')
+
+    if args.local_rank != -1:
+        assert torch.cuda.device_count() > args.local_rank
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device('cuda', args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+    
+    print("load model to device")
+    # 支持 YOLOv11 backbone
+    if hasattr(cfg.MODEL, 'USE_YOLOV11') and cfg.MODEL.USE_YOLOV11:
+        yolo_scale = cfg.MODEL.YOLOV11_SCALE if hasattr(cfg.MODEL, 'YOLOV11_SCALE') else 'n'
+        yolo_weights = cfg.MODEL.YOLOV11_WEIGHTS if hasattr(cfg.MODEL, 'YOLOV11_WEIGHTS') else f'weights/yolo11{yolo_scale}.pt'
+        freeze_backbone = cfg.MODEL.FREEZE_BACKBONE if hasattr(cfg.MODEL, 'FREEZE_BACKBONE') else False
+        
+        logger.info(f"Building model with YOLOv11{yolo_scale} backbone...")
+        logger.info(f"  Pretrained weights: {yolo_weights}")
+        logger.info(f"  Freeze backbone: {freeze_backbone}")
+        
+        model = get_net(cfg, yolo_scale=yolo_scale, yolo_weights_path=yolo_weights, freeze_backbone=freeze_backbone).to(device)
+    else:
+        model = get_net(cfg).to(device)
+    # print("load finished")
+    #model = model.to(device)
+    # print("finish build model")
+    
+
+    # define loss function (criterion) and optimizer
+    criterion = get_loss(cfg, device=device)
+    optimizer = get_optimizer(cfg, model)
+
+
+    # load checkpoint model
+    best_perf = 0.0
+    best_model = False
+    last_epoch = -1
+
+    Encoder_para_idx = [str(i) for i in range(0, 17)]
+    Det_Head_para_idx = [str(i) for i in range(17, 25)]
+    Da_Seg_Head_para_idx = [str(i) for i in range(25, 34)]
+    Ll_Seg_Head_para_idx = [str(i) for i in range(34,43)]
+
+    lf = lambda x: ((1 + math.cos(x * math.pi / cfg.TRAIN.END_EPOCH)) / 2) * \
+                   (1 - cfg.TRAIN.LRF) + cfg.TRAIN.LRF  # cosine
+    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
+
+    if rank in [-1, 0]:
+        checkpoint_file = os.path.join(
+            os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET), 'checkpoint.pth'
+        )
+        if os.path.exists(cfg.MODEL.PRETRAINED):
+            logger.info("=> loading model '{}'".format(cfg.MODEL.PRETRAINED))
+            checkpoint = torch.load(cfg.MODEL.PRETRAINED)
+            begin_epoch = checkpoint['epoch']
+            # best_perf = checkpoint['perf']
+            last_epoch = checkpoint['epoch']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
+                cfg.MODEL.PRETRAINED, checkpoint['epoch']))
+            #cfg.NEED_AUTOANCHOR = False     #disable autoanchor
+        
+        if os.path.exists(cfg.MODEL.PRETRAINED_DET):
+            logger.info("=> loading model weight in det branch from '{}'".format(cfg.MODEL.PRETRAINED))
+            det_idx_range = [str(i) for i in range(0,25)]
+            model_dict = model.state_dict()
+            checkpoint_file = cfg.MODEL.PRETRAINED_DET
+            checkpoint = torch.load(checkpoint_file)
+            begin_epoch = checkpoint['epoch']
+            last_epoch = checkpoint['epoch']
+            checkpoint_dict = {k: v for k, v in checkpoint['state_dict'].items() if k.split(".")[1] in det_idx_range}
+            model_dict.update(checkpoint_dict)
+            model.load_state_dict(model_dict)
+            logger.info("=> loaded det branch checkpoint '{}' ".format(checkpoint_file))
+        
+        if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
+            logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
+            checkpoint = torch.load(checkpoint_file)
+            begin_epoch = checkpoint['epoch']
+            # best_perf = checkpoint['perf']
+            last_epoch = checkpoint['epoch']
+            model.load_state_dict(checkpoint['state_dict'])
+            # optimizer = get_optimizer(cfg, model)
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
+                checkpoint_file, checkpoint['epoch']))
+            #cfg.NEED_AUTOANCHOR = False     #disable autoanchor
+        # model = model.to(device)
+
+        if cfg.TRAIN.SEG_ONLY:  #Only train two segmentation branchs
+            logger.info('freeze encoder and Det head...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.DET_ONLY:  #Only train detection branch
+            logger.info('freeze encoder and two Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.ENC_SEG_ONLY:  # Only train encoder and two segmentation branchs
+            logger.info('freeze Det head...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers 
+                if k.split(".")[1] in Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.ENC_DET_ONLY or cfg.TRAIN.DET_ONLY:    # Only train encoder and detection branchs
+            logger.info('freeze two Seg heads...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+
+        if cfg.TRAIN.LANE_ONLY: 
+            logger.info('freeze encoder and Det head and Da_Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.DRIVABLE_ONLY:
+            logger.info('freeze encoder and Det head and Ll_Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Ll_Seg_Head_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+        
+    if rank == -1 and torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model, device_ids=cfg.GPUS)
+        # model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()
+    # # DDP mode
+    if rank != -1:
+        model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank,find_unused_parameters=True)
+
+
+    # assign model params
+    model.gr = 1.0
+    model.nc = 1
+    # print('bulid model finished')
+
+    print("begin to load data")
+    # Data loading
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+
+    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+        cfg=cfg,
+        is_train=True,
+        inputsize=cfg.MODEL.IMAGE_SIZE,
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+    )
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if rank != -1 else None
+
+    train_loader = DataLoaderX(
+        train_dataset,
+        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
+        shuffle=(cfg.TRAIN.SHUFFLE & rank == -1),
+        num_workers=cfg.WORKERS,
+        sampler=train_sampler,
+        pin_memory=cfg.PIN_MEMORY,
+        collate_fn=dataset.AutoDriveDataset.collate_fn
+    )
+    num_batch = len(train_loader)
+
+    if rank in [-1, 0]:
+        valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+            cfg=cfg,
+            is_train=False,
+            inputsize=cfg.MODEL.IMAGE_SIZE,
+            transform=transforms.Compose([
+                transforms.ToTensor(),
+                normalize,
+            ])
+        )
+
+        valid_loader = DataLoaderX(
+            valid_dataset,
+            batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
+            shuffle=False,
+            num_workers=cfg.WORKERS,
+            pin_memory=cfg.PIN_MEMORY,
+            collate_fn=dataset.AutoDriveDataset.collate_fn
+        )
+        print('load data finished')
+    
+    if rank in [-1, 0]:
+        if cfg.NEED_AUTOANCHOR:
+            logger.info("begin check anchors")
+            run_anchor(logger,train_dataset, model=model, thr=cfg.TRAIN.ANCHOR_THRESHOLD, imgsz=min(cfg.MODEL.IMAGE_SIZE))
+        else:
+            logger.info("anchors loaded successfully")
+            det = model.module.model[model.module.detector_index] if is_parallel(model) \
+                else model.model[model.detector_index]
+            logger.info(str(det.anchors))
+
+    # training
+    num_warmup = max(round(cfg.TRAIN.WARMUP_EPOCHS * num_batch), 1000)
+    scaler = amp.GradScaler(enabled=device.type != 'cpu')
+    print('=> start training...')
+    for epoch in range(begin_epoch+1, cfg.TRAIN.END_EPOCH+1):
+        if rank != -1:
+            train_loader.sampler.set_epoch(epoch)
+        # train for one epoch
+        train(cfg, train_loader, model, criterion, optimizer, scaler,
+              epoch, num_batch, num_warmup, writer_dict, logger, device, rank)
+        
+        lr_scheduler.step()
+
+        # evaluate on validation set
+        if (epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH) and rank in [-1, 0]:
+            # print('validate')
+            da_segment_results,ll_segment_results,detect_results, total_loss,maps, times = validate(
+                epoch,cfg, valid_loader, valid_dataset, model, criterion,
+                final_output_dir, tb_log_dir, writer_dict,
+                logger, device, rank
+            )
+            fi = fitness(np.array(detect_results).reshape(1, -1))  #目标检测评价指标
+
+            msg = 'Epoch: [{0}]    Loss({loss:.3f})\n' \
+                      'Driving area Segment: Acc({da_seg_acc:.3f})    IOU ({da_seg_iou:.3f})    mIOU({da_seg_miou:.3f})\n' \
+                      'Lane line Segment: Acc({ll_seg_acc:.3f})    IOU ({ll_seg_iou:.3f})  mIOU({ll_seg_miou:.3f})\n' \
+                      'Detect: P({p:.3f})  R({r:.3f})  mAP@0.5({map50:.3f})  mAP@0.5:0.95({map:.3f})\n'\
+                      'Time: inference({t_inf:.4f}s/frame)  nms({t_nms:.4f}s/frame)'.format(
+                          epoch,  loss=total_loss, da_seg_acc=da_segment_results[0],da_seg_iou=da_segment_results[1],da_seg_miou=da_segment_results[2],
+                          ll_seg_acc=ll_segment_results[0],ll_seg_iou=ll_segment_results[1],ll_seg_miou=ll_segment_results[2],
+                          p=detect_results[0],r=detect_results[1],map50=detect_results[2],map=detect_results[3],
+                          t_inf=times[0], t_nms=times[1])
+            logger.info(msg)
+
+            # if perf_indicator >= best_perf:
+            #     best_perf = perf_indicator
+            #     best_model = True
+            # else:
+            #     best_model = False
+
+        # save checkpoint model and best model
+        if rank in [-1, 0]:
+            savepath = os.path.join(final_output_dir, f'epoch-{epoch}.pth')
+            logger.info('=> saving checkpoint to {}'.format(savepath))
+            save_checkpoint(
+                epoch=epoch,
+                name=cfg.MODEL.NAME,
+                model=model,
+                # 'best_state_dict': model.module.state_dict(),
+                # 'perf': perf_indicator,
+                optimizer=optimizer,
+                output_dir=final_output_dir,
+                filename=f'epoch-{epoch}.pth'
+            )
+            save_checkpoint(
+                epoch=epoch,
+                name=cfg.MODEL.NAME,
+                model=model,
+                # 'best_state_dict': model.module.state_dict(),
+                # 'perf': perf_indicator,
+                optimizer=optimizer,
+                output_dir=os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET),
+                filename='checkpoint.pth'
+            )
+
+    # save final model
+    if rank in [-1, 0]:
+        final_model_state_file = os.path.join(
+            final_output_dir, 'final_state.pth'
+        )
+        logger.info('=> saving final model state to {}'.format(
+            final_model_state_file)
+        )
+        model_state = model.module.state_dict() if is_parallel(model) else model.state_dict()
+        torch.save(model_state, final_model_state_file)
+        writer_dict['writer'].close()
+    else:
+        dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a6d6cd028bebe2f8fe32b403aaf59d42d5a25d
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,395 @@
+import argparse
+import os, sys
+import math
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+
+import pprint
+import time
+import torch
+import torch.nn.parallel
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda import amp
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import numpy as np
+from lib.utils import DataLoaderX, torch_distributed_zero_first
+from tensorboardX import SummaryWriter
+
+import lib.dataset as dataset
+from lib.config import cfg
+from lib.config import update_config
+from lib.core.loss import get_loss
+from lib.core.function import train
+from lib.core.function import validate
+from lib.core.general import fitness
+from lib.models import get_net
+from lib.utils import is_parallel
+from lib.utils.utils import get_optimizer
+from lib.utils.utils import save_checkpoint
+from lib.utils.utils import create_logger, select_device
+from lib.utils import run_anchor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Multitask network')
+    # general
+    # parser.add_argument('--cfg',
+    #                     help='experiment configure file name',
+    #                     required=True,
+    #                     type=str)
+
+    # philly
+    parser.add_argument('--modelDir',
+                        help='model directory',
+                        type=str,
+                        default='')
+    parser.add_argument('--logDir',
+                        help='log directory',
+                        type=str,
+                        default='runs/')
+    parser.add_argument('--dataDir',
+                        help='data directory',
+                        type=str,
+                        default='')
+    parser.add_argument('--prevModelDir',
+                        help='prev Model directory',
+                        type=str,
+                        default='')
+
+    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
+    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    # set all the configurations
+    args = parse_args()
+    update_config(cfg, args)
+
+    # Set DDP variables
+    world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
+
+    rank = global_rank
+    #print(rank)
+    # TODO: handle distributed training logger
+    # set the logger, tb_log_dir means tensorboard logdir
+
+    logger, final_output_dir, tb_log_dir = create_logger(
+        cfg, cfg.LOG_DIR, 'train', rank=rank)
+
+    if rank in [-1, 0]:
+        logger.info(pprint.pformat(args))
+        logger.info(cfg)
+
+        writer_dict = {
+            'writer': SummaryWriter(log_dir=tb_log_dir),
+            'train_global_steps': 0,
+            'valid_global_steps': 0,
+        }
+    else:
+        writer_dict = None
+
+    # cudnn related setting
+    cudnn.benchmark = cfg.CUDNN.BENCHMARK
+    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
+    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
+
+    # bulid up model
+    # start_time = time.time()
+    print("begin to bulid up model...")
+    # DP mode
+    device = select_device(logger, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU* len(cfg.GPUS)) if not cfg.DEBUG \
+        else select_device(logger, 'cpu')
+
+    if args.local_rank != -1:
+        assert torch.cuda.device_count() > args.local_rank
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device('cuda', args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+    
+    print("load model to device")
+    model = get_net(cfg).to(device)
+    # print("load finished")
+    #model = model.to(device)
+    # print("finish build model")
+    
+
+    # define loss function (criterion) and optimizer
+    criterion = get_loss(cfg, device=device)
+    optimizer = get_optimizer(cfg, model)
+
+
+    # load checkpoint model
+    best_perf = 0.0
+    best_model = False
+    last_epoch = -1
+
+    Encoder_para_idx = [str(i) for i in range(0, 17)]
+    Det_Head_para_idx = [str(i) for i in range(17, 25)]
+    Da_Seg_Head_para_idx = [str(i) for i in range(25, 34)]
+    Ll_Seg_Head_para_idx = [str(i) for i in range(34,43)]
+
+    lf = lambda x: ((1 + math.cos(x * math.pi / cfg.TRAIN.END_EPOCH)) / 2) * \
+                   (1 - cfg.TRAIN.LRF) + cfg.TRAIN.LRF  # cosine
+    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
+
+    if rank in [-1, 0]:
+        checkpoint_file = os.path.join(
+            os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET), 'checkpoint.pth'
+        )
+        if os.path.exists(cfg.MODEL.PRETRAINED):
+            logger.info("=> loading model '{}'".format(cfg.MODEL.PRETRAINED))
+            checkpoint = torch.load(cfg.MODEL.PRETRAINED)
+            begin_epoch = checkpoint['epoch']
+            # best_perf = checkpoint['perf']
+            last_epoch = checkpoint['epoch']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
+                cfg.MODEL.PRETRAINED, checkpoint['epoch']))
+            #cfg.NEED_AUTOANCHOR = False     #disable autoanchor
+        
+        if os.path.exists(cfg.MODEL.PRETRAINED_DET):
+            logger.info("=> loading model weight in det branch from '{}'".format(cfg.MODEL.PRETRAINED))
+            det_idx_range = [str(i) for i in range(0,25)]
+            model_dict = model.state_dict()
+            checkpoint_file = cfg.MODEL.PRETRAINED_DET
+            checkpoint = torch.load(checkpoint_file)
+            begin_epoch = checkpoint['epoch']
+            last_epoch = checkpoint['epoch']
+            checkpoint_dict = {k: v for k, v in checkpoint['state_dict'].items() if k.split(".")[1] in det_idx_range}
+            model_dict.update(checkpoint_dict)
+            model.load_state_dict(model_dict)
+            logger.info("=> loaded det branch checkpoint '{}' ".format(checkpoint_file))
+        
+        if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
+            logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
+            checkpoint = torch.load(checkpoint_file)
+            begin_epoch = checkpoint['epoch']
+            # best_perf = checkpoint['perf']
+            last_epoch = checkpoint['epoch']
+            model.load_state_dict(checkpoint['state_dict'])
+            # optimizer = get_optimizer(cfg, model)
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
+                checkpoint_file, checkpoint['epoch']))
+            #cfg.NEED_AUTOANCHOR = False     #disable autoanchor
+        # model = model.to(device)
+
+        if cfg.TRAIN.SEG_ONLY:  #Only train two segmentation branchs
+            logger.info('freeze encoder and Det head...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.DET_ONLY:  #Only train detection branch
+            logger.info('freeze encoder and two Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.ENC_SEG_ONLY:  # Only train encoder and two segmentation branchs
+            logger.info('freeze Det head...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers 
+                if k.split(".")[1] in Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.ENC_DET_ONLY or cfg.TRAIN.DET_ONLY:    # Only train encoder and detection branchs
+            logger.info('freeze two Seg heads...')
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Da_Seg_Head_para_idx + Ll_Seg_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+
+        if cfg.TRAIN.LANE_ONLY: 
+            logger.info('freeze encoder and Det head and Da_Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Da_Seg_Head_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+
+        if cfg.TRAIN.DRIVABLE_ONLY:
+            logger.info('freeze encoder and Det head and Ll_Seg heads...')
+            # print(model.named_parameters)
+            for k, v in model.named_parameters():
+                v.requires_grad = True  # train all layers
+                if k.split(".")[1] in Encoder_para_idx + Ll_Seg_Head_para_idx + Det_Head_para_idx:
+                    print('freezing %s' % k)
+                    v.requires_grad = False
+        
+    if rank == -1 and torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model, device_ids=cfg.GPUS)
+        # model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()
+    # # DDP mode
+    if rank != -1:
+        model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank,find_unused_parameters=True)
+
+
+    # assign model params
+    model.gr = 1.0
+    model.nc = 1
+    # print('bulid model finished')
+
+    print("begin to load data")
+    # Data loading
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+
+    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+        cfg=cfg,
+        is_train=True,
+        inputsize=cfg.MODEL.IMAGE_SIZE,
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            normalize,
+        ])
+    )
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if rank != -1 else None
+
+    train_loader = DataLoaderX(
+        train_dataset,
+        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
+        shuffle=(cfg.TRAIN.SHUFFLE & rank == -1),
+        num_workers=cfg.WORKERS,
+        sampler=train_sampler,
+        pin_memory=cfg.PIN_MEMORY,
+        collate_fn=dataset.AutoDriveDataset.collate_fn
+    )
+    num_batch = len(train_loader)
+
+    if rank in [-1, 0]:
+        valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
+            cfg=cfg,
+            is_train=False,
+            inputsize=cfg.MODEL.IMAGE_SIZE,
+            transform=transforms.Compose([
+                transforms.ToTensor(),
+                normalize,
+            ])
+        )
+
+        valid_loader = DataLoaderX(
+            valid_dataset,
+            batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
+            shuffle=False,
+            num_workers=cfg.WORKERS,
+            pin_memory=cfg.PIN_MEMORY,
+            collate_fn=dataset.AutoDriveDataset.collate_fn
+        )
+        print('load data finished')
+    
+    if rank in [-1, 0]:
+        if cfg.NEED_AUTOANCHOR:
+            logger.info("begin check anchors")
+            run_anchor(logger,train_dataset, model=model, thr=cfg.TRAIN.ANCHOR_THRESHOLD, imgsz=min(cfg.MODEL.IMAGE_SIZE))
+        else:
+            logger.info("anchors loaded successfully")
+            det = model.module.model[model.module.detector_index] if is_parallel(model) \
+                else model.model[model.detector_index]
+            logger.info(str(det.anchors))
+
+    # training
+    num_warmup = max(round(cfg.TRAIN.WARMUP_EPOCHS * num_batch), 1000)
+    scaler = amp.GradScaler(enabled=device.type != 'cpu')
+    print('=> start training...')
+    for epoch in range(begin_epoch+1, cfg.TRAIN.END_EPOCH+1):
+        if rank != -1:
+            train_loader.sampler.set_epoch(epoch)
+        # train for one epoch
+        train(cfg, train_loader, model, criterion, optimizer, scaler,
+              epoch, num_batch, num_warmup, writer_dict, logger, device, rank)
+        
+        lr_scheduler.step()
+
+        # evaluate on validation set
+        if (epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH) and rank in [-1, 0]:
+            # print('validate')
+            da_segment_results,ll_segment_results,detect_results, total_loss,maps, times = validate(
+                epoch,cfg, valid_loader, valid_dataset, model, criterion,
+                final_output_dir, tb_log_dir, writer_dict,
+                logger, device, rank
+            )
+            fi = fitness(np.array(detect_results).reshape(1, -1))  #目标检测评价指标
+
+            msg = 'Epoch: [{0}]    Loss({loss:.3f})\n' \
+                      'Driving area Segment: Acc({da_seg_acc:.3f})    IOU ({da_seg_iou:.3f})    mIOU({da_seg_miou:.3f})\n' \
+                      'Lane line Segment: Acc({ll_seg_acc:.3f})    IOU ({ll_seg_iou:.3f})  mIOU({ll_seg_miou:.3f})\n' \
+                      'Detect: P({p:.3f})  R({r:.3f})  mAP@0.5({map50:.3f})  mAP@0.5:0.95({map:.3f})\n'\
+                      'Time: inference({t_inf:.4f}s/frame)  nms({t_nms:.4f}s/frame)'.format(
+                          epoch,  loss=total_loss, da_seg_acc=da_segment_results[0],da_seg_iou=da_segment_results[1],da_seg_miou=da_segment_results[2],
+                          ll_seg_acc=ll_segment_results[0],ll_seg_iou=ll_segment_results[1],ll_seg_miou=ll_segment_results[2],
+                          p=detect_results[0],r=detect_results[1],map50=detect_results[2],map=detect_results[3],
+                          t_inf=times[0], t_nms=times[1])
+            logger.info(msg)
+
+            # if perf_indicator >= best_perf:
+            #     best_perf = perf_indicator
+            #     best_model = True
+            # else:
+            #     best_model = False
+
+        # save checkpoint model and best model
+        if rank in [-1, 0]:
+            savepath = os.path.join(final_output_dir, f'epoch-{epoch}.pth')
+            logger.info('=> saving checkpoint to {}'.format(savepath))
+            save_checkpoint(
+                epoch=epoch,
+                name=cfg.MODEL.NAME,
+                model=model,
+                # 'best_state_dict': model.module.state_dict(),
+                # 'perf': perf_indicator,
+                optimizer=optimizer,
+                output_dir=final_output_dir,
+                filename=f'epoch-{epoch}.pth'
+            )
+            save_checkpoint(
+                epoch=epoch,
+                name=cfg.MODEL.NAME,
+                model=model,
+                # 'best_state_dict': model.module.state_dict(),
+                # 'perf': perf_indicator,
+                optimizer=optimizer,
+                output_dir=os.path.join(cfg.LOG_DIR, cfg.DATASET.DATASET),
+                filename='checkpoint.pth'
+            )
+
+    # save final model
+    if rank in [-1, 0]:
+        final_model_state_file = os.path.join(
+            final_output_dir, 'final_state.pth'
+        )
+        logger.info('=> saving final model state to {}'.format(
+            final_model_state_file)
+        )
+        model_state = model.module.state_dict() if is_parallel(model) else model.state_dict()
+        torch.save(model_state, final_model_state_file)
+        writer_dict['writer'].close()
+    else:
+        dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/weights/yolop-1280-1280.onnx b/weights/yolop-1280-1280.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..2deae716711326a89aeb94cee0fbc86a0a0a0fd6
--- /dev/null
+++ b/weights/yolop-1280-1280.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1234df7c57e3561b730a320aad34d060e27c7febd9f5385f5a69a3bc92422edd
+size 32069740
diff --git a/weights/yolop-320-320.onnx b/weights/yolop-320-320.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..513a86001b828c367fd4eafa24e92f4c98d89082
--- /dev/null
+++ b/weights/yolop-320-320.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86d6e8b6dfdef195c061e9bcad82d9487bb5ee1ac4a1cf9a3dc4736657a07369
+size 31817335
diff --git a/weights/yolop-640-640.onnx b/weights/yolop-640-640.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..87c9a9c72300287c275c629fdd7bb3d52491b8e3
--- /dev/null
+++ b/weights/yolop-640-640.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd66a3e0087a7258ae07768cc02cb742eed93865727ae4c9baf969b8fa190696
+size 35902568