{ "architectures": [ "OneFormerForUniversalSegmentation" ], "backbone_config": { "attention_probs_dropout_prob": 0.0, "depths": [ 2, 2, 18, 2 ], "drop_path_rate": 0.3, "embed_dim": 192, "encoder_stride": 32, "feature_channels": [ 192, 384, 768, 1536 ], "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "image_size": 384, "mlp_ratio": 4.0, "num_channels": 3, "num_heads": [ 6, 12, 24, 48 ], "patch_norm": true, "patch_size": 4, "qkv_bias": true, "strides": [ 4, 8, 16, 32 ], "use_absolute_embeddings": false, "window_size": 12 }, "decoder_config": { "common_stride": 4, "conv_dim": 256, "decoder_layers": 10, "dim_feedforward": 2048, "dropout": 0.1, "encoder_feedforward_dim": 1024, "encoder_layers": 6, "enforce_input_proj": false, "hidden_dim": 256, "mask_dim": 256, "norm": "GN", "num_heads": 8, "pre_norm": false, "query_dec_layers": 2, "use_task_norm": true }, "general_config": { "backbone_type": "swin", "class_weight": 2.0, "contrastive_temperature": 0.07, "contrastive_weight": 0.5, "deep_supervision": true, "dice_weight": 5.0, "ignore_value": 255, "importance_sample_ratio": 0.75, "init_std": 0.02, "init_xavier_std": 1.0, "is_train": false, "layer_norm_eps": 1e-05, "mask_weight": 5.0, "no_object_weight": 0.1, "num_classes": 19, "num_queries": 250, "output_auxiliary_logits": true, "oversample_ratio": 3.0, "train_num_points": 12544, "use_auxiliary_loss": true }, "hidden_size": 256, "id2label": { "0": "road", "1": "sidewalk", "2": "building", "3": "wall", "4": "fence", "5": "pole", "6": "traffic light", "7": "traffic sign", "8": "vegetation", "9": "terrain", "10": "sky", "11": "person", "12": "rider", "13": "car", "14": "truck", "15": "bus", "16": "train", "17": "motorcycle", "18": "bicycle" }, "init_std": 0.02, "init_xavier_std": 1.0, "label2id": { "bicycle": 18, "building": 2, "bus": 15, "car": 13, "fence": 4, "motorcycle": 17, "person": 11, "pole": 5, "rider": 12, "road": 0, "sidewalk": 1, "sky": 10, "terrain": 9, "traffic light": 6, "traffic sign": 7, "train": 16, "truck": 14, "vegetation": 8, "wall": 3 }, "model_type": "oneformer", "num_attention_heads": 8, "num_hidden_layers": 10, "output_attentions": true, "output_hidden_states": true, "text_encoder_config": { "max_seq_len": 77, "task_seq_len": 77, "text_encoder_context_length": 77, "text_encoder_n_ctx": 16, "text_encoder_num_layers": 6, "text_encoder_proj_layers": 2, "text_encoder_vocab_size": 49408, "text_encoder_width": 256 }, "torch_dtype": "float32", "transformers_version": "4.25.0.dev0" }