Jaime García Villena commited on
Commit
9b0616b
1 Parent(s): a2798f2

Copy from apple/deeplabv3-mobilevit-small

Browse files
.gitattributes CHANGED
@@ -2,34 +2,26 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.onnx filter=lfs diff=lfs merge=lfs -text
13
  *.ot filter=lfs diff=lfs merge=lfs -text
14
  *.parquet filter=lfs diff=lfs merge=lfs -text
15
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pt filter=lfs diff=lfs merge=lfs -text
17
  *.pth filter=lfs diff=lfs merge=lfs -text
18
  *.rar filter=lfs diff=lfs merge=lfs -text
 
19
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tflite filter=lfs diff=lfs merge=lfs -text
22
  *.tgz filter=lfs diff=lfs merge=lfs -text
23
  *.wasm filter=lfs diff=lfs merge=lfs -text
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .DS_Store
2
+ */.DS_Store
MobileViT_DeepLabV3.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
The diff for this file is too large to render. See raw diff
 
MobileViT_DeepLabV3.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e50a89dd6be1e3ba7e4df23be4f2d79a081d443c1e498536377d30b8e5fb3a29
3
+ size 25418432
MobileViT_DeepLabV3.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "4D7D9A73-AEEC-412D-A20C-7AA2C0F806EF": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "FBABE180-594F-4894-9881-F3B3D807D27D": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "4D7D9A73-AEEC-412D-A20C-7AA2C0F806EF"
18
+ }
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ tags:
4
+ - vision
5
+ - image-segmentation
6
+ datasets:
7
+ - pascal-voc
8
+ widget:
9
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-2.jpg
10
+ example_title: Cat
11
+ ---
12
+
13
+ # MobileViT + DeepLabV3 (small-sized model)
14
+
15
+ MobileViT model pre-trained on PASCAL VOC at resolution 512x512. It was introduced in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari, and first released in [this repository](https://github.com/apple/ml-cvnets). The license used is [Apple sample code license](https://github.com/apple/ml-cvnets/blob/main/LICENSE).
16
+
17
+ Disclaimer: The team releasing MobileViT did not write a model card for this model so this model card has been written by the Hugging Face team.
18
+
19
+ ## Model description
20
+
21
+ MobileViT is a light-weight, low latency convolutional neural network that combines MobileNetV2-style layers with a new block that replaces local processing in convolutions with global processing using transformers. As with ViT (Vision Transformer), the image data is converted into flattened patches before it is processed by the transformer layers. Afterwards, the patches are "unflattened" back into feature maps. This allows the MobileViT-block to be placed anywhere inside a CNN. MobileViT does not require any positional embeddings.
22
+
23
+ The model in this repo adds a [DeepLabV3](https://arxiv.org/abs/1706.05587) head to the MobileViT backbone for semantic segmentation.
24
+
25
+ ## Intended uses & limitations
26
+
27
+ You can use the raw model for semantic segmentation. See the [model hub](https://huggingface.co/models?search=mobilevit) to look for fine-tuned versions on a task that interests you.
28
+
29
+ ### How to use
30
+
31
+ Here is how to use this model:
32
+
33
+ ```python
34
+ from transformers import MobileViTFeatureExtractor, MobileViTForSemanticSegmentation
35
+ from PIL import Image
36
+ import requests
37
+
38
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
39
+ image = Image.open(requests.get(url, stream=True).raw)
40
+
41
+ feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-small")
42
+ model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")
43
+
44
+ inputs = feature_extractor(images=image, return_tensors="pt")
45
+
46
+ outputs = model(**inputs)
47
+ logits = outputs.logits
48
+ predicted_mask = logits.argmax(1).squeeze(0)
49
+ ```
50
+
51
+ Currently, both the feature extractor and model support PyTorch.
52
+
53
+ ## Training data
54
+
55
+ The MobileViT + DeepLabV3 model was pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k), a dataset consisting of 1 million images and 1,000 classes, and then fine-tuned on the [PASCAL VOC2012](http://host.robots.ox.ac.uk/pascal/VOC/) dataset.
56
+
57
+ ## Training procedure
58
+
59
+ ### Preprocessing
60
+
61
+ At inference time, images are center-cropped at 512x512. Pixels are normalized to the range [0, 1]. Images are expected to be in BGR pixel order, not RGB.
62
+
63
+ ### Pretraining
64
+
65
+ The MobileViT networks are trained from scratch for 300 epochs on ImageNet-1k on 8 NVIDIA GPUs with an effective batch size of 1024 and learning rate warmup for 3k steps, followed by cosine annealing. Also used were label smoothing cross-entropy loss and L2 weight decay. Training resolution varies from 160x160 to 320x320, using multi-scale sampling.
66
+
67
+ To obtain the DeepLabV3 model, MobileViT was fine-tuned on the PASCAL VOC dataset using 4 NVIDIA A100 GPUs.
68
+
69
+ ## Evaluation results
70
+
71
+ | Model | PASCAL VOC mIOU | # params | URL |
72
+ |------------------|-----------------|-----------|-----------------------------------------------------------|
73
+ | MobileViT-XXS | 73.6 | 1.9 M | https://huggingface.co/apple/deeplabv3-mobilevit-xx-small |
74
+ | MobileViT-XS | 77.1 | 2.9 M | https://huggingface.co/apple/deeplabv3-mobilevit-x-small |
75
+ | **MobileViT-S** | **79.1** | **6.4 M** | https://huggingface.co/apple/deeplabv3-mobilevit-small |
76
+
77
+ ### BibTeX entry and citation info
78
+
79
+ ```bibtex
80
+ @inproceedings{vision-transformer,
81
+ title = {MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer},
82
+ author = {Sachin Mehta and Mohammad Rastegari},
83
+ year = {2022},
84
+ URL = {https://arxiv.org/abs/2110.02178}
85
+ }
86
+ ```
config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MobileViTForSemanticSegmentation"
4
+ ],
5
+ "aspp_dropout_prob": 0.1,
6
+ "aspp_out_channels": 256,
7
+ "atrous_rates": [
8
+ 6,
9
+ 12,
10
+ 18
11
+ ],
12
+ "attention_probs_dropout_prob": 0.0,
13
+ "classifier_dropout_prob": 0.1,
14
+ "conv_kernel_size": 3,
15
+ "expand_ratio": 4.0,
16
+ "hidden_act": "silu",
17
+ "hidden_dropout_prob": 0.1,
18
+ "hidden_sizes": [
19
+ 144,
20
+ 192,
21
+ 240
22
+ ],
23
+ "id2label": {
24
+ "0": "background",
25
+ "1": "aeroplane",
26
+ "2": "bicycle",
27
+ "3": "bird",
28
+ "4": "boat",
29
+ "5": "bottle",
30
+ "6": "bus",
31
+ "7": "car",
32
+ "8": "cat",
33
+ "9": "chair",
34
+ "10": "cow",
35
+ "11": "diningtable",
36
+ "12": "dog",
37
+ "13": "horse",
38
+ "14": "motorbike",
39
+ "15": "person",
40
+ "16": "pottedplant",
41
+ "17": "sheep",
42
+ "18": "sofa",
43
+ "19": "train",
44
+ "20": "tvmonitor"
45
+ },
46
+ "image_size": 512,
47
+ "initializer_range": 0.02,
48
+ "label2id": {
49
+ "aeroplane": 1,
50
+ "background": 0,
51
+ "bicycle": 2,
52
+ "bird": 3,
53
+ "boat": 4,
54
+ "bottle": 5,
55
+ "bus": 6,
56
+ "car": 7,
57
+ "cat": 8,
58
+ "chair": 9,
59
+ "cow": 10,
60
+ "diningtable": 11,
61
+ "dog": 12,
62
+ "horse": 13,
63
+ "motorbike": 14,
64
+ "person": 15,
65
+ "pottedplant": 16,
66
+ "sheep": 17,
67
+ "sofa": 18,
68
+ "train": 19,
69
+ "tvmonitor": 20
70
+ },
71
+ "layer_norm_eps": 1e-05,
72
+ "mlp_ratio": 2.0,
73
+ "model_type": "mobilevit",
74
+ "neck_hidden_sizes": [
75
+ 16,
76
+ 32,
77
+ 64,
78
+ 96,
79
+ 128,
80
+ 160,
81
+ 640
82
+ ],
83
+ "num_attention_heads": 4,
84
+ "num_channels": 3,
85
+ "output_stride": 16,
86
+ "patch_size": 2,
87
+ "qkv_bias": true,
88
+ "semantic_loss_ignore_index": 255,
89
+ "torch_dtype": "float32",
90
+ "transformers_version": "4.20.0.dev0"
91
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 512,
3
+ "do_center_crop": true,
4
+ "do_flip_channels": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "MobileViTFeatureExtractor",
7
+ "resample": 2,
8
+ "size": 544
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e68a534df237d8b89aa9209c815976b4b34f49a4e8107f630fd799697e98291
3
+ size 25615631
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e14ab532bd4b573c60e4f4c6639de6176db4c35c803cc7c0ba05fdb16e5b3de
3
+ size 25943848