timm
/

Image Classification
timm
PyTorch
Safetensors
rwightman HF staff commited on
Commit
7b4a1d5
1 Parent(s): b86ee10
Files changed (4) hide show
  1. README.md +146 -0
  2. config.json +33 -0
  3. model.safetensors +3 -0
  4. pytorch_model.bin +3 -0
README.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-classification
4
+ - timm
5
+ library_name: timm
6
+ license: cc-by-nc-4.0
7
+ datasets:
8
+ - imagenet-1k
9
+ ---
10
+ # Model card for hiera_base_224.mae
11
+
12
+ A Hiera image feature model. Pretrained on ImageNet-1k with Self-Supervised Masked Autoencoder (MAE) method by paper authors.
13
+
14
+
15
+
16
+ ## Model Details
17
+ - **Model Type:** Image classification / feature backbone
18
+ - **Model Stats:**
19
+ - Params (M): 50.8
20
+ - GMACs: 9.4
21
+ - Activations (M): 30.4
22
+ - Image size: 224 x 224
23
+ - **Dataset:** ImageNet-1k
24
+ - **Papers:**
25
+ - Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles: https://arxiv.org/abs/2306.00989
26
+ - Masked Autoencoders Are Scalable Vision Learners: https://arxiv.org/abs/2111.06377
27
+ - **Original:** https://github.com/facebookresearch/hiera
28
+
29
+ ## Model Usage
30
+ ### Image Classification
31
+ ```python
32
+ from urllib.request import urlopen
33
+ from PIL import Image
34
+ import timm
35
+
36
+ img = Image.open(urlopen(
37
+ 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
38
+ ))
39
+
40
+ model = timm.create_model('hiera_base_224.mae', pretrained=True)
41
+ model = model.eval()
42
+
43
+ # get model specific transforms (normalization, resize)
44
+ data_config = timm.data.resolve_model_data_config(model)
45
+ transforms = timm.data.create_transform(**data_config, is_training=False)
46
+
47
+ output = model(transforms(img).unsqueeze(0)) # unsqueeze single image into batch of 1
48
+
49
+ top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
50
+ ```
51
+
52
+ ### Feature Map Extraction
53
+ ```python
54
+ from urllib.request import urlopen
55
+ from PIL import Image
56
+ import timm
57
+
58
+ img = Image.open(urlopen(
59
+ 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
60
+ ))
61
+
62
+ model = timm.create_model(
63
+ 'hiera_base_224.mae',
64
+ pretrained=True,
65
+ features_only=True,
66
+ )
67
+ model = model.eval()
68
+
69
+ # get model specific transforms (normalization, resize)
70
+ data_config = timm.data.resolve_model_data_config(model)
71
+ transforms = timm.data.create_transform(**data_config, is_training=False)
72
+
73
+ output = model(transforms(img).unsqueeze(0)) # unsqueeze single image into batch of 1
74
+
75
+ for o in output:
76
+ # print shape of each feature map in output
77
+ # e.g.:
78
+ # torch.Size([1, 96, 56, 56])
79
+ # torch.Size([1, 192, 28, 28])
80
+ # torch.Size([1, 384, 14, 14])
81
+ # torch.Size([1, 768, 7, 7])
82
+
83
+ print(o.shape)
84
+ ```
85
+
86
+ ### Image Embeddings
87
+ ```python
88
+ from urllib.request import urlopen
89
+ from PIL import Image
90
+ import timm
91
+
92
+ img = Image.open(urlopen(
93
+ 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
94
+ ))
95
+
96
+ model = timm.create_model(
97
+ 'hiera_base_224.mae',
98
+ pretrained=True,
99
+ num_classes=0, # remove classifier nn.Linear
100
+ )
101
+ model = model.eval()
102
+
103
+ # get model specific transforms (normalization, resize)
104
+ data_config = timm.data.resolve_model_data_config(model)
105
+ transforms = timm.data.create_transform(**data_config, is_training=False)
106
+
107
+ output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor
108
+
109
+ # or equivalently (without needing to set num_classes=0)
110
+
111
+ output = model.forward_features(transforms(img).unsqueeze(0))
112
+ # output is unpooled, a (1, 49, 768) shaped tensor
113
+
114
+ output = model.forward_head(output, pre_logits=True)
115
+ # output is a (1, num_features) shaped tensor
116
+ ```
117
+
118
+ ## Model Comparison
119
+ ### By Top-1
120
+
121
+ |model |top1 |top1_err|top5 |top5_err|param_count|
122
+ |---------------------------------|------|--------|------|--------|-----------|
123
+ |hiera_huge_224.mae_in1k_ft_in1k |86.834|13.166 |98.01 |1.99 |672.78 |
124
+ |hiera_large_224.mae_in1k_ft_in1k |86.042|13.958 |97.648|2.352 |213.74 |
125
+ |hiera_base_plus_224.mae_in1k_ft_in1k|85.134|14.866 |97.158|2.842 |69.9 |
126
+ |hiera_base_224.mae_in1k_ft_in1k |84.49 |15.51 |97.032|2.968 |51.52 |
127
+ |hiera_small_224.mae_in1k_ft_in1k |83.884|16.116 |96.684|3.316 |35.01 |
128
+ |hiera_tiny_224.mae_in1k_ft_in1k |82.786|17.214 |96.204|3.796 |27.91 |
129
+
130
+ ## Citation
131
+ ```bibtex
132
+ @article{ryali2023hiera,
133
+ title={Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles},
134
+ author={Ryali, Chaitanya and Hu, Yuan-Ting and Bolya, Daniel and Wei, Chen and Fan, Haoqi and Huang, Po-Yao and Aggarwal, Vaibhav and Chowdhury, Arkabandhu and Poursaeed, Omid and Hoffman, Judy and Malik, Jitendra and Li, Yanghao and Feichtenhofer, Christoph},
135
+ journal={ICML},
136
+ year={2023}
137
+ }
138
+ ```
139
+ ```bibtex
140
+ @Article{MaskedAutoencoders2021,
141
+ author = {Kaiming He and Xinlei Chen and Saining Xie and Yanghao Li and Piotr Doll{'a}r and Ross Girshick},
142
+ journal = {arXiv:2111.06377},
143
+ title = {Masked Autoencoders Are Scalable Vision Learners},
144
+ year = {2021},
145
+ }
146
+ ```
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "hiera_base_224",
3
+ "num_classes": 0,
4
+ "num_features": 768,
5
+ "pretrained_cfg": {
6
+ "tag": "mae",
7
+ "custom_load": false,
8
+ "input_size": [
9
+ 3,
10
+ 224,
11
+ 224
12
+ ],
13
+ "fixed_input_size": true,
14
+ "interpolation": "bicubic",
15
+ "crop_pct": 0.9,
16
+ "crop_mode": "center",
17
+ "mean": [
18
+ 0.485,
19
+ 0.456,
20
+ 0.406
21
+ ],
22
+ "std": [
23
+ 0.229,
24
+ 0.224,
25
+ 0.225
26
+ ],
27
+ "num_classes": 0,
28
+ "pool_size": null,
29
+ "first_conv": "patch_embed.proj",
30
+ "classifier": "head.fc",
31
+ "license": "cc-by-nc-4.0"
32
+ }
33
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302ef85170526899e273554580ebdea1e5ef36bf2114d99aa7281d58ffa3f8b7
3
+ size 203051448
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:434c2e9f52b06b8a7e1a7a63cf1eedfef2391c1ffb0d60e0d9333fb1420af234
3
+ size 203129910