muqtadar commited on
Commit
e63695e
1 Parent(s): a03b03b

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -9
  2. README.md +104 -1
  3. config.json +209 -0
  4. model.safetensors +3 -0
  5. preprocessor_config.json +18 -0
  6. pytorch_model.bin +3 -0
.gitattributes CHANGED
@@ -2,34 +2,27 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.onnx filter=lfs diff=lfs merge=lfs -text
13
  *.ot filter=lfs diff=lfs merge=lfs -text
14
  *.parquet filter=lfs diff=lfs merge=lfs -text
15
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pt filter=lfs diff=lfs merge=lfs -text
17
  *.pth filter=lfs diff=lfs merge=lfs -text
18
  *.rar filter=lfs diff=lfs merge=lfs -text
 
19
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tflite filter=lfs diff=lfs merge=lfs -text
22
  *.tgz filter=lfs diff=lfs merge=lfs -text
23
  *.wasm filter=lfs diff=lfs merge=lfs -text
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,106 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ tags:
4
+ - object-detection
5
+ - vision
6
+ datasets:
7
+ - coco
8
+ widget:
9
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
10
+ example_title: Savanna
11
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
12
+ example_title: Football Match
13
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
14
+ example_title: Airport
15
  ---
16
+
17
+ # YOLOS (tiny-sized) model
18
+
19
+ YOLOS model fine-tuned on COCO 2017 object detection (118k annotated images). It was introduced in the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Fang et al. and first released in [this repository](https://github.com/hustvl/YOLOS).
20
+
21
+ Disclaimer: The team releasing YOLOS did not write a model card for this model so this model card has been written by the Hugging Face team.
22
+
23
+ ## Model description
24
+
25
+ YOLOS is a Vision Transformer (ViT) trained using the DETR loss. Despite its simplicity, a base-sized YOLOS model is able to achieve 42 AP on COCO validation 2017 (similar to DETR and more complex frameworks such as Faster R-CNN).
26
+
27
+ The model is trained using a "bipartite matching loss": one compares the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N (so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as bounding box). The Hungarian matching algorithm is used to create an optimal one-to-one mapping between each of the N queries and each of the N annotations. Next, standard cross-entropy (for the classes) and a linear combination of the L1 and generalized IoU loss (for the bounding boxes) are used to optimize the parameters of the model.
28
+
29
+ ## Intended uses & limitations
30
+
31
+ You can use the raw model for object detection. See the [model hub](https://huggingface.co/models?search=hustvl/yolos) to look for all available YOLOS models.
32
+
33
+ ### How to use
34
+
35
+ Here is how to use this model:
36
+
37
+ ```python
38
+ from transformers import YolosImageProcessor, YolosForObjectDetection
39
+ from PIL import Image
40
+ import torch
41
+ import requests
42
+
43
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
44
+ image = Image.open(requests.get(url, stream=True).raw)
45
+
46
+ model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
47
+ image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
48
+
49
+ inputs = image_processor(images=image, return_tensors="pt")
50
+ outputs = model(**inputs)
51
+
52
+ # model predicts bounding boxes and corresponding COCO classes
53
+ logits = outputs.logits
54
+ bboxes = outputs.pred_boxes
55
+
56
+
57
+ # print results
58
+ target_sizes = torch.tensor([image.size[::-1]])
59
+ results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]
60
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
61
+ box = [round(i, 2) for i in box.tolist()]
62
+ print(
63
+ f"Detected {model.config.id2label[label.item()]} with confidence "
64
+ f"{round(score.item(), 3)} at location {box}"
65
+ )
66
+ ```
67
+
68
+ Currently, both the feature extractor and model support PyTorch.
69
+
70
+ ## Training data
71
+
72
+ The YOLOS model was pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet2012) and fine-tuned on [COCO 2017 object detection](https://cocodataset.org/#download), a dataset consisting of 118k/5k annotated images for training/validation respectively.
73
+
74
+ ### Training
75
+
76
+ The model was pre-trained for 300 epochs on ImageNet-1k and fine-tuned for 300 epochs on COCO.
77
+
78
+ ## Evaluation results
79
+
80
+ This model achieves an AP (average precision) of **28.7** on COCO 2017 validation. For more details regarding evaluation results, we refer to the original paper.
81
+
82
+ ### BibTeX entry and citation info
83
+
84
+ ```bibtex
85
+ @article{DBLP:journals/corr/abs-2106-00666,
86
+ author = {Yuxin Fang and
87
+ Bencheng Liao and
88
+ Xinggang Wang and
89
+ Jiemin Fang and
90
+ Jiyang Qi and
91
+ Rui Wu and
92
+ Jianwei Niu and
93
+ Wenyu Liu},
94
+ title = {You Only Look at One Sequence: Rethinking Transformer in Vision through
95
+ Object Detection},
96
+ journal = {CoRR},
97
+ volume = {abs/2106.00666},
98
+ year = {2021},
99
+ url = {https://arxiv.org/abs/2106.00666},
100
+ eprinttype = {arXiv},
101
+ eprint = {2106.00666},
102
+ timestamp = {Fri, 29 Apr 2022 19:49:16 +0200},
103
+ biburl = {https://dblp.org/rec/journals/corr/abs-2106-00666.bib},
104
+ bibsource = {dblp computer science bibliography, https://dblp.org}
105
+ }
106
+ ```
config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "YolosForObjectDetection"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "auxiliary_loss": false,
7
+ "bbox_cost": 5,
8
+ "bbox_loss_coefficient": 5,
9
+ "class_cost": 1,
10
+ "eos_coefficient": 0.1,
11
+ "giou_cost": 2,
12
+ "giou_loss_coefficient": 2,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.0,
15
+ "hidden_size": 192,
16
+ "id2label": {
17
+ "0": "N/A",
18
+ "1": "person",
19
+ "2": "bicycle",
20
+ "3": "car",
21
+ "4": "motorcycle",
22
+ "5": "airplane",
23
+ "6": "bus",
24
+ "7": "train",
25
+ "8": "truck",
26
+ "9": "boat",
27
+ "10": "traffic light",
28
+ "11": "fire hydrant",
29
+ "12": "N/A",
30
+ "13": "stop sign",
31
+ "14": "parking meter",
32
+ "15": "bench",
33
+ "16": "bird",
34
+ "17": "cat",
35
+ "18": "dog",
36
+ "19": "horse",
37
+ "20": "sheep",
38
+ "21": "cow",
39
+ "22": "elephant",
40
+ "23": "bear",
41
+ "24": "zebra",
42
+ "25": "giraffe",
43
+ "26": "N/A",
44
+ "27": "backpack",
45
+ "28": "umbrella",
46
+ "29": "N/A",
47
+ "30": "N/A",
48
+ "31": "handbag",
49
+ "32": "tie",
50
+ "33": "suitcase",
51
+ "34": "frisbee",
52
+ "35": "skis",
53
+ "36": "snowboard",
54
+ "37": "sports ball",
55
+ "38": "kite",
56
+ "39": "baseball bat",
57
+ "40": "baseball glove",
58
+ "41": "skateboard",
59
+ "42": "surfboard",
60
+ "43": "tennis racket",
61
+ "44": "bottle",
62
+ "45": "N/A",
63
+ "46": "wine glass",
64
+ "47": "cup",
65
+ "48": "fork",
66
+ "49": "knife",
67
+ "50": "spoon",
68
+ "51": "bowl",
69
+ "52": "banana",
70
+ "53": "apple",
71
+ "54": "sandwich",
72
+ "55": "orange",
73
+ "56": "broccoli",
74
+ "57": "carrot",
75
+ "58": "hot dog",
76
+ "59": "pizza",
77
+ "60": "donut",
78
+ "61": "cake",
79
+ "62": "chair",
80
+ "63": "couch",
81
+ "64": "potted plant",
82
+ "65": "bed",
83
+ "66": "N/A",
84
+ "67": "dining table",
85
+ "68": "N/A",
86
+ "69": "N/A",
87
+ "70": "toilet",
88
+ "71": "N/A",
89
+ "72": "tv",
90
+ "73": "laptop",
91
+ "74": "mouse",
92
+ "75": "remote",
93
+ "76": "keyboard",
94
+ "77": "cell phone",
95
+ "78": "microwave",
96
+ "79": "oven",
97
+ "80": "toaster",
98
+ "81": "sink",
99
+ "82": "refrigerator",
100
+ "83": "N/A",
101
+ "84": "book",
102
+ "85": "clock",
103
+ "86": "vase",
104
+ "87": "scissors",
105
+ "88": "teddy bear",
106
+ "89": "hair drier",
107
+ "90": "toothbrush"
108
+ },
109
+ "image_size": [
110
+ 800,
111
+ 1333
112
+ ],
113
+ "initializer_range": 0.02,
114
+ "intermediate_size": 768,
115
+ "label2id": {
116
+ "N/A": 83,
117
+ "airplane": 5,
118
+ "apple": 53,
119
+ "backpack": 27,
120
+ "banana": 52,
121
+ "baseball bat": 39,
122
+ "baseball glove": 40,
123
+ "bear": 23,
124
+ "bed": 65,
125
+ "bench": 15,
126
+ "bicycle": 2,
127
+ "bird": 16,
128
+ "boat": 9,
129
+ "book": 84,
130
+ "bottle": 44,
131
+ "bowl": 51,
132
+ "broccoli": 56,
133
+ "bus": 6,
134
+ "cake": 61,
135
+ "car": 3,
136
+ "carrot": 57,
137
+ "cat": 17,
138
+ "cell phone": 77,
139
+ "chair": 62,
140
+ "clock": 85,
141
+ "couch": 63,
142
+ "cow": 21,
143
+ "cup": 47,
144
+ "dining table": 67,
145
+ "dog": 18,
146
+ "donut": 60,
147
+ "elephant": 22,
148
+ "fire hydrant": 11,
149
+ "fork": 48,
150
+ "frisbee": 34,
151
+ "giraffe": 25,
152
+ "hair drier": 89,
153
+ "handbag": 31,
154
+ "horse": 19,
155
+ "hot dog": 58,
156
+ "keyboard": 76,
157
+ "kite": 38,
158
+ "knife": 49,
159
+ "laptop": 73,
160
+ "microwave": 78,
161
+ "motorcycle": 4,
162
+ "mouse": 74,
163
+ "orange": 55,
164
+ "oven": 79,
165
+ "parking meter": 14,
166
+ "person": 1,
167
+ "pizza": 59,
168
+ "potted plant": 64,
169
+ "refrigerator": 82,
170
+ "remote": 75,
171
+ "sandwich": 54,
172
+ "scissors": 87,
173
+ "sheep": 20,
174
+ "sink": 81,
175
+ "skateboard": 41,
176
+ "skis": 35,
177
+ "snowboard": 36,
178
+ "spoon": 50,
179
+ "sports ball": 37,
180
+ "stop sign": 13,
181
+ "suitcase": 33,
182
+ "surfboard": 42,
183
+ "teddy bear": 88,
184
+ "tennis racket": 43,
185
+ "tie": 32,
186
+ "toaster": 80,
187
+ "toilet": 70,
188
+ "toothbrush": 90,
189
+ "traffic light": 10,
190
+ "train": 7,
191
+ "truck": 8,
192
+ "tv": 72,
193
+ "umbrella": 28,
194
+ "vase": 86,
195
+ "wine glass": 46,
196
+ "zebra": 24
197
+ },
198
+ "layer_norm_eps": 1e-12,
199
+ "model_type": "yolos",
200
+ "num_attention_heads": 3,
201
+ "num_channels": 3,
202
+ "num_detection_tokens": 100,
203
+ "num_hidden_layers": 12,
204
+ "patch_size": 16,
205
+ "qkv_bias": true,
206
+ "torch_dtype": "float32",
207
+ "transformers_version": "4.19.0.dev0",
208
+ "use_mid_position_embeddings": false
209
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba9f8216bc16184052023a12f371c94af912535b3a265f6132b1885f9a0004fe
3
+ size 133
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "YolosFeatureExtractor",
5
+ "format": "coco_detection",
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_std": [
12
+ 0.229,
13
+ 0.224,
14
+ 0.225
15
+ ],
16
+ "max_size": 1333,
17
+ "size": 512
18
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c1f644db234c987a4abaa672267015f5b277f2228639c260c68505b00758e43
3
+ size 133