JB commited on
Commit
6812d99
1 Parent(s): 954ff6e

Upload 13 files

Browse files
README.md CHANGED
@@ -1,3 +1,104 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - image-to-text
4
+ widget:
5
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
6
+ example_title: Football Match
7
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog-cat.jpg
8
+ example_title: Dog & Cat
9
  ---
10
+
11
+ ## Example
12
+
13
+ The model is by no means a state-of-the-art model, but nevertheless
14
+ produces reasonable image captioning results. It was mainly fine-tuned
15
+ as a proof-of-concept for the 🤗 FlaxVisionEncoderDecoder Framework.
16
+
17
+ The model can be used as follows:
18
+
19
+ **In PyTorch**
20
+ ```python
21
+
22
+ import torch
23
+ import requests
24
+ from PIL import Image
25
+ from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
26
+
27
+
28
+ loc = "ydshieh/vit-gpt2-coco-en"
29
+
30
+ feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
31
+ tokenizer = AutoTokenizer.from_pretrained(loc)
32
+ model = VisionEncoderDecoderModel.from_pretrained(loc)
33
+ model.eval()
34
+
35
+
36
+ def predict(image):
37
+
38
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
39
+
40
+ with torch.no_grad():
41
+ output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
42
+
43
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
44
+ preds = [pred.strip() for pred in preds]
45
+
46
+ return preds
47
+
48
+
49
+ # We will verify our results on an image of cute cats
50
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
51
+ with Image.open(requests.get(url, stream=True).raw) as image:
52
+ preds = predict(image)
53
+
54
+ print(preds)
55
+ # should produce
56
+ # ['a cat laying on top of a couch next to another cat']
57
+
58
+ ```
59
+
60
+ **In Flax**
61
+ ```python
62
+
63
+ import jax
64
+ import requests
65
+ from PIL import Image
66
+ from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel
67
+
68
+
69
+ loc = "ydshieh/vit-gpt2-coco-en"
70
+
71
+ feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
72
+ tokenizer = AutoTokenizer.from_pretrained(loc)
73
+ model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
74
+
75
+ gen_kwargs = {"max_length": 16, "num_beams": 4}
76
+
77
+
78
+ # This takes sometime when compiling the first time, but the subsequent inference will be much faster
79
+ @jax.jit
80
+ def generate(pixel_values):
81
+ output_ids = model.generate(pixel_values, **gen_kwargs).sequences
82
+ return output_ids
83
+
84
+
85
+ def predict(image):
86
+
87
+ pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
88
+ output_ids = generate(pixel_values)
89
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
90
+ preds = [pred.strip() for pred in preds]
91
+
92
+ return preds
93
+
94
+
95
+ # We will verify our results on an image of cute cats
96
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
97
+ with Image.open(requests.get(url, stream=True).raw) as image:
98
+ preds = predict(image)
99
+
100
+ print(preds)
101
+ # should produce
102
+ # ['a cat laying on top of a couch next to another cat']
103
+
104
+ ```
config.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VisionEncoderDecoderModel"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "decoder": {
7
+ "_name_or_path": "",
8
+ "activation_function": "gelu_new",
9
+ "add_cross_attention": true,
10
+ "architectures": [
11
+ "GPT2LMHeadModel"
12
+ ],
13
+ "attn_pdrop": 0.1,
14
+ "bad_words_ids": null,
15
+ "bos_token_id": 50256,
16
+ "chunk_size_feed_forward": 0,
17
+ "decoder_start_token_id": 50256,
18
+ "diversity_penalty": 0.0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "embd_pdrop": 0.1,
22
+ "encoder_no_repeat_ngram_size": 0,
23
+ "eos_token_id": 50256,
24
+ "finetuning_task": null,
25
+ "forced_bos_token_id": null,
26
+ "forced_eos_token_id": null,
27
+ "id2label": {
28
+ "0": "LABEL_0",
29
+ "1": "LABEL_1"
30
+ },
31
+ "initializer_range": 0.02,
32
+ "is_decoder": true,
33
+ "is_encoder_decoder": false,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1
37
+ },
38
+ "layer_norm_epsilon": 1e-05,
39
+ "length_penalty": 1.0,
40
+ "max_length": 20,
41
+ "min_length": 0,
42
+ "model_type": "gpt2",
43
+ "n_ctx": 1024,
44
+ "n_embd": 768,
45
+ "n_head": 12,
46
+ "n_inner": null,
47
+ "n_layer": 12,
48
+ "n_positions": 1024,
49
+ "no_repeat_ngram_size": 0,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_return_sequences": 1,
53
+ "output_attentions": false,
54
+ "output_hidden_states": false,
55
+ "output_scores": false,
56
+ "pad_token_id": 50256,
57
+ "prefix": null,
58
+ "problem_type": null,
59
+ "pruned_heads": {},
60
+ "remove_invalid_values": false,
61
+ "repetition_penalty": 1.0,
62
+ "resid_pdrop": 0.1,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "scale_attn_weights": true,
66
+ "sep_token_id": null,
67
+ "summary_activation": null,
68
+ "summary_first_dropout": 0.1,
69
+ "summary_proj_to_labels": true,
70
+ "summary_type": "cls_index",
71
+ "summary_use_proj": true,
72
+ "task_specific_params": {
73
+ "text-generation": {
74
+ "do_sample": true,
75
+ "max_length": 50
76
+ }
77
+ },
78
+ "temperature": 1.0,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.11.0.dev0",
87
+ "use_bfloat16": false,
88
+ "use_cache": true,
89
+ "vocab_size": 50257
90
+ },
91
+ "decoder_start_token_id": 50256,
92
+ "encoder": {
93
+ "_name_or_path": "",
94
+ "add_cross_attention": false,
95
+ "architectures": [
96
+ "ViTModel"
97
+ ],
98
+ "attention_probs_dropout_prob": 0.0,
99
+ "bad_words_ids": null,
100
+ "bos_token_id": null,
101
+ "chunk_size_feed_forward": 0,
102
+ "decoder_start_token_id": null,
103
+ "diversity_penalty": 0.0,
104
+ "do_sample": false,
105
+ "early_stopping": false,
106
+ "encoder_no_repeat_ngram_size": 0,
107
+ "eos_token_id": null,
108
+ "finetuning_task": null,
109
+ "forced_bos_token_id": null,
110
+ "forced_eos_token_id": null,
111
+ "hidden_act": "gelu",
112
+ "hidden_dropout_prob": 0.0,
113
+ "hidden_size": 768,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 224,
119
+ "initializer_range": 0.02,
120
+ "intermediate_size": 3072,
121
+ "is_decoder": false,
122
+ "is_encoder_decoder": false,
123
+ "label2id": {
124
+ "LABEL_0": 0,
125
+ "LABEL_1": 1
126
+ },
127
+ "layer_norm_eps": 1e-12,
128
+ "length_penalty": 1.0,
129
+ "max_length": 20,
130
+ "min_length": 0,
131
+ "model_type": "vit",
132
+ "no_repeat_ngram_size": 0,
133
+ "num_attention_heads": 12,
134
+ "num_beam_groups": 1,
135
+ "num_beams": 1,
136
+ "num_channels": 3,
137
+ "num_hidden_layers": 12,
138
+ "num_return_sequences": 1,
139
+ "output_attentions": false,
140
+ "output_hidden_states": false,
141
+ "output_scores": false,
142
+ "pad_token_id": null,
143
+ "patch_size": 16,
144
+ "prefix": null,
145
+ "problem_type": null,
146
+ "pruned_heads": {},
147
+ "remove_invalid_values": false,
148
+ "repetition_penalty": 1.0,
149
+ "return_dict": true,
150
+ "return_dict_in_generate": false,
151
+ "sep_token_id": null,
152
+ "task_specific_params": null,
153
+ "temperature": 1.0,
154
+ "tie_encoder_decoder": false,
155
+ "tie_word_embeddings": true,
156
+ "tokenizer_class": null,
157
+ "top_k": 50,
158
+ "top_p": 1.0,
159
+ "torch_dtype": null,
160
+ "torchscript": false,
161
+ "transformers_version": "4.11.0.dev0",
162
+ "use_bfloat16": false
163
+ },
164
+ "eos_token_id": 50256,
165
+ "is_encoder_decoder": true,
166
+ "model_type": "vision-encoder-decoder",
167
+ "pad_token_id": 50256,
168
+ "transformers_version": null
169
+ }
generation_eval.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pipeline.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Any
3
+ from PIL import Image
4
+ import jax
5
+ from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel, VisionEncoderDecoderModel
6
+ import torch
7
+
8
+
9
+ class PreTrainedPipeline():
10
+
11
+ def __init__(self, path=""):
12
+
13
+ model_dir = path
14
+
15
+ # self.model = FlaxVisionEncoderDecoderModel.from_pretrained(model_dir)
16
+ self.model = VisionEncoderDecoderModel.from_pretrained(model_dir)
17
+ self.feature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)
18
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
19
+
20
+ max_length = 16
21
+ num_beams = 4
22
+ # self.gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
23
+ self.gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "return_dict_in_generate": True, "output_scores": True}
24
+
25
+ self.model.to("cpu")
26
+ self.model.eval()
27
+
28
+ # @jax.jit
29
+ def _generate(pixel_values):
30
+
31
+ with torch.no_grad():
32
+
33
+ outputs = self.model.generate(pixel_values, **self.gen_kwargs)
34
+ output_ids = outputs.sequences
35
+ sequences_scores = outputs.sequences_scores
36
+
37
+ return output_ids, sequences_scores
38
+
39
+ self.generate = _generate
40
+
41
+ # compile the model
42
+ image_path = os.path.join(path, 'val_000000039769.jpg')
43
+ image = Image.open(image_path)
44
+ self(image)
45
+ image.close()
46
+
47
+ def __call__(self, inputs: "Image.Image") -> List[str]:
48
+ """
49
+ Args:
50
+ Return:
51
+ """
52
+
53
+ # pixel_values = self.feature_extractor(images=inputs, return_tensors="np").pixel_values
54
+ pixel_values = self.feature_extractor(images=inputs, return_tensors="pt").pixel_values
55
+
56
+ output_ids, sequences_scores = self.generate(pixel_values)
57
+ preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
58
+ preds = [pred.strip() for pred in preds]
59
+
60
+ preds = [{"label": preds[0], "score": float(sequences_scores[0])}]
61
+
62
+ return preds
preprocessor_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "ViTFeatureExtractor",
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_std": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "size": 224
16
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42892c4e6b58884705d4e66e97f2dcc5059eb114278d3b7c088f6ae99615575
3
+ size 982135145
report.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Pillow
2
+ jax[cpu]
3
+ flax
4
+ git+https://github.com/ydshieh/transformers.git@flax_vision_encoder_decoder
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff