ydshieh commited on
Commit
c1c837b
1 Parent(s): 9d3e38f
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-classification
4
+ library_name: generic
5
+ ---
6
+
7
+ ## Example
8
+
9
+ The model is by no means a state-of-the-art model, but nevertheless
10
+ produces reasonable image captioning results. It was mainly fine-tuned
11
+ as a proof-of-concept for the 🤗 FlaxVisionEncoderDecoder Framework.
12
+
13
+ The model can be used as follows:
14
+
15
+ ```python
16
+
17
+ import requests
18
+ from PIL import Image
19
+ from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel
20
+
21
+ loc = "ydshieh/vit-gpt2-coco-en"
22
+
23
+ feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
24
+ tokenizer = AutoTokenizer.from_pretrained(loc)
25
+ model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
26
+
27
+ # We will verify our results on an image of cute cats
28
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
29
+ with Image.open(requests.get(url, stream=True).raw) as img:
30
+ pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
31
+
32
+ def generate_step(pixel_values):
33
+
34
+ output_ids = model.generate(pixel_values, max_length=16, num_beams=4).sequences
35
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
36
+ preds = [pred.strip() for pred in preds]
37
+
38
+ return preds
39
+
40
+ preds = generate_step(pixel_values)
41
+ print(preds)
42
+
43
+ # should produce
44
+ # ['a cat laying on top of a couch next to another cat']
45
+
46
+ ```
config.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VisionEncoderDecoderModel"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "decoder": {
7
+ "_name_or_path": "",
8
+ "activation_function": "gelu_new",
9
+ "add_cross_attention": true,
10
+ "architectures": [
11
+ "GPT2LMHeadModel"
12
+ ],
13
+ "attn_pdrop": 0.1,
14
+ "bad_words_ids": null,
15
+ "bos_token_id": 50256,
16
+ "chunk_size_feed_forward": 0,
17
+ "decoder_start_token_id": 50256,
18
+ "diversity_penalty": 0.0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "embd_pdrop": 0.1,
22
+ "encoder_no_repeat_ngram_size": 0,
23
+ "eos_token_id": 50256,
24
+ "finetuning_task": null,
25
+ "forced_bos_token_id": null,
26
+ "forced_eos_token_id": null,
27
+ "id2label": {
28
+ "0": "LABEL_0",
29
+ "1": "LABEL_1"
30
+ },
31
+ "initializer_range": 0.02,
32
+ "is_decoder": true,
33
+ "is_encoder_decoder": false,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1
37
+ },
38
+ "layer_norm_epsilon": 1e-05,
39
+ "length_penalty": 1.0,
40
+ "max_length": 20,
41
+ "min_length": 0,
42
+ "model_type": "gpt2",
43
+ "n_ctx": 1024,
44
+ "n_embd": 768,
45
+ "n_head": 12,
46
+ "n_inner": null,
47
+ "n_layer": 12,
48
+ "n_positions": 1024,
49
+ "no_repeat_ngram_size": 0,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_return_sequences": 1,
53
+ "output_attentions": false,
54
+ "output_hidden_states": false,
55
+ "output_scores": false,
56
+ "pad_token_id": 50256,
57
+ "prefix": null,
58
+ "problem_type": null,
59
+ "pruned_heads": {},
60
+ "remove_invalid_values": false,
61
+ "repetition_penalty": 1.0,
62
+ "resid_pdrop": 0.1,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "scale_attn_weights": true,
66
+ "sep_token_id": null,
67
+ "summary_activation": null,
68
+ "summary_first_dropout": 0.1,
69
+ "summary_proj_to_labels": true,
70
+ "summary_type": "cls_index",
71
+ "summary_use_proj": true,
72
+ "task_specific_params": {
73
+ "text-generation": {
74
+ "do_sample": true,
75
+ "max_length": 50
76
+ }
77
+ },
78
+ "temperature": 1.0,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.11.0.dev0",
87
+ "use_bfloat16": false,
88
+ "use_cache": true,
89
+ "vocab_size": 50257
90
+ },
91
+ "decoder_start_token_id": 50256,
92
+ "encoder": {
93
+ "_name_or_path": "",
94
+ "add_cross_attention": false,
95
+ "architectures": [
96
+ "ViTModel"
97
+ ],
98
+ "attention_probs_dropout_prob": 0.0,
99
+ "bad_words_ids": null,
100
+ "bos_token_id": null,
101
+ "chunk_size_feed_forward": 0,
102
+ "decoder_start_token_id": null,
103
+ "diversity_penalty": 0.0,
104
+ "do_sample": false,
105
+ "early_stopping": false,
106
+ "encoder_no_repeat_ngram_size": 0,
107
+ "eos_token_id": null,
108
+ "finetuning_task": null,
109
+ "forced_bos_token_id": null,
110
+ "forced_eos_token_id": null,
111
+ "hidden_act": "gelu",
112
+ "hidden_dropout_prob": 0.0,
113
+ "hidden_size": 768,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 224,
119
+ "initializer_range": 0.02,
120
+ "intermediate_size": 3072,
121
+ "is_decoder": false,
122
+ "is_encoder_decoder": false,
123
+ "label2id": {
124
+ "LABEL_0": 0,
125
+ "LABEL_1": 1
126
+ },
127
+ "layer_norm_eps": 1e-12,
128
+ "length_penalty": 1.0,
129
+ "max_length": 20,
130
+ "min_length": 0,
131
+ "model_type": "vit",
132
+ "no_repeat_ngram_size": 0,
133
+ "num_attention_heads": 12,
134
+ "num_beam_groups": 1,
135
+ "num_beams": 1,
136
+ "num_channels": 3,
137
+ "num_hidden_layers": 12,
138
+ "num_return_sequences": 1,
139
+ "output_attentions": false,
140
+ "output_hidden_states": false,
141
+ "output_scores": false,
142
+ "pad_token_id": null,
143
+ "patch_size": 16,
144
+ "prefix": null,
145
+ "problem_type": null,
146
+ "pruned_heads": {},
147
+ "remove_invalid_values": false,
148
+ "repetition_penalty": 1.0,
149
+ "return_dict": true,
150
+ "return_dict_in_generate": false,
151
+ "sep_token_id": null,
152
+ "task_specific_params": null,
153
+ "temperature": 1.0,
154
+ "tie_encoder_decoder": false,
155
+ "tie_word_embeddings": true,
156
+ "tokenizer_class": null,
157
+ "top_k": 50,
158
+ "top_p": 1.0,
159
+ "torch_dtype": null,
160
+ "torchscript": false,
161
+ "transformers_version": "4.11.0.dev0",
162
+ "use_bfloat16": false
163
+ },
164
+ "eos_token_id": 50256,
165
+ "is_encoder_decoder": true,
166
+ "model_type": "vision-encoder-decoder",
167
+ "pad_token_id": 50256,
168
+ "transformers_version": null
169
+ }
events.out.tfevents.1633443513.t1v-n-bb5dfd23-w-0.8655.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04007cf6b329171749452c6fd460a130288f72a6679059781ec25c8a7157111a
3
+ size 8279933
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df01c05234220e7fea4e92598162988c48315bb6fa2475b5d66f002875710be9
3
+ size 956799284
generation_eval.json ADDED
The diff for this file is too large to render. See raw diff
merges.txt ADDED
The diff for this file is too large to render. See raw diff
pipeline.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Any
3
+ from PIL import Image
4
+ import jax
5
+ from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel
6
+
7
+
8
+ class PreTrainedPipeline():
9
+
10
+ def __init__(self, path=""):
11
+
12
+ model_dir = os.path.join(path, "ckpt_epoch_3_step_6900")
13
+
14
+ self.model = FlaxVisionEncoderDecoderModel.from_pretrained(model_dir)
15
+ self.feature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
17
+
18
+ max_length = 16
19
+ num_beams = 4
20
+ self.gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
21
+
22
+ @jax.jit
23
+ def _generate(pixel_values):
24
+
25
+ output_ids = self.model.generate(pixel_values, **self.gen_kwargs).sequences
26
+ return output_ids
27
+
28
+ self.generate = _generate
29
+
30
+ # compile the model
31
+ image_path = os.path.join(path, 'val_000000039769.jpg')
32
+ image = Image.open(image_path)
33
+ self(image)
34
+ image.close()
35
+
36
+ def __call__(self, inputs: "Image.Image") -> List[str]:
37
+ """
38
+ Args:
39
+ Return:
40
+ """
41
+
42
+ pixel_values = self.feature_extractor(images=inputs, return_tensors="np").pixel_values
43
+
44
+ output_ids = self.generate(pixel_values)
45
+ preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
46
+ preds = [pred.strip() for pred in preds]
47
+
48
+ return preds
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "image_mean": [
5
+ 0.5,
6
+ 0.5,
7
+ 0.5
8
+ ],
9
+ "image_std": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "size": 224
15
+ }
report.txt ADDED
The diff for this file is too large to render. See raw diff
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ Pillow
2
+ jax[cpu]
3
+ flax
4
+ git+https://github.com/ydshieh/transformers.git@flax_vision_encoder_decoder
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
val_000000039769.jpg ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff