Farseq -> Transformers conversion

#1
by mys - opened
README.md CHANGED
@@ -1,6 +1,64 @@
1
- # OFA-Base-Caption
2
- This is the official checkpoint (adaptive to the official code instead of Huggingface Transformers) of OFA-Base finetuned on the MSCOCO Caption dataset for image captioning. Specifically, the model was first trained with cross-entropy loss and then with CIDEr optimization.
 
3
 
4
- For more information, please refer to the official github ([https://github.com/OFA-Sys/OFA](https://github.com/OFA-Sys/OFA))
 
5
 
6
- Temporarily, we only provide the finetuned checkpoints based on the official code.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
 
5
+ # OFA-base-caption
6
+ This is the **base** version of OFA model finetuned for the image captioning task. OFA is a unified multimodal pretrained model that unifies modalities (i.e., cross-modality, vision, language) and tasks (e.g., image generation, visual grounding, image captioning, image classification, text generation, etc.) to a simple sequence-to-sequence learning framework.
7
 
8
+ The directory includes 4 files, namely `config.json` which consists of model configuration, `vocab.json` and `merge.txt` for our OFA tokenizer, and lastly `pytorch_model.bin` which consists of model weights. There is no need to worry about the mismatch between Fairseq and transformers, since we have addressed the issue yet.
9
+
10
+ To use it in transformers, please refer to https://github.com/OFA-Sys/OFA/tree/feature/add_transformers. Install the transformers and download the models as shown below.
11
+
12
+ ```
13
+ git clone --single-branch --branch feature/add_transformers https://github.com/OFA-Sys/OFA.git
14
+ pip install OFA/transformers/
15
+ ```
16
+
17
+ After, prepare an image for the testing example below. Also, ensure that you have pillow and torchvision in your environment.
18
+
19
+ ```
20
+ import re
21
+ import time
22
+ from PIL import Image
23
+ from torchvision import transforms
24
+ from transformers import OFATokenizer, OFAModel
25
+
26
+ model_name = "OFA-sys/OFA-base-caption"
27
+
28
+ mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
29
+ resolution = 256
30
+
31
+ patch_resize_transform = transforms.Compose([
32
+ lambda image: image.convert("RGB"),
33
+ transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
34
+ transforms.ToTensor(),
35
+ transforms.Normalize(mean=mean, std=std)
36
+ ])
37
+
38
+ start = time.time()
39
+ tokenizer = OFATokenizer.from_pretrained(model_name)
40
+ model = OFAModel.from_pretrained(model_name, use_cache=False)
41
+ alapsed = time.time() - start
42
+ print(f"Loaded in {alapsed} secs")
43
+
44
+
45
+ def caption_image(txt, img):
46
+ inputs = tokenizer([txt], return_tensors="pt").input_ids
47
+ patch_img = patch_resize_transform(img).unsqueeze(0)
48
+
49
+ gen = model.generate(inputs, patch_images=patch_img, num_beams=5, no_repeat_ngram_size=3)
50
+ results = tokenizer.batch_decode(gen, skip_special_tokens=True)
51
+
52
+ result = results[0].strip()
53
+ result = re.sub(r'[^\w\s]', '', result)
54
+
55
+ return result
56
+
57
+
58
+ if __name__ == "__main__":
59
+ txt = "What does the image describe?"
60
+ img = Image.open('/path/to/input/image.jpg')
61
+ caption = caption_image(txt, img)
62
+ print(caption)
63
+
64
+ ```
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "add_type_embedding": true,
5
+ "architectures": [
6
+ "OFAModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attn_scale_factor": 2.0,
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": 0.0,
12
+ "code_image_size": 128,
13
+ "code_layernorm_embedding": true,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_drop_path_rate": 0.0,
17
+ "decoder_ffn_dim": 3072,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_normalize_before": true,
21
+ "decoder_start_token_id": 0,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 12,
24
+ "encoder_drop_path_rate": 0.0,
25
+ "encoder_ffn_dim": 3072,
26
+ "encoder_layerdrop": 0.0,
27
+ "encoder_layers": 6,
28
+ "encoder_normalize_before": true,
29
+ "entangle_position_embedding": false,
30
+ "eos_token_id": 2,
31
+ "forced_eos_token_id": 2,
32
+ "image_bucket_size": 42,
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "layernorm_embedding": true,
36
+ "max_position_embeddings": 1024,
37
+ "model_type": "ofa",
38
+ "normformer": true,
39
+ "num_hidden_layers": 6,
40
+ "pad_token_id": 1,
41
+ "patch_layernorm_embedding": true,
42
+ "resnet_drop_path_rate": 0.0,
43
+ "resnet_model_path": null,
44
+ "resnet_type": "resnet101",
45
+ "scale_embedding": false,
46
+ "share_decoder_input_output_embed": true,
47
+ "token_bucket_size": 256,
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.15.0",
50
+ "use_cache": false,
51
+ "vocab_size": 59457
52
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
caption_base_best.pt → pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a243bed55b82bf6596255edae716d6b4262d7a2175d4e24ab6db372a97ed2d1
3
- size 2254237467
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:521abbc85015e110be39ca7158579966b6e41101d012b961a5ea6aff18b3fe66
3
+ size 1161554935
vocab.json ADDED
The diff for this file is too large to render. See raw diff