File size: 2,300 Bytes
f338d56
 
 
 
 
a99072f
 
 
f338d56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a99072f
 
f338d56
 
 
a99072f
 
 
 
 
 
f338d56
 
 
 
 
a99072f
 
 
f338d56
a99072f
f338d56
 
 
 
 
 
 
 
 
a99072f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f338d56
a99072f
 
 
 
 
 
 
f338d56
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import sys, os

current_path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_path)

# jax
import jax

# Main model -  ViTGPT2LM
from vit_gpt2.modeling_flax_vit_gpt2_lm import FlaxViTGPT2LMForConditionalGeneration

# Vit - as encoder
from transformers import ViTFeatureExtractor
from PIL import Image
import requests
import numpy as np

# GPT2 / GPT2LM - as decoder
from transformers import ViTFeatureExtractor, GPT2Tokenizer

model_name_or_path = './outputs/ckpt_2/'
flax_vit_gpt2_lm = FlaxViTGPT2LMForConditionalGeneration.from_pretrained(model_name_or_path)

vit_model_name = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_name)

gpt2_model_name = 'asi/gpt-fr-cased-small'
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)

max_length = 64
num_beams = 16
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


@jax.jit
def predict_fn(pixel_values):

    return flax_vit_gpt2_lm.generate(pixel_values, **gen_kwargs)

def predict(image, pxs=None):

    # batch dim is added automatically
    encoder_inputs = feature_extractor(images=image, return_tensors="jax")
    pixel_values = encoder_inputs.pixel_values

    if pxs is not None:
        pixel_values = pxs

    # generation
    generation = predict_fn(pixel_values)

    token_ids = np.array(generation.sequences)[0]
    caption = tokenizer.decode(token_ids)

    return caption, token_ids


if __name__ == '__main__':

    from datetime import datetime

    idx = 11
    url = f'./wit_data_dir/train/images/{idx}.jpg'
    image = Image.open(url)

    encoder_inputs = feature_extractor(images=image, return_tensors="np")
    pv1 = encoder_inputs.pixel_values
    pv2 = np.load(f'./wit_data_dir/train/numpy/{idx}.npy')
    print(np.sum(np.abs(pv1 - pv2)))

    s = datetime.now()
    caption, token_ids = predict(image, pxs=pv2)
    e = datetime.now()
    e = (e - s).total_seconds()
    print(e)

    print(f'token_ids: {token_ids}')
    print(f'caption: {caption}')

    for _ in range(1):
        s = datetime.now()
        caption, token_ids = predict(image, pxs=None)
        e = datetime.now()
        e = (e - s).total_seconds()
        print(e)
        print('-' * 20)

    print(f'token_ids: {token_ids}')
    print(f'caption: {caption}')