amgadhasan
commited on
Commit
•
386e8e5
1
Parent(s):
142e171
Upload folder using huggingface_hub
Browse files- .gitattributes +6 -0
- image_captioner.py +114 -0
- model/cnn/fingerprint.pb +3 -0
- model/cnn/keras_metadata.pb +3 -0
- model/cnn/saved_model.pb +3 -0
- model/cnn/variables/variables.data-00000-of-00001 +3 -0
- model/cnn/variables/variables.index +0 -0
- model/cnn_projector/fingerprint.pb +3 -0
- model/cnn_projector/keras_metadata.pb +3 -0
- model/cnn_projector/saved_model.pb +3 -0
- model/cnn_projector/variables/variables.data-00000-of-00001 +3 -0
- model/cnn_projector/variables/variables.index +0 -0
- model/decoder/fingerprint.pb +3 -0
- model/decoder/keras_metadata.pb +3 -0
- model/decoder/saved_model.pb +3 -0
- model/decoder/variables/variables.data-00000-of-00001 +3 -0
- model/decoder/variables/variables.index +0 -0
- model/model_config.json +5 -0
- tokenizer/tokenizer.json +0 -0
- tokenizer/tokenizer_config.json +6 -0
.gitattributes
CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
cnn_encoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
model/cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
model/cnn_projector/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
41 |
+
model/decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
image_captioner.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
3 |
+
import tensorflow as tf
|
4 |
+
from utils.constants import MAX_LENGTH, IMAGE_SIZE, HIDDEN_UNITS
|
5 |
+
import json
|
6 |
+
import io
|
7 |
+
|
8 |
+
|
9 |
+
class ImageCaptioner():
|
10 |
+
"""
|
11 |
+
A custom class that builds the full model from the smaller sub models. It contains a cnn for feature extraction, a cnn_encoder to encode the features to a suitable dimension,
|
12 |
+
an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
|
13 |
+
"""
|
14 |
+
def __init__(self, cnn, cnn_encoder, rnn_decoder, **kwargs):
|
15 |
+
"""
|
16 |
+
Initializes the ImageCaptioner class with the given arguments.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
cnn: A convolutional neural network that is used to extract features from images.
|
20 |
+
cnn_encoder: A model that encodes the image features into a lower-dimensional space.
|
21 |
+
rnn_decoder: A recurrent neural network that generates captions for the input images.
|
22 |
+
max_length: The maximum length of the captions that the model generates.
|
23 |
+
**kwargs: Additional keyword arguments that are not used in this implementation.
|
24 |
+
"""
|
25 |
+
self.cnn = cnn
|
26 |
+
self.cnn_encoder = cnn_encoder
|
27 |
+
self.rnn_decoder = rnn_decoder
|
28 |
+
self.MAX_LENGTH = MAX_LENGTH
|
29 |
+
self.START_TOKEN_INDEX = 1
|
30 |
+
self.END_TOKEN_INDEX = 2
|
31 |
+
self.HIDDEN_UNITS = HIDDEN_UNITS
|
32 |
+
|
33 |
+
def __call__(self, inputs):
|
34 |
+
"""
|
35 |
+
Calls the MyCustomModel instance with the given inputs.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
inputs: A list of input tensors containing the decoder input, encoded features, and hidden state.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
The output tensor of the RNN decoder.
|
42 |
+
"""
|
43 |
+
[decoder_input, encoded_features, hidden_state] = inputs
|
44 |
+
return self.rnn_decoder(decoder_input, encoded_features, hidden_state, training=False)
|
45 |
+
|
46 |
+
def predict(self, image):
|
47 |
+
"""
|
48 |
+
Generates a caption for the given image.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
image: An input image tensor that the model generates a caption for.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
A tuple containing the indices of the predicted tokens and the attention weights sequence.
|
55 |
+
"""
|
56 |
+
image_features = self.cnn(image)
|
57 |
+
reshaped_features = tf.reshape(image_features, (tf.shape(image_features)[0], -1, image_features.shape[3]))
|
58 |
+
encoded_features = self.cnn_encoder(reshaped_features)
|
59 |
+
|
60 |
+
# Get the RNN's initial state and start token for each new sample
|
61 |
+
# hidden_state = tf.zeros((1, 512))
|
62 |
+
# decoder_input = tf.expand_dims([self.START_TOKEN_INDEX],0)
|
63 |
+
# decoder_input = tf.cast(decoder_input, tf.int32)
|
64 |
+
# caption_probability = 1
|
65 |
+
# predicted_tokens_indices = []
|
66 |
+
# attention_weights_sequence = []
|
67 |
+
n_captions = 2
|
68 |
+
results = tf.Variable(tf.zeros(shape=(n_captions, self.MAX_LENGTH),dtype='int32'), )
|
69 |
+
scores = tf.ones(shape=(n_captions,))
|
70 |
+
#hidden = decoder.get_initial_state(batch_size=1)
|
71 |
+
#hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
|
72 |
+
hiddens = tf.zeros((n_captions, self.HIDDEN_UNITS))
|
73 |
+
#hiddens = [hidden for _ in range(n)]
|
74 |
+
#dec_input = tf.expand_dims([tokenizer.word_index['بب']], 0)
|
75 |
+
dec_inputs = tf.fill(dims=(n_captions,1), value=self.START_TOKEN_INDEX)
|
76 |
+
batch_indices = list(range(n_captions)) # batch size
|
77 |
+
for i in range(self.MAX_LENGTH):
|
78 |
+
logits, hiddens, attention_weights = self.__call__([dec_inputs, encoded_features, hiddens])
|
79 |
+
predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32) # shape (batch_size,num_samples)
|
80 |
+
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
|
81 |
+
#predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
|
82 |
+
#probabilities = tf.nn.softmax(logits, axis=-1)
|
83 |
+
element_indices = predicted_ids
|
84 |
+
|
85 |
+
indices = tf.stack([batch_indices, element_indices], axis=1)
|
86 |
+
scores *= tf.gather_nd(logits ,indices = indices)
|
87 |
+
#predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int64).numpy()[0]
|
88 |
+
#print(predicted_id)
|
89 |
+
#print(predicted_ids)
|
90 |
+
results[:,i].assign(predicted_ids)
|
91 |
+
|
92 |
+
# if tokenizer.index_word[predicted_id] == 'نه':
|
93 |
+
# break
|
94 |
+
dec_inputs = tf.expand_dims(predicted_ids, 1)
|
95 |
+
#dec_input = tf.expand_dims([predicted_id], 0)
|
96 |
+
#print(probs)
|
97 |
+
most_probable_sequence_id = int(tf.math.argmax(scores))
|
98 |
+
best_caption = list(results[most_probable_sequence_id].numpy())
|
99 |
+
print(best_caption)
|
100 |
+
eos_loc = best_caption.index(self.END_TOKEN_INDEX)
|
101 |
+
#caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
|
102 |
+
|
103 |
+
return best_caption[:eos_loc], None
|
104 |
+
# Generate the caption token by token
|
105 |
+
# for i in range(self.MAX_LENGTH):
|
106 |
+
# logits, hidden_state, attention_weights = self.__call__([decoder_input, encoded_features, hidden_state])
|
107 |
+
# predicted_token_index = tf.cast(tf.random.categorical(logits, 1)[0][0], tf.int64)
|
108 |
+
# predicted_tokens_indices.append(tf.get_static_value(predicted_token_index))
|
109 |
+
# attention_weights_sequence.append(attention_weights)
|
110 |
+
# if predicted_token_index == self.END_TOKEN_INDEX:
|
111 |
+
# break
|
112 |
+
# decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
|
113 |
+
|
114 |
+
# return predicted_tokens_indices, attention_weights_sequence
|
model/cnn/fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b06f0abc26074bdd2f680bf3fc77ad130ef4c5ee087b1727384a1e2c21c0283e
|
3 |
+
size 59
|
model/cnn/keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc361c72100de8ac123d591862a6c27a59bf6b690ba37203bd8b315c1f9f36fb
|
3 |
+
size 668989
|
model/cnn/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43261b6b23f0472a9a833121d7c0c0ad10656992ea8187fbff3b56c8d2cdc38f
|
3 |
+
size 5193848
|
model/cnn/variables/variables.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a20dffd4d4ecadb4f07ad80c9706105c77f44331bca9c9766eca53bcba516c3
|
3 |
+
size 87456823
|
model/cnn/variables/variables.index
ADDED
Binary file (24.5 kB). View file
|
|
model/cnn_projector/fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74511d4056302d6009de08b7f24a59457579f079adda15f81014a484b68cccdd
|
3 |
+
size 56
|
model/cnn_projector/keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c8d76f6d44bb65e3723d61ef410cef6bf45e5f6388c84fe06fa88e8a74d7a0a
|
3 |
+
size 1968
|
model/cnn_projector/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d16d4c3203b1f2101452773ee7b616809910d99d684f8d16222aada39be1fc99
|
3 |
+
size 41207
|
model/cnn_projector/variables/variables.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8fd15616021f0c39d77d73fc3fa4e6d551a1a7d90b73a0724c370007c1643a6
|
3 |
+
size 2099602
|
model/cnn_projector/variables/variables.index
ADDED
Binary file (269 Bytes). View file
|
|
model/decoder/fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92658694177f3b40a5224bf1c1831fdeba388860440803f7e397a54568d86487
|
3 |
+
size 57
|
model/decoder/keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6f4c8078d0a60c54547d3f7b04a996db1d1f5a28a9a9947c3ec699a5ac013ea
|
3 |
+
size 11019
|
model/decoder/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06a59924ab529ecbcce4ea9a50eb18706613305dd8b071fc3de1fd988d50827e
|
3 |
+
size 814528
|
model/decoder/variables/variables.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae8b870808c53f8429e7f1caaa340016eb023439175eebc12c25ca2cb5bb4dc4
|
3 |
+
size 38347590
|
model/decoder/variables/variables.index
ADDED
Binary file (951 Bytes). View file
|
|
model/model_config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_length": 26,
|
3 |
+
"image_size": [299, 299],
|
4 |
+
"num_hidden_units": 512
|
5 |
+
}
|
tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "بب",
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token": "نه",
|
5 |
+
"eos_token_id": 2,
|
6 |
+
}
|