amgadhasan commited on
Commit
386e8e5
1 Parent(s): 142e171

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
37
+ cnn_encoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
38
+ decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
39
+ model/cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
40
+ model/cnn_projector/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
41
+ model/decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
image_captioner.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
3
+ import tensorflow as tf
4
+ from utils.constants import MAX_LENGTH, IMAGE_SIZE, HIDDEN_UNITS
5
+ import json
6
+ import io
7
+
8
+
9
+ class ImageCaptioner():
10
+ """
11
+ A custom class that builds the full model from the smaller sub models. It contains a cnn for feature extraction, a cnn_encoder to encode the features to a suitable dimension,
12
+ an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
13
+ """
14
+ def __init__(self, cnn, cnn_encoder, rnn_decoder, **kwargs):
15
+ """
16
+ Initializes the ImageCaptioner class with the given arguments.
17
+
18
+ Args:
19
+ cnn: A convolutional neural network that is used to extract features from images.
20
+ cnn_encoder: A model that encodes the image features into a lower-dimensional space.
21
+ rnn_decoder: A recurrent neural network that generates captions for the input images.
22
+ max_length: The maximum length of the captions that the model generates.
23
+ **kwargs: Additional keyword arguments that are not used in this implementation.
24
+ """
25
+ self.cnn = cnn
26
+ self.cnn_encoder = cnn_encoder
27
+ self.rnn_decoder = rnn_decoder
28
+ self.MAX_LENGTH = MAX_LENGTH
29
+ self.START_TOKEN_INDEX = 1
30
+ self.END_TOKEN_INDEX = 2
31
+ self.HIDDEN_UNITS = HIDDEN_UNITS
32
+
33
+ def __call__(self, inputs):
34
+ """
35
+ Calls the MyCustomModel instance with the given inputs.
36
+
37
+ Args:
38
+ inputs: A list of input tensors containing the decoder input, encoded features, and hidden state.
39
+
40
+ Returns:
41
+ The output tensor of the RNN decoder.
42
+ """
43
+ [decoder_input, encoded_features, hidden_state] = inputs
44
+ return self.rnn_decoder(decoder_input, encoded_features, hidden_state, training=False)
45
+
46
+ def predict(self, image):
47
+ """
48
+ Generates a caption for the given image.
49
+
50
+ Args:
51
+ image: An input image tensor that the model generates a caption for.
52
+
53
+ Returns:
54
+ A tuple containing the indices of the predicted tokens and the attention weights sequence.
55
+ """
56
+ image_features = self.cnn(image)
57
+ reshaped_features = tf.reshape(image_features, (tf.shape(image_features)[0], -1, image_features.shape[3]))
58
+ encoded_features = self.cnn_encoder(reshaped_features)
59
+
60
+ # Get the RNN's initial state and start token for each new sample
61
+ # hidden_state = tf.zeros((1, 512))
62
+ # decoder_input = tf.expand_dims([self.START_TOKEN_INDEX],0)
63
+ # decoder_input = tf.cast(decoder_input, tf.int32)
64
+ # caption_probability = 1
65
+ # predicted_tokens_indices = []
66
+ # attention_weights_sequence = []
67
+ n_captions = 2
68
+ results = tf.Variable(tf.zeros(shape=(n_captions, self.MAX_LENGTH),dtype='int32'), )
69
+ scores = tf.ones(shape=(n_captions,))
70
+ #hidden = decoder.get_initial_state(batch_size=1)
71
+ #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
72
+ hiddens = tf.zeros((n_captions, self.HIDDEN_UNITS))
73
+ #hiddens = [hidden for _ in range(n)]
74
+ #dec_input = tf.expand_dims([tokenizer.word_index['بب']], 0)
75
+ dec_inputs = tf.fill(dims=(n_captions,1), value=self.START_TOKEN_INDEX)
76
+ batch_indices = list(range(n_captions)) # batch size
77
+ for i in range(self.MAX_LENGTH):
78
+ logits, hiddens, attention_weights = self.__call__([dec_inputs, encoded_features, hiddens])
79
+ predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32) # shape (batch_size,num_samples)
80
+ predicted_ids = tf.squeeze(predicted_ids, axis=-1)
81
+ #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
82
+ #probabilities = tf.nn.softmax(logits, axis=-1)
83
+ element_indices = predicted_ids
84
+
85
+ indices = tf.stack([batch_indices, element_indices], axis=1)
86
+ scores *= tf.gather_nd(logits ,indices = indices)
87
+ #predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int64).numpy()[0]
88
+ #print(predicted_id)
89
+ #print(predicted_ids)
90
+ results[:,i].assign(predicted_ids)
91
+
92
+ # if tokenizer.index_word[predicted_id] == 'نه':
93
+ # break
94
+ dec_inputs = tf.expand_dims(predicted_ids, 1)
95
+ #dec_input = tf.expand_dims([predicted_id], 0)
96
+ #print(probs)
97
+ most_probable_sequence_id = int(tf.math.argmax(scores))
98
+ best_caption = list(results[most_probable_sequence_id].numpy())
99
+ print(best_caption)
100
+ eos_loc = best_caption.index(self.END_TOKEN_INDEX)
101
+ #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
102
+
103
+ return best_caption[:eos_loc], None
104
+ # Generate the caption token by token
105
+ # for i in range(self.MAX_LENGTH):
106
+ # logits, hidden_state, attention_weights = self.__call__([decoder_input, encoded_features, hidden_state])
107
+ # predicted_token_index = tf.cast(tf.random.categorical(logits, 1)[0][0], tf.int64)
108
+ # predicted_tokens_indices.append(tf.get_static_value(predicted_token_index))
109
+ # attention_weights_sequence.append(attention_weights)
110
+ # if predicted_token_index == self.END_TOKEN_INDEX:
111
+ # break
112
+ # decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
113
+
114
+ # return predicted_tokens_indices, attention_weights_sequence
model/cnn/fingerprint.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b06f0abc26074bdd2f680bf3fc77ad130ef4c5ee087b1727384a1e2c21c0283e
3
+ size 59
model/cnn/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc361c72100de8ac123d591862a6c27a59bf6b690ba37203bd8b315c1f9f36fb
3
+ size 668989
model/cnn/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43261b6b23f0472a9a833121d7c0c0ad10656992ea8187fbff3b56c8d2cdc38f
3
+ size 5193848
model/cnn/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a20dffd4d4ecadb4f07ad80c9706105c77f44331bca9c9766eca53bcba516c3
3
+ size 87456823
model/cnn/variables/variables.index ADDED
Binary file (24.5 kB). View file
 
model/cnn_projector/fingerprint.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74511d4056302d6009de08b7f24a59457579f079adda15f81014a484b68cccdd
3
+ size 56
model/cnn_projector/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c8d76f6d44bb65e3723d61ef410cef6bf45e5f6388c84fe06fa88e8a74d7a0a
3
+ size 1968
model/cnn_projector/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d16d4c3203b1f2101452773ee7b616809910d99d684f8d16222aada39be1fc99
3
+ size 41207
model/cnn_projector/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8fd15616021f0c39d77d73fc3fa4e6d551a1a7d90b73a0724c370007c1643a6
3
+ size 2099602
model/cnn_projector/variables/variables.index ADDED
Binary file (269 Bytes). View file
 
model/decoder/fingerprint.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92658694177f3b40a5224bf1c1831fdeba388860440803f7e397a54568d86487
3
+ size 57
model/decoder/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6f4c8078d0a60c54547d3f7b04a996db1d1f5a28a9a9947c3ec699a5ac013ea
3
+ size 11019
model/decoder/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a59924ab529ecbcce4ea9a50eb18706613305dd8b071fc3de1fd988d50827e
3
+ size 814528
model/decoder/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae8b870808c53f8429e7f1caaa340016eb023439175eebc12c25ca2cb5bb4dc4
3
+ size 38347590
model/decoder/variables/variables.index ADDED
Binary file (951 Bytes). View file
 
model/model_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "max_length": 26,
3
+ "image_size": [299, 299],
4
+ "num_hidden_units": 512
5
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "بب",
3
+ "bos_token_id": 1,
4
+ "eos_token": "نه",
5
+ "eos_token_id": 2,
6
+ }