amgadhasan commited on Jan 6

Commit

386e8e5

•

1 Parent(s): 142e171

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +6 -0
image_captioner.py +114 -0
model/cnn/fingerprint.pb +3 -0
model/cnn/keras_metadata.pb +3 -0
model/cnn/saved_model.pb +3 -0
model/cnn/variables/variables.data-00000-of-00001 +3 -0
model/cnn/variables/variables.index +0 -0
model/cnn_projector/fingerprint.pb +3 -0
model/cnn_projector/keras_metadata.pb +3 -0
model/cnn_projector/saved_model.pb +3 -0
model/cnn_projector/variables/variables.data-00000-of-00001 +3 -0
model/cnn_projector/variables/variables.index +0 -0
model/decoder/fingerprint.pb +3 -0
model/decoder/keras_metadata.pb +3 -0
model/decoder/saved_model.pb +3 -0
model/decoder/variables/variables.data-00000-of-00001 +3 -0
model/decoder/variables/variables.index +0 -0
model/model_config.json +5 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer_config.json +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+cnn_encoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+model/cnn/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+model/cnn_projector/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+model/decoder/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

image_captioner.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+from utils.constants import MAX_LENGTH, IMAGE_SIZE, HIDDEN_UNITS
+import json
+import io
+class ImageCaptioner():
+    """
+    A custom class that builds the full model from the smaller sub models. It contains a cnn for feature extraction, a cnn_encoder to encode the features to a suitable dimension,
+    an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
+    """
+    def __init__(self, cnn, cnn_encoder, rnn_decoder, **kwargs):
+        """
+        Initializes the ImageCaptioner class with the given arguments.
+        Args:
+        cnn: A convolutional neural network that is used to extract features from images.
+        cnn_encoder: A model that encodes the image features into a lower-dimensional space.
+        rnn_decoder: A recurrent neural network that generates captions for the input images.
+        max_length: The maximum length of the captions that the model generates.
+        **kwargs: Additional keyword arguments that are not used in this implementation.
+        """
+        self.cnn = cnn
+        self.cnn_encoder = cnn_encoder
+        self.rnn_decoder = rnn_decoder
+        self.MAX_LENGTH = MAX_LENGTH
+        self.START_TOKEN_INDEX = 1
+        self.END_TOKEN_INDEX = 2
+        self.HIDDEN_UNITS = HIDDEN_UNITS
+    def __call__(self, inputs):
+        """
+        Calls the MyCustomModel instance with the given inputs.
+        Args:
+        inputs: A list of input tensors containing the decoder input, encoded features, and hidden state.
+        Returns:
+        The output tensor of the RNN decoder.
+        """
+        [decoder_input, encoded_features, hidden_state] = inputs
+        return self.rnn_decoder(decoder_input, encoded_features, hidden_state, training=False)
+    def predict(self, image):
+        """
+        Generates a caption for the given image.
+        Args:
+        image: An input image tensor that the model generates a caption for.
+        Returns:
+        A tuple containing the indices of the predicted tokens and the attention weights sequence.
+        """
+        image_features = self.cnn(image)
+        reshaped_features = tf.reshape(image_features, (tf.shape(image_features)[0], -1, image_features.shape[3]))
+        encoded_features = self.cnn_encoder(reshaped_features)
+        # Get the RNN's initial state and start token for each new sample
+        # hidden_state = tf.zeros((1, 512))
+        # decoder_input = tf.expand_dims([self.START_TOKEN_INDEX],0)
+        # decoder_input = tf.cast(decoder_input, tf.int32)
+        # caption_probability = 1
+        # predicted_tokens_indices = []
+        # attention_weights_sequence = []
+        n_captions = 2
+        results = tf.Variable(tf.zeros(shape=(n_captions, self.MAX_LENGTH),dtype='int32'), )
+        scores = tf.ones(shape=(n_captions,))
+        #hidden = decoder.get_initial_state(batch_size=1)
+        #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
+        hiddens = tf.zeros((n_captions, self.HIDDEN_UNITS))
+        #hiddens = [hidden for _ in range(n)]
+        #dec_input = tf.expand_dims([tokenizer.word_index['بب']], 0)
+        dec_inputs = tf.fill(dims=(n_captions,1), value=self.START_TOKEN_INDEX)
+        batch_indices = list(range(n_captions)) # batch size
+        for i in range(self.MAX_LENGTH):
+            logits, hiddens, attention_weights = self.__call__([dec_inputs, encoded_features, hiddens])
+            predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)  # shape (batch_size,num_samples)
+            predicted_ids = tf.squeeze(predicted_ids, axis=-1)
+            #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
+            #probabilities = tf.nn.softmax(logits, axis=-1)
+            element_indices = predicted_ids
+            indices = tf.stack([batch_indices, element_indices], axis=1)
+            scores *= tf.gather_nd(logits ,indices = indices)
+            #predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int64).numpy()[0]
+            #print(predicted_id)
+            #print(predicted_ids)
+            results[:,i].assign(predicted_ids)
+            # if tokenizer.index_word[predicted_id] == 'نه':
+            #     break
+            dec_inputs = tf.expand_dims(predicted_ids, 1)
+            #dec_input = tf.expand_dims([predicted_id], 0)
+            #print(probs)
+        most_probable_sequence_id = int(tf.math.argmax(scores))
+        best_caption = list(results[most_probable_sequence_id].numpy())
+        print(best_caption)
+        eos_loc = best_caption.index(self.END_TOKEN_INDEX)
+        #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
+        return best_caption[:eos_loc], None
+        # Generate the caption token by token
+        # for i in range(self.MAX_LENGTH):
+        #     logits, hidden_state, attention_weights = self.__call__([decoder_input, encoded_features, hidden_state])
+        #     predicted_token_index = tf.cast(tf.random.categorical(logits, 1)[0][0], tf.int64)
+        #     predicted_tokens_indices.append(tf.get_static_value(predicted_token_index))
+        #     attention_weights_sequence.append(attention_weights)
+        #     if predicted_token_index == self.END_TOKEN_INDEX:
+        #         break
+        #     decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
+        # return predicted_tokens_indices, attention_weights_sequence

model/cnn/fingerprint.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b06f0abc26074bdd2f680bf3fc77ad130ef4c5ee087b1727384a1e2c21c0283e
+size 59

model/cnn/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc361c72100de8ac123d591862a6c27a59bf6b690ba37203bd8b315c1f9f36fb
+size 668989

model/cnn/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43261b6b23f0472a9a833121d7c0c0ad10656992ea8187fbff3b56c8d2cdc38f
+size 5193848

model/cnn/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a20dffd4d4ecadb4f07ad80c9706105c77f44331bca9c9766eca53bcba516c3
+size 87456823

model/cnn/variables/variables.index ADDED Viewed

Binary file (24.5 kB). View file

model/cnn_projector/fingerprint.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74511d4056302d6009de08b7f24a59457579f079adda15f81014a484b68cccdd
+size 56

model/cnn_projector/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8d76f6d44bb65e3723d61ef410cef6bf45e5f6388c84fe06fa88e8a74d7a0a
+size 1968

model/cnn_projector/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d16d4c3203b1f2101452773ee7b616809910d99d684f8d16222aada39be1fc99
+size 41207

model/cnn_projector/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8fd15616021f0c39d77d73fc3fa4e6d551a1a7d90b73a0724c370007c1643a6
+size 2099602

model/cnn_projector/variables/variables.index ADDED Viewed

Binary file (269 Bytes). View file

model/decoder/fingerprint.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92658694177f3b40a5224bf1c1831fdeba388860440803f7e397a54568d86487
+size 57

model/decoder/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f4c8078d0a60c54547d3f7b04a996db1d1f5a28a9a9947c3ec699a5ac013ea
+size 11019

model/decoder/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06a59924ab529ecbcce4ea9a50eb18706613305dd8b071fc3de1fd988d50827e
+size 814528

model/decoder/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae8b870808c53f8429e7f1caaa340016eb023439175eebc12c25ca2cb5bb4dc4
+size 38347590

model/decoder/variables/variables.index ADDED Viewed

Binary file (951 Bytes). View file

model/model_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "max_length": 26,
+  "image_size": [299, 299],
+  "num_hidden_units": 512
+}

tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+"bos_token": "بب",
+"bos_token_id": 1,
+"eos_token": "نه",
+"eos_token_id": 2,
+}