Spaces:

sonamsherpa
/

firstspace

Runtime error

App Files Files Community

sonamsherpa commited on Aug 18, 2023

Commit

27afece

•

1 Parent(s): dbebdf9

Update app.py

Browse files

Files changed (1) hide show

app.py +685 -2

app.py CHANGED Viewed

@@ -1,3 +1,686 @@
-import pandas as pd
-print('Hello World!')

+# -*- coding: utf-8 -*-
+"""Copy of Copy of Imagecaption_generator_AIML.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1Thp1MpIDt-AnhXifbSu-AeGQRI8iR3-E
+"""
+!pip install wget
+import os
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+import requests
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import shutil
+from tensorflow.keras.applications import efficientnet
+import wget
+from tensorflow.keras.layers import TextVectorization
+seed = 111
+np.random.seed(seed)
+tf.random.set_seed(seed)
+!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
+!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
+!unzip -qq Flickr8k_Dataset.zip
+!unzip -qq Flickr8k_text.zip
+!rm Flickr8k_Dataset.zip Flickr8k_text.zip
+# Desired image dimensions
+image_size = (299, 299)
+# Vocabulary size
+vocabulary_size = 10000
+# Fixed length allowed for any sequence
+sequence_length = 25
+# Dimension for the image embeddings and token embeddings
+# Per-layer units in the feed-forward network
+embedded_dimension = feed_forward_dimension = EMBED_DIM = 512
+# Other training parameters
+batch_size = 64
+epochs = 30
+autotune = tf.data.AUTOTUNE
+def map_image_caption(filename):
+    '''
+        Load caption and maps each caption to respecitve image
+        Returns: Dictionay of image name and its captions and list contatining all the captions
+    '''
+    with open(filename) as caption_file:
+        caption_data = caption_file.readlines()
+        mapped_captions = {}
+        text_data = []
+        skip_these_images = set()
+        for c_data in caption_data:
+            # Image's name and caption is seperated by tab so split them into separate variable
+            image_name, caption = c_data.strip("\n").split("\t")
+            caption = caption.strip()
+            # There are 5 captions for each images and each images name has suffix '#(caption_number)' so remove everything after # and strip for any whitespaces
+            image_name = os.path.join('Flicker8k_Dataset', image_name.split("#")[0].strip())
+            # We will remove caption that are either too short to too long
+            tokens = caption.strip().split()
+            if len(tokens) < 5 or len(tokens) > sequence_length:
+                skip_these_images.add(image_name)
+                continue
+            if image_name.endswith("jpg") and image_name not in skip_these_images:
+                # Add start and end tags to identify the begining and ending of captions
+                text_data.append("<start> " + caption + " <end>")
+                if image_name in mapped_captions:
+                    mapped_captions[image_name].append(caption)
+                else:
+                    mapped_captions[image_name] = [caption]
+        for image_name in skip_these_images:
+            if image_name in mapped_captions:
+                del mapped_captions[image_name]
+        return mapped_captions, text_data
+def train_val_split(caption_data):
+    '''
+        Split train and test data for training and testing
+    '''
+    train_size = 0.8
+    # Get list of image names and convert to list
+    list_of_images = list(caption_data.keys())
+    # Shuffle for randomness
+    np.random.shuffle(list_of_images)
+    # Split data into training and testing
+    train_size = int(len(caption_data) * train_size)
+    train_data = {
+        name: caption_data[name] for name in list_of_images[:train_size]
+    }
+    test_data = {
+        name: caption_data[name] for name in list_of_images[train_size:]
+    }
+    return train_data, test_data
+# Load the dataset
+captions_mapping, text_data = map_image_caption("Flickr8k.token.txt")
+# Split the dataset into training and validation sets
+training_data, validation_data = train_val_split(captions_mapping)
+print("Number of training samples here: ", len(training_data))
+print("Number of validation samples here: ", len(validation_data))
+def standardize(input_string):
+    strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~".replace("<", "").replace(">", "")
+    return tf.strings.regex_replace(tf.strings.lower(input_string), "[%s]" % re.escape(strip_chars), "")
+vectorization = TextVectorization(
+    max_tokens=vocabulary_size,
+    output_mode="int",
+    output_sequence_length=sequence_length,
+    standardize=standardize,
+)
+vectorization.adapt(text_data)
+# Data augmentation for image data
+image_augmentation = keras.Sequential(
+    [
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(0.2),
+        layers.RandomContrast(0.3),
+    ]
+)
+def decoder_to_resizer(img_path):
+    '''
+        Decodes jpg and resize and converts images to float for processing
+    '''
+    image = tf.io.read_file(img_path)
+    decoded_image = tf.image.decode_jpeg(image, channels=3)
+    resized_image = tf.image.resize(decoded_image, image_size)
+    return tf.image.convert_image_dtype(resized_image, tf.float32)
+def process_input(img_path, captions):
+  '''
+  Returns decoded jpg in float after resizing to standard size, returns vectorized caption detail
+  '''
+  return decoder_to_resizer(img_path), vectorization(captions)
+def prepare_dataset(images, captions):
+    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
+    return dataset.shuffle(batch_size * 8).map(process_input, num_parallel_calls=autotune).batch(batch_size).prefetch(autotune)
+training_dataset = prepare_dataset(list(training_data.keys()), list(training_data.values()))
+validation_dataset = prepare_dataset(list(validation_data.keys()), list(validation_data.values()))
+training_dataset
+validation_dataset
+def prepare_cnn_model():
+    base_model = efficientnet.EfficientNetB0(
+        input_shape=(*image_size, 3), include_top=False, weights="imagenet",
+    )
+    # We freeze our feature extractor
+    base_model.trainable = False
+    base_model_out = base_model.output
+    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
+    cnn_model = keras.models.Model(base_model.input, base_model_out)
+    return cnn_model
+class EncoderClass(layers.Layer):
+    ''' Encoder block that inherits layer and uses layer for neural network model
+    '''
+    def __init__(self, embedded_dimension, dense_dimension, number_of_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embedded_dimension = embedded_dimension
+        self.dense_dimension = dense_dimension
+        self.number_of_heads = number_of_heads
+        # A multi headed self attention layer with no dropout
+        self.mh_attention_layer = layers.MultiHeadAttention(
+            num_heads=number_of_heads,
+            key_dim=embedded_dimension,
+            dropout=0.0
+        )
+        # Normalization layers
+        # There layers noramlizes the input we can compare it to Standard Scaler in traditional machine learning algorithm
+        self.normalization_layer_1 = layers.LayerNormalization()
+        self.normalization_layer_2 = layers.LayerNormalization()
+        # Dense layer with relu activation
+        self.dense_layer = layers.Dense(embedded_dimension, activation="relu")
+    def call(self, inputs, training):
+        # Here the inputs for multiheaded attention layers are passed with combination of normalization layer and dense layer
+        inputs = self.dense_layer(self.normalization_layer_1(inputs))
+        attention_output_1 = self.mh_attention_layer(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=None,
+            training=training,
+        )
+        # Here after applying attention mechanism in original input, it is passed from another normalization layer
+        return self.normalization_layer_2(inputs + attention_output_1)
+class EmbedTokenAndPostionClass(layers.Layer):
+    ''' This call will embed token and its position together giving both semantic and contextual meaning to each token
+    '''
+    def __init__(self, sequence_length, vocabulary_size, embedded_dimension, **kwargs):
+        super().__init__(**kwargs)
+        # Initialize Embedding layer to embed tokens, here inputs is the vocabulary size and output dimension is the embedded dimension
+        # This layer captures the semantic meaning of token in the inputs. This helps to understand the meaning of words and their relationship
+        self.token_embeddings = layers.Embedding(
+            input_dim=vocabulary_size,
+            output_dim=embedded_dimension
+        )
+        # Initialize Embedding layer that embebs positions, here inputs is the sequence length and output dimension is the embedded dimension
+        # This simply helps to capture the position of the input or order or where a particular token is
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length,
+            output_dim=embedded_dimension
+        )
+        self.sequence_length = sequence_length
+        self.vocabulary_size = vocabulary_size
+        self.embedded_dimension = embedded_dimension
+        # Calculate the square root of embedded dimension and convert to float 32
+        # This is done to prevent magnitude/value of embedded dimension from becoming too high
+        self.embedded_scale = tf.math.sqrt(tf.cast(embedded_dimension, tf.float32))
+    def call(self, inputs):
+        # Get all the positions
+        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
+        # Pass input through token embedding
+        # This will generate continous vector for each token
+        embedded_tokens = self.token_embeddings(inputs) * self.embedded_scale
+        embedded_positions = self.position_embeddings(positions)
+        # Combine vector and their position, capturing both sematic meaning of the words and its contextual meaning
+        return embedded_tokens + embedded_positions
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+class DecoderClass(layers.Layer):
+    '''This is the decoder component of our model. This will decode the vector space that has been encoded and embedded with its postions.
+       It uses self attention and cross attention mechanism along with feed forward NN layer to give output sequences.
+    '''
+    def __init__(self, embedded_dimension, feed_forward_dimension, number_of_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embedded_dimension
+        self.feed_forward_dimension = feed_forward_dimension
+        self.number_of_heads = number_of_heads
+        self.first_attention_layer = layers.MultiHeadAttention(
+            num_heads=number_of_heads,
+            key_dim=embedded_dimension,
+            dropout=0.1
+        )
+        self.second_attention_layer = layers.MultiHeadAttention(
+            num_heads=number_of_heads,
+            key_dim=embedded_dimension,
+            dropout=0.1
+        )
+        self.first_feed_forward_layer = layers.Dense(feed_forward_dimension, activation="relu")
+        self.second_feed_forward_layer = layers.Dense(embedded_dimension)
+        self.first_normalization_layer = layers.LayerNormalization()
+        self.second_normalization_layer = layers.LayerNormalization()
+        self.third_normalization_layer = layers.LayerNormalization()
+        self.embedding = EmbedTokenAndPostionClass(
+            embedded_dimension=embedded_dimension,
+            sequence_length=sequence_length,
+            vocabulary_size=vocabulary_size
+        )
+        self.out = layers.Dense(vocabulary_size, activation="softmax")
+        self.first_dropout_layer = layers.Dropout(0.3)
+        self.second_dropout_layer = layers.Dropout(0.5)
+        self.supports_masking = True
+    def call(self, inputs, encoder_outputs, training, mask=None):
+        inputs = self.embedding(inputs)
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
+            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
+            combined_mask = tf.minimum(combined_mask, causal_mask)
+        first_attention_output = self.first_attention_layer(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=combined_mask,
+            training=training,
+        )
+        first_normalization_output = self.first_normalization_layer(inputs + first_attention_output)
+        second_attention_output = self.second_attention_layer(
+            query=first_normalization_output,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+            training=training,
+        )
+        second_normalization_output = self.second_normalization_layer(first_normalization_output + second_attention_output)
+        output = self.first_feed_forward_layer(second_normalization_output)
+        output = self.first_dropout_layer(output, training=training)
+        output = self.second_feed_forward_layer(output)
+        output = self.third_normalization_layer(output + second_normalization_output, training=training)
+        output = self.second_dropout_layer(output, training=training)
+        return self.out(output)
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
+            axis=0,
+        )
+        return tf.tile(mask, mult)
+class ImageCaptionClass(keras.Model):
+    def __init__(
+        self, efficient_net_model, encoder_class, decoder_class, image_augmentation=None,
+    ):
+        super().__init__()
+        self.efficient_net_model = efficient_net_model
+        self.encoder_class = encoder_class
+        self.decoder_class = decoder_class
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.acc_tracker = keras.metrics.Mean(name="accuracy")
+        self.caption_to_image_ration = 5
+        self.image_augmentation = image_augmentation
+    def calculate_loss(self, y_actual_value, y_predicted_vaue, mask):
+        loss = self.loss(y_actual_value, y_predicted_vaue)
+        mask = tf.cast(mask, dtype=loss.dtype)
+        loss *= mask
+        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
+    def calculate_accuracy(self, y_actual_value, y_predicted_vaue, mask):
+        accuracy = tf.equal(y_actual_value, tf.argmax(y_predicted_vaue, axis=2))
+        accuracy = tf.math.logical_and(mask, accuracy)
+        accuracy = tf.cast(accuracy, dtype=tf.float32)
+        mask = tf.cast(mask, dtype=tf.float32)
+        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
+    def get_caption_loss_and_accuracy(self, image_embedded, batch_sequence, calculate_for_train=True):
+        encoder_class_out = self.encoder_class(image_embedded, training=calculate_for_train)
+        batch_sequence_input = batch_sequence[:, :-1]
+        batch_sequence_actual = batch_sequence[:, 1:]
+        mask = tf.math.not_equal(batch_sequence_actual, 0)
+        batch_sequence_predicted = self.decoder_class(
+            batch_sequence_input, encoder_class_out, training=calculate_for_train, mask=mask
+        )
+        loss = self.calculate_loss(batch_sequence_actual, batch_sequence_predicted, mask)
+        acc = self.calculate_accuracy(batch_sequence_actual, batch_sequence_predicted, mask)
+        return loss, acc
+    def train_step(self, data):
+        batch_image, batch_sequence = data
+        batch_loss = 0
+        batch_accuracy = 0
+        if self.image_augmentation:
+            batch_image = self.image_augmentation(batch_image)
+        # 1. Get image embeddings
+        image_embedded = self.efficient_net_model(batch_image)
+        # 2. Pass each of the five captions one by one to the decoder_class
+        # along with the encoder_class outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.caption_to_image_ration):
+            with tf.GradientTape() as gradient_tape:
+                loss, acc = self.get_caption_loss_and_accuracy(
+                    image_embedded, batch_sequence[:, i, :], calculate_for_train=True
+                )
+                # 3. Update loss and accuracy
+                batch_loss += loss
+                batch_accuracy += acc
+            # 4. Get the list of all the trainable weights
+            training_weights = (
+                self.encoder_class.trainable_variables + self.decoder_class.trainable_variables
+            )
+            # 5. Get the gradients
+            gradient_lists = gradient_tape.gradient(loss, training_weights)
+            # 6. Update the trainable weights
+            self.optimizer.apply_gradients(zip(gradient_lists, training_weights))
+        # 7. Update the trackers
+        batch_accuracy /= float(self.caption_to_image_ration)
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_accuracy)
+        # 8. Return the loss and accuracy values
+        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
+    def test_step(self, data):
+        batch_image, batch_sequence = data
+        batch_loss = 0
+        batch_accuracy = 0
+        # 1. Get image embeddings
+        image_embedded = self.efficient_net_model(batch_image)
+        # 2. Pass each of the five captions one by one to the decoder_class
+        # along with the encoder_class outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.caption_to_image_ration):
+            loss, acc = self.get_caption_loss_and_accuracy(
+                image_embedded, batch_sequence[:, i, :], calculate_for_train=False
+            )
+            # 3. Update batch loss and batch accuracy
+            batch_loss += loss
+            batch_accuracy += acc
+        batch_accuracy /= float(self.caption_to_image_ration)
+        # 4. Update the trackers
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_accuracy)
+        # 5. Return the loss and accuracy values
+        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
+    @property
+    def metrics(self):
+        # We need to list our metrics here so the `reset_states()` can be
+        # called automatically.
+        return [self.loss_tracker, self.acc_tracker]
+    def get_config(self):
+        # Return a dictionary containing the configuration of your model.
+        config = {
+            "efficient_net_model": self.efficient_net_model,
+            "encoder_class": self.encoder_class,
+            "decoder_class": self.decoder_class,
+            "caption_to_image_ration": self.caption_to_image_ration,
+            "image_augmentation": self.image_augmentation,
+        }
+        return config
+    def call(self, data):
+        batch_image, batch_sequence = data
+        batch_loss = 0
+        batch_accuracy = 0
+        if self.image_augmentation:
+            batch_image = self.image_augmentation(batch_image)
+        # 1. Get image embeddings
+        image_embedded = self.efficient_net_model(batch_image)
+        # 2. Pass each of the five captions one by one to the decoder_class
+        # along with the encoder_class outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.caption_to_image_ration):
+            loss, acc = self.get_caption_loss_and_accuracy(
+                image_embedded, batch_sequence[:, i, :], calculate_for_train=True
+            )
+            # 3. Update batch loss and batch accuracy
+            batch_loss += loss
+            batch_accuracy += acc
+        batch_accuracy /= float(self.caption_to_image_ration)
+        # 4. Update the trackers
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_accuracy)
+        # 5. Return the loss and accuracy values
+        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
+cnn_model = prepare_cnn_model()
+encoder = EncoderClass(embedded_dimension=embedded_dimension, dense_dimension=feed_forward_dimension, number_of_heads=1)
+decoder = DecoderClass(embedded_dimension=embedded_dimension, feed_forward_dimension=feed_forward_dimension, number_of_heads=2)
+caption_model = ImageCaptionClass(
+    efficient_net_model=cnn_model, encoder_class=encoder, decoder_class=decoder, image_augmentation=image_augmentation,
+)
+caption_model
+cross_entropy_loss_f = keras.losses.SparseCategoricalCrossentropy(reduction="none")
+early_stopping_function = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
+# Create a class that inherits Learning Rate Schedule class from Keras,
+# this class will determines how slow or fast the model adjusts its parameter according to the loss function
+class LRSClass(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(self, learning_rate_post_warmup, steps):
+        super().__init__()
+        self.learning_rate_post_warmup = learning_rate_post_warmup
+        self.steps = steps
+    def __call__(self, step):
+        global_step = tf.cast(step, tf.float32)
+        steps = tf.cast(self.steps, tf.float32)
+        progress = global_step / steps
+        learning_rate = self.learning_rate_post_warmup * progress
+        return tf.cond(
+            global_step < steps,
+            lambda: learning_rate,
+            lambda: self.learning_rate_post_warmup,
+        )
+# Number of optimization steps required
+num_train_steps = len(training_dataset) * epochs
+# No. of steps where learning rate is gradually increased.
+warmup_steps = num_train_steps // 15
+lr_schedule = LRSClass(learning_rate_post_warmup=1e-4, steps=warmup_steps)
+# Compile the model
+caption_model.compile(optimizer=keras.optimizers.Adam(lr_schedule), loss=cross_entropy_loss_f)
+# Fit the model
+caption_model.fit(
+    training_dataset,
+    epochs=epochs,
+    validation_data=validation_dataset,
+    callbacks=[early_stopping_function],
+)
+caption_model.summary()
+test_vocabulary = vectorization.get_vocabulary()
+index_lookup = dict(zip(range(len(test_vocabulary)), test_vocabulary))
+max_decoded_sentence_length = sequence_length - 1
+valid_images = list(validation_data.keys())
+def generate_caption():
+    # Select a random image from the validation dataset
+    validate_image = np.random.choice(valid_images)
+    # Get sample image and decode/ resize
+    validate_image = decoder_to_resizer(validate_image)
+    image = validate_image.numpy().clip(0, 255).astype(np.uint8)
+    plt.imshow(image)
+    plt.show()
+    # Prepare image and send it the efficient net model
+    image = tf.expand_dims(validate_image, 0)
+    image = caption_model.efficient_net_model(image)
+    # Pass the image features to the Transformer encoder
+    encoded_img = caption_model.encoder_class(image, training=False)
+    # Generate the caption using the Transformer decoder
+    decoded_caption = "<start> "
+    for i in range(max_decoded_sentence_length):
+        tokenized_caption = vectorization([decoded_caption])[:, :-1]
+        mask = tf.math.not_equal(tokenized_caption, 0)
+        predictions = caption_model.decoder_class(
+            tokenized_caption, encoded_img, training=False, mask=mask
+        )
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = index_lookup[sampled_token_index]
+        if sampled_token == "<end>":
+            break
+        decoded_caption += " " + sampled_token
+    decoded_caption = decoded_caption.replace("<start> ", "")
+    decoded_caption = decoded_caption.replace(" <end>", "").strip()
+    print("Predicted Caption: ", decoded_caption)
+generate_caption()
+generate_caption()
+generate_caption()
+def generate_caption_custom(img_path):
+    # Select a random image from the validation dataset
+    validate_image = img_path
+    print(validate_image)
+    # Get sample image and decode/ resize
+    validate_image = decoder_to_resizer(validate_image)
+    image = validate_image.numpy().clip(0, 255).astype(np.uint8)
+    plt.imshow(image)
+    plt.show()
+    # Prepare image and send it the efficient net model
+    image = tf.expand_dims(validate_image, 0)
+    image = caption_model.efficient_net_model(image)
+    # Pass the image features to the Transformer encoder
+    encoded_img = caption_model.encoder_class(image, training=False)
+    # Generate the caption using the Transformer decoder
+    decoded_caption = "<start> "
+    for i in range(max_decoded_sentence_length):
+        tokenized_caption = vectorization([decoded_caption])[:, :-1]
+        mask = tf.math.not_equal(tokenized_caption, 0)
+        predictions = caption_model.decoder_class(
+            tokenized_caption, encoded_img, training=False, mask=mask
+        )
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = index_lookup[sampled_token_index]
+        if sampled_token == "<end>":
+            break
+        decoded_caption += " " + sampled_token
+    decoded_caption = decoded_caption.replace("<start> ", "")
+    decoded_caption = decoded_caption.replace(" <end>", "").strip()
+    print("Predicted Caption: ", decoded_caption)
+# generate_caption_custom("./image2.jpg")
+# generate_caption_custom("./Document.jpeg")
+def generate_with_link(url):
+  file_name = wget.download(url)
+  generate_caption_custom(file_name)
+link = 'https://media.istockphoto.com/id/1346503960/photo/school-children-with-a-parachute.jpg?s=1024x1024&w=is&k=20&c=HNOFWi02yU4NB_98iIWKHbzpGlWPYcfQagnPthD2eOo='
+generate_with_link(link)
+caption_model.save('path/to/location', save_format='tf')
+image_shape = (*image_size, 3)  # Assuming RGB images
+caption_shape = (5, sequence_length)  # For 5 captions with max sequence length
+caption_model.build(input_shape=[(None, *image_shape), (None, *caption_shape)])
+# Save the model
+path_to_save = 'path_to_save'
+caption_model.save(path_to_save)