Spaces:

vvd2003
/

Image-Captioning

Runtime error

App Files Files Community

vvd2003 commited on Apr 12, 2023

Commit

bbc5e76

•

1 Parent(s): c78dd05

Upload 15 files

Browse files

Files changed (15) hide show

app.py +86 -0
dataset.py +191 -0
examples/000000000139.jpg +0 -0
examples/000000000785.jpg +0 -0
examples/000000005477.jpg +0 -0
examples/good1.png +0 -0
examples/good2.png +0 -0
examples/good3.png +0 -0
examples/good6.png +0 -0
model.py +86 -0
requirement.txt +4 -0
savedir/best.pt +3 -0
savedir/last.pt +3 -0
utils.py +105 -0
vocab.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding: utf-8 -*-
+"""
+@author: Van Duc <vvduc03@gmail.com>
+"""
+"""Import necessary packages"""
+import os
+import argparse
+import config
+import gradio as gr
+from model import ImgCaption_Model
+from dataset import Vocabulary
+from timeit import default_timer as timer
+from utils import load_check_point_to_use
+# Initialize parameters and parse the parameters
+def get_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument('--save-path', '-s', type=str, default=config.save_path, help='number of batch size')
+    parse.add_argument('--transform', default=config.transform, help='Compose transform of images')
+    parse.add_argument('--embed-size', default=config.embed_size, help='Size of embedding')
+    parse.add_argument('--hidden-size', default=config.hidden_size, help='Number of hidden nodes in RNN')
+    parse.add_argument('--num-layer', default=config.num_layer, help='Number of layers lstm stack')
+    parse.add_argument('--num-workers', default=config.num_workers, help='Number of core CPU use to load data')
+    args = parse.parse_args()
+    return args
+# Load vocab file
+vocab = Vocabulary()
+vocab.read_vocab()
+# Load arguments
+args = get_args()
+# Load model
+model = ImgCaption_Model(args.embed_size, args.hidden_size, len(vocab), args.num_layer)
+# Load saved weights
+load_check_point_to_use(args.save_path + '/best.pt', model, 'cpu')
+def caption(img):
+    """Transforms, describe about image and returns caption and time taken.
+    """
+    # Start the timer
+    start_time = timer()
+    # Transform the target image
+    img = args.transform(img)
+    # Put model into evaluation mode and describe image
+    model.eval()
+    prompt = " ".join(model.caption_image(img.unsqueeze(0), vocab))
+    # Calculate the prediction time
+    pred_time = round(timer() - start_time, 5)
+    # Return the caption and prediction time
+    return prompt, pred_time
+# Create title, description and article strings
+def main():
+    title = "Image Captioning 🖼➡️🆎"
+    description = "A model describe about the picture"
+    article = "Created on [GITHUB](https://github.com/vvduc1803/Image-Captioning)."
+    # Create examples list from "examples/" directory
+    example_list = [["examples/" + example] for example in os.listdir("examples")]
+    # Create the Gradio demo
+    demo = gr.Interface(fn=caption,  # mapping function from input to output
+                        inputs=gr.Image(type="pil"),  # what are the inputs?
+                        outputs=[gr.Textbox(label="Caption"), # what are the outputs?
+                                 gr.Number(label="Prediction time (s)")],
+                        # our fn has two outputs, therefore we have two outputs
+                        # Create examples list from "examples/" directory
+                        examples=example_list,
+                        title=title,
+                        description=description,
+                        article=article)
+    # Launch the demo!
+    demo.launch(server_name="127.0.0.1", server_port=1234, share=True)
+if __name__ == '__main__':
+    main()

dataset.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# -*- coding: utf-8 -*-
+"""
+@author: Van Duc <vvduc03@gmail.com>
+"""
+"""Import necessary packages"""
+import os
+import spacy  # for tokenizer
+import torch
+import config
+import json
+from torch.nn.utils.rnn import pad_sequence  # pad batch
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image
+import torchvision.transforms as transforms
+# Download with: python -m spacy download en_core_web_sm
+spacy_eng = spacy.load("en_core_web_sm")
+class Vocabulary:
+    def __init__(self, freq_threshold=5):
+        # Initialize 2 dictionary: index to string and string to index
+        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
+        # Threshold for add word to dictionary
+        self.freq_threshold = freq_threshold
+    def __len__(self):
+        return len(self.itos)
+    @staticmethod
+    def tokenizer_eng(text):
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+    def build_vocabulary(self, sentence_list):
+        frequencies = {}
+        idx = 4
+        for sentence in sentence_list:
+            for word in self.tokenizer_eng(sentence):
+                if word not in frequencies:
+                    frequencies[word] = 1
+                else:
+                    frequencies[word] += 1
+                if frequencies[word] == self.freq_threshold:
+                    self.stoi[word] = idx
+                    self.itos[idx] = word
+                    idx += 1
+    def read_vocab(self, file_name='vocab.json'):
+        """
+        Load created vocabulary file and replace the 'index to string' and 'string to index' dictionary
+        """
+        vocab_path = open(file_name, 'r')
+        vocab = json.load(vocab_path)
+        new_itos = {int(key): value for key, value in vocab['itos'].items()}
+        self.itos = new_itos
+        self.stoi = vocab['stoi']
+    def create_vocab(self, file_name='vocab.json'):
+        # create json object from dictionary
+        vocab = json.dumps({'itos': self.itos,
+                            'stoi': self.stoi})
+        # open file for writing, "w"
+        f = open(file_name, "w")
+        # write json object to file
+        f.write(vocab)
+        # close file
+        f.close()
+    def numericalize(self, text):
+        tokenized_text = self.tokenizer_eng(text)
+        return [
+            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
+            for token in tokenized_text
+        ]
+class CoCoDataset(Dataset):
+    def __init__(self, root_dir, transform=None, freq_threshold=5):
+        self.root_dir = root_dir
+        self.freq_threshold = freq_threshold
+        captions_path = open(os.path.join(self.root_dir, config.captions), 'r')
+        captions_file = json.load(captions_path)
+        self.transform = transform
+        # Get img, caption columns
+        self.imageID_list = [captions['image_id'] for captions in captions_file['annotations']]
+        self.captions_list = [captions['caption'] for captions in captions_file['annotations']]
+        # # Initialize vocabulary and build vocab
+        # if not self.set_vocab:
+        #     self.vocab = Vocabulary(self.freq_threshold)
+        #     self.vocab.build_vocabulary(self.captions_list)
+        #     self.vocab.create_vocab()
+        # else:
+        #     self.vocab = self.set_vocab
+        # Load vocab file
+        self.vocab = Vocabulary(self.freq_threshold)
+        self.vocab.read_vocab()
+    def __len__(self):
+        return len(self.imageID_list)
+    def __getitem__(self, index):
+        # Load index caption and image
+        caption = self.captions_list[index]
+        img_id = str((self.imageID_list[index])).zfill(12) + '.jpg'
+        self.img = Image.open(os.path.join(self.root_dir, config.images, img_id)).convert("RGB")
+        # Transform image
+        if self.transform:
+            img = self.transform(self.img)
+        # Numericalized captions
+        numericalized_caption = [self.vocab.stoi["<SOS>"]]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi["<EOS>"])
+        return img, torch.tensor(numericalized_caption)
+class MyCollate:
+    def __init__(self, pad_idx):
+        self.pad_idx = pad_idx
+    def __call__(self, batch):
+        imgs = [item[0].unsqueeze(0) for item in batch]
+        imgs = torch.cat(imgs, dim=0)
+        targets = [item[1] for item in batch]
+        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
+        return imgs, targets
+def get_loader(
+    root_folder,
+    transform,
+    batch_size=16,
+    num_workers=4,
+    shuffle=True,
+    pin_memory=True
+):
+    dataset = CoCoDataset(root_folder, transform=transform)
+    pad_idx = dataset.vocab.stoi["<PAD>"]
+    loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        pin_memory=pin_memory,
+        collate_fn=MyCollate(pad_idx=pad_idx),
+    )
+    return dataset, loader
+if __name__ == "__main__":
+    transform = transforms.Compose(
+        [transforms.Resize((224, 224)), transforms.ToTensor(),]
+    )
+    train_dataset, train_loader = get_loader(root_folder=config.train,
+                                                    transform=config.transform,
+                                                    batch_size=config.batch_size,
+                                                    num_workers=config.num_workers,
+                                                    shuffle=True)
+    from utils import plot_examples
+    from model import ImgCaption_Model
+    model = ImgCaption_Model(256, 256, len(train_dataset.vocab), 1)
+    plot_examples(model, 'cuda', train_dataset, train_dataset.vocab)
+    # imgs, captions = dataset.__getitem__(1)
+    # print(imgs.shape)
+    # print(captions)
+    # print(captions.shape)
+    # for x, y in loader:
+    #     a = [[1], [2], [3]]
+    #     print(a[:-1])
+    #     print(y[:-1])
+    #     print(y)
+    #     break

examples/000000000139.jpg ADDED Viewed

examples/000000000785.jpg ADDED Viewed

examples/000000005477.jpg ADDED Viewed

examples/good1.png ADDED Viewed

examples/good2.png ADDED Viewed

examples/good3.png ADDED Viewed

examples/good6.png ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding: utf-8 -*-
+"""
+@author: Van Duc <vvduc03@gmail.com>
+"""
+"""Import necessary packages"""
+import torch
+import torch.nn as nn
+import torchvision.models as models
+from torchinfo import summary
+class CNN(nn.Module):
+    def __init__(self, embed_size=256, train_model=False):
+        super().__init__()
+        # Load pretrained Efficientnet-B2 model
+        self.model = models.efficientnet_b2(weights=models.EfficientNet_B2_Weights)
+        # Frozen all layer of model
+        if not train_model:
+            for param in self.model.parameters():
+                param.requires_grad = False
+        # Replace head of model
+        self.model.classifier.requires_grad_(True)
+        self.model.classifier = nn.Sequential(nn.Linear(1408, embed_size),
+                                              nn.ReLU(),
+                                              nn.Dropout(0.5))
+    def forward(self, x):
+        return self.model(x)
+class RNN(nn.Module):
+    def __init__(self, hidden_size, vocab_size, num_layers, embed_size=256):
+        super().__init__()
+        # Embedding caption
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        # Initialize some necessary layer
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        self.drop_out = nn.Dropout(0.5)
+    def forward(self, features, captions):
+        embeddings = self.drop_out(self.embed(captions))
+        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
+        hidden, _ = self.lstm(embeddings)
+        outputs = self.linear(hidden)
+        return outputs
+class ImgCaption_Model(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
+        super().__init__()
+        self.CNN = CNN(embed_size)
+        self.RNN = RNN(hidden_size, vocab_size, num_layers, embed_size)
+    def forward(self, images, captions):
+        features = self.CNN(images)
+        outputs = self.RNN(features, captions)
+        return outputs
+    def caption_image(self, image, vocab, max_length=50):
+        result = []
+        with torch.inference_mode():
+            features = self.CNN(image)
+            state = None
+            for _ in range(max_length):
+                hidden, state = self.RNN.lstm(features, state)
+                output = self.RNN.linear(hidden)
+                predict = output.argmax(axis=1)
+                if vocab.itos[predict.item()] == "<EOS>":
+                    break
+                result.append(predict.item())
+                features = self.RNN.embed(predict)
+        return [vocab.itos[idx] for idx in result[1:]]
+if __name__ == '__main__':
+    pass

requirement.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+torch
+spacy
+torchvision

savedir/best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdac83fb7cfc485259434762661b38b749cfca2df720fd583e36bac929a3a968
+size 104784308

savedir/last.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdef3c9612113193c97196cebd5ef3b9115aceb3ba60b572e8897217b3c973cf
+size 104784308

utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# -*- coding: utf-8 -*-
+"""
+@author: Van Duc <vvduc03@gmail.com>
+"""
+"""Import necessary packages"""
+import os
+import torch
+import random
+import matplotlib.pyplot as plt
+def read_caption(num_caption, vocab):
+    """
+    Convert caption form number to string
+    Args:
+        num_caption: caption form number
+        vocab: vocabulary file
+    Returns:
+        A list of string (ex: [a, dog, in, the, sky])
+    """
+    str_caption = []
+    for cap in num_caption[1:]:
+        if vocab.itos[cap.item()] == "<EOS>":
+            break
+        str_caption.append(cap)
+    return [vocab.itos[id.item()] for id in str_caption]
+def plot_examples(model, device, dataset, vocab, num_examples=20):
+    """
+    Plot image, correct caption and predict caption of some image in dataset
+    Args:
+        model: pretrained-model to predict caption
+        device: target device cpu and gpu
+        dataset: dataset
+        vocab: vocabulary
+        num_examples: number examples plot
+    Returns:
+        Images of picture and caption
+    """
+    model.eval()
+    model.to(device)
+    # Load over examples
+    for example in range(num_examples):
+        # Take some example from dataset
+        image, caption = dataset.__getitem__(random.randint(0, dataset.__len__()))
+        image = image.to(device)
+        # Print output
+        correct = f"Example {example+1} CORRECT: " + " ".join(read_caption(caption, vocab))
+        output = f"Example {example+1} OUTPUT: " + " ".join(model.caption_image(image.unsqueeze(0), vocab))
+        print(correct)
+        print(output)
+        print('----------------------------------------------')
+        # Plot image and caption
+        fig, ax = plt.subplots()
+        ax.imshow(dataset.img)
+        ax.axis('off')
+        fig.text(0.5, 0.05,
+                 correct + '\n' + output,
+                 ha="center")
+        plt.show()
+    model.train()
+def save_checkpoint(model, optimizer, epoch, save_path, last_loss, best_loss):
+    print("=> Saving checkpoint")
+    checkpoint = {
+        "epoch": epoch + 1,
+        "model": model.state_dict(),
+        "optimizer": optimizer.state_dict()
+    }
+    torch.save(checkpoint, os.path.join(save_path, "last.pt"))
+    if last_loss < best_loss:
+        best_loss = last_loss
+        torch.save(checkpoint, os.path.join(save_path, "best.pt"))
+    return best_loss
+def load_check_point_to_use(checkpoint_file, model, device):
+    print("=> Loading checkpoint")
+    checkpoint = torch.load(checkpoint_file, map_location=device)
+    model.load_state_dict(checkpoint["model"])
+    return model
+def load_checkpoint_to_continue(checkpoint_file, model, optimizer, lr, device):
+    print("=> Loading checkpoint")
+    checkpoint = torch.load(checkpoint_file+'/last.pt', map_location=device)
+    model.load_state_dict(checkpoint["model"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    epoch = checkpoint["epoch"]
+    # If we don't do this then it will just have learning rate of old checkpoint
+    # and it will lead to many hours of debugging \:
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+    return model, epoch

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff