Spaces:

ovi054
/

image_to_text

Build error

App Files Files Community

ovi054 commited on Mar 19, 2023

Commit

00abfdc

•

1 Parent(s): b1fd26f

first commit

Browse files

Files changed (8) hide show

app.py +98 -0
data_loader.py +152 -0
model.py +64 -0
models/decoder-3.pkl +3 -0
models/encoder-3.pkl +3 -0
models/vocab.pkl +3 -0
requirements.txt +1 -0
vocabulary.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from PIL import Image
+import numpy as np
+from torchvision import transforms
+import torch
+from data_loader import get_loader
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Watch for any changes in model.py, and re-load it automatically.
+%load_ext autoreload
+%autoreload 2
+import os
+import torch
+from model import EncoderCNN, DecoderRNN
+# TODO #2: Specify the saved models to load.
+encoder_file = 'encoder-3.pkl'
+decoder_file = 'decoder-3.pkl'
+# TODO #3: Select appropriate values for the Python variables below.
+embed_size = 256
+hidden_size = 512
+# The size of the vocabulary.
+vocab_size = 8855
+# Initialize the encoder and decoder, and set each to inference mode.
+encoder = EncoderCNN(embed_size)
+encoder.eval()
+decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
+decoder.eval()
+# Load the trained weights.
+encoder.load_state_dict(torch.load(os.path.join('/models', encoder_file), map_location=torch.device('cpu')))
+decoder.load_state_dict(torch.load(os.path.join('/models', decoder_file), map_location=torch.device('cpu')))
+# Move models to GPU if CUDA is available.
+encoder.to(device)
+decoder.to(device)
+def process_image(image):
+    ''' Scales, crops, and normalizes a PIL image for a PyTorch model
+    '''
+    #img = Image.open(image)
+    transformation = transforms.Compose([
+                                      transforms.Resize(256),                          # smaller edge of image resized to 256
+                                      transforms.RandomCrop(224),                      # get 224x224 crop from random location
+                                      transforms.ToTensor(),                           # convert the PIL Image to a tensor
+                                      transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
+                                                          (0.229, 0.224, 0.225))])
+    return transformation(image)
+def function(img_np):
+    PIL_image = Image.fromarray(img_np).convert('RGB')
+    orig_image = np.array(PIL_image)
+    image = process_image(PIL_image)
+    # return original image and pre-processed image tensor
+    return orig_image, image
+def clean_sentence(output):
+    sentense = ''
+    for i in output:
+        word = data_loader.dataset.vocab.idx2word[i]
+        if i == 0:
+            continue
+        if i == 1:
+            break
+        if i == 18:
+            sentense = sentense + word
+        else:
+            sentense = sentense + ' ' + word
+    return sentense.strip()
+data_loader = get_loader(transform=transforms, mode='test')
+def get_caption(image):
+    orig_image, image = function('image')
+    image =image.unsqueeze(0)
+    plt.imshow(np.squeeze(orig_image))
+    plt.title('Sample Image')
+    plt.show()
+    image = image.to(device)
+    features = encoder(image).unsqueeze(1)
+    output = decoder.sample(features)
+    sentence = clean_sentence(output)
+    return sentence
+import gradio as gr
+demo = gr.Interface(fn=get_caption, inputs= "image", outputs="image")
+demo.launch()

data_loader.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import nltk
+import os
+import torch
+import torch.utils.data as data
+from vocabulary import Vocabulary
+from PIL import Image
+from pycocotools.coco import COCO
+import numpy as np
+from tqdm import tqdm
+import random
+import json
+def get_loader(transform,
+               mode='train',
+               batch_size=1,
+               vocab_threshold=None,
+               vocab_file='/models/vocab.pkl',
+               start_word="<start>",
+               end_word="<end>",
+               unk_word="<unk>",
+               vocab_from_file=True,
+               num_workers=0,
+               cocoapi_loc='/opt'):
+    """Returns the data loader.
+    Args:
+      transform: Image transform.
+      mode: One of 'train' or 'test'.
+      batch_size: Batch size (if in testing mode, must have batch_size=1).
+      vocab_threshold: Minimum word count threshold.
+      vocab_file: File containing the vocabulary.
+      start_word: Special word denoting sentence start.
+      end_word: Special word denoting sentence end.
+      unk_word: Special word denoting unknown words.
+      vocab_from_file: If False, create vocab from scratch & override any existing vocab_file.
+                       If True, load vocab from from existing vocab_file, if it exists.
+      num_workers: Number of subprocesses to use for data loading
+      cocoapi_loc: The location of the folder containing the COCO API: https://github.com/cocodataset/cocoapi
+    """
+    assert mode in ['train', 'test'], "mode must be one of 'train' or 'test'."
+    if vocab_from_file==False: assert mode=='train', "To generate vocab from captions file, must be in training mode (mode='train')."
+    # Based on mode (train, val, test), obtain img_folder and annotations_file.
+    if mode == 'train':
+        if vocab_from_file==True: assert os.path.exists(vocab_file), "vocab_file does not exist.  Change vocab_from_file to False to create vocab_file."
+        img_folder = os.path.join(cocoapi_loc, 'cocoapi/images/train2014/')
+        annotations_file = os.path.join(cocoapi_loc, 'cocoapi/annotations/captions_train2014.json')
+    if mode == 'test':
+        assert batch_size==1, "Please change batch_size to 1 if testing your model."
+        assert os.path.exists(vocab_file), "Must first generate vocab.pkl from training data."
+        assert vocab_from_file==True, "Change vocab_from_file to True."
+        img_folder = '/content/opt/cocoapi/images/test2014'
+        annotations_file = '/content/gdrive/MyDrive/image_info_test2014.json'
+    # COCO caption dataset.
+    dataset = CoCoDataset(transform=transform,
+                          mode=mode,
+                          batch_size=batch_size,
+                          vocab_threshold=vocab_threshold,
+                          vocab_file=vocab_file,
+                          start_word=start_word,
+                          end_word=end_word,
+                          unk_word=unk_word,
+                          annotations_file=annotations_file,
+                          vocab_from_file=vocab_from_file,
+                          img_folder=img_folder)
+    if mode == 'train':
+        # Randomly sample a caption length, and sample indices with that length.
+        indices = dataset.get_train_indices()
+        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
+        initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
+        # data loader for COCO dataset.
+        data_loader = data.DataLoader(dataset=dataset,
+                                      num_workers=num_workers,
+                                      batch_sampler=data.sampler.BatchSampler(sampler=initial_sampler,
+                                                                              batch_size=dataset.batch_size,
+                                                                              drop_last=False))
+    else:
+        data_loader = data.DataLoader(dataset=dataset,
+                                      batch_size=dataset.batch_size,
+                                      shuffle=True,
+                                      num_workers=num_workers)
+    return data_loader
+class CoCoDataset(data.Dataset):
+    def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word,
+        end_word, unk_word, annotations_file, vocab_from_file, img_folder):
+        self.transform = transform
+        self.mode = mode
+        self.batch_size = batch_size
+        self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word,
+            end_word, unk_word, annotations_file, vocab_from_file)
+        self.img_folder = img_folder
+        if self.mode == 'train':
+            self.coco = COCO(annotations_file)
+            self.ids = list(self.coco.anns.keys())
+            print('Obtaining caption lengths...')
+            all_tokens = [nltk.tokenize.word_tokenize(str(self.coco.anns[self.ids[index]]['caption']).lower()) for index in tqdm(np.arange(len(self.ids)))]
+            self.caption_lengths = [len(token) for token in all_tokens]
+        else:
+            test_info = json.loads(open(annotations_file).read())
+            self.paths = [item['file_name'] for item in test_info['images']]
+    def __getitem__(self, index):
+        # obtain image and caption if in training mode
+        if self.mode == 'train':
+            ann_id = self.ids[index]
+            caption = self.coco.anns[ann_id]['caption']
+            img_id = self.coco.anns[ann_id]['image_id']
+            path = self.coco.loadImgs(img_id)[0]['file_name']
+            # Convert image to tensor and pre-process using transform
+            image = Image.open(os.path.join(self.img_folder, path)).convert('RGB')
+            image = self.transform(image)
+            # Convert caption to tensor of word ids.
+            tokens = nltk.tokenize.word_tokenize(str(caption).lower())
+            caption = []
+            caption.append(self.vocab(self.vocab.start_word))
+            caption.extend([self.vocab(token) for token in tokens])
+            caption.append(self.vocab(self.vocab.end_word))
+            caption = torch.Tensor(caption).long()
+            # return pre-processed image and caption tensors
+            return image, caption
+        # obtain image if in test mode
+        else:
+            path = self.paths[index]
+            # Convert image to tensor and pre-process using transform
+            PIL_image = Image.open(os.path.join(self.img_folder, path)).convert('RGB')
+            orig_image = np.array(PIL_image)
+            image = self.transform(PIL_image)
+            # return original image and pre-processed image tensor
+            return orig_image, image
+    def get_train_indices(self):
+        sel_length = np.random.choice(self.caption_lengths)
+        all_indices = np.where([self.caption_lengths[i] == sel_length for i in np.arange(len(self.caption_lengths))])[0]
+        indices = list(np.random.choice(all_indices, size=self.batch_size))
+        return indices
+    def __len__(self):
+        if self.mode == 'train':
+            return len(self.ids)
+        else:
+            return len(self.paths)

model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import torch.nn as nn
+import torchvision.models as models
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters():
+            param.requires_grad_(False)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.embed = nn.Linear(resnet.fc.in_features, embed_size)
+    def forward(self, images):
+        features = self.resnet(images)
+        features = features.view(features.size(0), -1)
+        features = self.embed(features)
+        return features
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
+        super(DecoderRNN, self).__init__()
+        self.hidden_dim = hidden_size
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        self.hidden = (torch.zeros(1, 1, hidden_size),torch.zeros(1, 1, hidden_size))
+    def forward(self, features, captions):
+        cap_embedding = self.embed(captions[:,:-1])
+        embeddings = torch.cat((features.unsqueeze(1), cap_embedding), 1)
+        #print('in decoderrnn forward, embedding shape ', embeddings.shape)
+        #packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
+        #lstm_out, self.hidden = self.lstm(embeddings, self.hidden)
+        #lstm_out, self.hidden = self.lstm(embeddings.view(len(embeddings), 1, -1), self.hidden)
+        lstm_out, self.hidden = self.lstm(embeddings)
+        outputs = self.linear(lstm_out)
+        #return outputs[:,1:,:]
+        return outputs
+    def sample(self, inputs, hidden=None, max_len=20):
+        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
+        res = []
+        for i in range(max_len):
+            outputs, hidden = self.lstm(inputs, hidden)
+#             print('lstm output shape ', outputs.shape)
+#             print('lstm output.squeeze(1) shape ', outputs.squeeze(1).shape)
+            outputs = self.linear(outputs.squeeze(1))
+#             print('linear output shape ', outputs.shape)
+            target_index = outputs.max(1)[1]
+#             print('target_index shape ', target_index.shape)
+            res.append(target_index.item())
+            inputs = self.embed(target_index).unsqueeze(1)
+#             print('new inputs shape ', inputs.shape, '\n')
+        return res

models/decoder-3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3bb7e576b01e41b358e9b43a823a89e71331cc3854733eae590fcbb631e3b0f
+size 33546937

models/encoder-3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ab1ce1da221b8eb5ab980ba434c7ffb52beb5a18fa569aa1223ade37e8e752b
+size 96387105

models/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4217bb73c18c91e7056154c08ee5c24f14adc9d91c334d63fb370b443f3eaa3
+size 242231

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI

vocabulary.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import nltk
+import pickle
+import os.path
+from pycocotools.coco import COCO
+from collections import Counter
+class Vocabulary(object):
+    def __init__(self,
+        vocab_threshold,
+        vocab_file='/models/vocab.pkl',
+        start_word="<start>",
+        end_word="<end>",
+        unk_word="<unk>",
+        annotations_file='../cocoapi/annotations/captions_train2014.json',
+        vocab_from_file=False):
+        """Initialize the vocabulary.
+        Args:
+          vocab_threshold: Minimum word count threshold.
+          vocab_file: File containing the vocabulary.
+          start_word: Special word denoting sentence start.
+          end_word: Special word denoting sentence end.
+          unk_word: Special word denoting unknown words.
+          annotations_file: Path for train annotation file.
+          vocab_from_file: If False, create vocab from scratch & override any existing vocab_file
+                           If True, load vocab from from existing vocab_file, if it exists
+        """
+        self.vocab_threshold = vocab_threshold
+        self.vocab_file = vocab_file
+        self.start_word = start_word
+        self.end_word = end_word
+        self.unk_word = unk_word
+        self.annotations_file = annotations_file
+        self.vocab_from_file = vocab_from_file
+        self.get_vocab()
+    def get_vocab(self):
+        """Load the vocabulary from file OR build the vocabulary from scratch."""
+        if os.path.exists(self.vocab_file) & self.vocab_from_file:
+            with open(self.vocab_file, 'rb') as f:
+                vocab = pickle.load(f)
+                self.word2idx = vocab.word2idx
+                self.idx2word = vocab.idx2word
+            print('Vocabulary successfully loaded from vocab.pkl file!')
+        else:
+            self.build_vocab()
+            with open(self.vocab_file, 'wb') as f:
+                pickle.dump(self, f)
+    def build_vocab(self):
+        """Populate the dictionaries for converting tokens to integers (and vice-versa)."""
+        self.init_vocab()
+        self.add_word(self.start_word)
+        self.add_word(self.end_word)
+        self.add_word(self.unk_word)
+        self.add_captions()
+    def init_vocab(self):
+        """Initialize the dictionaries for converting tokens to integers (and vice-versa)."""
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+    def add_word(self, word):
+        """Add a token to the vocabulary."""
+        if not word in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+    def add_captions(self):
+        """Loop over training captions and add all tokens to the vocabulary that meet or exceed the threshold."""
+        coco = COCO(self.annotations_file)
+        counter = Counter()
+        ids = coco.anns.keys()
+        for i, id in enumerate(ids):
+            caption = str(coco.anns[id]['caption'])
+            tokens = nltk.tokenize.word_tokenize(caption.lower())
+            counter.update(tokens)
+            if i % 100000 == 0:
+                print("[%d/%d] Tokenizing captions..." % (i, len(ids)))
+        words = [word for word, cnt in counter.items() if cnt >= self.vocab_threshold]
+        for i, word in enumerate(words):
+            self.add_word(word)
+    def __call__(self, word):
+        if not word in self.word2idx:
+            return self.word2idx[self.unk_word]
+        return self.word2idx[word]
+    def __len__(self):
+        return len(self.word2idx)