Spaces:
Build error
Build error
File size: 3,065 Bytes
00abfdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from PIL import Image
import numpy as np
from torchvision import transforms
import torch
from data_loader import get_loader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Watch for any changes in model.py, and re-load it automatically.
%load_ext autoreload
%autoreload 2
import os
import torch
from model import EncoderCNN, DecoderRNN
# TODO #2: Specify the saved models to load.
encoder_file = 'encoder-3.pkl'
decoder_file = 'decoder-3.pkl'
# TODO #3: Select appropriate values for the Python variables below.
embed_size = 256
hidden_size = 512
# The size of the vocabulary.
vocab_size = 8855
# Initialize the encoder and decoder, and set each to inference mode.
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()
# Load the trained weights.
encoder.load_state_dict(torch.load(os.path.join('/models', encoder_file), map_location=torch.device('cpu')))
decoder.load_state_dict(torch.load(os.path.join('/models', decoder_file), map_location=torch.device('cpu')))
# Move models to GPU if CUDA is available.
encoder.to(device)
decoder.to(device)
def process_image(image):
''' Scales, crops, and normalizes a PIL image for a PyTorch model
'''
#img = Image.open(image)
transformation = transforms.Compose([
transforms.Resize(256), # smaller edge of image resized to 256
transforms.RandomCrop(224), # get 224x224 crop from random location
transforms.ToTensor(), # convert the PIL Image to a tensor
transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model
(0.229, 0.224, 0.225))])
return transformation(image)
def function(img_np):
PIL_image = Image.fromarray(img_np).convert('RGB')
orig_image = np.array(PIL_image)
image = process_image(PIL_image)
# return original image and pre-processed image tensor
return orig_image, image
def clean_sentence(output):
sentense = ''
for i in output:
word = data_loader.dataset.vocab.idx2word[i]
if i == 0:
continue
if i == 1:
break
if i == 18:
sentense = sentense + word
else:
sentense = sentense + ' ' + word
return sentense.strip()
data_loader = get_loader(transform=transforms, mode='test')
def get_caption(image):
orig_image, image = function('image')
image =image.unsqueeze(0)
plt.imshow(np.squeeze(orig_image))
plt.title('Sample Image')
plt.show()
image = image.to(device)
features = encoder(image).unsqueeze(1)
output = decoder.sample(features)
sentence = clean_sentence(output)
return sentence
import gradio as gr
demo = gr.Interface(fn=get_caption, inputs= "image", outputs="image")
demo.launch() |