SameerR007's picture
Update app.py
ff01337
import streamlit as st
import pickle
st.header("Image Captioner")
st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!")
features=pickle.load(open("features.pkl","rb"))
all_captions=pickle.load(open("all_captions.pkl","rb"))
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)
from tensorflow import keras
model = keras.models.load_model("best_model.h5")
from tensorflow.keras.preprocessing.sequence import pad_sequences
def idx_to_word(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate caption for an image
import numpy as np
def predict_caption(model, image, tokenizer, max_length):
# add start tag for generation process
in_text = 'startseq'
# iterate over the max length of sequence
for i in range(max_length):
# encode input sequence
sequence = tokenizer.texts_to_sequences([in_text])[0]
# pad the sequence
sequence = pad_sequences([sequence], max_length)
# predict next word
yhat = model.predict([image, sequence], verbose=0)
# get index with high probability
yhat = np.argmax(yhat)
# convert index to word
word = idx_to_word(yhat, tokenizer)
# stop if word not found
if word is None:
break
in_text += " " + word
if word == 'endseq':
break
return in_text
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from gtts import gTTS
from io import BytesIO
sound_file = BytesIO()
vgg_model = VGG16()
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)
from tensorflow.keras.preprocessing.image import img_to_array
from PIL import Image
uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"])
image_path="bushman.jpeg"
if(uploaded_image!=None):
display_image=Image.open(uploaded_image)
st.image(display_image)
if st.button("Caption"):
st.text("Please be patient...")
display_image=display_image.resize((224,224))
image = img_to_array(display_image)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)
feature = vgg_model.predict(image, verbose=0)
final=predict_caption(model, feature, tokenizer, max_length)
final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1]))
tts = gTTS(final_output, lang='en')
tts.write_to_fp(sound_file)
st.text("Output:")
st.markdown(final_output)
st.audio(sound_file)