|
import streamlit as st |
|
import pickle |
|
st.header("Image Captioner") |
|
st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!") |
|
features=pickle.load(open("features.pkl","rb")) |
|
all_captions=pickle.load(open("all_captions.pkl","rb")) |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
tokenizer = Tokenizer() |
|
tokenizer.fit_on_texts(all_captions) |
|
vocab_size = len(tokenizer.word_index) + 1 |
|
max_length = max(len(caption.split()) for caption in all_captions) |
|
from tensorflow import keras |
|
model = keras.models.load_model("best_model.h5") |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
def idx_to_word(integer, tokenizer): |
|
for word, index in tokenizer.word_index.items(): |
|
if index == integer: |
|
return word |
|
return None |
|
|
|
import numpy as np |
|
def predict_caption(model, image, tokenizer, max_length): |
|
|
|
in_text = 'startseq' |
|
|
|
for i in range(max_length): |
|
|
|
sequence = tokenizer.texts_to_sequences([in_text])[0] |
|
|
|
sequence = pad_sequences([sequence], max_length) |
|
|
|
yhat = model.predict([image, sequence], verbose=0) |
|
|
|
yhat = np.argmax(yhat) |
|
|
|
word = idx_to_word(yhat, tokenizer) |
|
|
|
if word is None: |
|
break |
|
|
|
in_text += " " + word |
|
|
|
if word == 'endseq': |
|
break |
|
|
|
return in_text |
|
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input |
|
from tensorflow.keras.models import Model |
|
from gtts import gTTS |
|
from io import BytesIO |
|
sound_file = BytesIO() |
|
vgg_model = VGG16() |
|
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) |
|
from tensorflow.keras.preprocessing.image import img_to_array |
|
from PIL import Image |
|
uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"]) |
|
image_path="bushman.jpeg" |
|
if(uploaded_image!=None): |
|
display_image=Image.open(uploaded_image) |
|
st.image(display_image) |
|
if st.button("Caption"): |
|
st.text("Please be patient...") |
|
display_image=display_image.resize((224,224)) |
|
image = img_to_array(display_image) |
|
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) |
|
image = preprocess_input(image) |
|
feature = vgg_model.predict(image, verbose=0) |
|
final=predict_caption(model, feature, tokenizer, max_length) |
|
final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1])) |
|
tts = gTTS(final_output, lang='en') |
|
tts.write_to_fp(sound_file) |
|
st.text("Output:") |
|
st.markdown(final_output) |
|
st.audio(sound_file) |