Spaces:

lamelight07
/

ImageCaptioning_streamlit

Running

Sameer

Update app.py

ff01337 almost 2 years ago

2.9 kB

	import streamlit as st
	import pickle
	st.header("Image Captioner")
	st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!")
	features=pickle.load(open("features.pkl","rb"))
	all_captions=pickle.load(open("all_captions.pkl","rb"))
	from tensorflow.keras.preprocessing.text import Tokenizer
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(all_captions)
	vocab_size = len(tokenizer.word_index) + 1
	max_length = max(len(caption.split()) for caption in all_captions)
	from tensorflow import keras
	model = keras.models.load_model("best_model.h5")
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	def idx_to_word(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
	if index == integer:
	return word
	return None
	# generate caption for an image
	import numpy as np
	def predict_caption(model, image, tokenizer, max_length):
	# add start tag for generation process
	in_text = 'startseq'
	# iterate over the max length of sequence
	for i in range(max_length):
	# encode input sequence
	sequence = tokenizer.texts_to_sequences([in_text])[0]
	# pad the sequence
	sequence = pad_sequences([sequence], max_length)
	# predict next word
	yhat = model.predict([image, sequence], verbose=0)
	# get index with high probability
	yhat = np.argmax(yhat)
	# convert index to word
	word = idx_to_word(yhat, tokenizer)
	# stop if word not found
	if word is None:
	break

	in_text += " " + word

	if word == 'endseq':
	break

	return in_text
	from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
	from tensorflow.keras.models import Model
	from gtts import gTTS
	from io import BytesIO
	sound_file = BytesIO()
	vgg_model = VGG16()
	vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)
	from tensorflow.keras.preprocessing.image import img_to_array
	from PIL import Image
	uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"])
	image_path="bushman.jpeg"
	if(uploaded_image!=None):
	display_image=Image.open(uploaded_image)
	st.image(display_image)
	if st.button("Caption"):
	st.text("Please be patient...")
	display_image=display_image.resize((224,224))
	image = img_to_array(display_image)
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	image = preprocess_input(image)
	feature = vgg_model.predict(image, verbose=0)
	final=predict_caption(model, feature, tokenizer, max_length)
	final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1]))
	tts = gTTS(final_output, lang='en')
	tts.write_to_fp(sound_file)
	st.text("Output:")
	st.markdown(final_output)
	st.audio(sound_file)