import streamlit as st from PIL import Image from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel import torch vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") device = "cuda" if torch.cuda.is_available() else "cpu" vitgpt_model.to(device) def generate_caption(processor, model, image, num_seq, tokenizer=None): inputs = processor(images=image, return_tensors="pt").to(device) generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50, num_beams=5, do_sample=True, temperature=2., top_k = 20, no_repeat_ngram_size=5, num_return_sequences=num_seq) if tokenizer is not None: generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) else: generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True) return generated_caption def generate_captions(image, num_seq): caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, num_seq, vitgpt_tokenizer) return caption_vitgpt st.title('Generate text to your image') uploaded_file = st.file_uploader("Upload your image") num_seq = st.slider('Return sequences quantity', 1, 5, 2) if uploaded_file is not None: if st.button('Generate!'): col1, col2 = st.columns(2) with col1: image = Image.open(uploaded_file) st.image(image) with col2: generated_caption = generate_caption(vitgpt_processor, vitgpt_model, image, num_seq, vitgpt_tokenizer) for i in generated_caption: st.write(i)