import torch import re import gradio as gr import streamlit as st # st.title("Image Caption Generator") from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel import os import tensorflow as tf os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' device='cpu' encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning" decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning" model_checkpoint = "nlpconnect/vit-gpt2-image-captioning" feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint) tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint) model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device) def predict(image, max_length=64, num_beams=4): image = image.convert('RGB') image = feature_extractor(image, return_tensors="pt").pixel_values.to(device) clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0] caption_ids = model.generate(image, max_length = max_length)[0] caption_text = clean_text(tokenizer.decode(caption_ids)) return caption_text input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True) output = gr.outputs.Textbox(type="text",label="Captions") examples = ["example1.jpg"] print("------------------------- 6 -------------------------\n") title = "Image to Text ViT with LORA" # interface = gr.Interface( # fn=predict, # description=description, # inputs = input, # theme="grass", # outputs=output, # examples=examples, # title=title, # ) # interface.launch(debug=True) with gr.Blocks() as demo: gr.HTML( """

ViT Image-to-Text with LORA

In the field of large language models, the challenge of fine-tuning has long perplexed researchers. Microsoft, however, has unveiled an innovative solution called Low-Rank Adaptation (LoRA). With the emergence of behemoth models like GPT-3 boasting billions of parameters, the cost of fine-tuning them for specific tasks or domains has become exorbitant. LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as rank-decomposition matrices in each transformer block. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.

You can find more info here: Linkedin article

""") with gr.Row(): with gr.Column(scale=1): gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True) button = gr.Button(value="Describe") with gr.Column(scale=1): gr.outputs.Textbox(type="text",label="Captions") button.click(predict, gr.Textbox(), gr.Textbox()) demo.launch(debug=True)