from PIL import Image from io import BytesIO from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel import urllib.request as request import pickle import gradio as gr import os import urllib.request ''' Sample image Image by bristekjegor on Freepik Image by halayalex on Freepik Image by halayalex on Freepik ''' # APP_ROOT = os.path.dirname(os.path.abspath(__file__)) # MODEL = os.path.join(APP_ROOT, 'finalized_model.pkl') model = pickle.load(open('finalized_model_hm.pkl', 'rb')) #model = VisionEncoderDecoderModel.from_pretrained("google/vit-base-patch16-224-in21k") feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") def predict_step(img): max_length = 30 gen_kwargs = {"max_length": max_length} img.convert('RGB') pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values # pixel_values = pixel_values.to(device) output_ids = model.generate(pixel_values, **gen_kwargs) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds[0] iface = gr.Interface(fn=predict_step, inputs=gr.inputs.Image(type="pil"), outputs="text", examples=[["m3.jpeg"],["5.jpg"],["6.jpg"]], ) iface.launch()