Spaces:

team-indain-image-caption
/

Hindi-image-captioning

Runtime error

File size: 1,922 Bytes

797b64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52ccae8
 
797b64f
 
58572c5
797b64f
 
ecda335
797b64f
52ccae8
ecda335
 
797b64f
52ccae8
797b64f
 
cafa65e
 
797b64f
 
 
 
 
 
dc7d2f7
797b64f
 
308a342

import torch
import re
import gradio as gr
from pathlib import Path
from transformers import AutoTokenizer, AutoFeatureExtractor, VisionEncoderDecoderModel
# Pattern to ignore all the text after 2 or more full stops
regex_pattern = "[.]{2,}"
def post_process(text):
    try:
        text = text.strip()
        text = re.split(regex_pattern, text)[0]
    except Exception as e:
        print(e)
        pass
    return text
def predict(image, max_length=64, num_beams=4):
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    with torch.no_grad():
        output_ids = model.generate(
            pixel_values,
            max_length=max_length,
            num_beams=num_beams,
            return_dict_in_generate=True,
        ).sequences
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    pred = post_process(preds[0])
    return pred
    
model_path = "team-indain-image-caption/hindi-image-captioning"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load model.
model = VisionEncoderDecoderModel.from_pretrained(model_path)
model.to(device)
print("Loaded model")
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
print("Loaded feature_extractor")
tokenizer = AutoTokenizer.from_pretrained(model_path)
#if model.decoder.name_or_path == "gpt2":
 #   tokenizer.pad_token = tokenizer.bos_token
print("Loaded tokenizer")
title = "Hindi Image Captioning"
description = ""
input = gr.inputs.Image(type="pil")
#example_images = sorted([f.as_posix() for f in Path("examples").glob("*.jpg")])
#print(f"Loaded {len(example_images)} example images")
interface = gr.Interface(
    fn=predict,
    inputs=input,
    outputs="textbox",
    title=title,
    description=description,
    #examples=example_images,
    live=True,
)
interface.launch(share=True)