File size: 1,695 Bytes
e470706 0cdd8d0 e470706 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import gradio as gr
from PIL import Image, ImageOps
from transformers import VisionEncoderDecoderModel, GPT2Tokenizer, AutoFeatureExtractor
text_processor = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|pad|>")
# text_processor = AutoTokenizer.from_pretrained("yuewu/toc_titler")
image_processor = AutoFeatureExtractor.from_pretrained("yuewu/toc_titler")
model = VisionEncoderDecoderModel.from_pretrained("yuewu/toc_titler")
def array_to_square_image(image):
# Numpy array to PIL image
image = Image.fromarray(image)
# Pad to square image
if image.size[0] != image.size[1]:
if image.size[0] > image.size[1]:
delta = image.size[0] - image.size[1]
padding = (0, delta//2, 0, delta//2)
if image.size[0] < image.size[1]:
delta = image.size[1] - image.size[0]
padding = (delta//2, 0, delta//2, 0)
image = ImageOps.expand(image, padding, fill=(255, 255, 255))
# In case size is off by 1
if image.size[0] != image.size[1]:
image.resize((image.size[0], image.size[0]))
return image
def greet(image):
image = array_to_square_image(image)
pixel_values = image_processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = text_processor.batch_decode(generated_ids, skip_special_tokens=True)
return generated_text[0]
demo = gr.Interface(fn=greet, inputs="image", outputs="text",
title="Table of Contents Image Title Generator",
description="Upload a chemistry paper table of contents image and the model will guess what the title of the paper should be.")
demo.launch() |