|
import gradio as gr |
|
from PIL import Image, ImageOps |
|
from transformers import VisionEncoderDecoderModel, GPT2Tokenizer, AutoFeatureExtractor |
|
|
|
text_processor = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|pad|>") |
|
|
|
image_processor = AutoFeatureExtractor.from_pretrained("yuewu/toc_titler") |
|
model = VisionEncoderDecoderModel.from_pretrained("yuewu/toc_titler") |
|
|
|
def array_to_square_image(image): |
|
|
|
image = Image.fromarray(image) |
|
|
|
|
|
if image.size[0] != image.size[1]: |
|
if image.size[0] > image.size[1]: |
|
delta = image.size[0] - image.size[1] |
|
padding = (0, delta//2, 0, delta//2) |
|
if image.size[0] < image.size[1]: |
|
delta = image.size[1] - image.size[0] |
|
padding = (delta//2, 0, delta//2, 0) |
|
image = ImageOps.expand(image, padding, fill=(255, 255, 255)) |
|
|
|
|
|
if image.size[0] != image.size[1]: |
|
image.resize((image.size[0], image.size[0])) |
|
|
|
return image |
|
|
|
def greet(image): |
|
image = array_to_square_image(image) |
|
pixel_values = image_processor(image, return_tensors="pt").pixel_values |
|
generated_ids = model.generate(pixel_values) |
|
generated_text = text_processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
|
|
return generated_text[0] |
|
|
|
demo = gr.Interface(fn=greet, inputs="image", outputs="text", |
|
title="Table of Contents Image Title Generator", |
|
description="Upload a chemistry paper table of contents image and the model will guess what the title of the paper should be.") |
|
demo.launch() |