import re import torch from fastapi import FastAPI, File, UploadFile from transformers import DonutProcessor, VisionEncoderDecoderModel from PIL import Image from io import BytesIO app = FastAPI() processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2", use_fast=False, cache_dir="new_cache_dir/") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2", cache_dir="new_cache_dir/") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) task_prompt = "" decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids def generateOutput(fileData): pil_image = Image.open(BytesIO(fileData)) pil_image.resize((800, 600)) pixel_values = processor(pil_image, return_tensors="pt").pixel_values outputs = model.generate( pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), max_length=model.decoder.config.max_position_embeddings, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True, ) return outputs @app.post("/ocr/") async def analyze_image(file: UploadFile = File(...)): content = await file.read() outputs = generateOutput(content) sequence = processor.batch_decode(outputs.sequences)[0] sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token return processor.token2json(sequence)