Spaces:
Runtime error
Runtime error
import torch | |
import streamlit as st | |
from PIL import Image | |
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor | |
def run_prediction(sample): | |
global pretrained_model, processor, task_prompt | |
if isinstance(sample, dict): | |
# prepare inputs | |
pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) | |
else: # sample is an image | |
# prepare encoder inputs | |
pixel_values = processor(image, return_tensors="pt").pixel_values | |
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
# run inference | |
outputs = pretrained_model.generate( | |
pixel_values.to(device), | |
decoder_input_ids=decoder_input_ids.to(device), | |
max_length=pretrained_model.decoder.config.max_position_embeddings, | |
early_stopping=True, | |
pad_token_id=processor.tokenizer.pad_token_id, | |
eos_token_id=processor.tokenizer.eos_token_id, | |
use_cache=True, | |
num_beams=1, | |
bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
return_dict_in_generate=True, | |
) | |
# process output | |
prediction = processor.batch_decode(outputs.sequences)[0] | |
# post-processing | |
if "cord" in task_prompt: | |
prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token | |
prediction = processor.token2json(prediction) | |
# load reference target | |
if isinstance(sample, dict): | |
target = processor.token2json(sample["target_sequence"]) | |
else: | |
target = "<not_provided>" | |
return prediction, target | |
task_prompt = f"<s>" | |
st.text(''' | |
This is OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset. | |
The original 🍩 implementation can be found on: https://github.com/clovaai/donut | |
''') | |
with st.sidebar: | |
information = st.radio( | |
"What information inside the are you interested in?", | |
('Receipt Summary', 'Receipt Menu Details', 'Extract all!')) | |
receipt = st.selectbox('Pick one receipt', ['1', '2', '3', '4', '5', '6'], index=5) | |
st.text(f'{information} mode is ON!\nTarget receipt: {receipt}\n(opening image @:./img/receipt-{receipt}.png)') | |
image = Image.open(f"./img/receipt-{receipt}.jpg") | |
st.image(image, caption='Your target receipt') | |
st.text(f'baking the 🍩...') | |
processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") | |
pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
pretrained_model.to(device) | |
pretrained_model.encoder.to(torch.bfloat16) | |
pretrained_model.eval() | |
st.text(f'parsing receipt..') | |
parsed_receipt_info = run_prediction(image) | |
st.text(f'\nRaw output:\n{parsed_receipt_info}') |