Spaces:
Runtime error
Runtime error
import torch | |
import streamlit as st | |
from PIL import Image | |
from io import BytesIO | |
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor | |
def run_prediction(sample): | |
global pretrained_model, processor, task_prompt | |
if isinstance(sample, dict): | |
# prepare inputs | |
pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) | |
else: # sample is an image | |
# prepare encoder inputs | |
pixel_values = processor(image, return_tensors="pt").pixel_values | |
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
# run inference | |
outputs = pretrained_model.generate( | |
pixel_values.to(device), | |
decoder_input_ids=decoder_input_ids.to(device), | |
max_length=pretrained_model.decoder.config.max_position_embeddings, | |
early_stopping=True, | |
pad_token_id=processor.tokenizer.pad_token_id, | |
eos_token_id=processor.tokenizer.eos_token_id, | |
use_cache=True, | |
num_beams=1, | |
bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
return_dict_in_generate=True, | |
) | |
# process output | |
prediction = processor.batch_decode(outputs.sequences)[0] | |
# post-processing | |
if "cord" in task_prompt: | |
prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
# prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token | |
prediction = processor.token2json(prediction) | |
# load reference target | |
if isinstance(sample, dict): | |
target = processor.token2json(sample["target_sequence"]) | |
else: | |
target = "<not_provided>" | |
return prediction, target | |
task_prompt = f"<s>" | |
# logo = Image.open("./img/rsz_unstructured_logo.png") | |
# st.image(logo) | |
st.markdown(''' | |
### Donut Common Crawl | |
Experimental OCR-free Document Understanding Vision Transformer nicknamed π©, fine-tuned with few samples of the common-crawl with some specific document elements. | |
''') | |
with st.sidebar: | |
information = st.radio( | |
"Choose one predictor:?", | |
('Base Common-Crawl π©', 'Hierarchical Common-Crawl π©')) | |
image_choice = st.selectbox('Pick one π', ['1', '2', '3'], index=1) | |
st.text(f'{information} mode is ON!\nTarget π: {image_choice}') # \n(opening image @:./img/receipt-{receipt}.png)') | |
col1, col2 = st.columns(2) | |
image_choice_map = { | |
'1': 'commoncrawl_amandalacombznewspolice-bust-man-sawed-oal_1.jpg', | |
'2': 'commoncrawl_canyonhillschroniclecomtagwomens-basketbll_0.png', | |
'3': 'commoncrawl_celstuttgartdeideaa-different-stort-of-nfe_0.png' | |
} | |
image = Image.open(image_choice_map[image_choice]) | |
with col1: | |
st.image(image, caption='Your target sample') | |
if st.button('Parse sample! π'): | |
image = image.convert('RGB') | |
image.save('./target_image.jpg') | |
image = Image.open('./target_image.jpg') | |
with st.spinner(f'baking the π©s...'): | |
if information == 'Base Common-Crawl π©': | |
processor = DonutProcessor.from_pretrained("laverdes/donut-commoncrawl-mid") # laverdes/donut-commoncrawl | |
pretrained_model = VisionEncoderDecoderModel.from_pretrained("laverdes/donut-commoncrawl-mid") # laverdes/donut-commoncrawl | |
task_prompt = f"<s>" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
pretrained_model.to(device) | |
elif information == 'Hierarchical Common-Crawl π©': | |
st.info("Not implemented yet...") | |
with col2: | |
st.info(f'parsing π...') | |
parsed_info, _ = run_prediction(image) | |
st.text(f'\n{information}') | |
st.json(parsed_info) |