import torch import streamlit as st from PIL import Image from io import BytesIO from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor def run_prediction(sample): global pretrained_model, processor, task_prompt if isinstance(sample, dict): # prepare inputs pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) else: # sample is an image # prepare encoder inputs pixel_values = processor(image, return_tensors="pt").pixel_values decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids # run inference outputs = pretrained_model.generate( pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), max_length=pretrained_model.decoder.config.max_position_embeddings, early_stopping=True, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=1, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True, ) # process output prediction = processor.batch_decode(outputs.sequences)[0] # post-processing if "cord" in task_prompt: prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") # prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token prediction = processor.token2json(prediction) # load reference target if isinstance(sample, dict): target = processor.token2json(sample["target_sequence"]) else: target = "" return prediction, target task_prompt = f"" logo = Image.open("./img/rsz_unstructured_logo.png") st.image(logo) st.markdown(''' ### Receipt Parser This is an OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset. The original 🍩 implementation can be found on [here](https://github.com/clovaai/donut). At [Unstructured.io](https://github.com/Unstructured-IO/unstructured) we are on a mission to build custom preprocessing pipelines for labeling, training, or production ML-ready pipelines 🤩. Come and join us in our public repos and contribute! Each of your contributions and feedback holds great value and is very significant to the community 😊. ''') image_upload = None photo = None with st.sidebar: information = st.radio( "What information inside the 🧾s are you interested in extracting?", ('Receipt Summary', 'Receipt Menu Details', 'Extract all', 'Unstructured.io Parser')) receipt = st.selectbox('Pick one 🧾', ['1', '2', '3', '4', '5', '6'], index=1) # file upload uploaded_file = st.file_uploader("Upload a 🧾") if uploaded_file is not None: # To read file as bytes: image_bytes_data = uploaded_file.getvalue() image_upload = Image.open(BytesIO(image_bytes_data)) #.frombytes('RGBA', (128,128), image_bytes_data, 'raw') # st.write(bytes_data) camera_click = st.button('Use my camera') img_file_buffer = None if camera_click: img_file_buffer = st.camera_input("Take a picture of your receipt!") if img_file_buffer: # To read image file buffer as a PIL Image: photo = Image.open(img_file_buffer) st.info("picture taken!") st.text(f'{information} mode is ON!\nTarget 🧾: {receipt}') # \n(opening image @:./img/receipt-{receipt}.png)') col1, col2 = st.columns(2) if photo: image = photo st.info("photo loaded to image") elif image_upload: image = image_upload else: image = Image.open(f"./img/receipt-{receipt}.jpg") with col1: st.image(image, caption='Your target receipt') if st.button('Parse receipt! 🐍'): with st.spinner(f'baking the 🍩s...'): if information == 'Receipt Summary': processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") task_prompt = f"" device = "cuda" if torch.cuda.is_available() else "cpu" pretrained_model.to(device) elif information == 'Receipt Menu Details': processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") pretrained_model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") task_prompt = f"" device = "cuda" if torch.cuda.is_available() else "cpu" pretrained_model.to(device) elif information == 'Unstructured.io Parser': processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-labelstudio-A1.0") pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-labelstudio-A1.0") task_prompt = f"" device = "cuda" if torch.cuda.is_available() else "cpu" pretrained_model.to(device) else: # Extract all processor_a = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") processor_b = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") pretrained_model_a = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") pretrained_model_b = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") device = "cuda" if torch.cuda.is_available() else "cpu" with col2: if information == 'Extract all': st.info(f'parsing 🧾 (extracting all)...') pretrained_model, processor, task_prompt = pretrained_model_a, processor_a, f"" pretrained_model.to(device) parsed_receipt_info_a, _ = run_prediction(image) pretrained_model, processor, task_prompt = pretrained_model_b, processor_b, f"" pretrained_model.to(device) parsed_receipt_info_b, _ = run_prediction(image) st.text(f'\nReceipt Summary:') st.json(parsed_receipt_info_a) st.text(f'\nReceipt Menu Details:') st.json(parsed_receipt_info_b) else: st.info(f'parsing 🧾...') parsed_receipt_info, _ = run_prediction(image) st.text(f'\n{information}') st.json(parsed_receipt_info)