Spaces:

adbcode
/

donut-mrz

Sleeping

App Files Files Community

adbcode commited on Jul 19, 2023

Commit

e05a1d1

•

1 Parent(s): 267443f

first draft

Browse files

Files changed (36) hide show

.gitattributes +19 -0
added_tokens.json +1 -0
app.py +58 -0
config.json +13 -0
donut/__init__.py +16 -0
donut/__pycache__/__init__.cpython-311.pyc +0 -0
donut/__pycache__/model.cpython-311.pyc +0 -0
donut/__pycache__/util.cpython-311.pyc +0 -0
donut/_version.py +6 -0
donut/model.py +609 -0
donut/util.py +344 -0
images/belgium_2.PNG +3 -0
images/denmark_2.jpeg +0 -0
images/estonia.PNG +3 -0
images/guiana.PNG +3 -0
images/iraq.PNG +3 -0
images/ireland.PNG +3 -0
images/mali_2.PNG +3 -0
images/newzealand_4.PNG +3 -0
images/poland_3.PNG +3 -0
images/portugal_3.PNG +3 -0
images/singapore_3.PNG +3 -0
images/spain.PNG +3 -0
images/spain_3.PNG +3 -0
images/suriname.PNG +3 -0
images/switzerland_2.PNG +3 -0
images/switzerland_4.PNG +3 -0
images/thailand_5.PNG +3 -0
images/togo_2.PNG +3 -0
images/uk.PNG +3 -0
images/uk_3.PNG +3 -0
pytorch_model.bin +3 -0
requirements.txt +6 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/belgium_2.PNG filter=lfs diff=lfs merge=lfs -text
+images/estonia.PNG filter=lfs diff=lfs merge=lfs -text
+images/guiana.PNG filter=lfs diff=lfs merge=lfs -text
+images/iraq.PNG filter=lfs diff=lfs merge=lfs -text
+images/ireland.PNG filter=lfs diff=lfs merge=lfs -text
+images/mali_2.PNG filter=lfs diff=lfs merge=lfs -text
+images/newzealand_4.PNG filter=lfs diff=lfs merge=lfs -text
+images/poland_3.PNG filter=lfs diff=lfs merge=lfs -text
+images/portugal_3.PNG filter=lfs diff=lfs merge=lfs -text
+images/singapore_3.PNG filter=lfs diff=lfs merge=lfs -text
+images/spain_3.PNG filter=lfs diff=lfs merge=lfs -text
+images/spain.PNG filter=lfs diff=lfs merge=lfs -text
+images/suriname.PNG filter=lfs diff=lfs merge=lfs -text
+images/switzerland_2.PNG filter=lfs diff=lfs merge=lfs -text
+images/switzerland_4.PNG filter=lfs diff=lfs merge=lfs -text
+images/thailand_5.PNG filter=lfs diff=lfs merge=lfs -text
+images/togo_2.PNG filter=lfs diff=lfs merge=lfs -text
+images/uk_3.PNG filter=lfs diff=lfs merge=lfs -text
+images/uk.PNG filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<sep/>": 57522, "<s_iitcdip>": 57523, "<s_synthdog>": 57524, "<-1/>": 57525, "</s_MachineReadableZone>": 57526, "<s_MachineReadableZone>": 57527, "<s_INPUT_data>": 57528}

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import argparse
+import gradio as gr
+import os
+import torch
+from donut import DonutModel
+from PIL import Image
+def demo_process_vqa(input_img, question):
+    global pretrained_model, task_prompt, task_name
+    input_img = Image.fromarray(input_img)
+    user_prompt = task_prompt.replace("{user_input}", question)
+    return pretrained_model.inference(input_img, prompt=user_prompt)["predictions"][0]
+def demo_process(input_img):
+    global pretrained_model, task_prompt, task_name
+    input_img = Image.fromarray(input_img)
+    best_output = pretrained_model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
+    return best_output["text_sequence"].split(" </s_MachineReadableZone>")[0]
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str, default="s_passport")
+    parser.add_argument("--pretrained_path", type=str, default=os.getcwd())
+    parser.add_argument("--port", type=int, default=12345)
+    parser.add_argument("--url", type=str, default="0.0.0.0")
+    parser.add_argument("--sample_img_path", type=str)
+    args, left_argv = parser.parse_known_args()
+    task_name = args.task
+    if "docvqa" == task_name:
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+    else:  # rvlcdip, cord, ...
+        task_prompt = f"<s_{task_name}>"
+    example_sample = [os.path.join("images", image) for image in os.listdir("images")]
+    if args.sample_img_path:
+        example_sample.append(args.sample_img_path)
+    pretrained_model = DonutModel.from_pretrained(args.pretrained_path)
+    if torch.cuda.is_available():
+        pretrained_model.half()
+        device = torch.device("cuda")
+        pretrained_model.to(device)
+    pretrained_model.eval()
+    gr.Interface(
+        fn=demo_process_vqa if task_name == "docvqa" else demo_process,
+        inputs=["image", "text"] if task_name == "docvqa" else "image",
+        outputs="text",
+        title="Demo of MRZ Extraction model based on 🍩 architecture",
+        examples=example_sample if example_sample else None
+    ).launch()

config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_name_or_path": ".",
+  "align_long_axis": false,
+  "architectures": [
+    "DonutModel"
+  ],
+  "input_size": [1280,960],
+  "max_length": 768,
+  "model_type": "donut",
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.3",
+  "window_size": 10
+}

donut/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from .model import DonutConfig, DonutModel
+from .util import DonutDataset, JSONParseEvaluator, load_json, save_json
+__all__ = [
+    "DonutConfig",
+    "DonutModel",
+    "DonutDataset",
+    "JSONParseEvaluator",
+    "load_json",
+    "save_json",
+]

donut/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (565 Bytes). View file

donut/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (31.3 kB). View file

donut/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (18.1 kB). View file

donut/_version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+__version__ = "1.0.9"

donut/model.py ADDED Viewed

	@@ -0,0 +1,609 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import math
+import os
+import re
+from typing import Any, List, Optional, Union
+import numpy as np
+import PIL
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import ImageOps
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.swin_transformer import SwinTransformer
+from torchvision import transforms
+from torchvision.transforms.functional import resize, rotate
+from transformers import MBartConfig, MBartForCausalLM, XLMRobertaTokenizer
+from transformers.file_utils import ModelOutput
+from transformers.modeling_utils import PretrainedConfig, PreTrainedModel
+class SwinEncoder(nn.Module):
+    r"""
+    Donut encoder based on SwinTransformer
+    Set the initial weights and configuration with a pretrained SwinTransformer and then
+    modify the detailed configurations as a Donut Encoder
+    Args:
+        input_size: Input image size (width, height)
+        align_long_axis: Whether to rotate image if height is greater than width
+        window_size: Window size(=patch size) of SwinTransformer
+        encoder_layer: Number of layers of SwinTransformer encoder
+        name_or_path: Name of a pretrained model name either registered in huggingface.co. or saved in local.
+                      otherwise, `swin_base_patch4_window12_384` will be set (using `timm`).
+    """
+    def __init__(
+        self,
+        input_size: List[int],
+        align_long_axis: bool,
+        window_size: int,
+        encoder_layer: List[int],
+        name_or_path: Union[str, bytes, os.PathLike] = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.to_tensor = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+        self.model = SwinTransformer(
+            img_size=self.input_size,
+            depths=self.encoder_layer,
+            window_size=self.window_size,
+            patch_size=4,
+            embed_dim=128,
+            num_heads=[4, 8, 16, 32],
+            num_classes=0,
+        )
+        # weight init with swin
+        if not name_or_path:
+            swin_state_dict = timm.create_model("swin_base_patch4_window12_384", pretrained=True).state_dict()
+            new_swin_state_dict = self.model.state_dict()
+            for x in new_swin_state_dict:
+                if x.endswith("relative_position_index") or x.endswith("attn_mask"):
+                    pass
+                elif (
+                    x.endswith("relative_position_bias_table")
+                    and self.model.layers[0].blocks[0].attn.window_size[0] != 12
+                ):
+                    pos_bias = swin_state_dict[x].unsqueeze(0)[0]
+                    old_len = int(math.sqrt(len(pos_bias)))
+                    new_len = int(2 * window_size - 1)
+                    pos_bias = pos_bias.reshape(1, old_len, old_len, -1).permute(0, 3, 1, 2)
+                    pos_bias = F.interpolate(pos_bias, size=(new_len, new_len), mode="bicubic", align_corners=False)
+                    new_swin_state_dict[x] = pos_bias.permute(0, 2, 3, 1).reshape(1, new_len ** 2, -1).squeeze(0)
+                else:
+                    new_swin_state_dict[x] = swin_state_dict[x]
+            self.model.load_state_dict(new_swin_state_dict)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch_size, num_channels, height, width)
+        """
+        x = self.model.patch_embed(x)
+        x = self.model.pos_drop(x)
+        x = self.model.layers(x)
+        return x
+    def prepare_input(self, img: PIL.Image.Image, random_padding: bool = False) -> torch.Tensor:
+        """
+        Convert PIL Image to tensor according to specified input_size after following steps below:
+            - resize
+            - rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
+            - pad
+        """
+        img = img.convert("RGB")
+        if self.align_long_axis and (
+            (self.input_size[0] > self.input_size[1] and img.width > img.height)
+            or (self.input_size[0] < self.input_size[1] and img.width < img.height)
+        ):
+            img = rotate(img, angle=-90, expand=True)
+        img = resize(img, min(self.input_size))
+        img.thumbnail((self.input_size[1], self.input_size[0]))
+        delta_width = self.input_size[1] - img.width
+        delta_height = self.input_size[0] - img.height
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+        padding = (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+        return self.to_tensor(ImageOps.expand(img, padding))
+class BARTDecoder(nn.Module):
+    """
+    Donut Decoder based on Multilingual BART
+    Set the initial weights and configuration with a pretrained multilingual BART model,
+    and modify the detailed configurations as a Donut decoder
+    Args:
+        decoder_layer:
+            Number of layers of BARTDecoder
+        max_position_embeddings:
+            The maximum sequence length to be trained
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local,
+            otherwise, `hyunwoongko/asian-bart-ecjk` will be set (using `transformers`)
+    """
+    def __init__(
+        self, decoder_layer: int, max_position_embeddings: int, name_or_path: Union[str, bytes, os.PathLike] = None
+    ):
+        super().__init__()
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_position_embeddings
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
+            "hyunwoongko/asian-bart-ecjk" if not name_or_path else name_or_path
+        )
+        self.model = MBartForCausalLM(
+            config=MBartConfig(
+                is_decoder=True,
+                is_encoder_decoder=False,
+                add_cross_attention=True,
+                decoder_layers=self.decoder_layer,
+                max_position_embeddings=self.max_position_embeddings,
+                vocab_size=len(self.tokenizer),
+                scale_embedding=True,
+                add_final_layer_norm=True,
+            )
+        )
+        self.model.forward = self.forward  #  to get cross attentions and utilize `generate` function
+        self.model.config.is_encoder_decoder = True  # to get cross-attention
+        self.add_special_tokens(["<sep/>"])  # <sep/> is used for representing a list in a JSON
+        self.model.model.decoder.embed_tokens.padding_idx = self.tokenizer.pad_token_id
+        self.model.prepare_inputs_for_generation = self.prepare_inputs_for_inference
+        # weight init with asian-bart
+        if not name_or_path:
+            bart_state_dict = MBartForCausalLM.from_pretrained("hyunwoongko/asian-bart-ecjk").state_dict()
+            new_bart_state_dict = self.model.state_dict()
+            for x in new_bart_state_dict:
+                if x.endswith("embed_positions.weight") and self.max_position_embeddings != 1024:
+                    new_bart_state_dict[x] = torch.nn.Parameter(
+                        self.resize_bart_abs_pos_emb(
+                            bart_state_dict[x],
+                            self.max_position_embeddings
+                            + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                        )
+                    )
+                elif x.endswith("embed_tokens.weight") or x.endswith("lm_head.weight"):
+                    new_bart_state_dict[x] = bart_state_dict[x][: len(self.tokenizer), :]
+                else:
+                    new_bart_state_dict[x] = bart_state_dict[x]
+            self.model.load_state_dict(new_bart_state_dict)
+    def add_special_tokens(self, list_of_tokens: List[str]):
+        """
+        Add special tokens to tokenizer and resize the token embeddings
+        """
+        newly_added_num = self.tokenizer.add_special_tokens({"additional_special_tokens": sorted(set(list_of_tokens))})
+        if newly_added_num > 0:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+    def prepare_inputs_for_inference(self, input_ids: torch.Tensor, encoder_outputs: torch.Tensor, past=None, use_cache: bool = None, attention_mask: torch.Tensor = None):
+        """
+        Args:
+            input_ids: (batch_size, sequence_lenth)
+        Returns:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, embedding_dim)
+        """
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long()
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        output = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+            "encoder_hidden_states": encoder_outputs.last_hidden_state,
+        }
+        return output
+    def forward(
+        self,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: bool = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = None,
+    ):
+        """
+        A forward fucntion to get cross attentions and utilize `generate` function
+        Source:
+        https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L1669-L1810
+        Args:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, hidden_size)
+        Returns:
+            loss: (1, )
+            logits: (batch_size, sequence_length, hidden_dim)
+            hidden_states: (batch_size, sequence_length, hidden_size)
+            decoder_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+            cross_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.model.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.model.config.use_return_dict
+        outputs = self.model.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.model.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return ModelOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            decoder_attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    @staticmethod
+    def resize_bart_abs_pos_emb(weight: torch.Tensor, max_length: int) -> torch.Tensor:
+        """
+        Resize position embeddings
+        Truncate if sequence length of Bart backbone is greater than given max_length,
+        else interpolate to max_length
+        """
+        if weight.shape[0] > max_length:
+            weight = weight[:max_length, ...]
+        else:
+            weight = (
+                F.interpolate(
+                    weight.permute(1, 0).unsqueeze(0),
+                    size=max_length,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .permute(1, 0)
+            )
+        return weight
+class DonutConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutModel`]. It is used to
+    instantiate a Donut model according to the specified arguments, defining the model architecture
+    Args:
+        input_size:
+            Input image size (canvas size) of Donut.encoder, SwinTransformer in this codebase
+        align_long_axis:
+            Whether to rotate image if height is greater than width
+        window_size:
+            Window size of Donut.encoder, SwinTransformer in this codebase
+        encoder_layer:
+            Depth of each Donut.encoder Encoder layer, SwinTransformer in this codebase
+        decoder_layer:
+            Number of hidden layers in the Donut.decoder, such as BART
+        max_position_embeddings
+            Trained max position embeddings in the Donut decoder,
+            if not specified, it will have same value with max_length
+        max_length:
+            Max position embeddings(=maximum sequence length) you want to train
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local
+    """
+    model_type = "donut"
+    def __init__(
+        self,
+        input_size: List[int] = [2560, 1920],
+        align_long_axis: bool = False,
+        window_size: int = 10,
+        encoder_layer: List[int] = [2, 2, 14, 2],
+        decoder_layer: int = 4,
+        max_position_embeddings: int = None,
+        max_length: int = 1536,
+        name_or_path: Union[str, bytes, os.PathLike] = "",
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_length if max_position_embeddings is None else max_position_embeddings
+        self.max_length = max_length
+        self.name_or_path = name_or_path
+class DonutModel(PreTrainedModel):
+    r"""
+    Donut: an E2E OCR-free Document Understanding Transformer.
+    The encoder maps an input document image into a set of embeddings,
+    the decoder predicts a desired token sequence, that can be converted to a structured format,
+    given a prompt and the encoder output embeddings
+    """
+    config_class = DonutConfig
+    base_model_prefix = "donut"
+    def __init__(self, config: DonutConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SwinEncoder(
+            input_size=self.config.input_size,
+            align_long_axis=self.config.align_long_axis,
+            window_size=self.config.window_size,
+            encoder_layer=self.config.encoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+        self.decoder = BARTDecoder(
+            max_position_embeddings=self.config.max_position_embeddings,
+            decoder_layer=self.config.decoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+    def forward(self, image_tensors: torch.Tensor, decoder_input_ids: torch.Tensor, decoder_labels: torch.Tensor):
+        """
+        Calculate a loss given an input image and a desired token sequence,
+        the model will be trained in a teacher-forcing manner
+        Args:
+            image_tensors: (batch_size, num_channels, height, width)
+            decoder_input_ids: (batch_size, sequence_length, embedding_dim)
+            decode_labels: (batch_size, sequence_length)
+        """
+        encoder_outputs = self.encoder(image_tensors)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            encoder_hidden_states=encoder_outputs,
+            labels=decoder_labels,
+        )
+        return decoder_outputs
+    def inference(
+        self,
+        image: PIL.Image = None,
+        prompt: str = None,
+        image_tensors: Optional[torch.Tensor] = None,
+        prompt_tensors: Optional[torch.Tensor] = None,
+        return_json: bool = True,
+        return_attentions: bool = False,
+    ):
+        """
+        Generate a token sequence in an auto-regressive manner,
+        the generated token sequence is convereted into an ordered JSON format
+        Args:
+            image: input document image (PIL.Image)
+            prompt: task prompt (string) to guide Donut Decoder generation
+            image_tensors: (1, num_channels, height, width)
+                convert prompt to tensor if image_tensor is not fed
+            prompt_tensors: (1, sequence_length)
+                convert image to tensor if prompt_tensor is not fed
+        """
+        # prepare backbone inputs (image and prompt)
+        if image is None and image_tensors is None:
+            raise ValueError("Expected either image or image_tensors")
+        if all(v is None for v in {prompt, prompt_tensors}):
+            raise ValueError("Expected either prompt or prompt_tensors")
+        if image_tensors is None:
+            image_tensors = self.encoder.prepare_input(image).unsqueeze(0)
+        if self.device.type == "cuda":  # half is not compatible in cpu implementation.
+            image_tensors = image_tensors.half()
+            image_tensors = image_tensors.to(self.device)
+        if prompt_tensors is None:
+            prompt_tensors = self.decoder.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
+        prompt_tensors = prompt_tensors.to(self.device)
+        last_hidden_state = self.encoder(image_tensors)
+        if self.device.type != "cuda":
+            last_hidden_state = last_hidden_state.to(torch.float32)
+        encoder_outputs = ModelOutput(last_hidden_state=last_hidden_state, attentions=None)
+        if len(encoder_outputs.last_hidden_state.size()) == 1:
+            encoder_outputs.last_hidden_state = encoder_outputs.last_hidden_state.unsqueeze(0)
+        if len(prompt_tensors.size()) == 1:
+            prompt_tensors = prompt_tensors.unsqueeze(0)
+        # get decoder output
+        decoder_output = self.decoder.model.generate(
+            decoder_input_ids=prompt_tensors,
+            encoder_outputs=encoder_outputs,
+            max_length=self.config.max_length,
+            early_stopping=True,
+            pad_token_id=self.decoder.tokenizer.pad_token_id,
+            eos_token_id=self.decoder.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.decoder.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            output_attentions=return_attentions,
+        )
+        output = {"predictions": list()}
+        for seq in self.decoder.tokenizer.batch_decode(decoder_output.sequences):
+            seq = seq.replace(self.decoder.tokenizer.eos_token, "").replace(self.decoder.tokenizer.pad_token, "")
+            seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
+            if return_json:
+                output["predictions"].append(self.token2json(seq))
+            else:
+                output["predictions"].append(seq)
+        if return_attentions:
+            output["attentions"] = {
+                "self_attentions": decoder_output.decoder_attentions,
+                "cross_attentions": decoder_output.cross_attentions,
+            }
+        return output
+    def json2token(self, obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
+        """
+        Convert an ordered JSON object into a token sequence
+        """
+        if type(obj) == dict:
+            if len(obj) == 1 and "text_sequence" in obj:
+                return obj["text_sequence"]
+            else:
+                output = ""
+                if sort_json_key:
+                    keys = sorted(obj.keys(), reverse=True)
+                else:
+                    keys = obj.keys()
+                for k in keys:
+                    if update_special_tokens_for_json_key:
+                        self.decoder.add_special_tokens([fr"<s_{k}>", fr"</s_{k}>"])
+                    output += (
+                        fr"<s_{k}>"
+                        + self.json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
+                        + fr"</s_{k}>"
+                    )
+                return output
+        elif type(obj) == list:
+            return r"<sep/>".join(
+                [self.json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
+            )
+        else:
+            obj = str(obj)
+            if f"<{obj}/>" in self.decoder.tokenizer.all_special_tokens:
+                obj = f"<{obj}/>"  # for categorical special tokens
+            return obj
+    def token2json(self, tokens, is_inner_value=False):
+        """
+        Convert a (generated) token seuqnce into an ordered JSON format
+        """
+        output = dict()
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            end_token = re.search(fr"</s_{key}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if (
+                                leaf in self.decoder.tokenizer.get_added_vocab()
+                                and leaf[0] == "<"
+                                and leaf[-2:] == "/>"
+                            ):
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True)
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, bytes, os.PathLike],
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained donut model from a pre-trained model configuration
+        Args:
+            pretrained_model_name_or_path:
+                Name of a pretrained model name either registered in huggingface.co. or saved in local,
+                e.g., `naver-clova-ix/donut-base`, or `naver-clova-ix/donut-base-finetuned-rvlcdip`
+        """
+        model = super(DonutModel, cls).from_pretrained(pretrained_model_name_or_path, revision="official", *model_args, **kwargs)
+        # truncate or interplolate position embeddings of donut decoder
+        max_length = kwargs.get("max_length", model.config.max_position_embeddings)
+        if (
+            max_length != model.config.max_position_embeddings
+        ):  # if max_length of trained model differs max_length you want to train
+            model.decoder.model.model.decoder.embed_positions.weight = torch.nn.Parameter(
+                model.decoder.resize_bart_abs_pos_emb(
+                    model.decoder.model.model.decoder.embed_positions.weight,
+                    max_length
+                    + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                )
+            )
+            model.config.max_position_embeddings = max_length
+        return model

donut/util.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple, Union
+import torch
+import zss
+from datasets import load_dataset
+from nltk import edit_distance
+from torch.utils.data import Dataset
+from transformers.modeling_utils import PreTrainedModel
+from zss import Node
+def save_json(write_path: Union[str, bytes, os.PathLike], save_obj: Any):
+    with open(write_path, "w") as f:
+        json.dump(save_obj, f)
+def load_json(json_path: Union[str, bytes, os.PathLike]):
+    with open(json_path, "r") as f:
+        return json.load(f)
+class DonutDataset(Dataset):
+    """
+    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
+    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
+    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string)
+    Args:
+        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
+        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
+        task_start_token: the special token to be fed to the decoder to conduct the target task
+    """
+    def __init__(
+        self,
+        dataset_name_or_path: str,
+        donut_model: PreTrainedModel,
+        max_length: int,
+        split: str = "train",
+        ignore_id: int = -100,
+        task_start_token: str = "<s>",
+        prompt_end_token: str = None,
+        sort_json_key: bool = True,
+    ):
+        super().__init__()
+        self.donut_model = donut_model
+        self.max_length = max_length
+        self.split = split
+        self.ignore_id = ignore_id
+        self.task_start_token = task_start_token
+        self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
+        self.sort_json_key = sort_json_key
+        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
+        self.dataset_length = len(self.dataset)
+        self.gt_token_sequences = []
+        #print(self.dataset)
+        for sample in self.dataset:
+           # print(sample)
+           # print(sample['ground_truth'])
+            ground_truth = json.loads(sample["ground_truth"])
+           # print(ground_truth)
+            if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
+                assert isinstance(ground_truth["gt_parses"], list)
+                gt_jsons = ground_truth["gt_parses"]
+            else:
+                assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
+                gt_jsons = [ground_truth["gt_parse"]]
+            self.gt_token_sequences.append(
+                [
+                    task_start_token
+                    + self.donut_model.json2token(
+                        gt_json,
+                        update_special_tokens_for_json_key=self.split == "train",
+                        sort_json_key=self.sort_json_key,
+                    )
+                    + self.donut_model.decoder.tokenizer.eos_token
+                    for gt_json in gt_jsons  # load json from list of json
+                ]
+            )
+        self.donut_model.decoder.add_special_tokens([self.task_start_token, self.prompt_end_token])
+        self.prompt_end_token_id = self.donut_model.decoder.tokenizer.convert_tokens_to_ids(self.prompt_end_token)
+    def __len__(self) -> int:
+        return self.dataset_length
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Load image from image_path of given dataset_path and convert into input_tensor and labels.
+        Convert gt data into input_ids (tokenized string)
+        Returns:
+            input_tensor : preprocessed image
+            input_ids : tokenized gt_data
+            labels : masked labels (model doesn't need to predict prompt and pad token)
+        """
+        sample = self.dataset[idx]
+        # input_tensor
+        input_tensor = self.donut_model.encoder.prepare_input(sample["image"], random_padding=self.split == "train")
+        # input_ids
+        processed_parse = random.choice(self.gt_token_sequences[idx])  # can be more than one, e.g., DocVQA Task 1
+        input_ids = self.donut_model.decoder.tokenizer(
+            processed_parse,
+            add_special_tokens=False,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )["input_ids"].squeeze(0)
+        if self.split == "train":
+            labels = input_ids.clone()
+            labels[
+                labels == self.donut_model.decoder.tokenizer.pad_token_id
+            ] = self.ignore_id  # model doesn't need to predict pad token
+            labels[
+                : torch.nonzero(labels == self.prompt_end_token_id).sum() + 1
+            ] = self.ignore_id  # model doesn't need to predict prompt (for VQA)
+            return input_tensor, input_ids, labels
+        else:
+            prompt_end_index = torch.nonzero(
+                input_ids == self.prompt_end_token_id
+            ).sum()  # return prompt end index instead of target output labels
+            return input_tensor, input_ids, prompt_end_index, processed_parse
+class JSONParseEvaluator:
+    """
+    Calculate n-TED(Normalized Tree Edit Distance) based accuracy and F1 accuracy score
+    """
+    @staticmethod
+    def flatten(data: dict):
+        """
+        Convert Dictionary into Non-nested Dictionary
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(list)
+                [
+                    ("menu.name", "cake"),
+                    ("menu.count", "2"),
+                    ("menu.name", "juice"),
+                    ("menu.count", "1"),
+                ]
+        """
+        flatten_data = list()
+        def _flatten(value, key=""):
+            if type(value) is dict:
+                for child_key, child_value in value.items():
+                    _flatten(child_value, f"{key}.{child_key}" if key else child_key)
+            elif type(value) is list:
+                for value_item in value:
+                    _flatten(value_item, key)
+            else:
+                flatten_data.append((key, value))
+        _flatten(data)
+        return flatten_data
+    @staticmethod
+    def update_cost(node1: Node, node2: Node):
+        """
+        Update cost for tree edit distance.
+        If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
+        If one of them is leaf node, cost is length of string in leaf node + 1.
+        If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
+        """
+        label1 = node1.label
+        label2 = node2.label
+        label1_leaf = "<leaf>" in label1
+        label2_leaf = "<leaf>" in label2
+        if label1_leaf == True and label2_leaf == True:
+            return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
+        elif label1_leaf == False and label2_leaf == True:
+            return 1 + len(label2.replace("<leaf>", ""))
+        elif label1_leaf == True and label2_leaf == False:
+            return 1 + len(label1.replace("<leaf>", ""))
+        else:
+            return int(label1 != label2)
+    @staticmethod
+    def insert_and_remove_cost(node: Node):
+        """
+        Insert and remove cost for tree edit distance.
+        If leaf node, cost is length of label name.
+        Otherwise, 1
+        """
+        label = node.label
+        if "<leaf>" in label:
+            return len(label.replace("<leaf>", ""))
+        else:
+            return 1
+    def normalize_dict(self, data: Union[Dict, List, Any]):
+        """
+        Sort by value, while iterate over element if data is list
+        """
+        if not data:
+            return {}
+        if isinstance(data, dict):
+            new_data = dict()
+            for key in sorted(data.keys(), key=lambda k: (len(k), k)):
+                value = self.normalize_dict(data[key])
+                if value:
+                    if not isinstance(value, list):
+                        value = [value]
+                    new_data[key] = value
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                new_data = []
+                for item in data:
+                    item = self.normalize_dict(item)
+                    if item:
+                        new_data.append(item)
+            else:
+                new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
+        else:
+            new_data = [str(data).strip()]
+        return new_data
+    def cal_f1(self, preds: List[dict], answers: List[dict]):
+        """
+        Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives
+        """
+        total_tp, total_fn_or_fp = 0, 0
+        for pred, answer in zip(preds, answers):
+            pred, answer = self.flatten(self.normalize_dict(pred)), self.flatten(self.normalize_dict(answer))
+            for field in pred:
+                if field in answer:
+                    total_tp += 1
+                    answer.remove(field)
+                else:
+                    total_fn_or_fp += 1
+            total_fn_or_fp += len(answer)
+        return total_tp / (total_tp + total_fn_or_fp / 2)
+    def construct_tree_from_dict(self, data: Union[Dict, List], node_name: str = None):
+        """
+        Convert Dictionary into Tree
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(tree)
+                                     <root>
+                                       |
+                                     menu
+                                    /    \
+                             <subtree>  <subtree>
+                            /      |     |      \
+                         name    count  name    count
+                        /         |     |         \
+                  <leaf>cake  <leaf>2  <leaf>juice  <leaf>1
+         """
+        if node_name is None:
+            node_name = "<root>"
+        node = Node(node_name)
+        if isinstance(data, dict):
+            for key, value in data.items():
+                kid_node = self.construct_tree_from_dict(value, key)
+                node.addkid(kid_node)
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                for item in data:
+                    kid_node = self.construct_tree_from_dict(
+                        item,
+                        "<subtree>",
+                    )
+                    node.addkid(kid_node)
+            else:
+                for item in data:
+                    node.addkid(Node(f"<leaf>{item}"))
+        else:
+            raise Exception(data, node_name)
+        return node
+    def cal_acc(self, pred: dict, answer: dict):
+        """
+        Calculate normalized tree edit distance(nTED) based accuracy.
+        1) Construct tree from dict,
+        2) Get tree distance with insert/remove/update cost,
+        3) Divide distance with GT tree size (i.e., nTED),
+        4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
+        """
+        pred = self.construct_tree_from_dict(self.normalize_dict(pred))
+        answer = self.construct_tree_from_dict(self.normalize_dict(answer))
+        return max(
+            0,
+            1
+            - (
+                zss.distance(
+                    pred,
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+                / zss.distance(
+                    self.construct_tree_from_dict(self.normalize_dict({})),
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+            ),
+        )

images/belgium_2.PNG ADDED Viewed

Git LFS Details

SHA256: a32cbc9e525058ea8e69d2bfb333800109bc455a0b5c93f87266308cf198b467
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

images/denmark_2.jpeg ADDED Viewed

images/estonia.PNG ADDED Viewed

Git LFS Details

SHA256: e4bfb2b8aec225e6a0948656a36a7fa88c74e4812549ed239fcf507910722289
Pointer size: 132 Bytes
Size of remote file: 2.29 MB

images/guiana.PNG ADDED Viewed

Git LFS Details

SHA256: 75745e82e312388f3fa4b74b821ef1bf152cea71abab00ee47232440ccac0ce3
Pointer size: 132 Bytes
Size of remote file: 3.21 MB

images/iraq.PNG ADDED Viewed

Git LFS Details

SHA256: 635cda83bf311f63b3d142074d6c0e60ad7456c327c6c0ebb13f208941ceedd2
Pointer size: 132 Bytes
Size of remote file: 3.04 MB

images/ireland.PNG ADDED Viewed

Git LFS Details

SHA256: 6297002ae1bb56b08c5056b38cda9481fc30c76129f1c5ce8fe83f564a051079
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

images/mali_2.PNG ADDED Viewed

Git LFS Details

SHA256: eebf7ab32c68fde48cd47cf29737a717f44b78c0fe59cca44f80514255c2ecc3
Pointer size: 132 Bytes
Size of remote file: 2.88 MB

images/newzealand_4.PNG ADDED Viewed

Git LFS Details

SHA256: d7914516d87e16f124280c9f228d39ba040f015bf0a7deb63ee5875cb3bc7248
Pointer size: 132 Bytes
Size of remote file: 2.26 MB

images/poland_3.PNG ADDED Viewed

Git LFS Details

SHA256: 8198fe99ac94721e5a356a91bd468a25046dbe72ad57fd0479d48f5d98c89300
Pointer size: 132 Bytes
Size of remote file: 2.84 MB

images/portugal_3.PNG ADDED Viewed

Git LFS Details

SHA256: 33722b34eb5a9d90a6ba8b8197fa140dee30794cb4fe72b41c236c9157fa9512
Pointer size: 132 Bytes
Size of remote file: 2.5 MB

images/singapore_3.PNG ADDED Viewed

Git LFS Details

SHA256: e21e503f13a7a5617b62c2c3b3fa2c01ac4a0e88768fbb46d758b78df92e36b5
Pointer size: 132 Bytes
Size of remote file: 2.29 MB

images/spain.PNG ADDED Viewed

Git LFS Details

SHA256: 11f06c0e606160d2bdf6d2fc9dbf9bff92bc2007c9fd29b61f50a847fadb1f63
Pointer size: 132 Bytes
Size of remote file: 1.63 MB

images/spain_3.PNG ADDED Viewed

Git LFS Details

SHA256: 43da95c507550f19ec6bb454e094450cda49f3f9ba223b7f6fecefce10b29b35
Pointer size: 132 Bytes
Size of remote file: 1.51 MB

images/suriname.PNG ADDED Viewed

Git LFS Details

SHA256: ee2b410ed207089f1c4c1f93a0ea9be787883085a4241eeb33a332d6775e66c8
Pointer size: 132 Bytes
Size of remote file: 3.24 MB

images/switzerland_2.PNG ADDED Viewed

Git LFS Details

SHA256: 2422bf27ebb84709343fcee203c908e8f53ff790bc9d02b685a6ae9cb880b2c8
Pointer size: 132 Bytes
Size of remote file: 2.76 MB

images/switzerland_4.PNG ADDED Viewed

Git LFS Details

SHA256: 2be5228a1d78c48e569a61c2d048a59ea7aec5dfe668ffc111ceca2d57e9eb18
Pointer size: 132 Bytes
Size of remote file: 2.77 MB

images/thailand_5.PNG ADDED Viewed

Git LFS Details

SHA256: 744a0e9bea866a1ec409a42c6401e43bf9f56854881eb33b9bdbef96b74ed6ae
Pointer size: 132 Bytes
Size of remote file: 2.92 MB

images/togo_2.PNG ADDED Viewed

Git LFS Details

SHA256: bfa721e36d28fffa7f815139b22b7cbebd7dfb93328f503a75121b4e903e7453
Pointer size: 132 Bytes
Size of remote file: 2.44 MB

images/uk.PNG ADDED Viewed

Git LFS Details

SHA256: d43ca4910764f0a96fc6d5d240c968b96ae82bbe60e3db1ae7b01b218962ca04
Pointer size: 132 Bytes
Size of remote file: 2.07 MB

images/uk_3.PNG ADDED Viewed

Git LFS Details

SHA256: 8470ce35a28c327cb1bd4a043bd6154d31f4acdd5a98b75c737c8e32bf73c32d
Pointer size: 132 Bytes
Size of remote file: 2.14 MB

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d1ac44133e20fc3af447c27598b241669b8aa475954bec445b37c85eae9c88a
+size 858374659

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+donut-python
+timm==0.6.13
+transformers==4.25.1
+gradio
+Pillow

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
+size 1296245

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<s_INPUT_data>"]}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sp_model_kwargs": {}, "from_slow": true, "name_or_path": "naver-clova-ix/donut-base", "processor_class": "DonutProcessor", "special_tokens_map_file": null, "tokenizer_file": "/root/.cache/huggingface/transformers/8dff5958cbdb9de4188d643398d5d92bebb82976ce97e6f741b4793e21600485.01bf49938a78cb9ef1792abc3a5829ec39a7887935548bf42fd8d76bf07f15d8", "tokenizer_class": "XLMRobertaTokenizer"}