Upload model

Browse files

Files changed (6) hide show

config.json +475 -0
configuration_magi.py +38 -0
modelling_magi.py +486 -0
processing_magi.py +274 -0
pytorch_model.bin +3 -0
utils.py +391 -0

config.json ADDED Viewed

	@@ -0,0 +1,475 @@

+{
+  "_name_or_path": "to_push",
+  "architectures": [
+    "MagiModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_magi.MagiConfig",
+    "AutoModel": "modelling_magi.MagiModel"
+  },
+  "crop_embedding_image_preprocessing_config": {
+    "_processor_class": null,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.485,
+      0.456,
+      0.406
+    ],
+    "image_processor_type": "ViTImageProcessor",
+    "image_std": [
+      0.229,
+      0.224,
+      0.225
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "height": 224,
+      "width": 224
+    }
+  },
+  "crop_embedding_model_config": {
+    "_name_or_path": "facebook/vit-mae-base",
+    "add_cross_attention": false,
+    "architectures": [
+      "ViTMAEForPreTraining"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_hidden_size": 512,
+    "decoder_intermediate_size": 2048,
+    "decoder_num_attention_heads": 16,
+    "decoder_num_hidden_layers": 8,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "mask_ratio": 0.75,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "norm_pix_loss": false,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "detection_image_preprocessing_config": {
+    "_processor_class": null,
+    "do_normalize": true,
+    "do_pad": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "format": "coco_detection",
+    "image_mean": [
+      0.485,
+      0.456,
+      0.406
+    ],
+    "image_processor_type": "ConditionalDetrImageProcessor",
+    "image_std": [
+      0.229,
+      0.224,
+      0.225
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 1333,
+      "shortest_edge": 800
+    }
+  },
+  "detection_model_config": {
+    "_name_or_path": "microsoft/conditional-detr-resnet-50",
+    "activation_dropout": 0.0,
+    "activation_function": "relu",
+    "add_cross_attention": false,
+    "architectures": [
+      "ConditionalDETRForObjectDetection"
+    ],
+    "attention_dropout": 0.0,
+    "auxiliary_loss": false,
+    "backbone": "resnet50",
+    "backbone_config": null,
+    "bad_words_ids": null,
+    "bbox_cost": 5,
+    "bbox_loss_coefficient": 5,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "class_cost": 2,
+    "cls_loss_coefficient": 2,
+    "cross_attention_hidden_size": null,
+    "d_model": 256,
+    "decoder_attention_heads": 8,
+    "decoder_ffn_dim": 2048,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 6,
+    "decoder_start_token_id": null,
+    "dice_loss_coefficient": 1,
+    "dilation": false,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "encoder_attention_heads": 8,
+    "encoder_ffn_dim": 2048,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 6,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "focal_alpha": 0.25,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "giou_cost": 2,
+    "giou_loss_coefficient": 2,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1",
+      "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "init_xavier_std": 1.0,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1,
+      "LABEL_2": 2
+    },
+    "length_penalty": 1.0,
+    "mask_loss_coefficient": 1,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 6,
+    "num_queries": 305,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "position_embedding_type": "sine",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_pretrained_backbone": true,
+    "use_timm_backbone": true
+  },
+  "disable_crop_embeddings": false,
+  "disable_detections": false,
+  "disable_ocr": false,
+  "model_type": "magi",
+  "ocr_model_config": {
+    "_name_or_path": "/work/rs/logs/manga_ocr/nt8rn2ul/",
+    "add_cross_attention": false,
+    "architectures": [
+      "VisionEncoderDecoderModel"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder": {
+      "_name_or_path": "",
+      "activation_dropout": 0.0,
+      "activation_function": "gelu",
+      "add_cross_attention": true,
+      "architectures": null,
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 0,
+      "chunk_size_feed_forward": 0,
+      "classifier_dropout": 0.0,
+      "cross_attention_hidden_size": 768,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_ffn_dim": 4096,
+      "decoder_layerdrop": 0.0,
+      "decoder_layers": 12,
+      "decoder_start_token_id": 2,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dropout": 0.1,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": 2,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "init_std": 0.02,
+      "is_decoder": true,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layernorm_embedding": true,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 512,
+      "min_length": 0,
+      "model_type": "trocr",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": 1,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "scale_embedding": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": false,
+      "use_learned_position_embeddings": true,
+      "vocab_size": 50265
+    },
+    "decoder_start_token_id": 0,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": true,
+    "encoder": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_probs_dropout_prob": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "encoder_stride": 16,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu",
+      "hidden_dropout_prob": 0.0,
+      "hidden_size": 768,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 384,
+      "initializer_range": 0.02,
+      "intermediate_size": 3072,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_norm_eps": 1e-12,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "vit",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 12,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_channels": 3,
+      "num_hidden_layers": 12,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 16,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "qkv_bias": false,
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    },
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 2.0,
+    "max_length": 300,
+    "min_length": 0,
+    "model_type": "vision-encoder-decoder",
+    "no_repeat_ngram_size": 3,
+    "num_beam_groups": 1,
+    "num_beams": 4,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 50265
+  },
+  "ocr_pretrained_processor_path": "microsoft/trocr-base-printed",
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0.dev0"
+}

configuration_magi.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from transformers import PretrainedConfig, VisionEncoderDecoderConfig
+from typing import List
+class MagiConfig(PretrainedConfig):
+    model_type = "magi"
+    def __init__(
+        self,
+        disable_ocr: bool = False,
+        disable_crop_embeddings: bool = False,
+        disable_detections: bool = False,
+        detection_model_config: dict = None,
+        ocr_model_config: dict = None,
+        crop_embedding_model_config: dict = None,
+        detection_image_preprocessing_config: dict = None,
+        ocr_pretrained_processor_path: str = None,
+        crop_embedding_image_preprocessing_config: dict = None,
+        **kwargs,
+    ):
+        self.disable_ocr = disable_ocr
+        self.disable_crop_embeddings = disable_crop_embeddings
+        self.disable_detections = disable_detections
+        self.detection_model_config = None
+        self.ocr_model_config = None
+        self.crop_embedding_model_config = None
+        if detection_model_config is not None:
+            self.detection_model_config = PretrainedConfig.from_dict(detection_model_config)
+        if ocr_model_config is not None:
+            self.ocr_model_config = VisionEncoderDecoderConfig.from_dict(ocr_model_config)
+        if crop_embedding_model_config is not None:
+            self.crop_embedding_model_config = PretrainedConfig.from_dict(crop_embedding_model_config)
+        self.detection_image_preprocessing_config = detection_image_preprocessing_config
+        self.ocr_pretrained_processor_path = ocr_pretrained_processor_path
+        self.crop_embedding_image_preprocessing_config = crop_embedding_image_preprocessing_config
+        super().__init__(**kwargs)

modelling_magi.py ADDED Viewed

	@@ -0,0 +1,486 @@

+from transformers import PreTrainedModel, VisionEncoderDecoderModel, ViTMAEModel, ConditionalDetrModel
+from transformers.models.conditional_detr.modeling_conditional_detr import (
+    ConditionalDetrMLPPredictionHead,
+    ConditionalDetrModelOutput,
+    ConditionalDetrHungarianMatcher,
+    inverse_sigmoid,
+)
+from .configuration_magi import MagiConfig
+from .processing_magi import MagiProcessor
+from torch import nn
+from typing import Optional, List
+import torch
+from einops import rearrange, repeat, einsum
+from .utils import move_to_device, visualise_single_image_prediction, sort_panels, sort_text_boxes_in_reading_order
+class MagiModel(PreTrainedModel):
+    config_class = MagiConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.processor = MagiProcessor(config)
+        if not config.disable_ocr:
+            self.ocr_model = VisionEncoderDecoderModel(config.ocr_model_config)
+        if not config.disable_crop_embeddings:
+            self.crop_embedding_model = ViTMAEModel(config.crop_embedding_model_config)
+        if not config.disable_detections:
+            self.num_non_obj_tokens = 5
+            self.detection_transformer = ConditionalDetrModel(config.detection_model_config)
+            self.bbox_predictor = ConditionalDetrMLPPredictionHead(
+                input_dim=config.detection_model_config.d_model,
+                hidden_dim=config.detection_model_config.d_model,
+                output_dim=4, num_layers=3
+            )
+            self.is_this_text_a_dialogue = ConditionalDetrMLPPredictionHead(
+                input_dim=config.detection_model_config.d_model,
+                hidden_dim=config.detection_model_config.d_model,
+                output_dim=1,
+                num_layers=3
+            )
+            self.character_character_matching_head = ConditionalDetrMLPPredictionHead(
+                input_dim = 3 * config.detection_model_config.d_model + (2 * config.crop_embedding_model_config.hidden_size if not config.disable_crop_embeddings else 0),
+                hidden_dim=config.detection_model_config.d_model,
+                output_dim=1, num_layers=3
+            )
+            self.text_character_matching_head = ConditionalDetrMLPPredictionHead(
+                input_dim = 3 * config.detection_model_config.d_model,
+                hidden_dim=config.detection_model_config.d_model,
+                output_dim=1, num_layers=3
+            )
+            self.class_labels_classifier = nn.Linear(
+                config.detection_model_config.d_model, config.detection_model_config.num_labels
+            )
+            self.matcher = ConditionalDetrHungarianMatcher(
+                class_cost=config.detection_model_config.class_cost,
+                bbox_cost=config.detection_model_config.bbox_cost,
+                giou_cost=config.detection_model_config.giou_cost
+            )
+    def move_to_device(self, input):
+        return move_to_device(input, self.device)
+    def predict_detections_and_associations(
+            self,
+            images,
+            move_to_device_fn=None,
+            character_detection_threshold=0.3,
+            panel_detection_threshold=0.2,
+            text_detection_threshold=0.25,
+            character_character_matching_threshold=0.7,
+            text_character_matching_threshold=0.4,
+        ):
+        assert not self.config.disable_detections
+        move_to_device_fn = self.move_to_device if move_to_device_fn is None else move_to_device_fn
+        inputs_to_detection_transformer = self.processor.preprocess_inputs_for_detection(images)
+        inputs_to_detection_transformer = move_to_device_fn(inputs_to_detection_transformer)
+        detection_transformer_output = self._get_detection_transformer_output(**inputs_to_detection_transformer)
+        predicted_class_scores, predicted_bboxes = self._get_predicted_bboxes_and_classes(detection_transformer_output)
+        # create callback fn
+        def get_character_character_matching_scores(batch_character_indices, batch_bboxes):
+            predicted_obj_tokens_for_batch = self._get_predicted_obj_tokens(detection_transformer_output)
+            predicted_c2c_tokens_for_batch = self._get_predicted_c2c_tokens(detection_transformer_output)
+            crop_bboxes = [batch_bboxes[i][batch_character_indices[i]] for i in range(len(batch_character_indices))]
+            crop_embeddings_for_batch = self.predict_crop_embeddings(images, crop_bboxes, move_to_device_fn)
+            character_obj_tokens_for_batch = []
+            c2c_tokens_for_batch = []
+            for predicted_obj_tokens, predicted_c2c_tokens, character_indices in zip(predicted_obj_tokens_for_batch, predicted_c2c_tokens_for_batch, batch_character_indices):
+                character_obj_tokens_for_batch.append(predicted_obj_tokens[character_indices])
+                c2c_tokens_for_batch.append(predicted_c2c_tokens)
+            return self._get_character_character_affinity_matrices(
+                character_obj_tokens_for_batch=character_obj_tokens_for_batch,
+                crop_embeddings_for_batch=crop_embeddings_for_batch,
+                c2c_tokens_for_batch=c2c_tokens_for_batch,
+                apply_sigmoid=True,
+            )
+        # create callback fn
+        def get_text_character_matching_scores(batch_text_indices, batch_character_indices):
+            predicted_obj_tokens_for_batch = self._get_predicted_obj_tokens(detection_transformer_output)
+            predicted_t2c_tokens_for_batch = self._get_predicted_t2c_tokens(detection_transformer_output)
+            text_obj_tokens_for_batch = []
+            character_obj_tokens_for_batch = []
+            t2c_tokens_for_batch = []
+            for predicted_obj_tokens, predicted_t2c_tokens, text_indices, character_indices in zip(predicted_obj_tokens_for_batch, predicted_t2c_tokens_for_batch, batch_text_indices, batch_character_indices):
+                text_obj_tokens_for_batch.append(predicted_obj_tokens[text_indices])
+                character_obj_tokens_for_batch.append(predicted_obj_tokens[character_indices])
+                t2c_tokens_for_batch.append(predicted_t2c_tokens)
+            return self._get_text_character_affinity_matrices(
+                character_obj_tokens_for_batch=character_obj_tokens_for_batch,
+                text_obj_tokens_for_this_batch=text_obj_tokens_for_batch,
+                t2c_tokens_for_batch=t2c_tokens_for_batch,
+                apply_sigmoid=True,
+            )
+        # create callback fn
+        def get_dialog_confidence_scores(batch_text_indices):
+            predicted_obj_tokens_for_batch = self._get_predicted_obj_tokens(detection_transformer_output)
+            dialog_confidence = []
+            for predicted_obj_tokens, text_indices in zip(predicted_obj_tokens_for_batch, batch_text_indices):
+                confidence = self.is_this_text_a_dialogue(predicted_obj_tokens[text_indices]).sigmoid()
+                dialog_confidence.append(rearrange(confidence, "i 1 -> i"))
+            return dialog_confidence
+        return self.processor.postprocess_detections_and_associations(
+            predicted_bboxes=predicted_bboxes,
+            predicted_class_scores=predicted_class_scores,
+            original_image_sizes=torch.stack([torch.tensor(img.shape[:2]) for img in images], dim=0).to(predicted_bboxes.device),
+            get_character_character_matching_scores=get_character_character_matching_scores,
+            get_text_character_matching_scores=get_text_character_matching_scores,
+            get_dialog_confidence_scores=get_dialog_confidence_scores,
+            character_detection_threshold=character_detection_threshold,
+            panel_detection_threshold=panel_detection_threshold,
+            text_detection_threshold=text_detection_threshold,
+            character_character_matching_threshold=character_character_matching_threshold,
+            text_character_matching_threshold=text_character_matching_threshold,
+        )
+    def predict_crop_embeddings(self, images, crop_bboxes, move_to_device_fn=None, mask_ratio=0.0, batch_size=256):
+        if self.config.disable_crop_embeddings:
+            return None
+        assert isinstance(crop_bboxes, List), "please provide a list of bboxes for each image to get embeddings for"
+        move_to_device_fn = self.move_to_device if move_to_device_fn is None else move_to_device_fn
+        # temporarily change the mask ratio from default to the one specified
+        old_mask_ratio = self.crop_embedding_model.embeddings.config.mask_ratio
+        self.crop_embedding_model.embeddings.config.mask_ratio = mask_ratio
+        crops_per_image = []
+        num_crops_per_batch = [len(bboxes) for bboxes in crop_bboxes]
+        for image, bboxes, num_crops in zip(images, crop_bboxes, num_crops_per_batch):
+            crops = self.processor.crop_image(image, bboxes)
+            assert len(crops) == num_crops
+            crops_per_image.extend(crops)
+        if len(crops_per_image) == 0:
+            return [[] for _ in crop_bboxes]
+        crops_per_image = self.processor.preprocess_inputs_for_crop_embeddings(crops_per_image)
+        crops_per_image = move_to_device_fn(crops_per_image)
+        # process the crops in batches to avoid OOM
+        embeddings = []
+        for i in range(0, len(crops_per_image), batch_size):
+            crops = crops_per_image[i:i+batch_size]
+            embeddings_per_batch = self.crop_embedding_model(crops).last_hidden_state[:, 0]
+            embeddings.append(embeddings_per_batch)
+        embeddings = torch.cat(embeddings, dim=0)
+        crop_embeddings_for_batch = []
+        for num_crops in num_crops_per_batch:
+            crop_embeddings_for_batch.append(embeddings[:num_crops])
+            embeddings = embeddings[num_crops:]
+        # restore the mask ratio to the default
+        self.crop_embedding_model.embeddings.config.mask_ratio = old_mask_ratio
+        return crop_embeddings_for_batch
+    def predict_ocr(self, images, crop_bboxes, move_to_device_fn=None, use_tqdm=False, batch_size=32):
+        assert not self.config.disable_ocr
+        move_to_device_fn = self.move_to_device if move_to_device_fn is None else move_to_device_fn
+        crops_per_image = []
+        num_crops_per_batch = [len(bboxes) for bboxes in crop_bboxes]
+        for image, bboxes, num_crops in zip(images, crop_bboxes, num_crops_per_batch):
+            crops = self.processor.crop_image(image, bboxes)
+            assert len(crops) == num_crops
+            crops_per_image.extend(crops)
+        if len(crops_per_image) == 0:
+            return [[] for _ in crop_bboxes]
+        crops_per_image = self.processor.preprocess_inputs_for_ocr(crops_per_image)
+        crops_per_image = move_to_device_fn(crops_per_image)
+        # process the crops in batches to avoid OOM
+        all_generated_texts = []
+        if use_tqdm:
+            from tqdm import tqdm
+            pbar = tqdm(range(0, len(crops_per_image), batch_size))
+        else:
+            pbar = range(0, len(crops_per_image), batch_size)
+        for i in pbar:
+            crops = crops_per_image[i:i+batch_size]
+            generated_ids = self.ocr_model.generate(crops)
+            generated_texts = self.processor.postprocess_ocr_tokens(generated_ids)
+            all_generated_texts.extend(generated_texts)
+        texts_for_images = []
+        for num_crops in num_crops_per_batch:
+            texts_for_images.append([x.replace("\n", "") for x in all_generated_texts[:num_crops]])
+            all_generated_texts = all_generated_texts[num_crops:]
+        return texts_for_images
+    def visualise_single_image_prediction(
+            self, image_as_np_array, predictions, filename=None
+    ):
+        return visualise_single_image_prediction(image_as_np_array, predictions, filename)
+    def generate_transcript_for_single_image(
+            self, predictions, ocr_results, filename=None
+    ):
+        character_clusters = predictions["character_cluster_labels"]
+        text_to_character = predictions["text_character_associations"]
+        text_to_character = {k: v for k, v in text_to_character}
+        transript = " ### Transcript ###\n"
+        for index, text in enumerate(ocr_results):
+            if index in text_to_character:
+                speaker = character_clusters[text_to_character[index]]
+                speaker = f"<{speaker}>"
+            else:
+                speaker = "<?>"
+            transript += f"{speaker}: {text}\n"
+        if filename is not None:
+            with open(filename, "w") as file:
+                file.write(transript)
+        return transript
+    def get_text_character_affinity_matrices_given_annotations(
+            self, images, annotations, move_to_device_fn=None, apply_sigmoid=True
+    ):
+        assert not self.config.disable_detections
+        move_to_device_fn = self.move_to_device if move_to_device_fn is None else move_to_device_fn
+        inputs_to_detection_transformer = self.processor.preprocess_inputs_for_detection(images, annotations)
+        inputs_to_detection_transformer = move_to_device_fn(inputs_to_detection_transformer)
+        processed_targets = inputs_to_detection_transformer.pop("labels")
+        detection_transformer_output = self._get_detection_transformer_output(**inputs_to_detection_transformer)
+        predicted_obj_tokens_for_batch = self._get_predicted_obj_tokens(detection_transformer_output)
+        predicted_t2c_tokens_for_batch = self._get_predicted_t2c_tokens(detection_transformer_output)
+        predicted_class_scores, predicted_bboxes = self._get_predicted_bboxes_and_classes(detection_transformer_output)
+        matching_dict = {
+            "logits": predicted_class_scores,
+            "pred_boxes": predicted_bboxes,
+        }
+        indices = self.matcher(matching_dict, processed_targets)
+        matched_char_obj_tokens_for_batch = []
+        matched_text_obj_tokens_for_batch = []
+        t2c_tokens_for_batch = []
+        text_bboxes_for_batch = []
+        character_bboxes_for_batch = []
+        for j, (pred_idx, tgt_idx) in enumerate(indices):
+            target_idx_to_pred_idx = {tgt.item(): pred.item() for pred, tgt in zip(pred_idx, tgt_idx)}
+            targets_for_this_image = processed_targets[j]
+            indices_of_text_boxes_in_annotation = [i for i, label in enumerate(targets_for_this_image["class_labels"]) if label == 1]
+            indices_of_char_boxes_in_annotation = [i for i, label in enumerate(targets_for_this_image["class_labels"]) if label == 0]
+            predicted_text_indices = [target_idx_to_pred_idx[i] for i in indices_of_text_boxes_in_annotation]
+            predicted_char_indices = [target_idx_to_pred_idx[i] for i in indices_of_char_boxes_in_annotation]
+            text_bboxes_for_batch.append(
+                [annotations[j]["bboxes_as_x1y1x2y2"][k] for k in indices_of_text_boxes_in_annotation]
+            )
+            character_bboxes_for_batch.append(
+                [annotations[j]["bboxes_as_x1y1x2y2"][k] for k in indices_of_char_boxes_in_annotation]
+            )
+            matched_char_obj_tokens_for_batch.append(predicted_obj_tokens_for_batch[j][predicted_char_indices])
+            matched_text_obj_tokens_for_batch.append(predicted_obj_tokens_for_batch[j][predicted_text_indices])
+            t2c_tokens_for_batch.append(predicted_t2c_tokens_for_batch[j])
+        text_character_affinity_matrices = self._get_text_character_affinity_matrices(
+            character_obj_tokens_for_batch=matched_char_obj_tokens_for_batch,
+            text_obj_tokens_for_this_batch=matched_text_obj_tokens_for_batch,
+            t2c_tokens_for_batch=t2c_tokens_for_batch,
+            apply_sigmoid=apply_sigmoid,
+        )
+        return {
+            "text_character_affinity_matrices": text_character_affinity_matrices,
+            "text_bboxes_for_batch": text_bboxes_for_batch,
+            "character_bboxes_for_batch": character_bboxes_for_batch,
+        }
+    def get_obj_embeddings_corresponding_to_given_annotations(
+            self, images, annotations, move_to_device_fn=None
+    ):
+        assert not self.config.disable_detections
+        move_to_device_fn = self.move_to_device if move_to_device_fn is None else move_to_device_fn
+        inputs_to_detection_transformer = self.processor.preprocess_inputs_for_detection(images, annotations)
+        inputs_to_detection_transformer = move_to_device_fn(inputs_to_detection_transformer)
+        processed_targets = inputs_to_detection_transformer.pop("labels")
+        detection_transformer_output = self._get_detection_transformer_output(**inputs_to_detection_transformer)
+        predicted_obj_tokens_for_batch = self._get_predicted_obj_tokens(detection_transformer_output)
+        predicted_t2c_tokens_for_batch = self._get_predicted_t2c_tokens(detection_transformer_output)
+        predicted_c2c_tokens_for_batch = self._get_predicted_c2c_tokens(detection_transformer_output)
+        predicted_class_scores, predicted_bboxes = self._get_predicted_bboxes_and_classes(detection_transformer_output)
+        matching_dict = {
+            "logits": predicted_class_scores,
+            "pred_boxes": predicted_bboxes,
+        }
+        indices = self.matcher(matching_dict, processed_targets)
+        matched_char_obj_tokens_for_batch = []
+        matched_text_obj_tokens_for_batch = []
+        matched_panel_obj_tokens_for_batch = []
+        t2c_tokens_for_batch = []
+        c2c_tokens_for_batch = []
+        for j, (pred_idx, tgt_idx) in enumerate(indices):
+            target_idx_to_pred_idx = {tgt.item(): pred.item() for pred, tgt in zip(pred_idx, tgt_idx)}
+            targets_for_this_image = processed_targets[j]
+            indices_of_char_boxes_in_annotation = [i for i, label in enumerate(targets_for_this_image["class_labels"]) if label == 0]
+            indices_of_text_boxes_in_annotation = [i for i, label in enumerate(targets_for_this_image["class_labels"]) if label == 1]
+            indices_of_panel_boxes_in_annotation = [i for i, label in enumerate(targets_for_this_image["class_labels"]) if label == 2]
+            predicted_text_indices = [target_idx_to_pred_idx[i] for i in indices_of_text_boxes_in_annotation]
+            predicted_char_indices = [target_idx_to_pred_idx[i] for i in indices_of_char_boxes_in_annotation]
+            predicted_panel_indices = [target_idx_to_pred_idx[i] for i in indices_of_panel_boxes_in_annotation]
+            matched_char_obj_tokens_for_batch.append(predicted_obj_tokens_for_batch[j][predicted_char_indices])
+            matched_text_obj_tokens_for_batch.append(predicted_obj_tokens_for_batch[j][predicted_text_indices])
+            matched_panel_obj_tokens_for_batch.append(predicted_obj_tokens_for_batch[j][predicted_panel_indices])
+            t2c_tokens_for_batch.append(predicted_t2c_tokens_for_batch[j])
+            c2c_tokens_for_batch.append(predicted_c2c_tokens_for_batch[j])
+        return {
+            "character": matched_char_obj_tokens_for_batch,
+            "text": matched_text_obj_tokens_for_batch,
+            "panel": matched_panel_obj_tokens_for_batch,
+            "t2c": t2c_tokens_for_batch,
+            "c2c": c2c_tokens_for_batch,
+        }
+    def sort_panels_and_text_bboxes_in_reading_order(
+        self,
+        batch_panel_bboxes,
+        batch_text_bboxes,
+    ):
+        batch_sorted_panel_indices = []
+        batch_sorted_text_indices = []
+        for batch_index in range(len(batch_text_bboxes)):
+            panel_bboxes = batch_panel_bboxes[batch_index]
+            text_bboxes = batch_text_bboxes[batch_index]
+            sorted_panel_indices = sort_panels(panel_bboxes)
+            sorted_panels = [panel_bboxes[i] for i in sorted_panel_indices]
+            sorted_text_indices = sort_text_boxes_in_reading_order(text_bboxes, sorted_panels)
+            batch_sorted_panel_indices.append(sorted_panel_indices)
+            batch_sorted_text_indices.append(sorted_text_indices)
+        return batch_sorted_panel_indices, batch_sorted_text_indices
+    def _get_detection_transformer_output(
+            self,
+            pixel_values: torch.FloatTensor,
+            pixel_mask: Optional[torch.LongTensor] = None
+    ):
+        if self.config.disable_detections:
+            raise ValueError("Detection model is disabled. Set disable_detections=False in the config.")
+        return self.detection_transformer(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            return_dict=True
+        )
+    def _get_predicted_obj_tokens(
+            self,
+            detection_transformer_output: ConditionalDetrModelOutput
+    ):
+        return detection_transformer_output.last_hidden_state[:, :-self.num_non_obj_tokens]
+    def _get_predicted_c2c_tokens(
+            self,
+            detection_transformer_output: ConditionalDetrModelOutput
+    ):
+        return detection_transformer_output.last_hidden_state[:, -self.num_non_obj_tokens]
+    def _get_predicted_t2c_tokens(
+            self,
+            detection_transformer_output: ConditionalDetrModelOutput
+    ):
+        return detection_transformer_output.last_hidden_state[:, -self.num_non_obj_tokens+1]
+    def _get_predicted_bboxes_and_classes(
+            self,
+            detection_transformer_output: ConditionalDetrModelOutput,
+    ):
+        if self.config.disable_detections:
+            raise ValueError("Detection model is disabled. Set disable_detections=False in the config.")
+        obj = self._get_predicted_obj_tokens(detection_transformer_output)
+        predicted_class_scores = self.class_labels_classifier(obj)
+        reference = detection_transformer_output.reference_points[:-self.num_non_obj_tokens]
+        reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
+        predicted_boxes = self.bbox_predictor(obj)
+        predicted_boxes[..., :2] += reference_before_sigmoid
+        predicted_boxes = predicted_boxes.sigmoid()
+        return predicted_class_scores, predicted_boxes
+    def _get_character_character_affinity_matrices(
+            self,
+            character_obj_tokens_for_batch: List[torch.FloatTensor] = None,
+            crop_embeddings_for_batch: List[torch.FloatTensor] = None,
+            c2c_tokens_for_batch: List[torch.FloatTensor] = None,
+            apply_sigmoid=True,
+    ):
+        assert self.config.disable_detections or (character_obj_tokens_for_batch is not None and c2c_tokens_for_batch is not None)
+        assert self.config.disable_crop_embeddings or crop_embeddings_for_batch is not None
+        assert not self.config.disable_detections or not self.config.disable_crop_embeddings
+        if self.config.disable_detections:
+            affinity_matrices = []
+            for crop_embeddings in crop_embeddings_for_batch:
+                crop_embeddings = crop_embeddings / crop_embeddings.norm(dim=-1, keepdim=True)
+                affinity_matrix = einsum("i d, j d -> i j", affinity_matrix)
+                affinity_matrices.append(affinity_matrix)
+            return affinity_matrices
+        affinity_matrices = []
+        for batch_index, (character_obj_tokens, c2c) in enumerate(zip(character_obj_tokens_for_batch, c2c_tokens_for_batch)):
+            if character_obj_tokens.shape[0] == 0:
+                affinity_matrices.append(torch.zeros(0, 0).type_as(character_obj_tokens))
+                continue
+            if not self.config.disable_crop_embeddings:
+                crop_embeddings = crop_embeddings_for_batch[batch_index]
+                assert character_obj_tokens.shape[0] == crop_embeddings.shape[0]
+                character_obj_tokens = torch.cat([character_obj_tokens, crop_embeddings], dim=-1)
+            char_i = repeat(character_obj_tokens, "i d -> i repeat d", repeat=character_obj_tokens.shape[0])
+            char_j = repeat(character_obj_tokens, "j d -> repeat j d", repeat=character_obj_tokens.shape[0])
+            char_ij = rearrange([char_i, char_j], "two i j d -> (i j) (two d)")
+            c2c = repeat(c2c, "d -> repeat d", repeat = char_ij.shape[0])
+            char_ij_c2c = torch.cat([char_ij, c2c], dim=-1)
+            character_character_affinities = self.character_character_matching_head(char_ij_c2c)
+            character_character_affinities = rearrange(character_character_affinities, "(i j) 1 -> i j", i=char_i.shape[0])
+            if apply_sigmoid:
+                character_character_affinities = character_character_affinities.sigmoid()
+            affinity_matrices.append(character_character_affinities)
+        return affinity_matrices
+    def _get_text_character_affinity_matrices(
+            self,
+            character_obj_tokens_for_batch: List[torch.FloatTensor] = None,
+            text_obj_tokens_for_this_batch: List[torch.FloatTensor] = None,
+            t2c_tokens_for_batch: List[torch.FloatTensor] = None,
+            apply_sigmoid=True,
+    ):
+        assert not self.config.disable_detections
+        assert character_obj_tokens_for_batch is not None and text_obj_tokens_for_this_batch is not None and t2c_tokens_for_batch is not None
+        affinity_matrices = []
+        for character_obj_tokens, text_obj_tokens, t2c in zip(character_obj_tokens_for_batch, text_obj_tokens_for_this_batch, t2c_tokens_for_batch):
+            if character_obj_tokens.shape[0] == 0 or text_obj_tokens.shape[0] == 0:
+                affinity_matrices.append(torch.zeros(text_obj_tokens.shape[0], character_obj_tokens.shape[0]).type_as(character_obj_tokens))
+                continue
+            text_i = repeat(text_obj_tokens, "i d -> i repeat d", repeat=character_obj_tokens.shape[0])
+            char_j = repeat(character_obj_tokens, "j d -> repeat j d", repeat=text_obj_tokens.shape[0])
+            text_char = rearrange([text_i, char_j], "two i j d -> (i j) (two d)")
+            t2c = repeat(t2c, "d -> repeat d", repeat = text_char.shape[0])
+            text_char_t2c = torch.cat([text_char, t2c], dim=-1)
+            text_character_affinities = self.text_character_matching_head(text_char_t2c)
+            text_character_affinities = rearrange(text_character_affinities, "(i j) 1 -> i j", i=text_i.shape[0])
+            if apply_sigmoid:
+                text_character_affinities = text_character_affinities.sigmoid()
+            affinity_matrices.append(text_character_affinities)
+        return affinity_matrices

processing_magi.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from transformers import ConditionalDetrImageProcessor, TrOCRProcessor, ViTImageProcessor
+from transformers.image_transforms import center_to_corners_format
+import torch
+from typing import List
+from shapely.geometry import box
+from .utils import UnionFind, sort_panels, sort_text_boxes_in_reading_order, x1y1x2y2_to_xywh
+import numpy as np
+class MagiProcessor():
+    def __init__(self, config):
+        self.config = config
+        self.detection_image_preprocessor = None
+        self.ocr_preprocessor = None
+        self.crop_embedding_image_preprocessor = None
+        if not config.disable_detections:
+            assert config.detection_image_preprocessing_config is not None
+            self.detection_image_preprocessor =  ConditionalDetrImageProcessor.from_dict(config.detection_image_preprocessing_config)
+        if not config.disable_ocr:
+            assert config.ocr_pretrained_processor_path is not None
+            self.ocr_preprocessor = TrOCRProcessor.from_pretrained(config.ocr_pretrained_processor_path)
+        if not config.disable_crop_embeddings:
+            assert config.crop_embedding_image_preprocessing_config is not None
+            self.crop_embedding_image_preprocessor = ViTImageProcessor.from_dict(config.crop_embedding_image_preprocessing_config)
+    def preprocess_inputs_for_detection(self, images, annotations=None):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        annotations = self._convert_annotations_to_coco_format(annotations)
+        inputs = self.detection_image_preprocessor(images, annotations=annotations, return_tensors="pt")
+        return inputs
+    def preprocess_inputs_for_ocr(self, images):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        return self.ocr_preprocessor(images, return_tensors="pt").pixel_values
+    def preprocess_inputs_for_crop_embeddings(self, images):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        return self.crop_embedding_image_preprocessor(images, return_tensors="pt").pixel_values
+    def postprocess_detections_and_associations(
+            self,
+            predicted_bboxes,
+            predicted_class_scores,
+            original_image_sizes,
+            get_character_character_matching_scores,
+            get_text_character_matching_scores,
+            get_dialog_confidence_scores,
+            character_detection_threshold=0.3,
+            panel_detection_threshold=0.2,
+            text_detection_threshold=0.25,
+            character_character_matching_threshold=0.7,
+            text_character_matching_threshold=0.4,
+        ):
+        assert self.config.disable_detections is False
+        batch_scores, batch_labels = predicted_class_scores.max(-1)
+        batch_scores = batch_scores.sigmoid()
+        batch_labels = batch_labels.long()
+        batch_bboxes = center_to_corners_format(predicted_bboxes)
+        # scale the bboxes back to the original image size
+        if isinstance(original_image_sizes, List):
+            img_h = torch.Tensor([i[0] for i in original_image_sizes])
+            img_w = torch.Tensor([i[1] for i in original_image_sizes])
+        else:
+            img_h, img_w = original_image_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(batch_bboxes.device)
+        batch_bboxes = batch_bboxes * scale_fct[:, None, :]
+        batch_panel_indices = self._get_indices_of_panels_to_keep(batch_scores, batch_labels, batch_bboxes, panel_detection_threshold)
+        batch_character_indices = self._get_indices_of_characters_to_keep(batch_scores, batch_labels, batch_bboxes, character_detection_threshold)
+        batch_text_indices = self._get_indices_of_texts_to_keep(batch_scores, batch_labels, batch_bboxes, text_detection_threshold)
+        batch_character_character_matching_scores = get_character_character_matching_scores(batch_character_indices, batch_bboxes)
+        batch_text_character_matching_scores = get_text_character_matching_scores(batch_text_indices, batch_character_indices)
+        batch_dialog_confidence_scores = get_dialog_confidence_scores(batch_text_indices)
+        # sort panels and texts in the reading order
+        for batch_index in range(len(batch_scores)):
+            panel_bboxes = batch_bboxes[batch_index][batch_panel_indices[batch_index]]
+            panel_scores = batch_scores[batch_index][batch_panel_indices[batch_index]]
+            text_bboxes = batch_bboxes[batch_index][batch_text_indices[batch_index]]
+            text_scores = batch_scores[batch_index][batch_text_indices[batch_index]]
+            sorted_panel_indices = sort_panels(panel_bboxes)
+            batch_bboxes[batch_index][batch_panel_indices[batch_index]] = panel_bboxes[sorted_panel_indices]
+            batch_scores[batch_index][batch_panel_indices[batch_index]] = panel_scores[sorted_panel_indices]
+            sorted_panels = batch_bboxes[batch_index][batch_panel_indices[batch_index]]
+            sorted_text_indices = sort_text_boxes_in_reading_order(text_bboxes, sorted_panels)
+            batch_bboxes[batch_index][batch_text_indices[batch_index]] = text_bboxes[sorted_text_indices]
+            batch_scores[batch_index][batch_text_indices[batch_index]] = text_scores[sorted_text_indices]
+            batch_text_character_matching_scores[batch_index] = batch_text_character_matching_scores[batch_index][sorted_text_indices]
+            batch_dialog_confidence_scores[batch_index] = batch_dialog_confidence_scores[batch_index][sorted_text_indices]
+        results = []
+        for batch_index in range(len(batch_scores)):
+            panel_bboxes = batch_bboxes[batch_index][batch_panel_indices[batch_index]]
+            panel_scores = batch_scores[batch_index][batch_panel_indices[batch_index]]
+            text_bboxes = batch_bboxes[batch_index][batch_text_indices[batch_index]]
+            text_scores = batch_scores[batch_index][batch_text_indices[batch_index]]
+            character_bboxes = batch_bboxes[batch_index][batch_character_indices[batch_index]]
+            character_scores = batch_scores[batch_index][batch_character_indices[batch_index]]
+            char_i, char_j = torch.where(batch_character_character_matching_scores[batch_index] > character_character_matching_threshold)
+            character_character_associations = torch.stack([char_i, char_j], dim=1)
+            text_boxes_to_match = batch_dialog_confidence_scores[batch_index] > text_character_matching_threshold
+            if 0 in batch_text_character_matching_scores[batch_index].shape:
+                text_character_associations = torch.zeros((0, 2), dtype=torch.long)
+            else:
+                most_likely_speaker_for_each_text = torch.argmax(batch_text_character_matching_scores[batch_index], dim=1)[text_boxes_to_match]
+                text_indices = torch.arange(len(text_bboxes)).type_as(most_likely_speaker_for_each_text)[text_boxes_to_match]
+                text_character_associations = torch.stack([text_indices, most_likely_speaker_for_each_text], dim=1)
+            character_ufds = UnionFind.from_adj_matrix(
+                batch_character_character_matching_scores[batch_index] > character_character_matching_threshold
+            )
+            results.append({
+                "panels": panel_bboxes.tolist(),
+                "panel_scores": panel_scores.tolist(),
+                "texts": text_bboxes.tolist(),
+                "text_scores": text_scores.tolist(),
+                "characters": character_bboxes.tolist(),
+                "character_scores": character_scores.tolist(),
+                "character_character_associations": character_character_associations.tolist(),
+                "text_character_associations": text_character_associations.tolist(),
+                "character_cluster_labels": character_ufds.get_labels_for_connected_components(),
+                "dialog_confidences": batch_dialog_confidence_scores[batch_index].tolist(),
+            })
+        return results
+    def postprocess_ocr_tokens(self, generated_ids, skip_special_tokens=True):
+        return self.ocr_preprocessor.batch_decode(generated_ids, skip_special_tokens=skip_special_tokens)
+    def crop_image(self, image, bboxes):
+        crops_for_image = []
+        for bbox in bboxes:
+            x1, y1, x2, y2 = bbox
+            # fix the bounding box in case it is out of bounds or too small
+            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+            x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2) # just incase
+            x1, y1 = max(0, x1), max(0, y1)
+            x1, y1 = min(image.shape[1], x1), min(image.shape[0], y1)
+            x2, y2 = max(0, x2), max(0, y2)
+            x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)
+            if x2 - x1 < 10:
+                if image.shape[1] - x1 > 10:
+                    x2 = x1 + 10
+                else:
+                    x1 = x2 - 10
+            if y2 - y1 < 10:
+                if image.shape[0] - y1 > 10:
+                    y2 = y1 + 10
+                else:
+                    y1 = y2 - 10
+            crop = image[y1:y2, x1:x2]
+            crops_for_image.append(crop)
+        return crops_for_image
+    def _get_indices_of_characters_to_keep(self, batch_scores, batch_labels, batch_bboxes, character_detection_threshold):
+        indices_of_characters_to_keep = []
+        for scores, labels, _ in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where((labels == 0) & (scores > character_detection_threshold))[0]
+            indices_of_characters_to_keep.append(indices)
+        return indices_of_characters_to_keep
+    def _get_indices_of_panels_to_keep(self, batch_scores, batch_labels, batch_bboxes, panel_detection_threshold):
+        indices_of_panels_to_keep = []
+        for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where(labels == 2)[0]
+            bboxes = bboxes[indices]
+            scores = scores[indices]
+            labels = labels[indices]
+            if len(indices) == 0:
+                indices_of_panels_to_keep.append([])
+                continue
+            scores, labels, indices, bboxes  = zip(*sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            panels_to_keep = []
+            union_of_panels_so_far = box(0, 0, 0, 0)
+            for ps, pb, pl, pi in zip(scores, bboxes, labels, indices):
+                panel_polygon = box(pb[0], pb[1], pb[2], pb[3])
+                if ps < panel_detection_threshold:
+                    continue
+                if union_of_panels_so_far.intersection(panel_polygon).area / panel_polygon.area > 0.5:
+                    continue
+                panels_to_keep.append((ps, pl, pb, pi))
+                union_of_panels_so_far = union_of_panels_so_far.union(panel_polygon)
+            indices_of_panels_to_keep.append([p[3].item() for p in panels_to_keep])
+        return indices_of_panels_to_keep
+    def _get_indices_of_texts_to_keep(self, batch_scores, batch_labels, batch_bboxes, text_detection_threshold):
+        indices_of_texts_to_keep = []
+        for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where((labels == 1) & (scores > text_detection_threshold))[0]
+            bboxes = bboxes[indices]
+            scores = scores[indices]
+            labels = labels[indices]
+            if len(indices) == 0:
+                indices_of_texts_to_keep.append([])
+                continue
+            scores, labels, indices, bboxes  = zip(*sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            texts_to_keep = []
+            texts_to_keep_as_shapely_objects = []
+            for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
+                text_polygon = box(tb[0], tb[1], tb[2], tb[3])
+                should_append = True
+                for t in texts_to_keep_as_shapely_objects:
+                    if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
+                        should_append = False
+                        break
+                if should_append:
+                    texts_to_keep.append((ts, tl, tb, ti))
+                    texts_to_keep_as_shapely_objects.append(text_polygon)
+            indices_of_texts_to_keep.append([t[3].item() for t in texts_to_keep])
+        return indices_of_texts_to_keep
+    def _convert_annotations_to_coco_format(self, annotations):
+        if annotations is None:
+            return None
+        self._verify_annotations_are_in_correct_format(annotations)
+        coco_annotations = []
+        for annotation in annotations:
+            coco_annotation = {
+                "image_id": annotation["image_id"],
+                "annotations": [],
+            }
+            for bbox, label in zip(annotation["bboxes_as_x1y1x2y2"], annotation["labels"]):
+                coco_annotation["annotations"].append({
+                    "bbox": x1y1x2y2_to_xywh(bbox),
+                    "category_id": label,
+                    "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
+                })
+            coco_annotations.append(coco_annotation)
+        return coco_annotations
+    def _verify_annotations_are_in_correct_format(self, annotations):
+        error_msg = """
+        Annotations must be in the following format:
+        [
+            {
+                "image_id": 0,
+                "bboxes_as_x1y1x2y2": [[0, 0, 10, 10], [10, 10, 20, 20], [20, 20, 30, 30]],
+                "labels": [0, 1, 2],
+            },
+            ...
+        ]
+        Labels: 0 for characters, 1 for text, 2 for panels.
+        """
+        if annotations is None:
+            return
+        if not isinstance(annotations, List) and not isinstance(annotations, tuple):
+            raise ValueError(
+                f"{error_msg} Expected a List/Tuple, found {type(annotations)}."
+            )
+        if len(annotations) == 0:
+            return
+        if not isinstance(annotations[0], dict):
+            raise ValueError(
+                f"{error_msg} Expected a List[Dict], found {type(annotations[0])}."
+            )
+        if "image_id" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'image_id'."
+            )
+        if "bboxes_as_x1y1x2y2" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'bboxes_as_x1y1x2y2'."
+            )
+        if "labels" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'labels'."
+            )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:219c2b80e741b1d02e92f22701a38358a5606d6460ad8b6335091e909b212011
+size 2063428286

utils.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import torch
+import numpy as np
+import random
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from shapely.geometry import Point, box
+import networkx as nx
+from copy import deepcopy
+from itertools import groupby
+def move_to_device(inputs, device):
+    if hasattr(inputs, "keys"):
+        return {k: move_to_device(v, device) for k, v in inputs.items()}
+    elif isinstance(inputs, list):
+        return [move_to_device(v, device) for v in inputs]
+    elif isinstance(inputs, tuple):
+        return tuple([move_to_device(v, device) for v in inputs])
+    elif isinstance(inputs, np.ndarray):
+        return torch.from_numpy(inputs).to(device)
+    else:
+        return inputs.to(device)
+class UnionFind:
+    def __init__(self, n):
+        self.parent = list(range(n))
+        self.size = [1] * n
+        self.num_components = n
+    @classmethod
+    def from_adj_matrix(cls, adj_matrix):
+        ufds = cls(adj_matrix.shape[0])
+        for i in range(adj_matrix.shape[0]):
+            for j in range(adj_matrix.shape[1]):
+                if adj_matrix[i, j] > 0:
+                    ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_adj_list(cls, adj_list):
+        ufds = cls(len(adj_list))
+        for i in range(len(adj_list)):
+            for j in adj_list[i]:
+                ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_edge_list(cls, edge_list, num_nodes):
+        ufds = cls(num_nodes)
+        for edge in edge_list:
+            ufds.unite(edge[0], edge[1])
+        return ufds
+    def find(self, x):
+        if self.parent[x] == x:
+            return x
+        self.parent[x] = self.find(self.parent[x])
+        return self.parent[x]
+    def unite(self, x, y):
+        x = self.find(x)
+        y = self.find(y)
+        if x != y:
+            if self.size[x] < self.size[y]:
+                x, y = y, x
+            self.parent[y] = x
+            self.size[x] += self.size[y]
+            self.num_components -= 1
+    def get_components_of(self, x):
+        x = self.find(x)
+        return [i for i in range(len(self.parent)) if self.find(i) == x]
+    def are_connected(self, x, y):
+        return self.find(x) == self.find(y)
+    def get_size(self, x):
+        return self.size[self.find(x)]
+    def get_num_components(self):
+        return self.num_components
+    def get_labels_for_connected_components(self):
+        map_parent_to_label = {}
+        labels = []
+        for i in range(len(self.parent)):
+            parent = self.find(i)
+            if parent not in map_parent_to_label:
+                map_parent_to_label[parent] = len(map_parent_to_label)
+            labels.append(map_parent_to_label[parent])
+        return labels
+def visualise_single_image_prediction(image_as_np_array, predictions, filename):
+    figure, subplot = plt.subplots(1, 1, figsize=(10, 10))
+    subplot.imshow(image_as_np_array)
+    plot_bboxes(subplot, predictions["panels"], color="green")
+    plot_bboxes(subplot, predictions["texts"], color="red", add_index=True)
+    plot_bboxes(subplot, predictions["characters"], color="blue")
+    COLOURS = [
+        "#b7ff51", # green
+        "#f50a8f", # pink
+        "#4b13b6", # purple
+        "#ddaa34", # orange
+        "#bea2a2", # brown
+    ]
+    colour_index = 0
+    character_cluster_labels = predictions["character_cluster_labels"]
+    unique_label_sorted_by_frequency = sorted(list(set(character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
+    for label in unique_label_sorted_by_frequency:
+        root = None
+        others = []
+        for i in range(len(predictions["characters"])):
+            if character_cluster_labels[i] == label:
+                if root is None:
+                    root = i
+                else:
+                    others.append(i)
+        if colour_index >= len(COLOURS):
+            random_colour = COLOURS[0]
+            while random_colour in COLOURS:
+                random_colour = "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
+        else:
+            random_colour = COLOURS[colour_index]
+            colour_index += 1
+        bbox_i = predictions["characters"][root]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
+        for j in others:
+            # draw line from centre of bbox i to centre of bbox j
+            bbox_j = predictions["characters"][j]
+            x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+            y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+            x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+            y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+            subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
+            subplot.plot([x2], [y2], color=random_colour, marker="o", markersize=5)
+    for (i, j) in predictions["text_character_associations"]:
+        score = predictions["dialog_confidences"][i]
+        bbox_i = predictions["texts"][i]
+        bbox_j = predictions["characters"][j]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        subplot.plot([x1, x2], [y1, y2], color="red", linewidth=2, linestyle="dashed", alpha=score)
+    subplot.axis("off")
+    if filename is not None:
+        plt.savefig(filename, bbox_inches="tight", pad_inches=0)
+    figure.canvas.draw()
+    image = np.array(figure.canvas.renderer._renderer)
+    plt.close()
+    return image
+def plot_bboxes(subplot, bboxes, color="red", add_index=False):
+    for id, bbox in enumerate(bboxes):
+        w = bbox[2] - bbox[0]
+        h = bbox[3] - bbox[1]
+        rect = patches.Rectangle(
+            bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
+        )
+        subplot.add_patch(rect)
+        if add_index:
+            cx, cy = bbox[0] + w / 2, bbox[1] + h / 2
+            subplot.text(cx, cy, str(id), color=color, fontsize=10, ha="center", va="center")
+def sort_panels(rects):
+    before_rects = convert_to_list_of_lists(rects)
+    # slightly erode all rectangles initially to account for imperfect detections
+    rects = [erode_rectangle(rect, 0.05) for rect in before_rects]
+    G = nx.DiGraph()
+    G.add_nodes_from(range(len(rects)))
+    for i in range(len(rects)):
+        for j in range(len(rects)):
+            if i == j:
+                continue
+            if is_there_a_directed_edge(i, j, rects):
+                G.add_edge(i, j, weight=get_distance(rects[i], rects[j]))
+            else:
+                G.add_edge(j, i, weight=get_distance(rects[i], rects[j]))
+    while True:
+        cycles = sorted(nx.simple_cycles(G))
+        cycles = [cycle for cycle in cycles if len(cycle) > 1]
+        if len(cycles) == 0:
+            break
+        cycle = cycles[0]
+        edges = [e for e in zip(cycle, cycle[1:] + cycle[:1])]
+        max_cyclic_edge = max(edges, key=lambda x: G.edges[x]["weight"])
+        G.remove_edge(*max_cyclic_edge)
+    return list(nx.topological_sort(G))
+def is_strictly_above(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2A < y1B
+def is_strictly_below(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2B < y1A
+def is_strictly_left_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2A < x1B
+def is_strictly_right_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2B < x1A
+def intersects(rectA, rectB):
+    return box(*rectA).intersects(box(*rectB))
+def is_there_a_directed_edge(a, b, rects):
+    rectA = rects[a]
+    rectB = rects[b]
+    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2, rectA[1] + (rectA[3] - rectA[1]) / 2]
+    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2, rectB[1] + (rectB[3] - rectB[1]) / 2]
+    if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
+        return box(*rectA).area > (box(*rectB)).area
+    copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
+    copy_B = [rectB[0], rectB[1], rectB[2], rectB[3]]
+    while True:
+        if is_strictly_above(copy_A, copy_B) and not is_strictly_left_of(copy_A, copy_B):
+            return 1
+        if is_strictly_above(copy_B, copy_A) and not is_strictly_left_of(copy_B, copy_A):
+            return 0
+        if is_strictly_right_of(copy_A, copy_B) and not is_strictly_below(copy_A, copy_B):
+            return 1
+        if is_strictly_right_of(copy_B, copy_A) and not is_strictly_below(copy_B, copy_A):
+            return 0
+        if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
+            return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
+           return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        # otherwise they intersect
+        copy_A = erode_rectangle(copy_A, 0.05)
+        copy_B = erode_rectangle(copy_B, 0.05)
+def get_distance(rectA, rectB):
+    return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
+def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
+    rects = deepcopy(rects)
+    while True:
+        xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
+        rect_index = [i for i in range(len(rects)) if intersects(rects[i], [xmin, ymin, xmax, ymax])]
+        rects_copy = [rect for rect in rects if intersects(rect, [xmin, ymin, xmax, ymax])]
+        # try to split the panels using a "horizontal" lines
+        overlapping_y_ranges = merge_overlapping_ranges([(y1, y2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
+            for i, index in enumerate(rect_index):
+                if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # try to split the panels using a "vertical" lines
+        overlapping_x_ranges = merge_overlapping_ranges([(x1, x2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
+            for i, index in enumerate(rect_index):
+                if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # otherwise, erode the rectangles and try again
+        rects = [erode_rectangle(rect, 0.05) for rect in rects]
+def erode_rectangle(bbox, erosion_factor):
+    x1, y1, x2, y2 = bbox
+    w, h = x2 - x1, y2 - y1
+    cx, cy = x1 + w / 2, y1 + h / 2
+    if w < h:
+        aspect_ratio = w / h
+        erosion_factor_width = erosion_factor * aspect_ratio
+        erosion_factor_height = erosion_factor
+    else:
+        aspect_ratio = h / w
+        erosion_factor_width = erosion_factor
+        erosion_factor_height = erosion_factor * aspect_ratio
+    w = w - w * erosion_factor_width
+    h = h - h * erosion_factor_height
+    x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
+    return [x1, y1, x2, y2]
+def merge_overlapping_ranges(ranges):
+    """
+    ranges: list of tuples (x1, x2)
+    """
+    if len(ranges) == 0:
+        return []
+    ranges = sorted(ranges, key=lambda x: x[0])
+    merged_ranges = []
+    for i, r in enumerate(ranges):
+        if i == 0:
+            prev_x1, prev_x2 = r
+            continue
+        x1, x2 = r
+        if x1 > prev_x2:
+            merged_ranges.append((prev_x1, prev_x2))
+            prev_x1, prev_x2 = x1, x2
+        else:
+            prev_x2 = max(prev_x2, x2)
+    merged_ranges.append((prev_x1, prev_x2))
+    return merged_ranges
+def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
+    text_bboxes = convert_to_list_of_lists(text_bboxes)
+    sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
+    if len(text_bboxes) == 0:
+        return []
+    def indices_of_same_elements(nums):
+        groups = groupby(range(len(nums)), key=lambda i: nums[i])
+        return [list(indices) for _, indices in groups]
+    panel_id_for_text = get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes)
+    indices_of_texts = list(range(len(text_bboxes)))
+    indices_of_texts, panel_id_for_text = zip(*sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
+    indices_of_texts = list(indices_of_texts)
+    grouped_indices = indices_of_same_elements(panel_id_for_text)
+    for group in grouped_indices:
+        subset_of_text_indices = [indices_of_texts[i] for i in group]
+        text_bboxes_of_subset = [text_bboxes[i] for i in subset_of_text_indices]
+        sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
+        indices_of_texts[group[0] : group[-1] + 1] = [subset_of_text_indices[i] for i in sorted_subset_indices]
+    return indices_of_texts
+def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
+    text_to_panel_mapping = []
+    for text_bbox in text_bboxes:
+        shapely_text_polygon = box(*text_bbox)
+        all_intersections = []
+        all_distances = []
+        if len(sorted_panel_bboxes) == 0:
+            text_to_panel_mapping.append(-1)
+            continue
+        for j, annotation in enumerate(sorted_panel_bboxes):
+            shapely_annotation_polygon = box(*annotation)
+            if shapely_text_polygon.intersects(shapely_annotation_polygon):
+                all_intersections.append((shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
+            all_distances.append((shapely_text_polygon.distance(shapely_annotation_polygon), j))
+        if len(all_intersections) == 0:
+            text_to_panel_mapping.append(min(all_distances, key=lambda x: x[0])[1])
+        else:
+            text_to_panel_mapping.append(max(all_intersections, key=lambda x: x[0])[1])
+    return text_to_panel_mapping
+def sort_texts_within_panel(rects):
+    smallest_y = float("inf")
+    greatest_x = float("-inf")
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        smallest_y = min(smallest_y, y1)
+        greatest_x = max(greatest_x, x2)
+    reference_point = Point(greatest_x, smallest_y)
+    polygons_and_index = []
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        polygons_and_index.append((box(x1,y1,x2,y2), i))
+    # sort points by closest to reference point
+    polygons_and_index = sorted(polygons_and_index, key=lambda x: reference_point.distance(x[0]))
+    indices = [x[1] for x in polygons_and_index]
+    return indices
+def x1y1wh_to_x1y1x2y2(bbox):
+    x1, y1, w, h = bbox
+    return [x1, y1, x1 + w, y1 + h]
+def x1y1x2y2_to_xywh(bbox):
+    x1, y1, x2, y2 = bbox
+    return [x1, y1, x2 - x1, y2 - y1]
+def convert_to_list_of_lists(rects):
+    if isinstance(rects, torch.Tensor):
+        return rects.tolist()
+    if isinstance(rects, np.ndarray):
+        return rects.tolist()
+    return [[a, b, c, d] for a, b, c, d in rects]