Spaces:

yumikimi381
/

alps

Build error

File size: 8,693 Bytes

daf0288

from typing import Any, List, Literal, Mapping, Optional, Tuple
import time 

from PIL import Image
# Numpy image type
import numpy.typing as npt
from numpy import uint8
ImageType = npt.NDArray[uint8]

import numpy as np
import uuid

from doctrfiles import DoctrWordDetector,DoctrTextRecognizer,Wordboxes
from deepdoc import RagFlow
from utils import LineAnnotation,WordAnnotation,getlogger,cropImageExtraMargin,crop_an_Image,cropImages,get_new_coord
from numpy.typing import NDArray

MARGIN_FACTOR = 1.4
class OCRComponent1():
    """
    This component uses RagFlow as text line detector 
    Uses DocTR's word detector and text recognizer 
    """
    def __init__(self,englishflag =False):
        logger = getlogger("1")
        start_time = time.time()
        self.textlineDetector = RagFlow()
        end_time = time.time()
        execution_time = end_time - start_time
        logger.info(f"time to initialize Ragflow: {execution_time} seconds")


        start_time = time.time()
        """
        self.wordDetector = DoctrWordDetector(architecture="db_resnet50", 
                                              path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt")
       
        """

        self.wordDetector = DoctrWordDetector(architecture="db_resnet50", 
                                              path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt",
                                              path_config_json ="doctrfiles/models/db_resnet50_config.json")
      
       
        end_time = time.time()

        execution_time = end_time - start_time
        logger.info(f"time to initialize DoctrWordDetectorDebug: {execution_time} seconds")
        start_time = time.time()
        if not englishflag:
            self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="doctrfiles/models/doctr-multilingual-parseq.bin", 
                                                path_config_json="doctrfiles/models/multilingual-parseq-config.json")
        else: 
            self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="doctrfiles/models/master-fde31e4a.pt", 
                                                path_config_json="doctrfiles/models/master.json")
        end_time = time.time()
        execution_time = end_time - start_time
        logger.info(f"time to initialize DoctrTextRecognizer: {execution_time} seconds")

        
    @staticmethod
    def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'):
        i = 0
        for img in detected_lines_images:
            pilimg = Image.fromarray(img)
            pilimg.save(prefix+str(i)+'.png')
            i=i+1

    @staticmethod
    def convert_coordinates(original_coord = NDArray[np.float32],detection_res = NDArray[np.float32])-> NDArray[np.float32]:
        """
        Type if original_coord : np.array([
                    [xmin, ymin],
                    [xmax, ymin],
                    [xmax, ymax],
                    [xmin, ymax]
                ]
        """
        height = original_coord[3][1] - original_coord[0][1]
        width = original_coord[1][0] - original_coord[0][0]
        if width/height<1.6:
            bigger = max(height,width)
            new_height = int(bigger *3)
            new_width = int(bigger*3)
        else:
            bigger = max(height,width)
            new_height = int(bigger *MARGIN_FACTOR)
            new_width = int(bigger*MARGIN_FACTOR)

        y_offset = (new_height - height) // 2
        x_offset = (new_width - width) // 2
        #new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img
        #x,y offsets are the min x and y 
        
        # Calculate relative coordinate to the original image in the padded image 

        rel = np.array(
            [
                [detection_res[0][0] - x_offset,  detection_res[0][1]-y_offset],
                [detection_res[1][0] - x_offset,  detection_res[1][1]-y_offset],
                [detection_res[2][0] - x_offset,  detection_res[2][1]-y_offset],
                [detection_res[3][0] - x_offset,  detection_res[3][1]-y_offset],
            ]
        )
        xmin = original_coord[0][0]
        ymin = original_coord[0][1]
        xmax = original_coord[1][0]
        ymax = original_coord[2][1]
        #This used to return 4 x 2 array
        #rel_in_page =[[xmin+b[0],ymin+b[1]] for b in rel]
        #Now returns 4x1 array 
        rel_in_page = np.array([xmin+rel[0][0],ymin+rel[0][1], xmin +rel[1][0], ymin +rel[2][1]])
        return rel_in_page

    

    def predict(self, img:ImageType)->Tuple[List[LineAnnotation],List[WordAnnotation]]:
        
        logger = getlogger("1")
        start_time = time.time()

        """
        bxs : Text line detection results - bounding boxes 
        Each element looks like : [array([[ 90.,  98.],
        [313., 100.],
        [312., 129.],
        [ 90., 127.]], dtype=float32)
        [left_lower, right_lower, right_upper, left_upper]
        """
        # 4x2 array 
        bxs:List[NDArray[np.float32]] = self.textlineDetector.predict(img = np.array(img)) 

        end_time = time.time()
        execution_time = end_time - start_time
        logger.info(f"time to detecttextline: {execution_time} seconds")

        line_annotations = {}
        straightboxs = []
        for points in bxs:
            xmin, ymin, xmax, ymax = get_new_coord(img.shape[1],img.shape[0],points)
            b = np.array([
                    [xmin, ymin],
                    [xmax, ymin],
                    [xmax, ymax],
                    [xmin, ymax]
                ], dtype=np.float32)
            straightboxs.append(b)
            ann = LineAnnotation(box =[xmin, ymin, xmax, ymax])
            line_annotations[ann.index] = ann 
        
        """
        detected_lines_images :  cropped images of detected lines
        """
        # Double computation in line 117 - we calculate the straight lines again 
        #Straightboxes : 4x 2 array 
        detected_lines_images:List[ImageType] = cropImageExtraMargin(straightboxs, img,margin =MARGIN_FACTOR,straight=True)
        #self.save_detection(detected_lines_images,prefix = './res/12June_two_Line_')
        start_time = time.time()
        word_annotations =[]

        #viz_word_detection =[]
        for uuid, lineimg in zip(line_annotations.keys(),detected_lines_images):
            
            original_coord = line_annotations[uuid].box
            xmin, ymin, xmax, ymax = original_coord
            original_coord_b = np.array([
                    [xmin, ymin],
                    [xmax, ymin],
                    [xmax, ymax],
                    [xmin, ymax]
                ], dtype=np.float32)

            #List of 4 x 2 
            detection_results :List[Wordboxes]= self.wordDetector.predict(lineimg)

            input_Word_recog ={}

            for wordbox in detection_results:
                #So i think cropped_image's expected form is different that what is being returned
                #takes in 4x2 array : box 
                cropped_image= crop_an_Image(wordbox.box,lineimg)
                """
                We need to convert coordintes in wordbox.box to the original image 
                wordbox.box = np.array(wordbox.box)
                """
                #original_coord_b :4x2 array
                #coord_in_page :4 x 1 array 
                coord_in_page = self.convert_coordinates(original_coord_b,wordbox.box)
                #logger.info("returned coordinate in page ")
                #logger.info(coord_in_page)


                wordAnn = WordAnnotation(box = coord_in_page, text = None)
                word_uuid = wordAnn.index
                input_Word_recog[word_uuid]= [cropped_image,wordAnn]
                #print("uuid is ")
                #print(uuid)
                #print(len(line_annotations[uuid].words))
                line_annotations[uuid].words.append(wordAnn)
                
                #viz_word_detection.append(cropped_image)
            

            #input_Word_recog contains only word detection 
            #It is dictionary of annotation id as key, than as values - list of cropped_image and Annotation Instance with key as uuid 

            word_annotations_in_line = self.textRecognizer.predict(input_Word_recog)
            word_annotations.append(word_annotations_in_line)
            
        #self.save_detection(viz_word_detection,prefix = './res/test4/rel_page_')
        end_time = time.time()
        execution_time = end_time - start_time
        logger.info(f"Entire DocTR pipeline: {execution_time} seconds")
        return line_annotations