File size: 2,647 Bytes
7e24b41
 
 
 
 
 
 
 
 
 
 
1802405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e24b41
 
 
 
 
 
 
 
 
1802405
 
 
 
 
 
 
 
 
 
 
7e24b41
1802405
 
 
 
 
 
 
 
7e24b41
1802405
 
 
7e24b41
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from src.utils.functions import cleanText, getConfig
from concurrent.futures import ThreadPoolExecutor
from src.utils.exceptions import CustomException
from pdf2image import convert_from_path
from src.utils.logging import logger
import numpy as np
import pymupdf
import easyocr

class PdfLoader:
    def __init__(self) -> None:
        """

        Initialize the PdfLoader with configuration settings and an EasyOCR reader.

        """
        self.config = getConfig(path="config.ini") 
        self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))

    def extractTextFromPage(self, page) -> str:
        """

        Extract and clean text from a PDF page.



        Args:

            page: A PyMuPDF page object.



        Returns:

            str: Cleaned text extracted from the page.

        """
        return cleanText(text=page.get_text())

    def searchablePdf(self, pdfPath: str) -> str:
        """

        Extract text from a searchable PDF.



        Args:

            pdfPath (str): The file path to the searchable PDF.



        Returns:

            str: All extracted text from the PDF.

        """
        try:
            logger.info("Text Extraction Started from Searchable PDF")
            doc = pymupdf.open(pdfPath)
            pages = [doc.load_page(i) for i in range(len(doc))]
            with ThreadPoolExecutor() as executor:
                texts = list(executor.map(self.extractTextFromPage, pages))
            doc.close()
            return "\n".join(texts)
        except Exception as e:
            logger.error(CustomException(e))

    def getText(self, image) -> str:
        """

        Extract and clean text from an image using EasyOCR.



        Args:

            image: An image (numpy array).



        Returns:

            str: Cleaned text extracted from the image.

        """
        text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
        return cleanText(text=text)

    def scannablePdf(self, pdfPath: str) -> str:
        """

        Extract text from a scannable PDF using OCR.



        Args:

            pdfPath (str): The file path to the scannable PDF.



        Returns:

            str: All extracted text from the PDF.

        """
        try:
            logger.info("Text Extraction Started from Scannable PDF")
            allImages = convert_from_path(pdfPath)
            texts = [self.getText(image) for image in allImages]
            return "\n".join(texts)
        except Exception as e:
            logger.error(CustomException(e))