Spaces:
Sleeping
Sleeping
File size: 2,647 Bytes
7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from src.utils.functions import cleanText, getConfig
from concurrent.futures import ThreadPoolExecutor
from src.utils.exceptions import CustomException
from pdf2image import convert_from_path
from src.utils.logging import logger
import numpy as np
import pymupdf
import easyocr
class PdfLoader:
def __init__(self) -> None:
"""
Initialize the PdfLoader with configuration settings and an EasyOCR reader.
"""
self.config = getConfig(path="config.ini")
self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))
def extractTextFromPage(self, page) -> str:
"""
Extract and clean text from a PDF page.
Args:
page: A PyMuPDF page object.
Returns:
str: Cleaned text extracted from the page.
"""
return cleanText(text=page.get_text())
def searchablePdf(self, pdfPath: str) -> str:
"""
Extract text from a searchable PDF.
Args:
pdfPath (str): The file path to the searchable PDF.
Returns:
str: All extracted text from the PDF.
"""
try:
logger.info("Text Extraction Started from Searchable PDF")
doc = pymupdf.open(pdfPath)
pages = [doc.load_page(i) for i in range(len(doc))]
with ThreadPoolExecutor() as executor:
texts = list(executor.map(self.extractTextFromPage, pages))
doc.close()
return "\n".join(texts)
except Exception as e:
logger.error(CustomException(e))
def getText(self, image) -> str:
"""
Extract and clean text from an image using EasyOCR.
Args:
image: An image (numpy array).
Returns:
str: Cleaned text extracted from the image.
"""
text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
return cleanText(text=text)
def scannablePdf(self, pdfPath: str) -> str:
"""
Extract text from a scannable PDF using OCR.
Args:
pdfPath (str): The file path to the scannable PDF.
Returns:
str: All extracted text from the PDF.
"""
try:
logger.info("Text Extraction Started from Scannable PDF")
allImages = convert_from_path(pdfPath)
texts = [self.getText(image) for image in allImages]
return "\n".join(texts)
except Exception as e:
logger.error(CustomException(e)) |