Spaces:

techconsptr
/

ConversAI

Sleeping

App Files Files Community

ConversAI / src /components /loaders /pdfLoader.py

techconsptrs

UPDATE: code update

1802405 9 months ago

raw

history blame contribute delete

2.65 kB

	from src.utils.functions import cleanText, getConfig
	from concurrent.futures import ThreadPoolExecutor
	from src.utils.exceptions import CustomException
	from pdf2image import convert_from_path
	from src.utils.logging import logger
	import numpy as np
	import pymupdf
	import easyocr

	class PdfLoader:
	def __init__(self) -> None:
	"""
	Initialize the PdfLoader with configuration settings and an EasyOCR reader.
	"""
	self.config = getConfig(path="config.ini")
	self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))

	def extractTextFromPage(self, page) -> str:
	"""
	Extract and clean text from a PDF page.

	Args:
	page: A PyMuPDF page object.

	Returns:
	str: Cleaned text extracted from the page.
	"""
	return cleanText(text=page.get_text())

	def searchablePdf(self, pdfPath: str) -> str:
	"""
	Extract text from a searchable PDF.

	Args:
	pdfPath (str): The file path to the searchable PDF.

	Returns:
	str: All extracted text from the PDF.
	"""
	try:
	logger.info("Text Extraction Started from Searchable PDF")
	doc = pymupdf.open(pdfPath)
	pages = [doc.load_page(i) for i in range(len(doc))]
	with ThreadPoolExecutor() as executor:
	texts = list(executor.map(self.extractTextFromPage, pages))
	doc.close()
	return "\n".join(texts)
	except Exception as e:
	logger.error(CustomException(e))

	def getText(self, image) -> str:
	"""
	Extract and clean text from an image using EasyOCR.

	Args:
	image: An image (numpy array).

	Returns:
	str: Cleaned text extracted from the image.
	"""
	text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
	return cleanText(text=text)

	def scannablePdf(self, pdfPath: str) -> str:
	"""
	Extract text from a scannable PDF using OCR.

	Args:
	pdfPath (str): The file path to the scannable PDF.

	Returns:
	str: All extracted text from the PDF.
	"""
	try:
	logger.info("Text Extraction Started from Scannable PDF")
	allImages = convert_from_path(pdfPath)
	texts = [self.getText(image) for image in allImages]
	return "\n".join(texts)
	except Exception as e:
	logger.error(CustomException(e))