Spaces:

Rivalcoder
/

Smart-Contract-Analyzer

Sleeping

Smart-Contract-Analyzer / app /services /text_extractor.py

Rivalcoder

Add Files

4b022af 3 months ago

1.59 kB

	import fitz # PyMuPDF
	import easyocr
	from pdf2image import convert_from_path
	from typing import Optional
	import tempfile
	import os


	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Extract text from PDF using hybrid approach:
	1. First try PyMuPDF for searchable PDFs
	2. If minimal text, fall back to OCR for scanned PDFs
	"""

	# Step 1: Try PyMuPDF extraction
	try:
	doc = fitz.open(file_path)
	text = ""

	for page_num in range(doc.page_count):
	page = doc.load_page(page_num)
	text += page.get_text()

	doc.close()

	# Check if we got meaningful text (more than 100 characters)
	if len(text.strip()) > 100:
	return text.strip()

	except Exception as e:
	print(f"PyMuPDF extraction failed: {e}")

	# Step 2: Fall back to OCR for scanned PDFs
	try:
	# Convert PDF to images
	images = convert_from_path(file_path)

	# Initialize EasyOCR for English and Hindi
	reader = easyocr.Reader(['en', 'hi'])

	ocr_text = ""
	for image in images:
	# Perform OCR on each page
	results = reader.readtext(image)

	# Extract text from OCR results
	for (bbox, text, confidence) in results:
	if confidence > 0.5: # Only include high-confidence text
	ocr_text += text + " "

	return ocr_text.strip()

	except Exception as e:
	print(f"OCR extraction failed: {e}")
	raise Exception(f"Failed to extract text from PDF: {e}")

	return ""