File size: 4,270 Bytes
71c6bbf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import os
import re
import fitz
import logging
from PIL import Image
from pdf2image import convert_from_path
import platform
import pytesseract
import docx
from odf.opendocument import load as load_odt
from odf.text import P
# Path to tesseract executable (ensure it points to tesseract.exe)
if platform.system() == "Windows":
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
else:
# For Hugging Face Spaces or other Linux environments
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
# # Set up logging
# logging.basicConfig(
# level=logging.DEBUG,
# format='%(asctime)s - %(levelname)s - %(message)s',
# handlers=[logging.StreamHandler()]
# )
# # Path to Tesseract executable
# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
# pytesseract.pytesseract.tesseract_cmd = tesseract_path
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(file_path):
text = ""
hyperlinks = []
try:
doc = fitz.open(file_path)
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
page_text = page.get_text("text")
if not page_text.strip():
images = convert_from_path(file_path, dpi=300)
for image in images:
text += pytesseract.image_to_string(image)
else:
text += page_text
links = page.get_links()
for link in links:
if link.get("uri"):
hyperlinks.append(link["uri"])
except Exception as e:
logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
return "", []
return text, list(set(hyperlinks))
# Function to extract text from DOCX
def extract_text_from_docx(file_path):
try:
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
except Exception as e:
logging.error(f"Error extracting text from DOCX: {e}")
return ""
# Function to extract text from RSF (assuming text-based format)
def extract_text_from_rsf(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
except Exception as e:
logging.error(f"Error extracting text from RSF: {e}")
return ""
# Function to extract text from ODT
def extract_text_from_odt(file_path):
try:
odt_doc = load_odt(file_path)
text_elements = odt_doc.getElementsByType(P)
text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
return text
except Exception as e:
logging.error(f"Error extracting text from ODT: {e}")
return ""
# Function to extract text from images using Tesseract
def extract_text_from_image(file_path):
try:
img = Image.open(file_path)
text = pytesseract.image_to_string(img)
return text
except Exception as e:
logging.error(f"Error extracting text from image: {e}")
return ""
# Function to clean and preprocess the extracted text
def preprocess_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
return text.strip()
# Function to automatically detect file format and extract text
def extract_text_based_on_format(file_path):
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
text, hyperlinks = extract_text_from_pdf(file_path)
elif file_ext == '.docx':
text = extract_text_from_docx(file_path)
hyperlinks = []
elif file_ext == '.rsf':
text = extract_text_from_rsf(file_path)
hyperlinks = []
elif file_ext == '.odt':
text = extract_text_from_odt(file_path)
hyperlinks = []
elif file_ext in ['.png', '.jpg', '.jpeg']:
text = extract_text_from_image(file_path)
hyperlinks = []
else:
raise ValueError("Unsupported file format")
return text, hyperlinks
|