|
from PyPDF2 import PdfReader |
|
from io import BytesIO |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def parse_pdf(file_content): |
|
try: |
|
reader = PdfReader(BytesIO(file_content)) |
|
text = "" |
|
for page in reader.pages: |
|
extracted = page.extract_text() |
|
if extracted: |
|
text += extracted |
|
return text |
|
except Exception as e: |
|
logger.error(f"PDF parse error: {e}") |
|
return "" |
|
|
|
def parse_text(file_content): |
|
try: |
|
return file_content.decode("utf-8") |
|
except UnicodeDecodeError: |
|
try: |
|
return file_content.decode("latin1") |
|
except Exception as e: |
|
logger.error(f"Text parse error: {e}") |
|
return "" |
|
|