ChatLM / parser.py
ah707
Initital deploy
e539f46
raw
history blame contribute delete
742 Bytes
from PyPDF2 import PdfReader
from io import BytesIO
import logging
logger = logging.getLogger(__name__)
def parse_pdf(file_content):
try:
reader = PdfReader(BytesIO(file_content))
text = ""
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted
return text
except Exception as e:
logger.error(f"PDF parse error: {e}")
return ""
def parse_text(file_content):
try:
return file_content.decode("utf-8")
except UnicodeDecodeError:
try:
return file_content.decode("latin1")
except Exception as e:
logger.error(f"Text parse error: {e}")
return ""