cv / text_extractor.py
saherPervaiz's picture
Update text_extractor.py
7cc953c verified
raw
history blame contribute delete
631 Bytes
# text_extractor.py
import os
import docx2txt
import PyPDF2
def extract_text_from_file(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
return " ".join([page.extract_text() or "" for page in reader.pages])
except:
return "[Error extracting PDF text]"
elif ext == ".docx":
try:
return docx2txt.process(file_path)
except:
return "[Error extracting DOCX text]"
else:
return "[Unsupported file type]"