Spaces:

saherPervaiz
/

cv

Sleeping

cv / text_extractor.py

Update text_extractor.py

7cc953c verified 7 months ago

631 Bytes

	# text_extractor.py

	import os
	import docx2txt
	import PyPDF2

	def extract_text_from_file(file_path):
	ext = os.path.splitext(file_path)[1].lower()

	if ext == ".pdf":
	try:
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	return " ".join([page.extract_text() or "" for page in reader.pages])
	except:
	return "[Error extracting PDF text]"

	elif ext == ".docx":
	try:
	return docx2txt.process(file_path)
	except:
	return "[Error extracting DOCX text]"

	else:
	return "[Unsupported file type]"