Spaces:

JAYASREESS
/

final_year

Running

first commit

9d21edd about 1 month ago

1.85 kB


	import pdfplumber
	import docx
	import io

	def extract_text_from_file(file_obj, file_type):
	"""
	Extracts text from various file formats with page/location tracking.
	Args:
	file_obj: The uploaded file object (bytes).
	file_type: 'pdf', 'docx', or 'txt'.
	Returns:
	List[Dict]: List of {'text': str, 'page': int}
	"""
	extracted_data = []
	try:
	if file_type == "pdf":
	with pdfplumber.open(file_obj) as pdf:
	for i, page in enumerate(pdf.pages):
	page_text = page.extract_text()
	if page_text:
	extracted_data.append({
	"text": page_text,
	"page": i + 1
	})

	elif file_type == "docx":
	doc = docx.Document(file_obj)
	# DOCX doesn't have strict pages, so we'll treat paragraphs/sections
	# as a stream. We'll mark it as Page 1 for now, or maybe
	# increment 'page' every N paragraphs to simulate flow?
	# Better: Return logical sections.
	full_text = ""
	for para in doc.paragraphs:
	full_text += para.text + "\n"

	extracted_data.append({
	"text": full_text,
	"page": 1 # DOCX treated as single continuous flow unless paginated
	})

	elif file_type == "txt":
	# Assuming utf-8 encoding
	text = file_obj.read().decode("utf-8")
	extracted_data.append({
	"text": text,
	"page": 1
	})

	except Exception as e:
	print(f"Error extracting text: {e}")
	return []

	return extracted_data