final_year / preprocessing /text_extractor.py
jayasrees's picture
first commit
9d21edd
import pdfplumber
import docx
import io
def extract_text_from_file(file_obj, file_type):
"""
Extracts text from various file formats with page/location tracking.
Args:
file_obj: The uploaded file object (bytes).
file_type: 'pdf', 'docx', or 'txt'.
Returns:
List[Dict]: List of {'text': str, 'page': int}
"""
extracted_data = []
try:
if file_type == "pdf":
with pdfplumber.open(file_obj) as pdf:
for i, page in enumerate(pdf.pages):
page_text = page.extract_text()
if page_text:
extracted_data.append({
"text": page_text,
"page": i + 1
})
elif file_type == "docx":
doc = docx.Document(file_obj)
# DOCX doesn't have strict pages, so we'll treat paragraphs/sections
# as a stream. We'll mark it as Page 1 for now, or maybe
# increment 'page' every N paragraphs to simulate flow?
# Better: Return logical sections.
full_text = ""
for para in doc.paragraphs:
full_text += para.text + "\n"
extracted_data.append({
"text": full_text,
"page": 1 # DOCX treated as single continuous flow unless paginated
})
elif file_type == "txt":
# Assuming utf-8 encoding
text = file_obj.read().decode("utf-8")
extracted_data.append({
"text": text,
"page": 1
})
except Exception as e:
print(f"Error extracting text: {e}")
return []
return extracted_data