Spaces:

Soltane777
/

textgeneration

Sleeping

App Files Files Community

textgeneration / backend /utils.py

Soltane777

Update backend/utils.py

70cb71f verified 4 months ago

raw

history blame contribute delete

2.28 kB

	import fitz # pymupdf
	from docx import Document
	import pptx
	import os
	from typing import Optional

	def extract_text_from_pdf(file_path: str) -> Optional[str]:
	"""
	استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
	"""
	try:
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text.strip() if text else None
	except Exception as e:
	print(f"Error reading PDF: {e}")
	return None

	def extract_text_from_docx(file_path: str) -> Optional[str]:
	"""
	استخراج النص من ملف Word (DOCX).
	"""
	try:
	doc = Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
	except Exception as e:
	print(f"Error reading DOCX: {e}")
	return None

	def extract_text_from_pptx(file_path: str) -> Optional[str]:
	"""
	استخراج النص من ملف PowerPoint (PPTX).
	"""
	try:
	presentation = pptx.Presentation(file_path)
	text = []
	for slide in presentation.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text.append(shape.text)
	return "\n".join(text) if text else None
	except Exception as e:
	print(f"Error reading PPTX: {e}")
	return None

	def extract_text_from_document(file_path: str) -> Optional[str]:
	"""
	دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
	"""
	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return None

	if file_path.lower().endswith('.pdf'):
	return extract_text_from_pdf(file_path)
	elif file_path.lower().endswith('.docx'):
	return extract_text_from_docx(file_path)
	elif file_path.lower().endswith('.pptx'):
	return extract_text_from_pptx(file_path)
	elif file_path.lower().endswith('.txt'):
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	print(f"Error reading TXT: {e}")
	return None
	else:
	print(f"Unsupported file format: {file_path}")
	return None