Spaces:

guardianrobot
/

screencomply_documents

Sleeping

screencomply_documents / backend /utils /file_parser.py

misakovhearst

Initial deploy

48c7fed 10 days ago

5.57 kB

	import os
	import re
	from typing import Tuple, Optional
	from pathlib import Path
	from pypdf import PdfReader
	from docx import Document

	_EMOJI_RE = re.compile(
	"[\U0001F600-\U0001F64F" # emoticons (😀 😂 🥹 etc.)
	"\U0001F300-\U0001F5FF" # misc symbols & pictographs (🌍 🎉 🔥 etc.)
	"\U0001F680-\U0001F6FF" # transport & map (🚀 ✈️ 🚗 etc.)
	"\U0001F1E0-\U0001F1FF" # regional indicator letters (flags 🇺🇸)
	"\U0001F900-\U0001F9FF" # supplemental symbols (🤔 🤣 🧠 etc.)
	"\U0001FA00-\U0001FA6F" # chess / extended pictographic
	"\U0001FA70-\U0001FAFF" # symbols & pictographs extended-A
	"]+",
	flags=re.UNICODE,
	)

	class FileParser:
	"""
	Parse multiple file formats and extract text.
	Supports: PDF, DOCX, TXT, and raw text input.
	"""

	SUPPORTED_FORMATS = {".pdf", ".docx", ".doc", ".txt"}

	@staticmethod
	def parse_file(file_path: str) -> Tuple[str, str, Optional[Exception]]:
	"""
	Parse a file and extract text.

	Args:
	file_path: Path to the file

	Returns:
	Tuple of (text, format, error)
	- text: Extracted text content
	- format: File format (pdf, docx, txt)
	- error: Exception if parsing failed, None if successful
	"""
	file_extension = Path(file_path).suffix.lower()

	if file_extension not in FileParser.SUPPORTED_FORMATS:
	error = ValueError(f"Unsupported file format: {file_extension}")
	return "", "", error

	if file_extension == ".pdf":
	return FileParser.parse_pdf(file_path)
	elif file_extension in {".docx", ".doc"}:
	return FileParser.parse_docx(file_path)
	elif file_extension == ".txt":
	return FileParser.parse_txt(file_path)

	return "", "", ValueError("Unknown error")

	@staticmethod
	def parse_pdf(file_path: str) -> Tuple[str, str, Optional[Exception]]:
	"""Extract text from PDF file"""
	try:
	text = ""
	with open(file_path, 'rb') as pdf_file:
	pdf_reader = PdfReader(pdf_file)

	# Extract text from all pages
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text() + "\n"

	return text.strip(), "pdf", None

	except Exception as e:
	return "", "pdf", e

	@staticmethod
	def parse_docx(file_path: str) -> Tuple[str, str, Optional[Exception]]:
	"""Extract text from DOCX file"""
	try:
	doc = Document(file_path)
	text = ""

	# Extract text from all paragraphs
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"

	# Also extract text from tables if present
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	text += cell.text + "\n"

	return text.strip(), "docx", None

	except Exception as e:
	return "", "docx", e

	@staticmethod
	def parse_txt(file_path: str) -> Tuple[str, str, Optional[Exception]]:
	"""Extract text from plain text file"""
	try:
	with open(file_path, 'r', encoding='utf-8') as txt_file:
	text = txt_file.read()

	return text.strip(), "txt", None

	except UnicodeDecodeError:
	# Try with different encoding
	try:
	with open(file_path, 'r', encoding='latin-1') as txt_file:
	text = txt_file.read()
	return text.strip(), "txt", None
	except Exception as e:
	return "", "txt", e

	except Exception as e:
	return "", "txt", e

	@staticmethod
	def parse_raw_text(text: str) -> Tuple[str, str, Optional[Exception]]:
	"""Process raw text input"""
	try:
	cleaned_text = text.strip()
	if not cleaned_text:
	return "", "raw", ValueError("Empty text provided")
	return cleaned_text, "raw", None
	except Exception as e:
	return "", "raw", e

	class TextCleaner:
	"""Clean and normalize extracted text"""

	@staticmethod
	def clean(text: str) -> str:
	"""
	Clean and normalize text.
	Removes extra whitespace, normalizes line breaks, etc.
	"""
	# Remove extra whitespace
	text = ' '.join(text.split())

	# Normalize line breaks
	text = text.replace('\r\n', '\n').replace('\r', '\n')

	return text

	@staticmethod
	def get_text_stats(text: str) -> dict:
	"""Get statistics about text"""
	words = text.split()
	sentences = text.split('.')

	return {
	"character_count": len(text),
	"word_count": len(words),
	"sentence_count": len([s for s in sentences if s.strip()]),
	"average_word_length": len(text) / len(words) if words else 0,
	"average_sentence_length": len(words) / len(sentences) if sentences else 0,
	"emoji_count": sum(len(m) for m in _EMOJI_RE.findall(text)),
	"em_dash_count": text.count('\u2014'),
	"arrow_count": text.count('\u2192'),
	}