Spaces:
Running
on
Zero
Running
on
Zero
from abc import ABC, abstractmethod | |
from pathlib import Path | |
from typing import Dict, List, Optional, Any, Union, Set | |
import threading | |
from src.core.exceptions import ParserError, UnsupportedFileTypeError | |
class DocumentParser(ABC): | |
"""Base interface for all document parsers in the system.""" | |
def __init__(self): | |
"""Initialize the parser.""" | |
self._cancellation_flag: Optional[threading.Event] = None | |
def set_cancellation_flag(self, flag: Optional[threading.Event]) -> None: | |
"""Set the cancellation flag for this parser.""" | |
self._cancellation_flag = flag | |
def _check_cancellation(self) -> bool: | |
"""Check if cancellation has been requested.""" | |
return self._cancellation_flag is not None and self._cancellation_flag.is_set() | |
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str: | |
""" | |
Parse a document and return its content. | |
Args: | |
file_path: Path to the document | |
ocr_method: OCR method to use (if applicable) | |
**kwargs: Additional parser-specific options | |
Returns: | |
str: The parsed content | |
Raises: | |
ParserError: For general parsing errors | |
UnsupportedFileTypeError: For unsupported file types | |
""" | |
pass | |
def get_name(cls) -> str: | |
"""Return the display name of this parser""" | |
pass | |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]: | |
""" | |
Return a list of supported OCR methods. | |
Returns: | |
List of dictionaries with keys: | |
- id: Unique identifier for the OCR method | |
- name: Display name for the OCR method | |
- default_params: Default parameters for this OCR method | |
""" | |
pass | |
def get_description(cls) -> str: | |
"""Return a description of this parser""" | |
return f"{cls.get_name()} document parser" | |
def get_supported_file_types(cls) -> Set[str]: | |
"""Return a set of supported file extensions (including the dot).""" | |
return {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"} | |
def is_available(cls) -> bool: | |
"""Check if this parser is available with current configuration.""" | |
return True | |
def validate_file(self, file_path: Union[str, Path]) -> None: | |
""" | |
Validate that the file can be processed by this parser. | |
Args: | |
file_path: Path to the file to validate | |
Raises: | |
UnsupportedFileTypeError: If file type is not supported | |
ParserError: For other validation errors | |
""" | |
path = Path(file_path) | |
if not path.exists(): | |
raise ParserError(f"File not found: {file_path}") | |
if path.suffix.lower() not in self.get_supported_file_types(): | |
raise UnsupportedFileTypeError( | |
f"File type '{path.suffix}' not supported by {self.get_name()}" | |
) | |
def get_metadata(self) -> Dict[str, Any]: | |
"""Return metadata about this parser instance.""" | |
return { | |
"name": self.get_name(), | |
"description": self.get_description(), | |
"supported_file_types": list(self.get_supported_file_types()), | |
"supported_ocr_methods": self.get_supported_ocr_methods(), | |
"available": self.is_available() | |
} |