Markit_v2 / src /parsers /parser_interface.py
AnseMin's picture
Refactor and enhance application structure for Markit_v2
a773878
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Set
import threading
from src.core.exceptions import ParserError, UnsupportedFileTypeError
class DocumentParser(ABC):
"""Base interface for all document parsers in the system."""
def __init__(self):
"""Initialize the parser."""
self._cancellation_flag: Optional[threading.Event] = None
def set_cancellation_flag(self, flag: Optional[threading.Event]) -> None:
"""Set the cancellation flag for this parser."""
self._cancellation_flag = flag
def _check_cancellation(self) -> bool:
"""Check if cancellation has been requested."""
return self._cancellation_flag is not None and self._cancellation_flag.is_set()
@abstractmethod
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""
Parse a document and return its content.
Args:
file_path: Path to the document
ocr_method: OCR method to use (if applicable)
**kwargs: Additional parser-specific options
Returns:
str: The parsed content
Raises:
ParserError: For general parsing errors
UnsupportedFileTypeError: For unsupported file types
"""
pass
@classmethod
@abstractmethod
def get_name(cls) -> str:
"""Return the display name of this parser"""
pass
@classmethod
@abstractmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
"""
Return a list of supported OCR methods.
Returns:
List of dictionaries with keys:
- id: Unique identifier for the OCR method
- name: Display name for the OCR method
- default_params: Default parameters for this OCR method
"""
pass
@classmethod
def get_description(cls) -> str:
"""Return a description of this parser"""
return f"{cls.get_name()} document parser"
@classmethod
def get_supported_file_types(cls) -> Set[str]:
"""Return a set of supported file extensions (including the dot)."""
return {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}
@classmethod
def is_available(cls) -> bool:
"""Check if this parser is available with current configuration."""
return True
def validate_file(self, file_path: Union[str, Path]) -> None:
"""
Validate that the file can be processed by this parser.
Args:
file_path: Path to the file to validate
Raises:
UnsupportedFileTypeError: If file type is not supported
ParserError: For other validation errors
"""
path = Path(file_path)
if not path.exists():
raise ParserError(f"File not found: {file_path}")
if path.suffix.lower() not in self.get_supported_file_types():
raise UnsupportedFileTypeError(
f"File type '{path.suffix}' not supported by {self.get_name()}"
)
def get_metadata(self) -> Dict[str, Any]:
"""Return metadata about this parser instance."""
return {
"name": self.get_name(),
"description": self.get_description(),
"supported_file_types": list(self.get_supported_file_types()),
"supported_ocr_methods": self.get_supported_ocr_methods(),
"available": self.is_available()
}