File size: 2,768 Bytes
2197ab7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""Markdown parsing module for fabric-to-espanso."""
from typing import Tuple, List, Optional, Set
from pathlib import Path
import regex
import logging

from .exceptions import ParsingError
from .config import config

logger = logging.getLogger('fabric_to_espanso')

def create_section_pattern(keywords: Set[str]) -> regex.Pattern:
    keyword_pattern = '|'.join(regex.escape(kw) for kw in keywords)
    return regex.compile(
        rf'^#\s+.*(?:{keyword_pattern}).*$\n?(?:(?!^#).*\n?)*',
        regex.MULTILINE | regex.IGNORECASE
    )

def parse_markdown_file(
    file_path: str | Path,
    keywords: Optional[Set[str]] = None
) -> Tuple[str, Optional[str]]:
    """Extract sections with specified keywords from markdown file.
    
    Args:
        file_path: Path to markdown file
        keywords: Set of keywords to match in headings. If None, uses defaults from config
        
    Returns:
        Tuple of (full_content, extracted_sections)
        If no sections match, returns (full_content, None)
        
    Raises:
        ParsingError: If file reading or parsing fails
    """
    try:
        # Use provided keywords or defaults from config
        keywords = keywords or set(config.base_words)
        
        # Create regex pattern for keywords in headings and text
        section_pattern = create_section_pattern(keywords)
        
        # Read file content
        path = Path(file_path)
        try:
            content = path.read_text(encoding='utf-8')
        except Exception as e:
            raise ParsingError(f"Failed to read {path}: {str(e)}") from e
            
        # Find all matching headings
        section_matches = list(section_pattern.findall(content))
        
        # If no matches found, return full content
        if not section_matches:
            logger.debug(f"No matching sections found in {path.name}")
            return content, None
            
        # Join sections with double newline
        extracted = '\n\n'.join(section_matches)
        logger.debug(f"Extracted {len(section_matches)} sections from {path.name}")
        
        return content, extracted
        
    except Exception as e:
        logger.error(f"Error parsing {file_path}: {str(e)}", exc_info=True)
        if isinstance(e, ParsingError):
            raise
        raise ParsingError(f"Unexpected error parsing {file_path}: {str(e)}") from e

def main():
    # Example usage
    try:
        # Custom keywords can be passed as second argument
        result = parse_markdown_file('document.md')
        # result = extract_sections('document.md', {'Identity', 'Purpose', 'Scope'})
        
        print(result)
    
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == '__main__':
    main()