Spaces:

blackopsrepl
/

yuga-planner

Paused

File size: 25,472 Bytes

3b9a6b5

"""
Markdown Analyzer Library

This library provides a comprehensive Markdown parsing and analysis system. It consists of three main components:

1. MarkdownParser: Converts Markdown text into a stream of tokens representing different structural elements
   (headers, paragraphs, lists, code blocks, etc.). It handles both block-level and inline elements.

2. InlineParser: Processes inline Markdown elements within block tokens, such as:
   - Links and images
   - Inline code
   - Emphasis (bold, italic)
   - Footnotes
   - HTML inline elements

3. MarkdownAnalyzer: The main interface that combines parsing and analysis. It:
   - Parses the input text into tokens
   - Processes inline elements within tokens
   - Provides methods to identify and analyze different Markdown elements
   - Generates statistics about the document structure

Usage:
    analyzer = MarkdownAnalyzer(markdown_text)
    analysis = analyzer.analyze()  # Get document statistics
    headers = analyzer.identify_headers()  # Get all headers
    links = analyzer.identify_links()  # Get all links
    # etc.

The library supports standard Markdown features including:
- Headers (ATX and Setext style)
- Lists (ordered, unordered, and task lists)
- Code blocks (fenced and inline)
- Blockquotes
- Tables
- Links and images
- Footnotes
- HTML blocks and inline elements
"""

import re
from collections import defaultdict


### MAIN INTERFACE ###
class MarkdownAnalyzer:
    # def __init__(self, file_path, encoding='utf-8'):
    def __init__(self, text):
        # with open(file_path, 'r', encoding=encoding) as f:
        #     self.text = f.read()
        self.text = text
        parser = MarkdownParser(self.text)
        self.tokens = parser.parse()
        self.references = parser.references
        self.footnotes = parser.footnotes
        self.inline_parser = InlineParser(
            references=self.references, footnotes=self.footnotes
        )

        self._parse_inline_tokens()

    def _parse_inline_tokens(self):
        inline_types = ("paragraph", "header", "blockquote")
        for token in self.tokens:
            if token.type in inline_types and token.content:
                inline_data = self.inline_parser.parse_inline(token.content)
                token.meta.update(inline_data)

    def identify_headers(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "header":
                result["Header"].append(
                    {"line": token.line, "level": token.level, "text": token.content}
                )
        return dict(result)

    def identify_paragraphs(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "paragraph":
                result["Paragraph"].append(token.content)
        return dict(result)

    def identify_blockquotes(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "blockquote":
                result["Blockquote"].append(token.content)
        return dict(result)

    def identify_code_blocks(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "code":
                result["Code block"].append(
                    {
                        "start_line": token.line,
                        "content": token.content,
                        "language": token.meta.get("language"),
                    }
                )
        return dict(result)

    def identify_lists(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "ordered_list":
                result["Ordered list"].append(token.meta["items"])
            elif token.type == "unordered_list":
                result["Unordered list"].append(token.meta["items"])
        return dict(result)

    def identify_tables(self):
        result = defaultdict(list)
        for token in self.tokens:
            if token.type == "table":
                result["Table"].append(
                    {"header": token.meta["header"], "rows": token.meta["rows"]}
                )
        return dict(result)

    def identify_links(self):
        result = defaultdict(list)
        for token in self.tokens:
            if "text_links" in token.meta:
                for l in token.meta["text_links"]:
                    result["Text link"].append(
                        {"line": token.line, "text": l["text"], "url": l["url"]}
                    )
            if "image_links" in token.meta:
                for img in token.meta["image_links"]:
                    result["Image link"].append(
                        {
                            "line": token.line,
                            "alt_text": img["alt_text"],
                            "url": img["url"],
                        }
                    )
        return dict(result)

    def identify_footnotes(self):
        result = []
        seen = set()
        for token in self.tokens:
            if "footnotes_used" in token.meta:
                for fn in token.meta["footnotes_used"]:
                    key = (fn["id"], fn["content"])
                    if key not in seen:
                        seen.add(key)
                        result.append(
                            {
                                "line": token.line,
                                "id": fn["id"],
                                "content": fn["content"],
                            }
                        )
        return result

    def identify_inline_code(self):
        codes = []
        for token in self.tokens:
            if "inline_code" in token.meta:
                for c in token.meta["inline_code"]:
                    codes.append({"line": token.line, "code": c})
        return codes

    def identify_emphasis(self):
        ems = []
        for token in self.tokens:
            if "emphasis" in token.meta:
                for e in token.meta["emphasis"]:
                    ems.append({"line": token.line, "text": e})
        return ems

    def identify_task_items(self):
        tasks = []
        for token in self.tokens:
            if token.type in ("ordered_list", "unordered_list"):
                for it in token.meta["items"]:
                    if it.get("task_item"):
                        tasks.append(
                            {
                                "line": token.line,
                                "text": it["text"],
                                "checked": it["checked"],
                            }
                        )
        return tasks

    def identify_html_blocks(self):
        # Gets HTML blocks
        result = []
        for token in self.tokens:
            if token.type == "html_block":
                result.append({"line": token.line, "content": token.content})
        return result

    def identify_html_inline(self):
        # Gets HTML tags from inline tokens
        result = []
        inline_types = ("paragraph", "header", "blockquote")
        for token in self.tokens:
            if token.type in inline_types and "html_inline" in token.meta:
                for h in token.meta["html_inline"]:
                    result.append({"line": token.line, "html": h})
        return result

    def count_words(self):
        words = self.text.split()
        return len(words)

    def count_characters(self):
        characters = [char for char in self.text if not char.isspace()]
        return len(characters)

    def analyze(self):
        headers = self.identify_headers().get("Header", [])
        paragraphs = self.identify_paragraphs().get("Paragraph", [])
        blockquotes = self.identify_blockquotes().get("Blockquote", [])
        code_blocks = self.identify_code_blocks().get("Code block", [])
        lists = self.identify_lists()
        ordered_lists = lists.get("Ordered list", [])
        unordered_lists = lists.get("Unordered list", [])
        tables = self.identify_tables().get("Table", [])
        html_blocks = self.identify_html_blocks()
        html_inline = self.identify_html_inline()

        analysis = {
            "headers": len(headers),
            "paragraphs": len(paragraphs),
            "blockquotes": len(blockquotes),
            "code_blocks": len(code_blocks),
            "ordered_lists": sum(len(l) for l in ordered_lists),
            "unordered_lists": sum(len(l) for l in unordered_lists),
            "tables": len(tables),
            "html_blocks": len(html_blocks),
            "html_inline_count": len(html_inline),
            "words": self.count_words(),
            "characters": self.count_characters(),
        }
        return analysis


### PARSING CLASSES ###
class BlockToken:
    """Represents a block-level Markdown element with its type, content, and metadata."""

    def __init__(self, type_, content="", level=None, meta=None, line=None):
        self.type = type_  # Type of block (header, paragraph, code, etc.)
        self.content = content  # The actual content of the block
        self.level = level  # Used for headers (h1-h6) and list indentation
        self.meta = meta or {}  # Additional metadata (language for code blocks, etc.)
        self.line = line  # Line number in the original document


class InlineParser:
    # Regular expressions for matching inline Markdown elements
    IMAGE_OR_LINK_RE = re.compile(
        r"(!?\[([^\]]*)\])(\(([^\)]+)\)|\[([^\]]+)\])"
    )  # Matches [text](url) or ![alt](url)
    CODE_INLINE_RE = re.compile(r"`([^`]+)`")  # Matches `code`
    EMPHASIS_RE = re.compile(
        r"(\*\*|__)(.*?)\1|\*(.*?)\*|_(.*?)_"
    )  # Matches **bold**, *italic*, _underline_
    FOOTNOTE_RE = re.compile(r"\[\^([^\]]+)\]")  # Matches [^footnote]
    HTML_INLINE_RE = re.compile(r"<[a-zA-Z/][^>]*>")  # Matches HTML tags
    HTML_INLINE_BLOCK_RE = re.compile(
        r"<([a-zA-Z]+)([^>]*)>(.*?)</\1>", re.DOTALL
    )  # Matches HTML blocks with content

    def __init__(self, references=None, footnotes=None):
        # Initialize with optional reference links and footnotes from the document
        self.references = references or {}  # For [text][ref] style links
        self.footnotes = footnotes or {}  # For [^footnote] style references

    def parse_inline(self, text):
        """Parse inline Markdown elements within a block of text."""
        result = {
            "text_links": [],  # Regular [text](url) links
            "image_links": [],  # ![alt](url) images
            "inline_code": [],  # `code` blocks
            "emphasis": [],  # **bold**, *italic* text
            "footnotes_used": [],  # [^footnote] references
            "html_inline": [],  # HTML tags and blocks
        }

        # Process footnotes first to avoid conflicts with other patterns
        used_footnotes = set()
        for fm in self.FOOTNOTE_RE.finditer(text):
            fid = fm.group(1)
            if fid in self.footnotes and fid not in used_footnotes:
                used_footnotes.add(fid)
                result["footnotes_used"].append(
                    {"id": fid, "content": self.footnotes[fid]}
                )

        # Find inline code blocks
        for cm in self.CODE_INLINE_RE.finditer(text):
            code = cm.group(1)
            result["inline_code"].append(code)

        # Find emphasized text (bold, italic, underline)
        for em_match in self.EMPHASIS_RE.finditer(text):
            emphasized_text = (
                em_match.group(2) or em_match.group(3) or em_match.group(4)
            )
            if emphasized_text:
                result["emphasis"].append(emphasized_text)

        # Process HTML blocks first to avoid conflicts with other patterns
        temp_text = text
        for block_match in self.HTML_INLINE_BLOCK_RE.finditer(text):
            html_content = block_match.group(0)
            result["html_inline"].append(html_content)
            temp_text = temp_text.replace(html_content, "")

        # Process links and images
        for mm in self.IMAGE_OR_LINK_RE.finditer(temp_text):
            prefix = mm.group(1)  # The [text] or ![alt] part
            inner_text = mm.group(2)  # The text inside []
            url = mm.group(4)  # The (url) part
            ref_id = mm.group(5)  # The [ref] part for reference-style links

            is_image = prefix.startswith("!")
            final_url = url
            if ref_id and ref_id.lower() in self.references:
                final_url = self.references[ref_id.lower()]

            if is_image:
                if final_url:
                    result["image_links"].append(
                        {"alt_text": inner_text, "url": final_url}
                    )
            else:
                if final_url:
                    result["text_links"].append({"text": inner_text, "url": final_url})
        return result


class MarkdownParser:
    # Regular expressions for matching block-level Markdown elements
    FRONTMATTER_RE = re.compile(r"^---\s*$")  # Matches YAML frontmatter delimiters
    ATX_HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)$")  # Matches # Header style
    SETEXT_H1_RE = re.compile(r"^=+\s*$")  # Matches ==== style h1
    SETEXT_H2_RE = re.compile(r"^-+\s*$")  # Matches ---- style h2
    FENCE_RE = re.compile(r"^```([^`]*)$")  # Matches code fence start
    BLOCKQUOTE_RE = re.compile(r"^(>\s?)(.*)$")  # Matches > quote style
    ORDERED_LIST_RE = re.compile(r"^\s*\d+\.\s+(.*)$")  # Matches 1. list style
    UNORDERED_LIST_RE = re.compile(r"^\s*[-+*]\s+(.*)$")  # Matches - list style
    HR_RE = re.compile(r"^(\*{3,}|-{3,}|_{3,})\s*$")  # Matches horizontal rules
    TABLE_SEPARATOR_RE = re.compile(
        r"^\|?(\s*:?-+:?\s*\|)+\s*:?-+:?\s*\|?\s*$"
    )  # Matches table separators
    REFERENCE_DEF_RE = re.compile(
        r"^\[([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE
    )  # Matches [ref]: url definitions
    FOOTNOTE_DEF_RE = re.compile(
        r"^\[\^([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE
    )  # Matches [^footnote]: content
    HTML_BLOCK_START = re.compile(
        r"^(<([a-zA-Z]+)([^>]*)>|<!--)"
    )  # Matches HTML block start
    HTML_BLOCK_END_COMMENT = re.compile(r"-->\s*$")  # Matches HTML comment end

    def __init__(self, text):
        """Initialize parser with the Markdown text to parse."""
        self.lines = text.split("\n")
        self.length = len(self.lines)
        self.pos = 0  # Current position in the text
        self.tokens = []  # List of parsed tokens
        self.text = text
        self.references = {}  # Reference-style link definitions
        self.footnotes = {}  # Footnote definitions
        self.extract_references_and_footnotes()

    def extract_references_and_footnotes(self):
        """Extract all reference-style links and footnotes from the document."""
        for m in self.REFERENCE_DEF_RE.finditer(self.text):
            rid, url = m.groups()
            self.references[rid.lower()] = url

        for m in self.FOOTNOTE_DEF_RE.finditer(self.text):
            fid, content = m.groups()
            self.footnotes[fid] = content

    def parse(self):
        """Main parsing method that processes the entire document."""
        # Check for frontmatter at the start
        if self.pos < self.length and self.FRONTMATTER_RE.match(
            self.lines[self.pos].strip()
        ):
            self.parse_frontmatter()

        # Process the document line by line
        while self.pos < self.length:
            if self.pos >= self.length:
                break
            line = self.lines[self.pos]
            if not line.strip():
                self.pos += 1
                continue

            # Check for table start
            if self.is_table_start():
                self.parse_table()
                continue

            # Check for HTML block
            if self.is_html_block_start(line):
                self.parse_html_block()
                continue

            # Check for ATX-style headers (# Header)
            m = self.ATX_HEADER_RE.match(line)
            if m:
                level = len(m.group(1))
                text = m.group(2).strip()
                self.tokens.append(
                    BlockToken("header", content=text, level=level, line=self.pos + 1)
                )
                self.pos += 1
                continue

            # Check for Setext-style headers (=== or ---)
            if self.pos + 1 < self.length:
                next_line = self.lines[self.pos + 1].strip()
                if self.SETEXT_H1_RE.match(next_line):
                    text = line.strip()
                    self.tokens.append(
                        BlockToken("header", content=text, level=1, line=self.pos + 1)
                    )
                    self.pos += 2
                    continue
                if self.SETEXT_H2_RE.match(next_line):
                    text = line.strip()
                    self.tokens.append(
                        BlockToken("header", content=text, level=2, line=self.pos + 1)
                    )
                    self.pos += 2
                    continue

            # Check for horizontal rule
            if self.HR_RE.match(line.strip()):
                self.tokens.append(BlockToken("hr", line=self.pos + 1))
                self.pos += 1
                continue

            # Check for fenced code block
            fm = self.FENCE_RE.match(line.strip())
            if fm:
                lang = fm.group(1).strip()
                self.parse_fenced_code_block(lang)
                continue

            # Check for blockquote
            bm = self.BLOCKQUOTE_RE.match(line)
            if bm:
                self.parse_blockquote()
                continue

            # Check for lists
            om = self.ORDERED_LIST_RE.match(line)
            um = self.UNORDERED_LIST_RE.match(line)
            if om or um:
                self.parse_list(ordered=bool(om))
                continue

            # If no other block type matches, treat as paragraph
            self.parse_paragraph()

        return self.tokens

    def is_html_block_start(self, line):
        # Verify if line seems to be HTML
        return self.HTML_BLOCK_START.match(line.strip()) is not None

    def parse_html_block(self):
        start = self.pos
        lines = []
        first_line = self.lines[self.pos].strip()
        comment_mode = first_line.startswith("<!--")

        # Read HTML block until empty line / eof
        while self.pos < self.length:
            line = self.lines[self.pos]
            lines.append(line)
            self.pos += 1

            if comment_mode and self.HTML_BLOCK_END_COMMENT.search(line):
                break
            else:
                # If next line is empty or doesn't exist, stop
                if self.pos < self.length:
                    nxt_line = self.lines[self.pos]
                    if not nxt_line.strip():
                        # End of block
                        break
                else:
                    # End of file
                    break

        content = "\n".join(lines)
        self.tokens.append(BlockToken("html_block", content=content, line=start + 1))

    def starts_new_block_peek(self):
        # Check next line without advancing
        if self.pos < self.length:
            nxt = self.lines[self.pos].strip()
            return self.starts_new_block(nxt)
        return False

    def is_table_start(self):
        if self.pos + 1 < self.length:
            line = self.lines[self.pos].strip()
            next_line = self.lines[self.pos + 1].strip()
            if (
                "|" in line
                and "|" in next_line
                and self.TABLE_SEPARATOR_RE.match(next_line)
            ):
                return True
        return False

    def parse_table(self):
        start = self.pos
        header_line = self.lines[self.pos].strip()
        separator_line = self.lines[self.pos + 1].strip()
        self.pos += 2
        rows = []
        while self.pos < self.length:
            line = self.lines[self.pos].strip()
            if not line or self.starts_new_block(line):
                break
            rows.append(line)
            self.pos += 1

        header_cells = [
            h.strip() for h in header_line.strip("|").split("|") if h.strip()
        ]
        data_rows = []
        for r in rows:
            data_rows.append([c.strip() for c in r.strip("|").split("|") if c.strip()])

        self.tokens.append(
            BlockToken(
                "table",
                meta={"header": header_cells, "rows": data_rows},
                line=start + 1,
            )
        )

    def starts_new_block(self, line):
        return (
            self.ATX_HEADER_RE.match(line)
            or self.FRONTMATTER_RE.match(line)
            or self.FENCE_RE.match(line)
            or self.BLOCKQUOTE_RE.match(line)
            or self.ORDERED_LIST_RE.match(line)
            or self.UNORDERED_LIST_RE.match(line)
            or self.HR_RE.match(line)
            or self.SETEXT_H1_RE.match(line)
            or self.SETEXT_H2_RE.match(line)
            or self.HTML_BLOCK_START.match(line)
        )

    def parse_frontmatter(self):
        self.pos += 1
        start = self.pos
        while self.pos < self.length:
            if self.FRONTMATTER_RE.match(self.lines[self.pos].strip()):
                content = "\n".join(self.lines[start : self.pos])
                self.tokens.append(BlockToken("frontmatter", content=content))
                self.pos += 1
                return
            self.pos += 1
        content = "\n".join(self.lines[start:])
        self.tokens.append(BlockToken("frontmatter", content=content))
        self.pos = self.length

    def parse_fenced_code_block(self, lang):
        self.pos += 1
        start = self.pos
        while self.pos < self.length:
            line = self.lines[self.pos]
            if line.strip().startswith("```"):
                content = "\n".join(self.lines[start : self.pos])
                self.tokens.append(
                    BlockToken(
                        "code", content=content, meta={"language": lang}, line=start + 1
                    )
                )
                self.pos += 1
                return
            self.pos += 1
        content = "\n".join(self.lines[start:])
        self.tokens.append(
            BlockToken("code", content=content, meta={"language": lang}, line=start + 1)
        )
        self.pos = self.length

    def parse_blockquote(self):
        start = self.pos
        lines = []
        while self.pos < self.length:
            line = self.lines[self.pos]
            bm = self.BLOCKQUOTE_RE.match(line)
            if bm:
                lines.append(bm.group(2))
                self.pos += 1
            else:
                break
        content = "\n".join(lines)
        self.tokens.append(BlockToken("blockquote", content=content, line=start + 1))

    def parse_list(self, ordered):
        start = self.pos
        items = []
        current_item = []
        list_pattern = self.ORDERED_LIST_RE if ordered else self.UNORDERED_LIST_RE

        while self.pos < self.length:
            line = self.lines[self.pos]
            if not line.strip():
                if current_item:
                    items.append("\n".join(current_item).strip())
                    current_item = []
                self.pos += 1
                continue

            if self.starts_new_block(line.strip()) and not (
                self.ORDERED_LIST_RE.match(line.strip())
                or self.UNORDERED_LIST_RE.match(line.strip())
            ):
                break

            lm = list_pattern.match(line)
            if lm:
                if current_item:
                    items.append("\n".join(current_item).strip())
                    current_item = []
                current_item.append(lm.group(1))
                self.pos += 1
            else:
                current_item.append(line.strip())
                self.pos += 1

        if current_item:
            items.append("\n".join(current_item).strip())

        task_re = re.compile(r"^\[( |x)\]\s+(.*)$")
        final_items = []
        for it in items:
            lines = it.split("\n")
            first_line = lines[0].strip()
            m = task_re.match(first_line)
            if m:
                state = m.group(1)
                text = m.group(2)
                task_checked = state == "x"
                final_items.append(
                    {"text": text, "task_item": True, "checked": task_checked}
                )
            else:
                final_items.append({"text": it, "task_item": False})

        list_type = "ordered_list" if ordered else "unordered_list"
        self.tokens.append(
            BlockToken(list_type, meta={"items": final_items}, line=start + 1)
        )

    def parse_paragraph(self):
        start = self.pos
        lines = []
        while self.pos < self.length:
            line = self.lines[self.pos]
            if not line.strip():
                self.pos += 1
                break
            if self.starts_new_block(line.strip()):
                break
            lines.append(line)
            self.pos += 1

        content = "\n".join(lines).strip()
        if content:
            self.tokens.append(BlockToken("paragraph", content=content, line=start + 1))