yuga-planner / src /utils /markdown_analyzer.py
blackopsrepl's picture
chore: reboot project versioning
3b9a6b5
"""
Markdown Analyzer Library
This library provides a comprehensive Markdown parsing and analysis system. It consists of three main components:
1. MarkdownParser: Converts Markdown text into a stream of tokens representing different structural elements
(headers, paragraphs, lists, code blocks, etc.). It handles both block-level and inline elements.
2. InlineParser: Processes inline Markdown elements within block tokens, such as:
- Links and images
- Inline code
- Emphasis (bold, italic)
- Footnotes
- HTML inline elements
3. MarkdownAnalyzer: The main interface that combines parsing and analysis. It:
- Parses the input text into tokens
- Processes inline elements within tokens
- Provides methods to identify and analyze different Markdown elements
- Generates statistics about the document structure
Usage:
analyzer = MarkdownAnalyzer(markdown_text)
analysis = analyzer.analyze() # Get document statistics
headers = analyzer.identify_headers() # Get all headers
links = analyzer.identify_links() # Get all links
# etc.
The library supports standard Markdown features including:
- Headers (ATX and Setext style)
- Lists (ordered, unordered, and task lists)
- Code blocks (fenced and inline)
- Blockquotes
- Tables
- Links and images
- Footnotes
- HTML blocks and inline elements
"""
import re
from collections import defaultdict
### MAIN INTERFACE ###
class MarkdownAnalyzer:
# def __init__(self, file_path, encoding='utf-8'):
def __init__(self, text):
# with open(file_path, 'r', encoding=encoding) as f:
# self.text = f.read()
self.text = text
parser = MarkdownParser(self.text)
self.tokens = parser.parse()
self.references = parser.references
self.footnotes = parser.footnotes
self.inline_parser = InlineParser(
references=self.references, footnotes=self.footnotes
)
self._parse_inline_tokens()
def _parse_inline_tokens(self):
inline_types = ("paragraph", "header", "blockquote")
for token in self.tokens:
if token.type in inline_types and token.content:
inline_data = self.inline_parser.parse_inline(token.content)
token.meta.update(inline_data)
def identify_headers(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "header":
result["Header"].append(
{"line": token.line, "level": token.level, "text": token.content}
)
return dict(result)
def identify_paragraphs(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "paragraph":
result["Paragraph"].append(token.content)
return dict(result)
def identify_blockquotes(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "blockquote":
result["Blockquote"].append(token.content)
return dict(result)
def identify_code_blocks(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "code":
result["Code block"].append(
{
"start_line": token.line,
"content": token.content,
"language": token.meta.get("language"),
}
)
return dict(result)
def identify_lists(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "ordered_list":
result["Ordered list"].append(token.meta["items"])
elif token.type == "unordered_list":
result["Unordered list"].append(token.meta["items"])
return dict(result)
def identify_tables(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == "table":
result["Table"].append(
{"header": token.meta["header"], "rows": token.meta["rows"]}
)
return dict(result)
def identify_links(self):
result = defaultdict(list)
for token in self.tokens:
if "text_links" in token.meta:
for l in token.meta["text_links"]:
result["Text link"].append(
{"line": token.line, "text": l["text"], "url": l["url"]}
)
if "image_links" in token.meta:
for img in token.meta["image_links"]:
result["Image link"].append(
{
"line": token.line,
"alt_text": img["alt_text"],
"url": img["url"],
}
)
return dict(result)
def identify_footnotes(self):
result = []
seen = set()
for token in self.tokens:
if "footnotes_used" in token.meta:
for fn in token.meta["footnotes_used"]:
key = (fn["id"], fn["content"])
if key not in seen:
seen.add(key)
result.append(
{
"line": token.line,
"id": fn["id"],
"content": fn["content"],
}
)
return result
def identify_inline_code(self):
codes = []
for token in self.tokens:
if "inline_code" in token.meta:
for c in token.meta["inline_code"]:
codes.append({"line": token.line, "code": c})
return codes
def identify_emphasis(self):
ems = []
for token in self.tokens:
if "emphasis" in token.meta:
for e in token.meta["emphasis"]:
ems.append({"line": token.line, "text": e})
return ems
def identify_task_items(self):
tasks = []
for token in self.tokens:
if token.type in ("ordered_list", "unordered_list"):
for it in token.meta["items"]:
if it.get("task_item"):
tasks.append(
{
"line": token.line,
"text": it["text"],
"checked": it["checked"],
}
)
return tasks
def identify_html_blocks(self):
# Gets HTML blocks
result = []
for token in self.tokens:
if token.type == "html_block":
result.append({"line": token.line, "content": token.content})
return result
def identify_html_inline(self):
# Gets HTML tags from inline tokens
result = []
inline_types = ("paragraph", "header", "blockquote")
for token in self.tokens:
if token.type in inline_types and "html_inline" in token.meta:
for h in token.meta["html_inline"]:
result.append({"line": token.line, "html": h})
return result
def count_words(self):
words = self.text.split()
return len(words)
def count_characters(self):
characters = [char for char in self.text if not char.isspace()]
return len(characters)
def analyze(self):
headers = self.identify_headers().get("Header", [])
paragraphs = self.identify_paragraphs().get("Paragraph", [])
blockquotes = self.identify_blockquotes().get("Blockquote", [])
code_blocks = self.identify_code_blocks().get("Code block", [])
lists = self.identify_lists()
ordered_lists = lists.get("Ordered list", [])
unordered_lists = lists.get("Unordered list", [])
tables = self.identify_tables().get("Table", [])
html_blocks = self.identify_html_blocks()
html_inline = self.identify_html_inline()
analysis = {
"headers": len(headers),
"paragraphs": len(paragraphs),
"blockquotes": len(blockquotes),
"code_blocks": len(code_blocks),
"ordered_lists": sum(len(l) for l in ordered_lists),
"unordered_lists": sum(len(l) for l in unordered_lists),
"tables": len(tables),
"html_blocks": len(html_blocks),
"html_inline_count": len(html_inline),
"words": self.count_words(),
"characters": self.count_characters(),
}
return analysis
### PARSING CLASSES ###
class BlockToken:
"""Represents a block-level Markdown element with its type, content, and metadata."""
def __init__(self, type_, content="", level=None, meta=None, line=None):
self.type = type_ # Type of block (header, paragraph, code, etc.)
self.content = content # The actual content of the block
self.level = level # Used for headers (h1-h6) and list indentation
self.meta = meta or {} # Additional metadata (language for code blocks, etc.)
self.line = line # Line number in the original document
class InlineParser:
# Regular expressions for matching inline Markdown elements
IMAGE_OR_LINK_RE = re.compile(
r"(!?\[([^\]]*)\])(\(([^\)]+)\)|\[([^\]]+)\])"
) # Matches [text](url) or ![alt](url)
CODE_INLINE_RE = re.compile(r"`([^`]+)`") # Matches `code`
EMPHASIS_RE = re.compile(
r"(\*\*|__)(.*?)\1|\*(.*?)\*|_(.*?)_"
) # Matches **bold**, *italic*, _underline_
FOOTNOTE_RE = re.compile(r"\[\^([^\]]+)\]") # Matches [^footnote]
HTML_INLINE_RE = re.compile(r"<[a-zA-Z/][^>]*>") # Matches HTML tags
HTML_INLINE_BLOCK_RE = re.compile(
r"<([a-zA-Z]+)([^>]*)>(.*?)</\1>", re.DOTALL
) # Matches HTML blocks with content
def __init__(self, references=None, footnotes=None):
# Initialize with optional reference links and footnotes from the document
self.references = references or {} # For [text][ref] style links
self.footnotes = footnotes or {} # For [^footnote] style references
def parse_inline(self, text):
"""Parse inline Markdown elements within a block of text."""
result = {
"text_links": [], # Regular [text](url) links
"image_links": [], # ![alt](url) images
"inline_code": [], # `code` blocks
"emphasis": [], # **bold**, *italic* text
"footnotes_used": [], # [^footnote] references
"html_inline": [], # HTML tags and blocks
}
# Process footnotes first to avoid conflicts with other patterns
used_footnotes = set()
for fm in self.FOOTNOTE_RE.finditer(text):
fid = fm.group(1)
if fid in self.footnotes and fid not in used_footnotes:
used_footnotes.add(fid)
result["footnotes_used"].append(
{"id": fid, "content": self.footnotes[fid]}
)
# Find inline code blocks
for cm in self.CODE_INLINE_RE.finditer(text):
code = cm.group(1)
result["inline_code"].append(code)
# Find emphasized text (bold, italic, underline)
for em_match in self.EMPHASIS_RE.finditer(text):
emphasized_text = (
em_match.group(2) or em_match.group(3) or em_match.group(4)
)
if emphasized_text:
result["emphasis"].append(emphasized_text)
# Process HTML blocks first to avoid conflicts with other patterns
temp_text = text
for block_match in self.HTML_INLINE_BLOCK_RE.finditer(text):
html_content = block_match.group(0)
result["html_inline"].append(html_content)
temp_text = temp_text.replace(html_content, "")
# Process links and images
for mm in self.IMAGE_OR_LINK_RE.finditer(temp_text):
prefix = mm.group(1) # The [text] or ![alt] part
inner_text = mm.group(2) # The text inside []
url = mm.group(4) # The (url) part
ref_id = mm.group(5) # The [ref] part for reference-style links
is_image = prefix.startswith("!")
final_url = url
if ref_id and ref_id.lower() in self.references:
final_url = self.references[ref_id.lower()]
if is_image:
if final_url:
result["image_links"].append(
{"alt_text": inner_text, "url": final_url}
)
else:
if final_url:
result["text_links"].append({"text": inner_text, "url": final_url})
return result
class MarkdownParser:
# Regular expressions for matching block-level Markdown elements
FRONTMATTER_RE = re.compile(r"^---\s*$") # Matches YAML frontmatter delimiters
ATX_HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)$") # Matches # Header style
SETEXT_H1_RE = re.compile(r"^=+\s*$") # Matches ==== style h1
SETEXT_H2_RE = re.compile(r"^-+\s*$") # Matches ---- style h2
FENCE_RE = re.compile(r"^```([^`]*)$") # Matches code fence start
BLOCKQUOTE_RE = re.compile(r"^(>\s?)(.*)$") # Matches > quote style
ORDERED_LIST_RE = re.compile(r"^\s*\d+\.\s+(.*)$") # Matches 1. list style
UNORDERED_LIST_RE = re.compile(r"^\s*[-+*]\s+(.*)$") # Matches - list style
HR_RE = re.compile(r"^(\*{3,}|-{3,}|_{3,})\s*$") # Matches horizontal rules
TABLE_SEPARATOR_RE = re.compile(
r"^\|?(\s*:?-+:?\s*\|)+\s*:?-+:?\s*\|?\s*$"
) # Matches table separators
REFERENCE_DEF_RE = re.compile(
r"^\[([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE
) # Matches [ref]: url definitions
FOOTNOTE_DEF_RE = re.compile(
r"^\[\^([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE
) # Matches [^footnote]: content
HTML_BLOCK_START = re.compile(
r"^(<([a-zA-Z]+)([^>]*)>|<!--)"
) # Matches HTML block start
HTML_BLOCK_END_COMMENT = re.compile(r"-->\s*$") # Matches HTML comment end
def __init__(self, text):
"""Initialize parser with the Markdown text to parse."""
self.lines = text.split("\n")
self.length = len(self.lines)
self.pos = 0 # Current position in the text
self.tokens = [] # List of parsed tokens
self.text = text
self.references = {} # Reference-style link definitions
self.footnotes = {} # Footnote definitions
self.extract_references_and_footnotes()
def extract_references_and_footnotes(self):
"""Extract all reference-style links and footnotes from the document."""
for m in self.REFERENCE_DEF_RE.finditer(self.text):
rid, url = m.groups()
self.references[rid.lower()] = url
for m in self.FOOTNOTE_DEF_RE.finditer(self.text):
fid, content = m.groups()
self.footnotes[fid] = content
def parse(self):
"""Main parsing method that processes the entire document."""
# Check for frontmatter at the start
if self.pos < self.length and self.FRONTMATTER_RE.match(
self.lines[self.pos].strip()
):
self.parse_frontmatter()
# Process the document line by line
while self.pos < self.length:
if self.pos >= self.length:
break
line = self.lines[self.pos]
if not line.strip():
self.pos += 1
continue
# Check for table start
if self.is_table_start():
self.parse_table()
continue
# Check for HTML block
if self.is_html_block_start(line):
self.parse_html_block()
continue
# Check for ATX-style headers (# Header)
m = self.ATX_HEADER_RE.match(line)
if m:
level = len(m.group(1))
text = m.group(2).strip()
self.tokens.append(
BlockToken("header", content=text, level=level, line=self.pos + 1)
)
self.pos += 1
continue
# Check for Setext-style headers (=== or ---)
if self.pos + 1 < self.length:
next_line = self.lines[self.pos + 1].strip()
if self.SETEXT_H1_RE.match(next_line):
text = line.strip()
self.tokens.append(
BlockToken("header", content=text, level=1, line=self.pos + 1)
)
self.pos += 2
continue
if self.SETEXT_H2_RE.match(next_line):
text = line.strip()
self.tokens.append(
BlockToken("header", content=text, level=2, line=self.pos + 1)
)
self.pos += 2
continue
# Check for horizontal rule
if self.HR_RE.match(line.strip()):
self.tokens.append(BlockToken("hr", line=self.pos + 1))
self.pos += 1
continue
# Check for fenced code block
fm = self.FENCE_RE.match(line.strip())
if fm:
lang = fm.group(1).strip()
self.parse_fenced_code_block(lang)
continue
# Check for blockquote
bm = self.BLOCKQUOTE_RE.match(line)
if bm:
self.parse_blockquote()
continue
# Check for lists
om = self.ORDERED_LIST_RE.match(line)
um = self.UNORDERED_LIST_RE.match(line)
if om or um:
self.parse_list(ordered=bool(om))
continue
# If no other block type matches, treat as paragraph
self.parse_paragraph()
return self.tokens
def is_html_block_start(self, line):
# Verify if line seems to be HTML
return self.HTML_BLOCK_START.match(line.strip()) is not None
def parse_html_block(self):
start = self.pos
lines = []
first_line = self.lines[self.pos].strip()
comment_mode = first_line.startswith("<!--")
# Read HTML block until empty line / eof
while self.pos < self.length:
line = self.lines[self.pos]
lines.append(line)
self.pos += 1
if comment_mode and self.HTML_BLOCK_END_COMMENT.search(line):
break
else:
# If next line is empty or doesn't exist, stop
if self.pos < self.length:
nxt_line = self.lines[self.pos]
if not nxt_line.strip():
# End of block
break
else:
# End of file
break
content = "\n".join(lines)
self.tokens.append(BlockToken("html_block", content=content, line=start + 1))
def starts_new_block_peek(self):
# Check next line without advancing
if self.pos < self.length:
nxt = self.lines[self.pos].strip()
return self.starts_new_block(nxt)
return False
def is_table_start(self):
if self.pos + 1 < self.length:
line = self.lines[self.pos].strip()
next_line = self.lines[self.pos + 1].strip()
if (
"|" in line
and "|" in next_line
and self.TABLE_SEPARATOR_RE.match(next_line)
):
return True
return False
def parse_table(self):
start = self.pos
header_line = self.lines[self.pos].strip()
separator_line = self.lines[self.pos + 1].strip()
self.pos += 2
rows = []
while self.pos < self.length:
line = self.lines[self.pos].strip()
if not line or self.starts_new_block(line):
break
rows.append(line)
self.pos += 1
header_cells = [
h.strip() for h in header_line.strip("|").split("|") if h.strip()
]
data_rows = []
for r in rows:
data_rows.append([c.strip() for c in r.strip("|").split("|") if c.strip()])
self.tokens.append(
BlockToken(
"table",
meta={"header": header_cells, "rows": data_rows},
line=start + 1,
)
)
def starts_new_block(self, line):
return (
self.ATX_HEADER_RE.match(line)
or self.FRONTMATTER_RE.match(line)
or self.FENCE_RE.match(line)
or self.BLOCKQUOTE_RE.match(line)
or self.ORDERED_LIST_RE.match(line)
or self.UNORDERED_LIST_RE.match(line)
or self.HR_RE.match(line)
or self.SETEXT_H1_RE.match(line)
or self.SETEXT_H2_RE.match(line)
or self.HTML_BLOCK_START.match(line)
)
def parse_frontmatter(self):
self.pos += 1
start = self.pos
while self.pos < self.length:
if self.FRONTMATTER_RE.match(self.lines[self.pos].strip()):
content = "\n".join(self.lines[start : self.pos])
self.tokens.append(BlockToken("frontmatter", content=content))
self.pos += 1
return
self.pos += 1
content = "\n".join(self.lines[start:])
self.tokens.append(BlockToken("frontmatter", content=content))
self.pos = self.length
def parse_fenced_code_block(self, lang):
self.pos += 1
start = self.pos
while self.pos < self.length:
line = self.lines[self.pos]
if line.strip().startswith("```"):
content = "\n".join(self.lines[start : self.pos])
self.tokens.append(
BlockToken(
"code", content=content, meta={"language": lang}, line=start + 1
)
)
self.pos += 1
return
self.pos += 1
content = "\n".join(self.lines[start:])
self.tokens.append(
BlockToken("code", content=content, meta={"language": lang}, line=start + 1)
)
self.pos = self.length
def parse_blockquote(self):
start = self.pos
lines = []
while self.pos < self.length:
line = self.lines[self.pos]
bm = self.BLOCKQUOTE_RE.match(line)
if bm:
lines.append(bm.group(2))
self.pos += 1
else:
break
content = "\n".join(lines)
self.tokens.append(BlockToken("blockquote", content=content, line=start + 1))
def parse_list(self, ordered):
start = self.pos
items = []
current_item = []
list_pattern = self.ORDERED_LIST_RE if ordered else self.UNORDERED_LIST_RE
while self.pos < self.length:
line = self.lines[self.pos]
if not line.strip():
if current_item:
items.append("\n".join(current_item).strip())
current_item = []
self.pos += 1
continue
if self.starts_new_block(line.strip()) and not (
self.ORDERED_LIST_RE.match(line.strip())
or self.UNORDERED_LIST_RE.match(line.strip())
):
break
lm = list_pattern.match(line)
if lm:
if current_item:
items.append("\n".join(current_item).strip())
current_item = []
current_item.append(lm.group(1))
self.pos += 1
else:
current_item.append(line.strip())
self.pos += 1
if current_item:
items.append("\n".join(current_item).strip())
task_re = re.compile(r"^\[( |x)\]\s+(.*)$")
final_items = []
for it in items:
lines = it.split("\n")
first_line = lines[0].strip()
m = task_re.match(first_line)
if m:
state = m.group(1)
text = m.group(2)
task_checked = state == "x"
final_items.append(
{"text": text, "task_item": True, "checked": task_checked}
)
else:
final_items.append({"text": it, "task_item": False})
list_type = "ordered_list" if ordered else "unordered_list"
self.tokens.append(
BlockToken(list_type, meta={"items": final_items}, line=start + 1)
)
def parse_paragraph(self):
start = self.pos
lines = []
while self.pos < self.length:
line = self.lines[self.pos]
if not line.strip():
self.pos += 1
break
if self.starts_new_block(line.strip()):
break
lines.append(line)
self.pos += 1
content = "\n".join(lines).strip()
if content:
self.tokens.append(BlockToken("paragraph", content=content, line=start + 1))