shadow / authoring /parse.py
garywelz's picture
Add reader, versioning, and export tools
b15141d
from __future__ import annotations
from dataclasses import dataclass
import re
from typing import Iterable
_ZWSP_RE = re.compile(r"[\u200b\u200c\u200d\ufeff]")
def _clean(s: str) -> str:
return _ZWSP_RE.sub("", s).strip()
_CHAPTER_HEADER_RE = re.compile(
r"^(chapter\s+\d+|new\s+chapter\b.*|kungur\b.*chapter\b.*)$",
re.IGNORECASE,
)
def is_chapter_header(line: str) -> bool:
return bool(_CHAPTER_HEADER_RE.match(_clean(line)))
_DIVIDER_RE = re.compile(r"^\*{3,}\s*$")
def is_segment_divider(line: str) -> bool:
return bool(_DIVIDER_RE.match(_clean(line)))
@dataclass(frozen=True)
class Segment:
id: str
text: str
@dataclass(frozen=True)
class Chapter:
id: str
title: str
segments: list[Segment]
@dataclass(frozen=True)
class Manuscript:
title: str
chapters: list[Chapter]
source_path: str | None = None
def _slugify(value: str) -> str:
v = _clean(value).lower()
v = re.sub(r"[^a-z0-9]+", "-", v).strip("-")
return v or "untitled"
def split_segments(lines: Iterable[str]) -> list[str]:
segments: list[list[str]] = [[]]
for line in lines:
if is_segment_divider(line):
if segments[-1]:
segments.append([])
continue
segments[-1].append(line)
out: list[str] = []
for block in segments:
text = "\n".join(block).strip()
if text:
out.append(text)
return out
def parse_chapters_from_markdown(text: str, *, manuscript_title: str = "The Shadow of Lillya") -> Manuscript:
lines = text.splitlines()
chapters: list[tuple[str, list[str]]] = []
current_title: str | None = None
current_lines: list[str] = []
for raw in lines:
if is_chapter_header(raw):
# flush previous
if current_title is not None:
chapters.append((current_title, current_lines))
current_title = _clean(raw)
current_lines = []
continue
if current_title is None:
# ignore preamble until first chapter marker
continue
current_lines.append(raw)
if current_title is not None:
chapters.append((current_title, current_lines))
parsed: list[Chapter] = []
for idx, (title, body_lines) in enumerate(chapters, start=1):
seg_texts = split_segments(body_lines)
segs: list[Segment] = []
for sidx, seg_text in enumerate(seg_texts, start=1):
seg_id = f"{idx:03d}-{sidx:03d}"
segs.append(Segment(id=seg_id, text=seg_text))
chap_id = f"ch-{idx:03d}-{_slugify(title)[:40]}"
parsed.append(Chapter(id=chap_id, title=title, segments=segs))
# If no explicit chapter headers were found, treat the whole text as one chapter.
if not parsed:
seg_texts = split_segments(lines)
segs = [Segment(id=f"001-{i:03d}", text=t) for i, t in enumerate(seg_texts, start=1)]
parsed = [Chapter(id="ch-001-draft", title="Draft", segments=segs)]
return Manuscript(title=manuscript_title, chapters=parsed)
def manuscript_to_dict(m: Manuscript) -> dict:
return {
"title": m.title,
"source_path": m.source_path,
"chapters": [
{
"id": c.id,
"title": c.title,
"segments": [{"id": s.id, "text": s.text} for s in c.segments],
}
for c in m.chapters
],
}
def dict_to_manuscript(data: dict) -> Manuscript:
chapters: list[Chapter] = []
for c in data.get("chapters", []):
segs = [Segment(id=s["id"], text=s.get("text", "")) for s in c.get("segments", [])]
chapters.append(Chapter(id=c["id"], title=c.get("title", ""), segments=segs))
return Manuscript(title=data.get("title", "Manuscript"), chapters=chapters, source_path=data.get("source_path"))
def manuscript_to_markdown(m: Manuscript) -> str:
parts: list[str] = [f"# {m.title}".strip(), ""]
for i, c in enumerate(m.chapters, start=1):
parts.append(f"## {c.title or f'Chapter {i}'}".strip())
parts.append("")
for s in c.segments:
parts.append(s.text.strip())
parts.append("")
parts.append("***")
parts.append("")
return "\n".join(parts).rstrip() + "\n"