Spaces:

garywelz
/

shadow

Sleeping

App Files Files Community

shadow / authoring /parse.py

garywelz

Add reader, versioning, and export tools

b15141d 22 days ago

raw

history blame contribute delete

4.31 kB

	from __future__ import annotations

	from dataclasses import dataclass
	import re
	from typing import Iterable


	_ZWSP_RE = re.compile(r"[\u200b\u200c\u200d\ufeff]")


	def _clean(s: str) -> str:
	return _ZWSP_RE.sub("", s).strip()


	_CHAPTER_HEADER_RE = re.compile(
	r"^(chapter\s+\d+\|new\s+chapter\b.\|kungur\b.chapter\b.*)$",
	re.IGNORECASE,
	)


	def is_chapter_header(line: str) -> bool:
	return bool(_CHAPTER_HEADER_RE.match(_clean(line)))


	_DIVIDER_RE = re.compile(r"^\{3,}\s$")


	def is_segment_divider(line: str) -> bool:
	return bool(_DIVIDER_RE.match(_clean(line)))


	@dataclass(frozen=True)
	class Segment:
	id: str
	text: str


	@dataclass(frozen=True)
	class Chapter:
	id: str
	title: str
	segments: list[Segment]


	@dataclass(frozen=True)
	class Manuscript:
	title: str
	chapters: list[Chapter]
	source_path: str \| None = None


	def _slugify(value: str) -> str:
	v = _clean(value).lower()
	v = re.sub(r"[^a-z0-9]+", "-", v).strip("-")
	return v or "untitled"


	def split_segments(lines: Iterable[str]) -> list[str]:
	segments: list[list[str]] = [[]]
	for line in lines:
	if is_segment_divider(line):
	if segments[-1]:
	segments.append([])
	continue
	segments[-1].append(line)
	out: list[str] = []
	for block in segments:
	text = "\n".join(block).strip()
	if text:
	out.append(text)
	return out


	def parse_chapters_from_markdown(text: str, *, manuscript_title: str = "The Shadow of Lillya") -> Manuscript:
	lines = text.splitlines()

	chapters: list[tuple[str, list[str]]] = []
	current_title: str \| None = None
	current_lines: list[str] = []

	for raw in lines:
	if is_chapter_header(raw):
	# flush previous
	if current_title is not None:
	chapters.append((current_title, current_lines))
	current_title = _clean(raw)
	current_lines = []
	continue
	if current_title is None:
	# ignore preamble until first chapter marker
	continue
	current_lines.append(raw)

	if current_title is not None:
	chapters.append((current_title, current_lines))

	parsed: list[Chapter] = []
	for idx, (title, body_lines) in enumerate(chapters, start=1):
	seg_texts = split_segments(body_lines)
	segs: list[Segment] = []
	for sidx, seg_text in enumerate(seg_texts, start=1):
	seg_id = f"{idx:03d}-{sidx:03d}"
	segs.append(Segment(id=seg_id, text=seg_text))
	chap_id = f"ch-{idx:03d}-{_slugify(title)[:40]}"
	parsed.append(Chapter(id=chap_id, title=title, segments=segs))

	# If no explicit chapter headers were found, treat the whole text as one chapter.
	if not parsed:
	seg_texts = split_segments(lines)
	segs = [Segment(id=f"001-{i:03d}", text=t) for i, t in enumerate(seg_texts, start=1)]
	parsed = [Chapter(id="ch-001-draft", title="Draft", segments=segs)]

	return Manuscript(title=manuscript_title, chapters=parsed)


	def manuscript_to_dict(m: Manuscript) -> dict:
	return {
	"title": m.title,
	"source_path": m.source_path,
	"chapters": [
	{
	"id": c.id,
	"title": c.title,
	"segments": [{"id": s.id, "text": s.text} for s in c.segments],
	}
	for c in m.chapters
	],
	}


	def dict_to_manuscript(data: dict) -> Manuscript:
	chapters: list[Chapter] = []
	for c in data.get("chapters", []):
	segs = [Segment(id=s["id"], text=s.get("text", "")) for s in c.get("segments", [])]
	chapters.append(Chapter(id=c["id"], title=c.get("title", ""), segments=segs))
	return Manuscript(title=data.get("title", "Manuscript"), chapters=chapters, source_path=data.get("source_path"))


	def manuscript_to_markdown(m: Manuscript) -> str:
	parts: list[str] = [f"# {m.title}".strip(), ""]
	for i, c in enumerate(m.chapters, start=1):
	parts.append(f"## {c.title or f'Chapter {i}'}".strip())
	parts.append("")
	for s in c.segments:
	parts.append(s.text.strip())
	parts.append("")
	parts.append("***")
	parts.append("")
	return "\n".join(parts).rstrip() + "\n"