""" transcript_parser.py -------------------- Parses Zoom transcripts (TXT or VTT format) into structured speaker turns. Each turn is the atomic unit for downstream segmentation by Agent 1. Supported formats: - Zoom TXT export: "Speaker Name\tHH:MM:SS\nUtterance text\n" - Zoom VTT export: WebVTT with timestamp blocks and speaker labels Output: List of Turn dicts, ready to be saved as parsed JSON. """ import re import json from pathlib import Path from dataclasses import dataclass, asdict from typing import Optional, Union # --------------------------------------------------------------------------- # Data structure # --------------------------------------------------------------------------- @dataclass class Turn: turn_id: int # sequential index across the transcript speaker: str # normalised speaker name start_time: str # HH:MM:SS end_time: Optional[str] # HH:MM:SS — available in VTT, None in TXT start_seconds: float # for windowing arithmetic end_seconds: Optional[float] text: str # cleaned utterance # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _hhmmss_to_seconds(ts: str) -> float: """Convert HH:MM:SS or HH:MM:SS.mmm to total seconds.""" ts = ts.strip() # strip milliseconds if present (VTT uses HH:MM:SS.mmm) ts = ts.split(".")[0] parts = ts.split(":") try: if len(parts) == 3: h, m, s = parts return int(h) * 3600 + int(m) * 60 + int(s) elif len(parts) == 2: m, s = parts return int(m) * 60 + int(s) except ValueError: pass return 0.0 def _seconds_to_hhmmss(seconds: float) -> str: seconds = int(seconds) h = seconds // 3600 m = (seconds % 3600) // 60 s = seconds % 60 return f"{h:02d}:{m:02d}:{s:02d}" def _clean_text(text: str) -> str: """Strip VTT positioning tags, extra whitespace, and common artefacts.""" text = re.sub(r"<[^>]+>", "", text) # remove <00:00:00.000> tags text = re.sub(r"\[.*?\]", "", text) # remove [Music] [Applause] etc text = re.sub(r"\s+", " ", text) # collapse whitespace return text.strip() # --------------------------------------------------------------------------- # TXT parser # --------------------------------------------------------------------------- # Zoom TXT export format: # # Speaker Name\tHH:MM:SS # Utterance text (may span multiple lines until next speaker block) # # Some exports use a tab separator, others a newline between name+time and text. # We handle both. _TXT_HEADER_RE = re.compile( r"^(?P.+?)\t(?P