Spaces:
Runtime error
Runtime error
| """ | |
| File I/O utilities | |
| - safe_paths(input_path, out_dir) -> dict of deterministic output paths | |
| - write_jsonl(path, rows) -> str | |
| - write_csv(path, rows, field_order) -> str | |
| - write_txt_transcript(path, paragraphs) -> str | |
| - write_rttm(path, segments) -> str | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, Dict, List | |
| from pathlib import Path | |
| import hashlib | |
| import json | |
| import csv | |
| import os | |
| def safe_paths(input_path: str, out_dir: str) -> Dict[str, str]: | |
| """ | |
| Generate deterministic output file paths based on the input filename + md5 hash of absolute path. | |
| Example: input "noisy.wav" -> noisy_ab12cd.jsonl / csv / txt / rttm under out_dir. | |
| """ | |
| p = Path(input_path) | |
| out_root = Path(out_dir) | |
| out_root.mkdir(parents=True, exist_ok=True) | |
| stem = p.stem | |
| h = hashlib.md5(str(p.resolve()).encode("utf-8")).hexdigest()[:6] | |
| base = f"{stem}_{h}" | |
| return { | |
| "jsonl": str(out_root / f"{base}.jsonl"), | |
| "csv": str(out_root / f"{base}.csv"), | |
| "txt": str(out_root / f"{base}.txt"), | |
| "rttm": str(out_root / f"{base}.rttm"), | |
| } | |
| def write_jsonl(path: str, rows: List[Dict[str, Any]]) -> str: | |
| """Write list of dicts as JSONL (UTF-8, one JSON per line).""" | |
| Path(path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| for row in rows: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| return path | |
| def write_csv(path: str, rows: List[Dict[str, Any]], field_order: List[str]) -> str: | |
| """Write list of dicts as CSV with given column order.""" | |
| Path(path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(path, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=field_order, extrasaction="ignore") | |
| writer.writeheader() | |
| for row in rows: | |
| writer.writerow(row) | |
| return path | |
| def write_txt_transcript(path: str, paragraphs: List[str]) -> str: | |
| """ | |
| Write plain-text transcript (paragraphs separated by blank lines). | |
| """ | |
| Path(path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| for i, para in enumerate(paragraphs): | |
| f.write(para.strip() + "\n") | |
| if i < len(paragraphs) - 1: | |
| f.write("\n") | |
| return path | |
| def write_rttm(path: str, segments: List[Dict[str, Any]]) -> str: | |
| """ | |
| Write diarization output to RTTM format. | |
| Each line: | |
| SPEAKER <file-id> 1 <start> <dur> <NA> <NA> <speaker> <NA> | |
| Where: | |
| - <file-id>: base filename without extension | |
| - <start>: start time (seconds, float) | |
| - <dur>: duration (seconds, float) | |
| - <speaker>: speaker label (e.g., S1) | |
| """ | |
| Path(path).parent.mkdir(parents=True, exist_ok=True) | |
| file_id = Path(path).stem | |
| with open(path, "w", encoding="utf-8") as f: | |
| for seg in segments: | |
| s = float(seg.get("start", 0.0)) | |
| e = float(seg.get("end", s)) | |
| d = max(0.0, e - s) | |
| spk = seg.get("speaker", "Unknown") | |
| f.write(f"SPEAKER {file_id} 1 {s:.3f} {d:.3f} <NA> <NA> {spk} <NA>\n") | |
| return path | |