| | |
| | import io, re, zipfile |
| | from typing import Tuple, List |
| |
|
| | import gradio as gr |
| | import numpy as np |
| | import soundfile as sf |
| |
|
| | from synthesis import synthesize, preload_model |
| |
|
| | SR = 24_000 |
| | DIST_M = 1.0 |
| | AZ_LOOKUP = {"left": -45, "right": 45} |
| |
|
| | |
| | |
| | |
| | def _tts(text: str, az_deg: float) -> np.ndarray: |
| | return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR) |
| |
|
| | |
| | |
| | |
| | LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I) |
| |
|
| | def parse_script(script: str) -> List[Tuple[str, np.ndarray]]: |
| | tracks = [] |
| | for ln in script.strip().splitlines(): |
| | m = LINE_RE.match(ln.strip()) |
| | if not m: |
| | continue |
| | side, text = m.group(1).lower(), m.group(2).strip() |
| | tracks.append((side, _tts(text, AZ_LOOKUP[side]))) |
| | if not tracks: |
| | raise gr.Error("No valid lines found. Format: [S1][ left] Hello …") |
| | return tracks |
| |
|
| | |
| | |
| | |
| | def _pad(pcm: np.ndarray, T: int) -> np.ndarray: |
| | return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant") |
| |
|
| | def render(script: str): |
| | tracks = parse_script(script) |
| | left = [w for side, w in tracks if side == "left"] |
| | right = [w for side, w in tracks if side == "right"] |
| |
|
| | def combine(wavs): |
| | if not wavs: |
| | return np.zeros((2, 1), dtype=np.float32) |
| | T = max(w.shape[1] for w in wavs) |
| | return sum(_pad(w, T) for w in wavs) |
| |
|
| | left_mix = combine(left) |
| | right_mix = combine(right) |
| | dialog = left_mix + right_mix |
| |
|
| | return ( |
| | (SR, left_mix.T), |
| | (SR, right_mix.T), |
| | (SR, dialog.T), |
| | _zip_bytes({ |
| | "left_speaker.wav": left_mix.T, |
| | "right_speaker.wav": right_mix.T, |
| | "dialog_mix.wav": dialog.T, |
| | }) |
| | ) |
| |
|
| | |
| | |
| | |
| | def _zip_bytes(files: dict) -> bytes: |
| | buf = io.BytesIO() |
| | with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: |
| | for fname, data in files.items(): |
| | wav_buf = io.BytesIO() |
| | sf.write(wav_buf, data, SR, subtype="PCM_16") |
| | zf.writestr(fname, wav_buf.getvalue()) |
| | return buf.getvalue() |
| |
|
| | |
| | |
| | |
| | with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo: |
| | gr.Markdown("### Spatial Dialog Synth\n" |
| | "Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`") |
| | |
| | with gr.Row(): |
| | |
| | with gr.Column(scale=1): |
| | script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script") |
| | gen_btn = gr.Button("Generate", variant="primary") |
| | zip_output = gr.File(label="Download all (zip)") |
| | |
| | |
| | with gr.Column(scale=1): |
| | left_audio = gr.Audio(label="Left speaker") |
| | right_audio = gr.Audio(label="Right speaker") |
| | mix_audio = gr.Audio(label="Dialog mix") |
| |
|
| | gen_btn.click( |
| | fn=render, |
| | inputs=script_in, |
| | outputs=[left_audio, right_audio, mix_audio, zip_output] |
| | ) |
| | |
| | |
| | |
| | |
| | preload_model() |
| |
|
| | demo.launch() |
| |
|