import os import gradio as gr import numpy as np import torch from pathlib import Path os.system("pip uninstall -y gradio") os.system("pip install gradio==3.2") from demo_inference.demo_tts import DemoTTS from demo_inference.demo_asr import DemoASR from demo_inference.demo_anonymization import DemoAnonymizer def pcm2float(sig, dtype='float32'): """ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 """ sig = np.asarray(sig) if sig.dtype.kind not in 'iu': raise TypeError("'sig' must be an array of integers") dtype = np.dtype(dtype) if dtype.kind != 'f': raise TypeError("'dtype' must be a floating point type") i = np.iinfo(sig.dtype) abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig.astype(dtype) - offset) / abs_max def float2pcm(sig, dtype='int16'): """ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 """ sig = np.asarray(sig) if sig.dtype.kind != 'f': raise TypeError("'sig' must be a float array") dtype = np.dtype(dtype) if dtype.kind not in 'iu': raise TypeError("'dtype' must be an integer type") i = np.iinfo(dtype) abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) class VPInterface: def __init__(self): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.path_to_tts_models = Path('models', 'tts') self.path_to_asr_model = Path('models', 'asr') self.path_to_anon_model = Path('models', 'anonymization') self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100', device=self.device) self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device) self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool', device=self.device) def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag): sr, audio = recording audio = pcm2float(audio) self._check_models(asr_model_tag, anon_model_tag, tts_model_tag) text_is_phonemes = (self.asr_model.model_tag == 'phones') text = self.asr_model.recognize_speech(audio, sr) print(text) speaker_embedding = self.anon_model.anonymize_embedding(audio, sr) print(speaker_embedding) syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding, text_is_phonemes=text_is_phonemes) return 48000, float2pcm(syn_audio.cpu().numpy()) def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag): if asr_model_tag != self.asr_model.model_tag: self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device) if anon_model_tag != self.anon_model.model_tag: self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag, device=self.device) if tts_model_tag != self.synthesis_model.model_tag: self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag, device=self.device) model = VPInterface() # ── UI copy & look-and-feel for “anamedi Ghost” ──────────────────────────────── article = """ **anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click. Under the hood the demo chains three micro-models: * **ASR engine** → turns speech into text * **Voice-privacy module** → scrambles the speaker embedding (pool/random/pool-raw) * **TTS synthesizer** → renders the same words with a surrogate voice Every time you hit **Submit** you’ll get a new, privacy-safe version of your input. > **Heads-up** > • This Space runs on CPU, so inference can take ~20 s. > • The “pool” setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`. > • Quality is still work-in-progress—please report glitches! 🔗 Source & docs: """ description = """ ## anamedi Ghost – Voice Privacy Demo (v0.1) Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result. """ css = """ .gr-button-primary { background-color: #00b7c2 !important; /* anamedi teal */ border-color: #00b7c2 !important; } """ iface = gr.Interface( fn=model.read, inputs=[ gr.inputs.Audio(source="microphone", type="numpy", label="🎙️ Speak a sentence (English)"), gr.inputs.Dropdown( ["phones", "STT", "TTS"], type="value", default="phones", label="ASR engine"), gr.inputs.Dropdown( ["pool", "random", "pool raw"], type="value", default="pool", label="Privacy mode"), gr.inputs.Dropdown( ["Libri100", "Libri100 + finetuned", "Libri600", "Libri600 + finetuned"], type="value", default="Libri100", label="TTS voice") ], outputs=gr.outputs.Audio(type="numpy", label="🔊 Anonymized audio"), layout="vertical", title="anamedi Ghost – Voice Privacy Demo", description=description, theme="default", allow_flagging="never", article=article, allow_screenshot=False ) iface.launch(enable_queue=True)