Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from pathlib import Path | |
| os.system("pip uninstall -y gradio") | |
| os.system("pip install gradio==3.2") | |
| from demo_inference.demo_tts import DemoTTS | |
| from demo_inference.demo_asr import DemoASR | |
| from demo_inference.demo_anonymization import DemoAnonymizer | |
| def pcm2float(sig, dtype='float32'): | |
| """ | |
| https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
| """ | |
| sig = np.asarray(sig) | |
| if sig.dtype.kind not in 'iu': | |
| raise TypeError("'sig' must be an array of integers") | |
| dtype = np.dtype(dtype) | |
| if dtype.kind != 'f': | |
| raise TypeError("'dtype' must be a floating point type") | |
| i = np.iinfo(sig.dtype) | |
| abs_max = 2 ** (i.bits - 1) | |
| offset = i.min + abs_max | |
| return (sig.astype(dtype) - offset) / abs_max | |
| def float2pcm(sig, dtype='int16'): | |
| """ | |
| https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
| """ | |
| sig = np.asarray(sig) | |
| if sig.dtype.kind != 'f': | |
| raise TypeError("'sig' must be a float array") | |
| dtype = np.dtype(dtype) | |
| if dtype.kind not in 'iu': | |
| raise TypeError("'dtype' must be an integer type") | |
| i = np.iinfo(dtype) | |
| abs_max = 2 ** (i.bits - 1) | |
| offset = i.min + abs_max | |
| return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) | |
| class VPInterface: | |
| def __init__(self): | |
| self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| self.path_to_tts_models = Path('models', 'tts') | |
| self.path_to_asr_model = Path('models', 'asr') | |
| self.path_to_anon_model = Path('models', 'anonymization') | |
| self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100', | |
| device=self.device) | |
| self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device) | |
| self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool', | |
| device=self.device) | |
| def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag): | |
| sr, audio = recording | |
| audio = pcm2float(audio) | |
| self._check_models(asr_model_tag, anon_model_tag, tts_model_tag) | |
| text_is_phonemes = (self.asr_model.model_tag == 'phones') | |
| text = self.asr_model.recognize_speech(audio, sr) | |
| print(text) | |
| speaker_embedding = self.anon_model.anonymize_embedding(audio, sr) | |
| print(speaker_embedding) | |
| syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding, | |
| text_is_phonemes=text_is_phonemes) | |
| return 48000, float2pcm(syn_audio.cpu().numpy()) | |
| def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag): | |
| if asr_model_tag != self.asr_model.model_tag: | |
| self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device) | |
| if anon_model_tag != self.anon_model.model_tag: | |
| self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag, | |
| device=self.device) | |
| if tts_model_tag != self.synthesis_model.model_tag: | |
| self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag, | |
| device=self.device) | |
| model = VPInterface() | |
| # ββ UI copy & look-and-feel for βanamedi Ghostβ ββββββββββββββββββββββββββββββββ | |
| article = """ | |
| **anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click. | |
| Under the hood the demo chains three micro-models: | |
| * **ASR engine** β turns speech into text | |
| * **Voice-privacy module** β scrambles the speaker embedding (pool/random/pool-raw) | |
| * **TTS synthesizer** β renders the same words with a surrogate voice | |
| Every time you hit **Submit** youβll get a new, privacy-safe version of your input. | |
| > **Heads-up** | |
| > β’ This Space runs on CPU, so inference can take ~20 s. | |
| > β’ The βpoolβ setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`. | |
| > β’ Quality is still work-in-progressβplease report glitches! | |
| """ | |
| description = """ | |
| ## anamedi Ghost β Voice Privacy Demo (v0.1) | |
| Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result. | |
| """ | |
| css = """ | |
| .gr-button-primary { | |
| background-color: #00b7c2 !important; /* anamedi teal */ | |
| border-color: #00b7c2 !important; | |
| } | |
| """ | |
| iface = gr.Interface( | |
| fn=model.read, | |
| inputs=[ | |
| gr.inputs.Audio(source="microphone", type="numpy", | |
| label="ποΈ Speak a sentence (English)"), | |
| gr.inputs.Dropdown( | |
| ["phones", "STT", "TTS"], | |
| type="value", default="phones", | |
| label="ASR engine"), | |
| gr.inputs.Dropdown( | |
| ["pool", "random", "pool raw"], | |
| type="value", default="random", | |
| label="Privacy mode"), | |
| gr.inputs.Dropdown( | |
| ["Libri100", "Libri100 + finetuned", "Libri600", | |
| "Libri600 + finetuned"], | |
| type="value", default="Libri100", | |
| label="TTS voice") | |
| ], | |
| outputs=gr.outputs.Audio(type="numpy", label="π Anonymized audio"), | |
| layout="vertical", | |
| title="anamedi Ghost β Voice Privacy Demo", | |
| description=description, | |
| theme="default", | |
| allow_flagging="never", | |
| article=article, | |
| allow_screenshot=False | |
| ) | |
| iface.launch(enable_queue=True) |