Spaces:
Running
Running
import os | |
import gradio as gr | |
import numpy as np | |
import torch | |
from pathlib import Path | |
os.system("pip uninstall -y gradio") | |
os.system("pip install gradio==3.2") | |
from demo_inference.demo_tts import DemoTTS | |
from demo_inference.demo_asr import DemoASR | |
from demo_inference.demo_anonymization import DemoAnonymizer | |
def pcm2float(sig, dtype='float32'): | |
""" | |
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
""" | |
sig = np.asarray(sig) | |
if sig.dtype.kind not in 'iu': | |
raise TypeError("'sig' must be an array of integers") | |
dtype = np.dtype(dtype) | |
if dtype.kind != 'f': | |
raise TypeError("'dtype' must be a floating point type") | |
i = np.iinfo(sig.dtype) | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
return (sig.astype(dtype) - offset) / abs_max | |
def float2pcm(sig, dtype='int16'): | |
""" | |
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
""" | |
sig = np.asarray(sig) | |
if sig.dtype.kind != 'f': | |
raise TypeError("'sig' must be a float array") | |
dtype = np.dtype(dtype) | |
if dtype.kind not in 'iu': | |
raise TypeError("'dtype' must be an integer type") | |
i = np.iinfo(dtype) | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) | |
class VPInterface: | |
def __init__(self): | |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
self.path_to_tts_models = Path('models', 'tts') | |
self.path_to_asr_model = Path('models', 'asr') | |
self.path_to_anon_model = Path('models', 'anonymization') | |
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100', | |
device=self.device) | |
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device) | |
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool', | |
device=self.device) | |
def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag): | |
sr, audio = recording | |
audio = pcm2float(audio) | |
self._check_models(asr_model_tag, anon_model_tag, tts_model_tag) | |
text_is_phonemes = (self.asr_model.model_tag == 'phones') | |
text = self.asr_model.recognize_speech(audio, sr) | |
print(text) | |
speaker_embedding = self.anon_model.anonymize_embedding(audio, sr) | |
print(speaker_embedding) | |
syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding, | |
text_is_phonemes=text_is_phonemes) | |
return 48000, float2pcm(syn_audio.cpu().numpy()) | |
def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag): | |
if asr_model_tag != self.asr_model.model_tag: | |
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device) | |
if anon_model_tag != self.anon_model.model_tag: | |
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag, | |
device=self.device) | |
if tts_model_tag != self.synthesis_model.model_tag: | |
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag, | |
device=self.device) | |
model = VPInterface() | |
# ββ UI copy & look-and-feel for βanamedi Ghostβ ββββββββββββββββββββββββββββββββ | |
article = """ | |
**anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click. | |
Under the hood the demo chains three micro-models: | |
* **ASR engine** β turns speech into text | |
* **Voice-privacy module** β scrambles the speaker embedding (pool/random/pool-raw) | |
* **TTS synthesizer** β renders the same words with a surrogate voice | |
Every time you hit **Submit** youβll get a new, privacy-safe version of your input. | |
> **Heads-up** | |
> β’ This Space runs on CPU, so inference can take ~20 s. | |
> β’ The βpoolβ setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`. | |
> β’ Quality is still work-in-progressβplease report glitches! | |
π Source & docs: <https://github.com/anamedi/ghost> | |
""" | |
description = """ | |
## anamedi Ghost β Voice Privacy Demo (v0.1) | |
Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result. | |
""" | |
css = """ | |
.gr-button-primary { | |
background-color: #00b7c2 !important; /* anamedi teal */ | |
border-color: #00b7c2 !important; | |
} | |
""" | |
iface = gr.Interface( | |
fn=model.read, | |
inputs=[ | |
gr.inputs.Audio(source="microphone", type="numpy", | |
label="ποΈ Speak a sentence (English)"), | |
gr.inputs.Dropdown( | |
["phones", "STT", "TTS"], | |
type="value", default="phones", | |
label="ASR engine"), | |
gr.inputs.Dropdown( | |
["pool", "random", "pool raw"], | |
type="value", default="pool", | |
label="Privacy mode"), | |
gr.inputs.Dropdown( | |
["Libri100", "Libri100 + finetuned", "Libri600", | |
"Libri600 + finetuned"], | |
type="value", default="Libri100", | |
label="TTS voice") | |
], | |
outputs=gr.outputs.Audio(type="numpy", label="π Anonymized audio"), | |
layout="vertical", | |
title="anamedi Ghost β Voice Privacy Demo", | |
description=description, | |
theme="default", | |
allow_flagging="never", | |
article=article, | |
allow_screenshot=False | |
) | |
iface.launch(enable_queue=True) |