jempf's picture
updated the UI
fd30b9c
import os
import gradio as gr
import numpy as np
import torch
from pathlib import Path
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.2")
from demo_inference.demo_tts import DemoTTS
from demo_inference.demo_asr import DemoASR
from demo_inference.demo_anonymization import DemoAnonymizer
def pcm2float(sig, dtype='float32'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind not in 'iu':
raise TypeError("'sig' must be an array of integers")
dtype = np.dtype(dtype)
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(sig.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig.astype(dtype) - offset) / abs_max
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
class VPInterface:
def __init__(self):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.path_to_tts_models = Path('models', 'tts')
self.path_to_asr_model = Path('models', 'asr')
self.path_to_anon_model = Path('models', 'anonymization')
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
device=self.device)
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
device=self.device)
def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
sr, audio = recording
audio = pcm2float(audio)
self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)
text_is_phonemes = (self.asr_model.model_tag == 'phones')
text = self.asr_model.recognize_speech(audio, sr)
print(text)
speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
print(speaker_embedding)
syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
text_is_phonemes=text_is_phonemes)
return 48000, float2pcm(syn_audio.cpu().numpy())
def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
if asr_model_tag != self.asr_model.model_tag:
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
if anon_model_tag != self.anon_model.model_tag:
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
device=self.device)
if tts_model_tag != self.synthesis_model.model_tag:
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
device=self.device)
model = VPInterface()
# ── UI copy & look-and-feel for β€œanamedi Ghost” ────────────────────────────────
article = """
**anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click.
Under the hood the demo chains three micro-models:
* **ASR engine** β†’ turns speech into text
* **Voice-privacy module** β†’ scrambles the speaker embedding (pool/random/pool-raw)
* **TTS synthesizer** β†’ renders the same words with a surrogate voice
Every time you hit **Submit** you’ll get a new, privacy-safe version of your input.
> **Heads-up**
> β€’ This Space runs on CPU, so inference can take ~20 s.
> β€’ The β€œpool” setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`.
> β€’ Quality is still work-in-progressβ€”please report glitches!
πŸ”— Source & docs: <https://github.com/anamedi/ghost>
"""
description = """
## anamedi Ghost – Voice Privacy Demo (v0.1)
Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result.
"""
css = """
.gr-button-primary {
background-color: #00b7c2 !important; /* anamedi teal */
border-color: #00b7c2 !important;
}
"""
iface = gr.Interface(
fn=model.read,
inputs=[
gr.inputs.Audio(source="microphone", type="numpy",
label="πŸŽ™οΈ Speak a sentence (English)"),
gr.inputs.Dropdown(
["phones", "STT", "TTS"],
type="value", default="phones",
label="ASR engine"),
gr.inputs.Dropdown(
["pool", "random", "pool raw"],
type="value", default="pool",
label="Privacy mode"),
gr.inputs.Dropdown(
["Libri100", "Libri100 + finetuned", "Libri600",
"Libri600 + finetuned"],
type="value", default="Libri100",
label="TTS voice")
],
outputs=gr.outputs.Audio(type="numpy", label="πŸ”Š Anonymized audio"),
layout="vertical",
title="anamedi Ghost – Voice Privacy Demo",
description=description,
theme="default",
allow_flagging="never",
article=article,
allow_screenshot=False
)
iface.launch(enable_queue=True)