speaker-anonymization-v2

Sleeping

File size: 5,747 Bytes

import os
import gradio as gr
import numpy as np
import torch
from pathlib import Path

os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.2")

from demo_inference.demo_tts import DemoTTS
from demo_inference.demo_asr import DemoASR
from demo_inference.demo_anonymization import DemoAnonymizer


def pcm2float(sig, dtype='float32'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind not in 'iu':
        raise TypeError("'sig' must be an array of integers")
    dtype = np.dtype(dtype)
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(sig.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig.astype(dtype) - offset) / abs_max


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class VPInterface:

    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.path_to_tts_models = Path('models', 'tts')
        self.path_to_asr_model = Path('models', 'asr')
        self.path_to_anon_model = Path('models', 'anonymization')

        self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
                                       device=self.device)
        self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
        self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
                                         device=self.device)

    def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
        sr, audio = recording
        audio = pcm2float(audio)

        self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)

        text_is_phonemes = (self.asr_model.model_tag == 'phones')
        text = self.asr_model.recognize_speech(audio, sr)
        print(text)
        speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
        print(speaker_embedding)
        syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
                                                   text_is_phonemes=text_is_phonemes)

        return 48000, float2pcm(syn_audio.cpu().numpy())

    def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
        if asr_model_tag != self.asr_model.model_tag:
            self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
        if anon_model_tag != self.anon_model.model_tag:
            self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
                                             device=self.device)
        if tts_model_tag != self.synthesis_model.model_tag:
            self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
                                           device=self.device)


model = VPInterface()

# ── UI copy & look-and-feel for “anamedi Ghost” ────────────────────────────────
article = """
**anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click.  
Under the hood the demo chains three micro-models:

* **ASR engine** → turns speech into text  
* **Voice-privacy module** → scrambles the speaker embedding (pool/random/pool-raw)  
* **TTS synthesizer** → renders the same words with a surrogate voice

Every time you hit **Submit** you’ll get a new, privacy-safe version of your input.

> **Heads-up**  
> • This Space runs on CPU, so inference can take ~20 s.  
> • The “pool” setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`.  
> • Quality is still work-in-progress—please report glitches!


"""

description = """
## anamedi Ghost – Voice Privacy Demo (v0.1)
Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result.
"""

css = """
.gr-button-primary {
    background-color: #00b7c2 !important;  /* anamedi teal */
    border-color: #00b7c2 !important;
}
"""

iface = gr.Interface(
    fn=model.read,
    inputs=[
        gr.inputs.Audio(source="microphone", type="numpy",
                        label="🎙️ Speak a sentence (English)"),
        gr.inputs.Dropdown(
            ["phones", "STT", "TTS"],
            type="value", default="phones",
            label="ASR engine"),
        gr.inputs.Dropdown(
            ["pool", "random", "pool raw"],
            type="value", default="random",
            label="Privacy mode"),
        gr.inputs.Dropdown(
            ["Libri100", "Libri100 + finetuned", "Libri600",
             "Libri600 + finetuned"],
            type="value", default="Libri100",
            label="TTS voice")
    ],
    outputs=gr.outputs.Audio(type="numpy", label="🔊 Anonymized audio"),
    layout="vertical",
    title="anamedi Ghost – Voice Privacy Demo",
    description=description,
    theme="default",
    allow_flagging="never",
    article=article,
    allow_screenshot=False
)
iface.launch(enable_queue=True)