File size: 5,799 Bytes
574ab7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd30b9c
574ab7e
fd30b9c
 
574ab7e
fd30b9c
 
 
574ab7e
fd30b9c
574ab7e
fd30b9c
 
 
 
 
 
574ab7e
 
 
fd30b9c
 
574ab7e
 
 
fd30b9c
 
 
 
574ab7e
 
fd30b9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import gradio as gr
import numpy as np
import torch
from pathlib import Path

os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.2")

from demo_inference.demo_tts import DemoTTS
from demo_inference.demo_asr import DemoASR
from demo_inference.demo_anonymization import DemoAnonymizer


def pcm2float(sig, dtype='float32'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind not in 'iu':
        raise TypeError("'sig' must be an array of integers")
    dtype = np.dtype(dtype)
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(sig.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig.astype(dtype) - offset) / abs_max


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class VPInterface:

    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.path_to_tts_models = Path('models', 'tts')
        self.path_to_asr_model = Path('models', 'asr')
        self.path_to_anon_model = Path('models', 'anonymization')

        self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
                                       device=self.device)
        self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
        self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
                                         device=self.device)

    def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
        sr, audio = recording
        audio = pcm2float(audio)

        self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)

        text_is_phonemes = (self.asr_model.model_tag == 'phones')
        text = self.asr_model.recognize_speech(audio, sr)
        print(text)
        speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
        print(speaker_embedding)
        syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
                                                   text_is_phonemes=text_is_phonemes)

        return 48000, float2pcm(syn_audio.cpu().numpy())

    def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
        if asr_model_tag != self.asr_model.model_tag:
            self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
        if anon_model_tag != self.anon_model.model_tag:
            self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
                                             device=self.device)
        if tts_model_tag != self.synthesis_model.model_tag:
            self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
                                           device=self.device)


model = VPInterface()

# ── UI copy & look-and-feel for β€œanamedi Ghost” ────────────────────────────────
article = """
**anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click.  
Under the hood the demo chains three micro-models:

* **ASR engine** β†’ turns speech into text  
* **Voice-privacy module** β†’ scrambles the speaker embedding (pool/random/pool-raw)  
* **TTS synthesizer** β†’ renders the same words with a surrogate voice

Every time you hit **Submit** you’ll get a new, privacy-safe version of your input.

> **Heads-up**  
> β€’ This Space runs on CPU, so inference can take ~20 s.  
> β€’ The β€œpool” setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`.  
> β€’ Quality is still work-in-progressβ€”please report glitches!

πŸ”— Source & docs: <https://github.com/anamedi/ghost>
"""

description = """
## anamedi Ghost – Voice Privacy Demo (v0.1)
Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result.
"""

css = """
.gr-button-primary {
    background-color: #00b7c2 !important;  /* anamedi teal */
    border-color: #00b7c2 !important;
}
"""

iface = gr.Interface(
    fn=model.read,
    inputs=[
        gr.inputs.Audio(source="microphone", type="numpy",
                        label="πŸŽ™οΈ Speak a sentence (English)"),
        gr.inputs.Dropdown(
            ["phones", "STT", "TTS"],
            type="value", default="phones",
            label="ASR engine"),
        gr.inputs.Dropdown(
            ["pool", "random", "pool raw"],
            type="value", default="pool",
            label="Privacy mode"),
        gr.inputs.Dropdown(
            ["Libri100", "Libri100 + finetuned", "Libri600",
             "Libri600 + finetuned"],
            type="value", default="Libri100",
            label="TTS voice")
    ],
    outputs=gr.outputs.Audio(type="numpy", label="πŸ”Š Anonymized audio"),
    layout="vertical",
    title="anamedi Ghost – Voice Privacy Demo",
    description=description,
    theme="default",
    allow_flagging="never",
    article=article,
    allow_screenshot=False
)
iface.launch(enable_queue=True)