Spaces:
Running
Running
File size: 5,799 Bytes
574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c 574ab7e fd30b9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import os
import gradio as gr
import numpy as np
import torch
from pathlib import Path
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.2")
from demo_inference.demo_tts import DemoTTS
from demo_inference.demo_asr import DemoASR
from demo_inference.demo_anonymization import DemoAnonymizer
def pcm2float(sig, dtype='float32'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind not in 'iu':
raise TypeError("'sig' must be an array of integers")
dtype = np.dtype(dtype)
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(sig.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig.astype(dtype) - offset) / abs_max
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
class VPInterface:
def __init__(self):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.path_to_tts_models = Path('models', 'tts')
self.path_to_asr_model = Path('models', 'asr')
self.path_to_anon_model = Path('models', 'anonymization')
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
device=self.device)
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
device=self.device)
def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
sr, audio = recording
audio = pcm2float(audio)
self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)
text_is_phonemes = (self.asr_model.model_tag == 'phones')
text = self.asr_model.recognize_speech(audio, sr)
print(text)
speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
print(speaker_embedding)
syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
text_is_phonemes=text_is_phonemes)
return 48000, float2pcm(syn_audio.cpu().numpy())
def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
if asr_model_tag != self.asr_model.model_tag:
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
if anon_model_tag != self.anon_model.model_tag:
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
device=self.device)
if tts_model_tag != self.synthesis_model.model_tag:
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
device=self.device)
model = VPInterface()
# ββ UI copy & look-and-feel for βanamedi Ghostβ ββββββββββββββββββββββββββββββββ
article = """
**anamedi Ghost** lets you **anonymize _and_ pseudonymize** short speech samples in a single click.
Under the hood the demo chains three micro-models:
* **ASR engine** β turns speech into text
* **Voice-privacy module** β scrambles the speaker embedding (pool/random/pool-raw)
* **TTS synthesizer** β renders the same words with a surrogate voice
Every time you hit **Submit** youβll get a new, privacy-safe version of your input.
> **Heads-up**
> β’ This Space runs on CPU, so inference can take ~20 s.
> β’ The βpoolβ setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`.
> β’ Quality is still work-in-progressβplease report glitches!
π Source & docs: <https://github.com/anamedi/ghost>
"""
description = """
## anamedi Ghost β Voice Privacy Demo (v0.1)
Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result.
"""
css = """
.gr-button-primary {
background-color: #00b7c2 !important; /* anamedi teal */
border-color: #00b7c2 !important;
}
"""
iface = gr.Interface(
fn=model.read,
inputs=[
gr.inputs.Audio(source="microphone", type="numpy",
label="ποΈ Speak a sentence (English)"),
gr.inputs.Dropdown(
["phones", "STT", "TTS"],
type="value", default="phones",
label="ASR engine"),
gr.inputs.Dropdown(
["pool", "random", "pool raw"],
type="value", default="pool",
label="Privacy mode"),
gr.inputs.Dropdown(
["Libri100", "Libri100 + finetuned", "Libri600",
"Libri600 + finetuned"],
type="value", default="Libri100",
label="TTS voice")
],
outputs=gr.outputs.Audio(type="numpy", label="π Anonymized audio"),
layout="vertical",
title="anamedi Ghost β Voice Privacy Demo",
description=description,
theme="default",
allow_flagging="never",
article=article,
allow_screenshot=False
)
iface.launch(enable_queue=True) |