speaker-anonymization-v2

Sleeping

App Files Files Community

speaker-anonymization-v2 / app.py

jempf

set to random as default

fac5f23 7 months ago

raw

history blame contribute delete

5.75 kB

	import os
	import gradio as gr
	import numpy as np
	import torch
	from pathlib import Path

	os.system("pip uninstall -y gradio")
	os.system("pip install gradio==3.2")

	from demo_inference.demo_tts import DemoTTS
	from demo_inference.demo_asr import DemoASR
	from demo_inference.demo_anonymization import DemoAnonymizer


	def pcm2float(sig, dtype='float32'):
	"""
	https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
	"""
	sig = np.asarray(sig)
	if sig.dtype.kind not in 'iu':
	raise TypeError("'sig' must be an array of integers")
	dtype = np.dtype(dtype)
	if dtype.kind != 'f':
	raise TypeError("'dtype' must be a floating point type")

	i = np.iinfo(sig.dtype)
	abs_max = 2 ** (i.bits - 1)
	offset = i.min + abs_max
	return (sig.astype(dtype) - offset) / abs_max


	def float2pcm(sig, dtype='int16'):
	"""
	https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
	"""
	sig = np.asarray(sig)
	if sig.dtype.kind != 'f':
	raise TypeError("'sig' must be a float array")
	dtype = np.dtype(dtype)
	if dtype.kind not in 'iu':
	raise TypeError("'dtype' must be an integer type")
	i = np.iinfo(dtype)
	abs_max = 2 ** (i.bits - 1)
	offset = i.min + abs_max
	return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


	class VPInterface:

	def __init__(self):
	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

	self.path_to_tts_models = Path('models', 'tts')
	self.path_to_asr_model = Path('models', 'asr')
	self.path_to_anon_model = Path('models', 'anonymization')

	self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
	device=self.device)
	self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
	self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
	device=self.device)

	def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
	sr, audio = recording
	audio = pcm2float(audio)

	self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)

	text_is_phonemes = (self.asr_model.model_tag == 'phones')
	text = self.asr_model.recognize_speech(audio, sr)
	print(text)
	speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
	print(speaker_embedding)
	syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
	text_is_phonemes=text_is_phonemes)

	return 48000, float2pcm(syn_audio.cpu().numpy())

	def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
	if asr_model_tag != self.asr_model.model_tag:
	self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
	if anon_model_tag != self.anon_model.model_tag:
	self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
	device=self.device)
	if tts_model_tag != self.synthesis_model.model_tag:
	self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
	device=self.device)


	model = VPInterface()

	# ── UI copy & look-and-feel for “anamedi Ghost” ────────────────────────────────
	article = """
	anamedi Ghost lets you anonymize _and_ pseudonymize short speech samples in a single click.
	Under the hood the demo chains three micro-models:

	* ASR engine → turns speech into text
	* Voice-privacy module → scrambles the speaker embedding (pool/random/pool-raw)
	* TTS synthesizer → renders the same words with a surrogate voice

	Every time you hit Submit you’ll get a new, privacy-safe version of your input.

	> Heads-up
	> • This Space runs on CPU, so inference can take ~20 s.
	> • The “pool” setting uses an ad-hoc scaler (`StandardScaler`) because we process just one sample at a time; in production Ghost uses a batch-optimised `MinMaxScaler`.
	> • Quality is still work-in-progress—please report glitches!


	"""

	description = """
	## anamedi Ghost – Voice Privacy Demo (v0.1)
	Try it out: record a short English sentence, pick your privacy mode, then listen to the anonymized result.
	"""

	css = """
	.gr-button-primary {
	background-color: #00b7c2 !important; /* anamedi teal */
	border-color: #00b7c2 !important;
	}
	"""

	iface = gr.Interface(
	fn=model.read,
	inputs=[
	gr.inputs.Audio(source="microphone", type="numpy",
	label="🎙️ Speak a sentence (English)"),
	gr.inputs.Dropdown(
	["phones", "STT", "TTS"],
	type="value", default="phones",
	label="ASR engine"),
	gr.inputs.Dropdown(
	["pool", "random", "pool raw"],
	type="value", default="random",
	label="Privacy mode"),
	gr.inputs.Dropdown(
	["Libri100", "Libri100 + finetuned", "Libri600",
	"Libri600 + finetuned"],
	type="value", default="Libri100",
	label="TTS voice")
	],
	outputs=gr.outputs.Audio(type="numpy", label="🔊 Anonymized audio"),
	layout="vertical",
	title="anamedi Ghost – Voice Privacy Demo",
	description=description,
	theme="default",
	allow_flagging="never",
	article=article,
	allow_screenshot=False
	)
	iface.launch(enable_queue=True)