--- language: "en" inference: false tags: - Vocoder - HiFIGAN - speech-synthesis - speechbrain license: "apache-2.0" datasets: - LJSpeech ---

# Vocoder with HiFIGAN Unit ## Work In Progress .... ```python import pathlib as pl import numpy as np import matplotlib.pyplot as plt import torch import torchaudio from speechbrain.inference.vocoders import UnitHIFIGAN from speechbrain.lobes.models.huggingface_transformers import ( hubert, wav2vec2, wavlm, ) from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( DiscreteSSL, ) ENCODER_CLASSES = { "HuBERT": hubert.HuBERT, "Wav2Vec2": wav2vec2.Wav2Vec2, "WavLM": wavlm.WavLM, } kmeans_folder = "poonehmousavi/SSL_Quantization" kmeans_dataset = "LJSpeech" # LibriSpeech-100-360-500 num_clusters = 1000 encoder_type = "HuBERT" # one of [HuBERT, Wav2Vec2, WavLM] encoder_source = "facebook/hubert-large-ll60k" layer = [3, 7, 12, 18, 23] vocoder_source = ( "chaanks/hifigan-unit-hubert-ll60k-l3-7-12-18-23-k1000-ljspeech-ljspeech" ) save_path = pl.Path(".tmpdir") device = "cuda" sample_rate = 16000 wav = "chaanks/hifigan-unit-hubert-ll60k-l3-7-12-18-23-k1000-ljspeech-ljspeech/test.wav" encoder_class = ENCODER_CLASSES[encoder_type] encoder = encoder_class( source=encoder_source, save_path=(save_path / "encoder").as_posix(), output_norm=False, freeze=True, freeze_feature_extractor=True, apply_spec_augment=False, output_all_hiddens=True, ).to(device) discrete_encoder = DiscreteSSL( save_path=(save_path / "discrete_encoder").as_posix(), ssl_model=encoder, kmeans_dataset=kmeans_dataset, kmeans_repo_id=kmeans_folder, num_clusters=num_clusters, ) vocoder = UnitHIFIGAN.from_hparams( source=vocoder_source, run_opts={"device": str(device)}, savedir=(save_path / "vocoder").as_posix(), ) audio = vocoder.load_audio(wav) audio = audio.unsqueeze(0).to(device) deduplicates = [False for _ in layer] bpe_tokenizers = [None for _ in layer] tokens, _, _ = discrete_encoder( audio, SSL_layers=layer, deduplicates=deduplicates, bpe_tokenizers=bpe_tokenizers, ) tokens = tokens.cpu().squeeze(0) num_layer = len(layer) offsets = torch.arange(num_layer) * num_clusters tokens = tokens + offsets waveform = vocoder.decode_unit(tokens) torchaudio.save("pred.wav", waveform.cpu(), sample_rate=sample_rate) ```