Demo for 🎛️ EmoKnob


import os
import subprocess
import sys
import spaces

def install(package):
    if '=' in package:
        package_name, package_version = package.split('==')
    else:
        package_name = package
        package_version = None
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package_name])
        print(f"Successfully uninstalled {package}")
    except subprocess.CalledProcessError:
        print(f"Package {package} was not installed, proceeding with installation")
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# install('pydantic==2.0.0')
# install('gradio==4.44.0')
# install('spacy==3.7')

is_prod = True
if os.environ.get('PROD_MODE') == 'local':
    is_prod = False

import pickle

import gradio as gr
import os

if not is_prod:

    import os
    os.environ['HF_HOME'] = '/proj/afosr/metavoice/cache'
    os.environ['TRANSFORMERS_CACHE'] = '/proj/afosr/metavoice/cache'
    os.environ['HF_DATASETS_CACHE'] = '/proj/afosr/metavoice/cache'
    os.environ['HF_METRICS_CACHE'] = '/proj/afosr/metavoice/cache'
    os.environ['HF_MODULES_CACHE'] = '/proj/afosr/metavoice/cache'
    ffmpeg_path = '/home/hc3295/ffmpegg_build/bin'
    os.environ['PATH'] += os.pathsep + ffmpeg_path


import shutil
import tempfile
import time
from pathlib import Path

import librosa
import torch
from huggingface_hub import snapshot_download

from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
    EncodecDecoder,
    InferenceConfig,
    Model,
    TiltedEncodec,
    TrainedBPETokeniser,
    get_cached_embedding,
    get_cached_file,    
    get_enhancer,
)
from fam.llm.utils import (
    check_audio_file,
    get_default_dtype,
    get_device,
    normalize_text,
)

debug = False

DESCRIPTION = ""
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
if torch.cuda.is_available():
    if not debug:
        model_name = "metavoiceio/metavoice-1B-v0.1"
        seed = 1337
        output_dir = "outputs"
        _dtype = get_default_dtype()
        _device = 'cuda:0'
        _model_dir = snapshot_download(repo_id=model_name)
        first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
        output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

        second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
        config_second_stage = InferenceConfig(
            ckpt_path=second_stage_ckpt_path,
            num_samples=1,
            seed=seed,
            device=_device,
            dtype=_dtype,
            compile=False,
            init_from="resume",
            output_dir=output_dir,
        )
        data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
        llm_second_stage = Model(
            config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
        )
        enhancer = get_enhancer("df")

        precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
        model, tokenizer, smodel, model_size = build_model(
            precision=precision,
            checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
            spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
            device=_device,
            compile=True,
            compile_prefill=True,
        )

@spaces.GPU
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):

    print('text', text)
    print('emo_dir', emo_dir)
    print('source_path', source_path)
    print('emo_path', emo_path)
    print('neutral_path', neutral_path)
    print('strength', strength)
    print('top_p', top_p)
    print('guidance_scale', guidance_scale)

    if toggle == RADIO_CHOICES[0]:
        source_path = PRESET_VOICES[preset_dropdown]
    source_path = get_cached_file(source_path)
    check_audio_file(source_path)
    source_emb = get_cached_embedding(source_path, smodel).to(device=_device, dtype=precision)

    if emo_dir == EMO_NAMES[0]:
        emo_path = get_cached_file(emo_path)
        check_audio_file(emo_path)
        emo_emb = get_cached_embedding(emo_path, smodel).to(device=_device, dtype=precision)

        neutral_path = get_cached_file(neutral_path)
        check_audio_file(neutral_path)
        neutral_emb = get_cached_embedding(neutral_path, smodel).to(device=_device, dtype=precision)

        emo_dir = emo_emb - neutral_emb
        emo_dir = emo_dir / torch.norm(emo_dir, p=2)
    else:
        emo_dir = torch.tensor(ALL_EMO_DIRS[emo_dir], device=_device, dtype=precision)
    
    
    edited_emb = source_emb + strength * emo_dir
    edited_emb = edited_emb.to(device=_device, dtype=precision)

    temperature=1.0
    text = normalize_text(text)

    start = time.time()
    # first stage LLM
    tokens = main(
        model=model,
        tokenizer=tokenizer,
        model_size=model_size,
        prompt=text,
        spk_emb=edited_emb,
        top_p=torch.tensor(top_p, device=_device, dtype=precision),
        guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
        temperature=torch.tensor(temperature, device=_device, dtype=precision),
    )
    text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

    b_speaker_embs = edited_emb.unsqueeze(0)

    # second stage LLM + multi-band diffusion model
    wav_files = llm_second_stage(
        texts=[text],
        encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
        speaker_embs=b_speaker_embs,
        batch_size=1,
        guidance_scale=None,
        top_p=None,
        top_k=200,
        temperature=1.0,
        max_new_tokens=None,
    )

    wav_file = wav_files[0]
    with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
        enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
        shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
        print(f"\nSaved audio to {wav_file}.wav")
    
    output_path = str(wav_file) + ".wav"
    return output_path


ALL_EMO_DIRS = pickle.load(open('all_emo_dirs.pkl', 'rb'))
EMO_NAMES = ['Upload your own sample'] + list(ALL_EMO_DIRS.keys())

RADIO_CHOICES = ["Preset voices", "Upload your voice"]
MAX_CHARS = 220
PRESET_VOICES = {
    # female
    "Bria": "https://cdn.themetavoice.xyz/speakers%2Fbria.mp3",
    # male
    "Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
    "Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
}


def denormalise_top_p(top_p):
    # returns top_p in the range [0.9, 1.0]
    return round(0.9 + top_p / 100, 2)


def denormalise_guidance(guidance):
    # returns guidance in the range [1.0, 3.0]
    return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)


def _check_file_size(path):
    if not path:
        return
    filesize = os.path.getsize(path)
    filesize_mb = filesize / 1024 / 1024
    if filesize_mb >= 50:
        raise gr.Error(f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB")


def _handle_edge_cases(to_say, upload_target):
    if not to_say:
        raise gr.Error("Please provide text to synthesise")

    if len(to_say) > MAX_CHARS:
        gr.Warning(
            f"Max {MAX_CHARS} characters allowed. Provided: {len(to_say)} characters. Truncating and generating speech...Result at the end can be unstable as a result."
        )

    if not upload_target:
        return

    check_audio_file(upload_target)  # check file duration to be atleast 30s
    _check_file_size(upload_target)


def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target):
    try:
        d_top_p = denormalise_top_p(top_p)
        d_guidance = denormalise_guidance(guidance)

        _handle_edge_cases(to_say, upload_target)

        to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]

        return TTS_MODEL.synthesise(
            text=to_say,
            spk_ref_path=PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else upload_target,
            top_p=d_top_p,
            guidance_scale=d_guidance,
        )
    except Exception as e:
        raise gr.Error(f"Something went wrong. Reason: {str(e)}")


def change_voice_selection_layout(choice):
    if choice == RADIO_CHOICES[0]:
        return [gr.update(visible=True), gr.update(visible=False)]

    return [gr.update(visible=False), gr.update(visible=True)]

def change_emotion_selection_layout(choice):
    if choice == EMO_NAMES[0]:
        return [gr.update(visible=True)]

    return [gr.update(visible=False)]

title = """
</style>
<h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
"""

description = """
- While existing TTS services do not allow fine-grained control over emotions, EmoKnob allows users to control emotion in speech with few-shot samples.
- In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
- You can then use preset emotion or upload your own emotional-neutral sample pair to control emotions.
- You can adjust the strength of the emotion by using the slider.


EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
"""

with gr.Blocks(title="EmoKnob Demo") as demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)

    with gr.Row():
        gr.Markdown(description)

    with gr.Row():
        with gr.Column():
            to_say = gr.TextArea(
                label=f"What should I say!? (max {MAX_CHARS} characters).",
                lines=4,
                value="To be or not to be, that is the question.",
            )

            
            with gr.Row(), gr.Column():
                # voice settings
                top_p = gr.Slider(
                    value=0.95,
                    minimum=0.0,
                    maximum=10.0,
                    step=1.0,
                    label="Speech Stability - improves text following for a challenging speaker",
                )
                guidance = gr.Slider(
                    value=3.0,
                    minimum=1.0,
                    maximum=5.0,
                    step=1.0,
                    label="Speaker similarity - How closely to match speaker identity and speech style.",
                )

                strength = gr.Slider(
                    value=0.1,
                    minimum=0.0,
                    maximum=5.0,
                    step=0.01,
                    label="Strength - how strong the emotion is. Setting it to too large a value may result in unstable output.",
                )

                
                # voice select
                toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])

            with gr.Row(visible=True) as row_1:
                preset_dropdown = gr.Dropdown(
                    PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
                )
                with gr.Accordion("Preview: Preset voices", open=False):
                    for label, path in PRESET_VOICES.items():
                        gr.Audio(value=path, label=label)

            with gr.Row(visible=False) as row_2:
                upload_target = gr.Audio(
                    sources=["upload"],
                    type="filepath",
                    label="Upload a clean sample to clone.",
                )
            with gr.Row():
                emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[0])
            with gr.Row(visible=True) as row_3:
                upload_neutral = gr.Audio(
                    sources=["upload"],
                    type="filepath",
                    label="Upload a neutral sample to compute the emotion direction. Should be same speaker as the emotional sample.",
                )

                upload_emo = gr.Audio(
                    sources=["upload"],
                    type="filepath",
                    label="Upload an emotional sample to compute the emotion direction. Should be same speaker as the neutral sample.",
                )

            toggle.change(
                change_voice_selection_layout,
                inputs=toggle,
                outputs=[row_1, row_2],
            )

            # emotion_name.change(
            #     change_emotion_selection_layout,
            #     inputs=emotion_name,
            #     outputs=[row_3],
            # )

        with gr.Column():
            speech = gr.Audio(
                type="filepath",
                label="Model says...",
            )

    submit = gr.Button("Generate Speech")
    submit.click(
        fn=generate_sample,
        inputs=[to_say, emotion_name, upload_target, upload_emo, upload_neutral, strength, top_p, guidance, preset_dropdown, toggle],
        outputs=speech,
    )


demo.launch()