Uploaded Model

Developed by: AquaLabs
License: apache-2.0
Finetuned from model : unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit

This LlaMa model was trained 2x faster with Unsloth and Huggingface's TRL library.

First, download requirements with following code:

import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install snac

Pipeline

import torch
from unsloth import FastLanguageModel
from snac import SNAC
from IPython.display import Audio, display
import numpy as np

TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6

GEN_START_TOKEN = 128259
GEN_EOS_TOKEN = 128258  
GEN_END_EXTRA_TOKEN = 128260
GEN_REMOVE_TOKEN = 128258
CODE_OFFSET = 128266

def load_models(HF_TOKEN):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
        max_seq_length=2048,
        token=HF_TOKEN
    )
    FastLanguageModel.for_inference(model)
    
    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    snac_model = snac_model.to(device)
    
    return model, tokenizer, snac_model, device

def redistribute_codes(code_list, snac_model, device):
    layer_1, layer_2, layer_3 = [], [], []
    num_groups = len(code_list) // 7
    for i in range(num_groups):
        group = code_list[7 * i: 7 * i + 7]
        layer_1.append(group[0])
        layer_2.append(group[1] - 4096)
        layer_3.append(group[2] - (2 * 4096))
        layer_3.append(group[3] - (3 * 4096))
        layer_2.append(group[4] - (4 * 4096))
        layer_3.append(group[5] - (5 * 4096))
        layer_3.append(group[6] - (6 * 4096))
    codes = [
        torch.tensor(layer_1).unsqueeze(0).to(device),
        torch.tensor(layer_2).unsqueeze(0).to(device),
        torch.tensor(layer_3).unsqueeze(0).to(device)
    ]

    audio_waveform = snac_model.decode(codes)
    return audio_waveform

def tts_pipeline(prompt, model, tokenizer, snac_model, device):
    input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
    end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
    modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
    
    attention_mask = torch.ones_like(modified_input_ids, device=device)
    
    generated_ids = model.generate(
        input_ids=modified_input_ids,
        attention_mask=attention_mask,
        max_new_tokens=1200,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.1,
        num_return_sequences=1,
        eos_token_id=GEN_EOS_TOKEN,
        use_cache=True
    )
    
    marker_token = 128257
    token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
    if len(token_indices[1]) > 0:
        last_marker = token_indices[1][-1].item()
        cropped_tensor = generated_ids[:, last_marker + 1:]
    else:
        cropped_tensor = generated_ids
    
    processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
    
    row_length = processed_tokens.size(0)
    new_length = (row_length // 7) * 7
    trimmed_tokens = processed_tokens[:new_length]
 
    code_list = (trimmed_tokens - CODE_OFFSET).tolist()
    
    audio_waveform = redistribute_codes(code_list, snac_model, device)
    return audio_waveform

if __name__ == "__main__":
    HF_TOKEN = "YOUR_TOKEN"  
    model, tokenizer, snac_model, device = load_models(HF_TOKEN)
    
    prompt = "In the image, there is 2 man riding bike."
    audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)

    audio_array = audio_output.detach().cpu().numpy()
    audio_array = np.squeeze(audio_array)

    if audio_array.ndim not in [1, 2]:
        raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))

    display(Audio(audio_array, rate=24000))
    print("Audio generation complete.")

Contributors

Ahmet Erdem Pamuk - GitHub | Hugging Face
Emir Kaan Özdemir - GitHub | Hugging Face
Şuayp Talha Kocabay - GitHub | Hugging Face

Details are provided in the paper.

AquaLabs
/

Orpheus-3B-0.1-ft-Elise

Uploaded Model

First, download requirements with following code:

Pipeline

Contributors

Model tree for AquaLabs/Orpheus-3B-0.1-ft-Elise

Collection including AquaLabs/Orpheus-3B-0.1-ft-Elise

EchoLLaMA: 3D-to-Speech with Multimodal AI