EchoLLaMA: 3D-to-Speech with Multimodal AI
Collection
This collection contains the models and datasets used in EchoLLaMA: 3D-to-Speech with Multimodal AI paper.
•
4 items
•
Updated
•
4
This LlaMa model was trained 2x faster with Unsloth and Huggingface's TRL library.
import os
if "COLAB_" not in "".join(os.environ.keys()):
!pip install unsloth
else:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install snac
import torch
from unsloth import FastLanguageModel
from snac import SNAC
from IPython.display import Audio, display
import numpy as np
TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6
GEN_START_TOKEN = 128259
GEN_EOS_TOKEN = 128258
GEN_END_EXTRA_TOKEN = 128260
GEN_REMOVE_TOKEN = 128258
CODE_OFFSET = 128266
def load_models(HF_TOKEN):
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
max_seq_length=2048,
token=HF_TOKEN
)
FastLanguageModel.for_inference(model)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
device = "cuda" if torch.cuda.is_available() else "cpu"
snac_model = snac_model.to(device)
return model, tokenizer, snac_model, device
def redistribute_codes(code_list, snac_model, device):
layer_1, layer_2, layer_3 = [], [], []
num_groups = len(code_list) // 7
for i in range(num_groups):
group = code_list[7 * i: 7 * i + 7]
layer_1.append(group[0])
layer_2.append(group[1] - 4096)
layer_3.append(group[2] - (2 * 4096))
layer_3.append(group[3] - (3 * 4096))
layer_2.append(group[4] - (4 * 4096))
layer_3.append(group[5] - (5 * 4096))
layer_3.append(group[6] - (6 * 4096))
codes = [
torch.tensor(layer_1).unsqueeze(0).to(device),
torch.tensor(layer_2).unsqueeze(0).to(device),
torch.tensor(layer_3).unsqueeze(0).to(device)
]
audio_waveform = snac_model.decode(codes)
return audio_waveform
def tts_pipeline(prompt, model, tokenizer, snac_model, device):
input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
attention_mask = torch.ones_like(modified_input_ids, device=device)
generated_ids = model.generate(
input_ids=modified_input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=GEN_EOS_TOKEN,
use_cache=True
)
marker_token = 128257
token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_marker = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_marker + 1:]
else:
cropped_tensor = generated_ids
processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
row_length = processed_tokens.size(0)
new_length = (row_length // 7) * 7
trimmed_tokens = processed_tokens[:new_length]
code_list = (trimmed_tokens - CODE_OFFSET).tolist()
audio_waveform = redistribute_codes(code_list, snac_model, device)
return audio_waveform
if __name__ == "__main__":
HF_TOKEN = "YOUR_TOKEN"
model, tokenizer, snac_model, device = load_models(HF_TOKEN)
prompt = "In the image, there is 2 man riding bike."
audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)
audio_array = audio_output.detach().cpu().numpy()
audio_array = np.squeeze(audio_array)
if audio_array.ndim not in [1, 2]:
raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))
display(Audio(audio_array, rate=24000))
print("Audio generation complete.")
Details are provided in the paper.
Base model
meta-llama/Llama-3.2-3B-Instruct