--- language: - ja base_model: - google/gemma-2-2b-jpn-it pipeline_tag: audio-text-to-text license: gemma datasets: - fixie-ai/common_voice_17_0 --- ```py import transformers import librosa import torch import numpy as np from typing import Dict, Any model = transformers.AutoModel.from_pretrained( "neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True ) model.to("cuda", dtype=torch.bfloat16) processor = transformers.AutoProcessor.from_pretrained( "neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True ) path = "record.wav" audio, sr = librosa.load(path, sr=16000) def preprocess(inputs: Dict[str, Any], device, dtype): turns: list = inputs.get("turns", []) audio = inputs.get("audio", None) # Convert to float32 if needed. if isinstance(audio, np.ndarray): if audio.dtype == np.float64: audio = audio.astype(np.float32) elif audio.dtype == np.int16: audio = audio.astype(np.float32) / np.float32(32768.0) elif audio.dtype == np.int32: audio = audio.astype(np.float32) / np.float32(2147483648.0) if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"): prompt = inputs.get("prompt", "<|audio|>") if "<|audio|>" not in prompt: print( "Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt." ) prompt += " <|audio|>" turns.append({"role": "user", "content": prompt}) text = processor.tokenizer.apply_chat_template( turns, add_generation_prompt=True, tokenize=False ) if "sampling_rate" not in inputs and audio is not None: print( "No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate." ) output = processor( text=text, audio=audio, sampling_rate=inputs.get("sampling_rate", 16000), ) if "audio_values" in output: output["audio_values"] = output["audio_values"].to(device, dtype) return output.to(device, dtype) turns = [] print( processor.tokenizer.decode( model.generate( **preprocess( {"audio": audio, "turns": turns, "sampling_rate": sr}, "cuda", torch.bfloat16, ), max_new_tokens=300, ).squeeze(), skip_special_tokens=True, ) ) ```