File size: 2,434 Bytes
aaf7d72 02cd5c3 d68a6c2 aaf7d72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
---
language:
- ja
base_model:
- google/gemma-2-2b-jpn-it
pipeline_tag: audio-text-to-text
license: gemma
datasets:
- fixie-ai/common_voice_17_0
---
```py
import transformers
import librosa
import torch
import numpy as np
from typing import Dict, Any
model = transformers.AutoModel.from_pretrained(
"neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True
)
model.to("cuda", dtype=torch.bfloat16)
processor = transformers.AutoProcessor.from_pretrained(
"neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True
)
path = "record.wav"
audio, sr = librosa.load(path, sr=16000)
def preprocess(inputs: Dict[str, Any], device, dtype):
turns: list = inputs.get("turns", [])
audio = inputs.get("audio", None)
# Convert to float32 if needed.
if isinstance(audio, np.ndarray):
if audio.dtype == np.float64:
audio = audio.astype(np.float32)
elif audio.dtype == np.int16:
audio = audio.astype(np.float32) / np.float32(32768.0)
elif audio.dtype == np.int32:
audio = audio.astype(np.float32) / np.float32(2147483648.0)
if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"):
prompt = inputs.get("prompt", "<|audio|>")
if "<|audio|>" not in prompt:
print(
"Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt."
)
prompt += " <|audio|>"
turns.append({"role": "user", "content": prompt})
text = processor.tokenizer.apply_chat_template(
turns, add_generation_prompt=True, tokenize=False
)
if "sampling_rate" not in inputs and audio is not None:
print(
"No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate."
)
output = processor(
text=text,
audio=audio,
sampling_rate=inputs.get("sampling_rate", 16000),
)
if "audio_values" in output:
output["audio_values"] = output["audio_values"].to(device, dtype)
return output.to(device, dtype)
turns = []
print(
processor.tokenizer.decode(
model.generate(
**preprocess(
{"audio": audio, "turns": turns, "sampling_rate": sr},
"cuda",
torch.bfloat16,
),
max_new_tokens=300,
).squeeze(),
skip_special_tokens=True,
)
)
``` |