Working Example Code + Correct Transformers version.

#2
by tintwotin - opened

Spend a couple of hours getting this working. This post was a big help: https://huggingface.co/syvai/hviske-v5.3/discussions/1

This is my corking code:

# Use: transformers==4.57.6

import time
import torch
import torchaudio.functional as F
import numpy as np
import soundfile as sf
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

# Configuration
MODEL_ID = "syvai/hviske-v5.3"
TARGET_SR = 16000
AUDIO_PATH = "my_audio.wav"

# 1. Load processor and model
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID, 
    trust_remote_code=True, 
    torch_dtype=torch.float32
).to(device="cuda").eval()

# 2. Load and preprocess audio
audio, sr = sf.read(AUDIO_PATH)
audio = np.asarray(audio, dtype=np.float32)

# Convert stereo to mono
if audio.ndim > 1:
    audio = np.mean(audio, axis=1)

# Resample to 16000 Hz if necessary
if sr != TARGET_SR:
    audio_tensor = torch.from_numpy(audio)
    audio = F.resample(audio_tensor, orig_freq=sr, new_freq=TARGET_SR).numpy()
    sr = TARGET_SR

# 3. Prepare model inputs
inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt", language="da")

# Extract non-tensor metadata to prevent device casting errors
audio_chunk_index = inputs.pop("audio_chunk_index", None)
inputs = inputs.to(model.device, dtype=model.dtype)

# 4. Transformers 4.x Workaround
# Inject a valid decoder attention mask to prevent generation crashes
if "decoder_attention_mask" not in inputs:
    if "decoder_input_ids" in inputs:
        inputs["decoder_attention_mask"] = torch.ones_like(inputs["decoder_input_ids"])
    else:
        batch_size = inputs["input_features"].shape[0]
        inputs["decoder_attention_mask"] = torch.ones(
            (batch_size, 1), 
            dtype=torch.long, 
            device=model.device
        )

# 5. Generate transcription
print("Generating transcription...")
start_time = time.time()

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=256)

# Decode only the first sequence in the batch (outputs[0])
transcription = processor.decode(
    outputs[0],
    skip_special_tokens=True,
    audio_chunk_index=audio_chunk_index,
    language="da",
)

print(f"\n--- TRANSCRIPTION (Took {time.time() - start_time:.1f}s) ---")
print(transcription)

Thank you the script. I used it for inspiration of an open-ai compatible stt server running hviske in a docker container:

https://github.com/osos/hviske-stt-server

Sign up or log in to comment