tajweed-ai-fastapi / src /model_processing.py
Arwaaaa's picture
f
0845b4d
import os
import torch
import soundfile as sf
import librosa
from src.model import load_model
# -----------------------------
# CONFIGURATION
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model, processor = load_model()
model.to(device)
def transcribe(audio_path: str) -> str:
audio, sr = sf.read(audio_path)
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
audio = audio.astype("float32")
# Process input
inputs = processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# Run inference
with torch.no_grad():
predicted_ids = model.generate(
inputs,
suppress_tokens=None,
max_new_tokens=400,
)
# Decode output
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]