Habib-HF
/

tarbiyah-ai-v1-1

+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from optimum.bettertransformer import BetterTransformer
+import torch
+import io
+import soundfile as sf
+class InferenceHandler:
+    def __init__(self):
+        self.processor = None
+        self.model = None
+        self.device = None
+    def load(self, model_path):
+        """
+        Loads the model and processor from the specified path.
+        """
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        print(f"Loading model on device: {self.device}")
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
+        if torch.cuda.is_available():
+            self.model = BetterTransformer.transform(self.model) # Optimize for faster inference on GPU
+            self.model.to(self.device)
+        self.model.eval() # Set model to evaluation mode
+        # Set generation parameters
+        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="arabic", task="transcribe")
+        self.model.config.suppress_tokens = [] # Allow all tokens to be generated
+        print("Model and processor loaded successfully.")
+    def preprocess(self, input_data):
+        """
+        Preprocesses the incoming audio data.
+        input_data will be bytes (audio file content).
+        """
+        # Read audio from bytes using soundfile
+        # Ensure it's 16kHz, which Whisper expects
+        audio_bytes_io = io.BytesIO(input_data)
+        audio, original_sampling_rate = sf.read(audio_bytes_io, dtype='float32')
+        # If original sampling rate is not 16kHz, a warning will be logged by feature_extractor
+        # The feature extractor handles resampling implicitly if original_sampling_rate != processor.feature_extractor.sampling_rate
+        # Ensure it's 1-D array
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1) # Convert to mono if stereo
+        return audio, original_sampling_rate
+    def predict(self, preprocessed_data):
+        """
+        Performs inference using the loaded model.
+        """
+        audio_array, original_sampling_rate = preprocessed_data
+        # Use the processor to create input features, ensuring resampling
+        input_features = self.processor.feature_extractor(
+            audio_array,
+            sampling_rate=original_sampling_rate, # Pass original rate, feature_extractor handles resampling
+            return_tensors="pt"
+        ).input_features.to(self.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(inputs=input_features, max_new_tokens=225) # Use max_new_tokens
+        transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return transcription
+    def postprocess(self, prediction_output):
+        """
+        Postprocesses the prediction output.
+        """
+        # For ASR, prediction output is already the string transcription
+        return {"transcription": prediction_output}