Habib-HF
/

tarbiyah-ai-v1-1

Automatic Speech Recognition

Model card Files Files and versions

Habib-HF commited on Jun 24

Commit

d6990c3

·

verified ·

1 Parent(s): 4e98f15

Delete handler.py

Files changed (1) hide show

handler.py +0 -75

handler.py DELETED Viewed

@@ -1,75 +0,0 @@
-import torch
-import io
-import soundfile as sf
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-# Renamed class from InferenceHandler to EndpointHandler
-class EndpointHandler:
-    def __init__(self):
-        self.processor = None
-        self.model = None
-        self.device = None
-    def load(self, model_path):
-        """
-        Loads the model and processor from the specified path.
-        """
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        print(f"Loading model on device: {self.device}")
-        self.processor = AutoProcessor.from_pretrained(model_path)
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
-        if torch.cuda.is_available():
-            self.model.to(self.device) # Only move to device, no BetterTransformer
-        self.model.eval() # Set model to evaluation mode
-        # Set generation parameters
-        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="arabic", task="transcribe")
-        self.model.config.suppress_tokens = [] # Allow all tokens to be generated
-        print("Model and processor loaded successfully.")
-    def preprocess(self, input_data):
-        """
-        Preprocesses the incoming audio data.
-        input_data will be bytes (audio file content).
-        """
-        # Read audio from bytes using soundfile
-        # Ensure it's 16kHz, which Whisper expects
-        audio_bytes_io = io.BytesIO(input_data)
-        audio, original_sampling_rate = sf.read(audio_bytes_io, dtype='float32')
-        # If original sampling rate is not 16kHz, a warning will be logged by feature_extractor
-        # The feature extractor handles resampling implicitly if original_sampling_rate != processor.feature_extractor.sampling_rate
-        # Ensure it's 1-D array
-        if audio.ndim > 1:
-            audio = audio.mean(axis=1) # Convert to mono if stereo
-        return audio, original_sampling_rate
-    def predict(self, preprocessed_data):
-        """
-        Performs inference using the loaded model.
-        """
-        audio_array, original_sampling_rate = preprocessed_data
-        # Use the processor to create input features, ensuring resampling
-        input_features = self.processor.feature_extractor(
-            audio_array,
-            sampling_rate=original_sampling_rate, # Pass original rate, feature_extractor handles resampling
-            return_tensors="pt"
-        ).input_features.to(self.device)
-        with torch.no_grad():
-            generated_ids = self.model.generate(inputs=input_features, max_new_tokens=225) # Use max_new_tokens
-        transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return transcription
-    def postprocess(self, prediction_output):
-        """
-        Postprocesses the prediction output.
-        """
-        # For ASR, prediction output is already the string transcription
-        return {"transcription": prediction_output}