gautamtata
/

colab_test_model

Transformers

PyTorch

wav2vec2

Inference Endpoints

Model card Files Files and versions Community

gautamtata commited on Dec 13, 2023

Commit

c645001

•

1 Parent(s): 96fd59e

Update handler.py

Browse files

Files changed (1) hide show

handler.py +40 -45

handler.py CHANGED Viewed

@@ -1,55 +1,50 @@
-import torchaudio
 import torch
-from transformers import Wav2Vec2Processor, Wav2Vec2ForSpeechClassification, AutoConfig
-from torch.nn.functional import softmax
 from typing import Dict, List, Any
-# Suppose this handler is for a speech classification model
 class EndpointHandler():
-    def __init__(self, path=""):
-        # Assuming that the path contains all the necessary files for model and processor.
-        config = AutoConfig.from_pretrained(path)
-        self.processor = Wav2Vec2Processor.from_pretrained(path)
-        self.model = Wav2Vec2ForSpeechClassification.from_pretrained(path)
-        self.sampling_rate = self.processor.feature_extractor.sampling_rate
-        self.model.to('cuda' if torch.cuda.is_available() else 'cpu')
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        Overriding call method to handle speech input and return classification result.
-        """
-        # Extract 'inputs' key from the data dictionary. This should be a path to the audio file.
-        audio_path = data.get('inputs', None)
-        if audio_path is None:
-            raise ValueError("Invalid input, 'inputs' key with path to the audio file is required.")
-        # Load and preprocess the audio file, and run prediction
-        outputs = self.predict(audio_path)
-        return outputs
     def predict(self, path):
-        """
-        Runs prediction on the provided audio file path.
-        """
-        # Load audio file
-        speech_array, _sampling_rate = torchaudio.load(path)
-        # Resample if necessary
-        if _sampling_rate != self.sampling_rate:
-            resampler = torchaudio.transforms.Resample(_sampling_rate, self.sampling_rate)
-            speech_array = resampler(speech_array)
-        speech_array = speech_array.squeeze().numpy()
-        # Preprocess audio input
-        inputs = self.processor(speech_array, sampling_rate=self.sampling_rate, return_tensors="pt", padding=True)
-        input_values = inputs.input_values.to('cuda' if torch.cuda.is_available() else 'cpu')
-        attention_mask = inputs.attention_mask.to('cuda' if torch.cuda.is_available() else 'cpu')
-        # Model inference
         with torch.no_grad():
             logits = self.model(input_values, attention_mask=attention_mask).logits
-        # Postprocessing
-        scores = softmax(logits, dim=1).detach().cpu().numpy()[0]
-        predictions = [{"label": self.config.id2label[i], "score": float(score)} for i, score in enumerate(scores)]
-        return predictions

+from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2ForSpeechClassification
+from torch import nn
 import torch
+import torchaudio
+import torch.nn.functional as F
 from typing import Dict, List, Any
+# Assuming the provided predict and related functions are part of your handler
 class EndpointHandler():
+    def __init__(self, model_path=""):
+        # Here we load the model and processor.
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.config = AutoConfig.from_pretrained(f"{model_path}/config.json")
+        self.processor = Wav2Vec2Processor.from_pretrained(model_path)
+        self.model = Wav2Vec2ForSpeechClassification.from_pretrained(model_path).to(self.device)
+    def speech_file_to_array_fn(self, path):
+        sampling_rate = self.processor.feature_extractor.sampling_rate
+        speech_array, _sampling_rate = torchaudio.load(path)
+        resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
+        speech = resampler(speech_array).squeeze().numpy()
+        return speech
     def predict(self, path):
+        speech = self.speech_file_to_array_fn(path)
+        features = self.processor(speech, sampling_rate=self.processor.feature_extractor.sampling_rate,
+                                  return_tensors="pt", padding=True)
+        input_values = features.input_values.to(self.device)
+        attention_mask = features.attention_mask.to(self.device)
         with torch.no_grad():
             logits = self.model(input_values, attention_mask=attention_mask).logits
+        scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+        outputs = [{"label": self.config.id2label[i], "score": score} for i, score in enumerate(scores)]
+        return outputs
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        The actual method called during inference. Expects data to have a 'path' to the audio file.
+        """
+        # Get the path to the audio file from the request data
+        path = data.get("path")
+        # If the path is provided, we run the prediction, else return an error message
+        if path:
+            return self.predict(path)
+        else:
+            return {"error": "Path to the audio file is required."}