Add model

Browse files

Files changed (4) hide show

.gitattributes +1 -0
char.dict +28 -0
hubert_asr.ckpt +3 -0
model.py +89 -0

.gitattributes CHANGED Viewed

@@ -15,3 +15,4 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt* filter=lfs diff=lfs merge=lfs -text

char.dict ADDED Viewed

	@@ -0,0 +1,28 @@

+| 1980202
+E 1091870
+T 789572
+A 689048
+O 647720
+N 591778
+I 585614
+H 557204
+S 545238
+R 499568
+D 380912
+L 344952
+U 242014
+M 217730
+C 210734
+W 204598
+F 195086
+G 174098
+Y 168548
+P 146722
+B 129608
+V 81496
+K 65070
+' 19660
+X 12530
+J 12062
+Q 8164
+Z 4916

hubert_asr.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fff3c43e5ecff336801d7c436282780e9c0ea4dfbaf3fa3df55267190d5d5fd2
+size 513967537

model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+This is just an example of what people would submit for
+inference.
+"""
+from s3prl.downstream.runner import Runner
+from typing import Dict
+import torch
+from datasets import load_dataset
+class PreTrainedModel(Runner):
+    def __init__(self):
+        """
+        Loads model and tokenizer from local directory
+        """
+        ckp_file = "hubert_asr.ckpt"
+        ckp = torch.load(ckp_file, map_location='cpu')
+        ckp["Args"].init_ckpt = ckp_file
+        ckp["Args"].mode = "inference"
+        ckp["Args"].device = "cpu" # Just to try in my computer
+        ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]='char.dict'
+        Runner.__init__(self, ckp["Args"], ckp["Config"])
+    def __call__(self, inputs)-> Dict[str, str]:
+        """
+        Args:
+            inputs (:obj:`np.array`):
+                The raw waveform of audio received. By default at 16KHz.
+        Return:
+            A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
+            the detected text from the input audio.
+        """
+        for entry in self.all_entries:
+            entry.model.eval()
+        inputs = [torch.FloatTensor(inputs)]
+        with torch.no_grad():
+            features = self.upstream.model(inputs)
+            features = self.featurizer.model(inputs, features)
+            preds = self.downstream.model.inference(features, [])
+        return preds[0]
+"""
+import subprocess
+import numpy as np
+# This is already done in the Inference API
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+    ffmpeg_process = subprocess.Popen(
+        ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
+    )
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32).copy()
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+model = PreTrainedModel()
+ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+filename = ds[0]["file"]
+with open(filename, "rb") as f:
+    data = ffmpeg_read(f.read(), 16000)
+    print(model(data))
+"""