commit files to HF hub

Browse files

Files changed (4) hide show

.gitattributes +1 -0
README.md +11 -0
model.ckpt +3 -0
model.py +90 -0

.gitattributes CHANGED Viewed

@@ -15,3 +15,4 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+library_name: superb
+tags:
+- superb
+- automatic-speech-recognition
+widget:
+- label: Librispeech sample 1
+  src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
+---
+# Test for s3prl push to hub after fine-tuning

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36c592258dc00fcdff4c828b38415a567db4f48402fa940789e6e4b91840cb45
+size 513965647

model.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+This is just an example of what people would submit for
+inference.
+"""
+from s3prl.downstream.runner import Runner
+from typing import Dict
+import torch
+import os
+class PreTrainedModel(Runner):
+    def __init__(self, path=""):
+        """
+        Initialize downstream model.
+        """
+        ckp_file = os.path.join(path, "model.ckpt")
+        ckp = torch.load(ckp_file, map_location='cpu')
+        ckp["Args"].init_ckpt = ckp_file
+        ckp["Args"].mode = "inference"
+        ckp["Args"].device = "cpu" # Just to try in my computer
+        ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]=os.path.join(path,'char.dict')
+        Runner.__init__(self, ckp["Args"], ckp["Config"])
+    def __call__(self, inputs)-> Dict[str, str]:
+        """
+        Args:
+            inputs (:obj:`np.array`):
+                The raw waveform of audio received. By default at 16KHz.
+        Return:
+            A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
+            the detected text from the input audio.
+        """
+        for entry in self.all_entries:
+            entry.model.eval()
+        inputs = [torch.FloatTensor(inputs)]
+        with torch.no_grad():
+            features = self.upstream.model(inputs)
+            features = self.featurizer.model(inputs, features)
+            preds = self.downstream.model.inference(features, [])
+        return {"text": preds[0]}
+"""
+import subprocess
+import numpy as np
+from datasets import load_dataset
+# This is already done in the Inference API
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+    ffmpeg_process = subprocess.Popen(
+        ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
+    )
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32).copy()
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+model = PreTrainedModel()
+ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+filename = ds[0]["file"]
+with open(filename, "rb") as f:
+    data = ffmpeg_read(f.read(), 16000)
+    print(model(data))
+"""