commit files to HF hub
Browse files- .gitattributes +1 -0
- README.md +13 -0
- char.dict +28 -0
- model.ckpt +3 -0
- model.py +90 -0
.gitattributes
CHANGED
@@ -15,3 +15,4 @@
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: superb
|
3 |
+
benchmark: superb
|
4 |
+
task: asr
|
5 |
+
tags:
|
6 |
+
- superb
|
7 |
+
- automatic-speech-recognition
|
8 |
+
widget:
|
9 |
+
- label: Librispeech sample 1
|
10 |
+
src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
|
11 |
+
---
|
12 |
+
|
13 |
+
# Test for s3prl push to hub after fine-tuning
|
char.dict
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
| 1980202
|
2 |
+
E 1091870
|
3 |
+
T 789572
|
4 |
+
A 689048
|
5 |
+
O 647720
|
6 |
+
N 591778
|
7 |
+
I 585614
|
8 |
+
H 557204
|
9 |
+
S 545238
|
10 |
+
R 499568
|
11 |
+
D 380912
|
12 |
+
L 344952
|
13 |
+
U 242014
|
14 |
+
M 217730
|
15 |
+
C 210734
|
16 |
+
W 204598
|
17 |
+
F 195086
|
18 |
+
G 174098
|
19 |
+
Y 168548
|
20 |
+
P 146722
|
21 |
+
B 129608
|
22 |
+
V 81496
|
23 |
+
K 65070
|
24 |
+
' 19660
|
25 |
+
X 12530
|
26 |
+
J 12062
|
27 |
+
Q 8164
|
28 |
+
Z 4916
|
model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db19fda354b289dfba72cf17ccbd3c8d69e351036ed2f91f9167fa005f92d014
|
3 |
+
size 513965711
|
model.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This is just an example of what people would submit for
|
3 |
+
inference.
|
4 |
+
"""
|
5 |
+
|
6 |
+
from s3prl.downstream.runner import Runner
|
7 |
+
from typing import Dict
|
8 |
+
import torch
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
class PreTrainedModel(Runner):
|
13 |
+
def __init__(self, path=""):
|
14 |
+
"""
|
15 |
+
Initialize downstream model.
|
16 |
+
"""
|
17 |
+
ckp_file = os.path.join(path, "model.ckpt")
|
18 |
+
ckp = torch.load(ckp_file, map_location='cpu')
|
19 |
+
ckp["Args"].init_ckpt = ckp_file
|
20 |
+
ckp["Args"].mode = "inference"
|
21 |
+
ckp["Args"].device = "cpu" # Just to try in my computer
|
22 |
+
ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]=os.path.join(path,'char.dict')
|
23 |
+
|
24 |
+
Runner.__init__(self, ckp["Args"], ckp["Config"])
|
25 |
+
|
26 |
+
def __call__(self, inputs)-> Dict[str, str]:
|
27 |
+
"""
|
28 |
+
Args:
|
29 |
+
inputs (:obj:`np.array`):
|
30 |
+
The raw waveform of audio received. By default at 16KHz.
|
31 |
+
Return:
|
32 |
+
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
|
33 |
+
the detected text from the input audio.
|
34 |
+
"""
|
35 |
+
for entry in self.all_entries:
|
36 |
+
entry.model.eval()
|
37 |
+
|
38 |
+
inputs = [torch.FloatTensor(inputs)]
|
39 |
+
|
40 |
+
with torch.no_grad():
|
41 |
+
features = self.upstream.model(inputs)
|
42 |
+
features = self.featurizer.model(inputs, features)
|
43 |
+
preds = self.downstream.model.inference(features, [])
|
44 |
+
return {"text": preds[0]}
|
45 |
+
|
46 |
+
|
47 |
+
"""
|
48 |
+
import subprocess
|
49 |
+
import numpy as np
|
50 |
+
from datasets import load_dataset
|
51 |
+
# This is already done in the Inference API
|
52 |
+
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
|
53 |
+
ar = f"{sampling_rate}"
|
54 |
+
ac = "1"
|
55 |
+
format_for_conversion = "f32le"
|
56 |
+
ffmpeg_command = [
|
57 |
+
"ffmpeg",
|
58 |
+
"-i",
|
59 |
+
"pipe:0",
|
60 |
+
"-ac",
|
61 |
+
ac,
|
62 |
+
"-ar",
|
63 |
+
ar,
|
64 |
+
"-f",
|
65 |
+
format_for_conversion,
|
66 |
+
"-hide_banner",
|
67 |
+
"-loglevel",
|
68 |
+
"quiet",
|
69 |
+
"pipe:1",
|
70 |
+
]
|
71 |
+
|
72 |
+
ffmpeg_process = subprocess.Popen(
|
73 |
+
ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
|
74 |
+
)
|
75 |
+
output_stream = ffmpeg_process.communicate(bpayload)
|
76 |
+
out_bytes = output_stream[0]
|
77 |
+
|
78 |
+
audio = np.frombuffer(out_bytes, np.float32).copy()
|
79 |
+
if audio.shape[0] == 0:
|
80 |
+
raise ValueError("Malformed soundfile")
|
81 |
+
return audio
|
82 |
+
|
83 |
+
|
84 |
+
model = PreTrainedModel()
|
85 |
+
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
86 |
+
filename = ds[0]["file"]
|
87 |
+
with open(filename, "rb") as f:
|
88 |
+
data = ffmpeg_read(f.read(), 16000)
|
89 |
+
print(model(data))
|
90 |
+
"""
|