osanseviero
HF staff
commited on
Commit
7cfa43d
1 Parent(s): f93f7f8

Add first model

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +12 -0
  3. char.dict +28 -0
  4. hubert_asr.ckpt +3 -0
  5. model.py +91 -0
.gitattributes CHANGED
@@ -15,3 +15,4 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ *.ckpt* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - superb
4
+ - automatic-speech-recognition
5
+ - benchmark:superb
6
+ library_name: superb
7
+ widget:
8
+ - label: Librispeech sample 1
9
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
10
+ ---
11
+
12
+ # Test for superb using hubert downstream ASR
char.dict ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | 1980202
2
+ E 1091870
3
+ T 789572
4
+ A 689048
5
+ O 647720
6
+ N 591778
7
+ I 585614
8
+ H 557204
9
+ S 545238
10
+ R 499568
11
+ D 380912
12
+ L 344952
13
+ U 242014
14
+ M 217730
15
+ C 210734
16
+ W 204598
17
+ F 195086
18
+ G 174098
19
+ Y 168548
20
+ P 146722
21
+ B 129608
22
+ V 81496
23
+ K 65070
24
+ ' 19660
25
+ X 12530
26
+ J 12062
27
+ Q 8164
28
+ Z 4916
hubert_asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fff3c43e5ecff336801d7c436282780e9c0ea4dfbaf3fa3df55267190d5d5fd2
3
+ size 513967537
model.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is just an example of what people would submit for
3
+ inference.
4
+ """
5
+
6
+ from s3prl.downstream.runner import Runner
7
+ from typing import Dict
8
+ import torch
9
+ import os
10
+
11
+
12
+ class PreTrainedModel(Runner):
13
+ def __init__(self, path=""):
14
+ """
15
+ Initialize downstream model.
16
+ """
17
+ ckp_file = os.path.join(path, "hubert_asr.ckpt")
18
+ ckp = torch.load(ckp_file, map_location='cpu')
19
+ ckp["Args"].init_ckpt = ckp_file
20
+ ckp["Args"].mode = "inference"
21
+ ckp["Args"].upstream = "osanseviero/hubert_base"
22
+ ckp["Args"].from_hf_hub = True
23
+ ckp["Args"].device = "cpu" # Just to try in my computer
24
+ ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]=os.path.join(path,'char.dict')
25
+
26
+ Runner.__init__(self, ckp["Args"], ckp["Config"])
27
+
28
+ def __call__(self, inputs)-> Dict[str, str]:
29
+ """
30
+ Args:
31
+ inputs (:obj:`np.array`):
32
+ The raw waveform of audio received. By default at 16KHz.
33
+ Return:
34
+ A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
35
+ the detected text from the input audio.
36
+ """
37
+ for entry in self.all_entries:
38
+ entry.model.eval()
39
+
40
+ inputs = [torch.FloatTensor(inputs)]
41
+
42
+ with torch.no_grad():
43
+ features = self.upstream.model(inputs)
44
+ features = self.featurizer.model(inputs, features)
45
+ preds = self.downstream.model.inference(features, [])
46
+ return {"text": preds[0]}
47
+
48
+ """
49
+ import subprocess
50
+ import numpy as np
51
+ from datasets import load_dataset
52
+ # This is already done in the Inference API
53
+ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
54
+ ar = f"{sampling_rate}"
55
+ ac = "1"
56
+ format_for_conversion = "f32le"
57
+ ffmpeg_command = [
58
+ "ffmpeg",
59
+ "-i",
60
+ "pipe:0",
61
+ "-ac",
62
+ ac,
63
+ "-ar",
64
+ ar,
65
+ "-f",
66
+ format_for_conversion,
67
+ "-hide_banner",
68
+ "-loglevel",
69
+ "quiet",
70
+ "pipe:1",
71
+ ]
72
+
73
+ ffmpeg_process = subprocess.Popen(
74
+ ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
75
+ )
76
+ output_stream = ffmpeg_process.communicate(bpayload)
77
+ out_bytes = output_stream[0]
78
+
79
+ audio = np.frombuffer(out_bytes, np.float32).copy()
80
+ if audio.shape[0] == 0:
81
+ raise ValueError("Malformed soundfile")
82
+ return audio
83
+
84
+
85
+ model = PreTrainedModel()
86
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
87
+ filename = ds[0]["file"]
88
+ with open(filename, "rb") as f:
89
+ data = ffmpeg_read(f.read(), 16000)
90
+ print(model(data))
91
+ """