geninhu commited on
Commit
0a9ff22
1 Parent(s): 39461c3

Upload lm-boosted decoder

Browse files
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["", "<s>", "</s>", "\u2047", " ", "a", "\u00e1", "\u00e0", "\u1ea3", "\u00e3", "\u1ea1", "e", "\u00e9", "\u00e8", "\u1ebb", "\u1ebd", "\u1eb9", "\u00ea", "\u1ebf", "\u1ec1", "\u1ec3", "\u1ec5", "\u1ec7", "i", "\u00ed", "\u00ec", "\u1ec9", "\u0129", "\u1ecb", "o", "\u00f3", "\u00f2", "\u1ecf", "\u00f5", "\u1ecd", "\u01a1", "\u1edb", "\u1edd", "\u1edf", "\u1ee1", "\u1ee3", "\u00f4", "\u1ed1", "\u1ed3", "\u1ed5", "\u1ed7", "\u1ed9", "u", "\u00fa", "\u00f9", "\u1ee7", "\u0169", "\u1ee5", "\u01b0", "\u1ee9", "\u1eeb", "\u1eed", "\u1eef", "\u1ef1", "y", "\u1ef3", "\u00fd", "\u1ef7", "\u1ef9", "\u1ef5", "\u0103", "\u1eaf", "\u1eb1", "\u1eb3", "\u1eb5", "\u1eb7", "\u00e2", "\u1ea5", "\u1ea7", "\u1ea9", "\u1eab", "\u1ead", "\u0111", "q", "w", "r", "t", "p", "s", "d", "f", "g", "h", "j", "k", "l", "z", "x", "c", "v", "b", "n", "m", "th", "ch", "kh", "ph", "nh", "gh", "qu", "ng", "ngh", "tr", "\u00e1c", "\u1ea1c", "ai", "\u00e1i", "\u00e0i", "\u1ea3i", "\u00e3i", "\u1ea1i", "am", "\u00e1m", "\u00e0m", "\u1ea3m", "\u00e3m", "\u1ea1m", "an", "\u00e1n", "\u00e0n", "\u1ea3n", "\u00e3n", "\u1ea1n", "ao", "\u00e1o", "\u00e0o", "\u1ea3o", "\u00e3o", "\u1ea1o", "au", "\u00e1u", "\u00e0u", "\u1ea3u", "\u00e3u", "\u1ea1u", "\u00e1p", "\u1ea1p", "\u00e1t", "\u1ea1t", "ay", "\u00e1y", "\u00e0y", "\u1ea3y", "\u00e3y", "\u1ea1y", "\u1eafc", "\u1eb7c", "\u0103m", "\u1eb1m", "\u1eafm", "\u1eb3m", "\u1eb5m", "\u1eb7m", "\u0103n", "\u1eafn", "\u1eb1n", "\u1eb3n", "\u1eb5n", "\u1eb7n", "\u1eafp", "\u1eb7p", "\u1eaft", "\u1eb7t", "\u1ea5c", "\u1eadc", "\u00e2m", "\u1ea5m", "\u1ea7m", "\u1ea9m", "\u1eabm", "\u1eadm", "\u00e2n", "\u1ea5n", "\u1ea7n", "\u1ea9n", "\u1eabn", "\u1eadn", "\u1ea5p", "\u1eadp", "\u1ea5t", "\u1eadt", "\u00e2u", "\u1ea5u", "\u1ea7u", "\u1ea9u", "\u1eabu", "\u1eadu", "\u00e2y", "\u1ea5y", "\u1ea7y", "\u1ea9y", "\u1eaby", "\u1eady", "\u00e9c", "\u1eb9c", "em", "\u00e9m", "\u00e8m", "\u1ebbm", "\u1ebdm", "\u1eb9m", "en", "\u00e9n", "\u00e8n", "\u1ebbn", "\u1ebdn", "\u1eb9n", "eo", "\u00e9o", "\u00e8o", "\u1ebbo", "\u1ebdo", "\u1eb9o", "\u00e9p", "\u1eb9p", "\u00e9t", "\u1eb9t", "\u00eam", "\u1ebfm", "\u1ec1m", "\u1ec5m", "\u1ec7m", "\u00ean", "\u1ebfn", "\u1ec1n", "\u1ec3n", "\u1ec7n", "\u1ebfp", "\u1ec7p", "\u1ebft", "\u1ec7t", "\u00eau", "\u1ebfu", "\u1ec1u", "\u1ec3u", "\u1ec5u", "\u1ec7u", "ia", "\u00eda", "\u00eca", "\u1ec9a", "\u0129a", "\u1ecba", "im", "\u00edm", "\u00ecm", "\u1ec9m", "\u0129m", "\u1ecbm", "in", "\u00edn", "\u00ecn", "\u1ec9n", "\u1ecbn", "\u00edp", "\u1ecbp", "\u00edt", "\u1ecbt", "iu", "\u00edu", "\u00ecu", "\u1ec9u", "\u0129u", "\u1ecbu", "oa", "\u00f3a", "\u00f2a", "\u1ecfa", "\u00f5a", "\u1ecda", "o\u00e0", "\u00f3c", "\u1ecdc", "oe", "\u00f3e", "\u00f2e", "\u1ecfe", "\u1ecde", "o\u1eb9", "oi", "\u00f3i", "\u00f2i", "\u1ecfi", "\u00f5i", "\u1ecdi", "om", "\u00f3m", "\u00f2m", "\u1ecfm", "\u00f5m", "\u1ecdm", "on", "\u00f3n", "\u00f2n", "\u1ecfn", "\u00f5n", "\u1ecdn", "\u00f3p", "\u1ecdp", "\u00f3t", "\u1ecdt", "\u1ed1c", "\u1ed9c", "\u00f4i", "\u1ed1i", "\u1ed3i", "\u1ed5i", "\u1ed7i", "\u1ed9i", "\u00f4m", "\u1ed1m", "\u1ed3m", "\u1ed5m", "\u1ed7m", "\u1ed9m", "\u00f4n", "\u1ed1n", "\u1ed3n", "\u1ed5n", "\u1ed7n", "\u1ed9n", "\u1ed1p", "\u1ed9p", "\u1ed1t", "\u1ed9t", "\u01a1i", "\u1edbi", "\u1eddi", "\u1edfi", "\u1ee1i", "\u1ee3i", "\u01a1m", "\u1edbm", "\u1eddm", "\u1edfm", "\u1ee1m", "\u1ee3m", "\u01a1n", "\u1edbn", "\u1eddn", "\u1edfn", "\u1ee1n", "\u1ee3n", "\u1edbp", "\u1ee3p", "\u1edbt", "\u1ee3t", "ua", "\u00faa", "\u00f9a", "\u1ee7a", "\u0169a", "\u1ee5a", "\u00fac", "\u1ee5c", "u\u00ea", "u\u1ebf", "u\u1ec1", "u\u1ec3", "u\u1ec7", "ui", "\u00fai", "\u00f9i", "\u1ee7i", "\u0169i", "\u1ee5i", "um", "\u00fam", "\u00f9m", "\u1ee7m", "\u0169m", "\u1ee5m", "un", "\u00fan", "\u00f9n", "\u1ee7n", "\u0169n", "\u1ee5n", "\u00fap", "\u1ee5p", "\u00fat", "\u1ee5t", "uy", "\u00fay", "\u00f9y", "\u1ee7y", "\u0169y", "\u1ee5y", "\u01b0a", "\u1ee9a", "\u1eeba", "\u1eeda", "\u1eefa", "\u1ef1a", "\u1ee9c", "\u1ef1c", "\u1eedi", "\u1eebm", "u\u01a1", "u\u1edf", "\u1ee9t", "\u1ef1t", "\u01b0u", "\u1ee9u", "\u1eebu", "\u1eedu", "\u1eefu", "\u1ef1u", "sh", "aw", "ee", "ea", "ei", "ew", "eu", "ie", "oo", "ou", "ow", "oy", "ue", "io", "\u00e1ch", "\u1ea1ch", "ang", "\u00e1ng", "\u00e0ng", "\u1ea3ng", "\u00e3ng", "\u1ea1ng", "anh", "\u00e1nh", "\u00e0nh", "\u1ea3nh", "\u00e3nh", "\u1ea1nh", "\u0103ng", "\u1eafng", "\u1eb1ng", "\u1eb3ng", "\u1eb5ng", "\u1eb7ng", "\u00e2ng", "\u1ea5ng", "\u1ea7ng", "\u1ea9ng", "\u1eabng", "\u1eadng", "eng", "\u00e9ng", "\u00e8ng", "\u1ebbng", "\u1ebfch", "\u1ec7ch", "\u00eanh", "\u1ebfnh", "\u1ec1nh", "\u1ec3nh", "\u1ec5nh", "\u1ec7nh", "\u00edch", "\u1ecbch", "i\u1ebfc", "i\u1ec7c", "i\u00eam", "i\u1ebfm", "i\u1ec1m", "i\u1ec3m", "i\u1ec5m", "i\u1ec7m", "i\u00ean", "i\u1ebfn", "i\u1ec1n", "i\u1ec3n", "i\u1ec5n", "i\u1ec7n", "i\u1ebfp", "i\u1ec7p", "i\u1ebft", "i\u1ec7t", "i\u00eau", "i\u1ebfu", "i\u1ec1u", "i\u1ec3u", "i\u1ec5u", "i\u1ec7u", "inh", "\u00ednh", "\u00ecnh", "\u1ec9nh", "\u0129nh", "\u1ecbnh", "o\u00e1c", "o\u1ea1c", "oai", "o\u00e1i", "o\u00e0i", "o\u1ea3i", "o\u00e3i", "o\u1ea1i", "o\u00e0m", "oan", "o\u00e1n", "o\u00e0n", "o\u1ea3n", "o\u00e3n", "o\u1ea1n", "oao", "o\u00e1o", "o\u00e1p", "o\u1ea1p", "o\u00e1t", "o\u1ea1t", "oay", "o\u00e1y", "o\u1ea3y", "o\u1eafc", "o\u1eb7c", "o\u0103m", "o\u0103n", "o\u1eb3n", "o\u1eafn", "o\u1eb1n", "o\u1eaft", "o\u1eb7t", "oen", "o\u1ebbn", "oeo", "o\u00e9o", "o\u00e8o", "o\u1ebbo", "o\u00e9t", "o\u1eb9t", "ong", "\u00f3ng", "\u00f2ng", "\u1ecfng", "\u00f5ng", "\u1ecdng", "o\u00f3c", "o\u1ecdc", "\u00f4ng", "\u1ed1ng", "\u1ed3ng", "\u1ed5ng", "\u1ed7ng", "\u1ed9ng", "u\u00e2n", "u\u1ea5n", "u\u1ea7n", "u\u1ea9n", "u\u1eabn", "u\u1eadn", "u\u1ea5t", "u\u1eadt", "u\u00e2y", "u\u1ea5y", "u\u1ea7y", "ung", "\u00fang", "\u00f9ng", "\u1ee7ng", "\u0169ng", "\u1ee5ng", "u\u1ed1c", "u\u1ed9c", "u\u00f4i", "u\u1ed1i", "u\u1ed3i", "u\u1ed5i", "u\u1ed7i", "u\u1ed9i", "u\u00f4m", "u\u1ed1m", "u\u1ed3m", "u\u1ed7m", "u\u1ed9m", "u\u00f4n", "u\u1ed1n", "u\u1ed3n", "u\u1ed7n", "u\u1ed9n", "u\u1ed1t", "u\u1ed9t", "u\u00fdt", "u\u1ef5t", "uya", "u\u1ef7u", "\u01b0ng", "\u1ee9ng", "\u1eebng", "\u1eedng", "\u1eefng", "\u1ef1ng", "\u01b0\u1edbc", "\u01b0\u1ee3c", "\u01b0\u01a1i", "\u01b0\u1edbi", "\u01b0\u1eddi", "\u01b0\u1edfi", "\u01b0\u1ee1i", "\u01b0\u1ee3i", "\u01b0\u01a1m", "\u01b0\u1edbm", "\u01b0\u1eddm", "\u01b0\u1ee3m", "\u01b0\u01a1n", "\u01b0\u1edbn", "\u01b0\u1eddn", "\u01b0\u1ee1n", "\u01b0\u1ee3n", "\u01b0\u1edbp", "\u01b0\u1ee3p", "\u01b0\u1edbt", "\u01b0\u1ee3t", "\u01b0\u01a1u", "\u01b0\u1edbu", "\u01b0\u1ee3u", "y\u00eam", "y\u1ebfm", "y\u1ec3m", "y\u00ean", "y\u1ebfn", "y\u00eau", "y\u1ebfu", "y\u1ec3u", "y\u1ebft", "i\u00eang", "i\u1ebfng", "i\u1ec1ng", "i\u1ec3ng", "i\u1ec5ng", "i\u1ec7ng", "o\u00e1ch", "o\u1ea1ch", "oang", "o\u00e1ng", "o\u00e0ng", "o\u1ea3ng", "o\u00e3ng", "o\u1ea1ng", "oanh", "o\u00e1nh", "o\u00e0nh", "o\u1ea1nh", "o\u1ea3nh", "o\u0103ng", "o\u1eafng", "o\u1eb1ng", "o\u1eb3ng", "oong", "u\u1ebfch", "u\u00eanh", "u\u00f4ng", "u\u1ed1ng", "u\u1ed3ng", "u\u1ed5ng", "u\u1ed7ng", "u\u1ed9ng", "u\u00fdch", "u\u1ef5ch", "uy\u00ean", "uy\u1ebfn", "uy\u1ec1n", "uy\u1ec3n", "uy\u1ec5n", "uy\u1ec7n", "uy\u1ebft", "uy\u1ec7t", "uynh", "u\u1ef3nh", "u\u00fdnh", "u\u1ef7nh", "\u01b0\u01a1ng", "\u01b0\u1edbng", "\u01b0\u1eddng", "\u01b0\u1edfng", "\u01b0\u1ee1ng", "\u01b0\u1ee3ng", "op", "ot", "gi", "ap", "at", "ac", "it", "ip", "ic", "ep", "et", "ec"], "is_bpe": false}
eval.py CHANGED
@@ -5,9 +5,85 @@ from typing import Dict
5
 
6
  from datasets import Audio, Dataset, load_dataset, load_metric
7
 
8
- from transformers import AutoFeatureExtractor, pipeline
9
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def log_results(result: Dataset, args: Dict[str, str]):
12
  """DO NOT CHANGE. This function computes and logs the result metrics."""
13
 
@@ -74,12 +150,23 @@ def main(args):
74
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
75
  sampling_rate = feature_extractor.sampling_rate
76
 
 
 
 
 
 
77
  # resample audio
78
  dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
79
 
80
  # load eval pipeline
81
- asr = pipeline("automatic-speech-recognition", model=args.model_id)
82
-
 
 
 
 
 
 
83
  # map function to decode audio
84
  def map_to_pred(batch):
85
  prediction = asr(
 
5
 
6
  from datasets import Audio, Dataset, load_dataset, load_metric
7
 
8
+ from transformers import AutoFeatureExtractor, pipeline, AutomaticSpeechRecognitionPipeline
9
+
10
+ from transformers import Wav2Vec2CTCTokenizer
11
+
12
+ class Wav2Vec2WordpieceTokenizer(Wav2Vec2CTCTokenizer):
13
+ def __init__(
14
+ self,
15
+ vocab_file,
16
+ bos_token="<s>",
17
+ eos_token="</s>",
18
+ unk_token="<unk>",
19
+ pad_token="<pad>",
20
+ word_delimiter_token="|",
21
+ do_lower_case=False,
22
+ **kwargs
23
+ ):
24
+ super().__init__(
25
+ vocab_file=vocab_file,
26
+ unk_token=unk_token,
27
+ bos_token=bos_token,
28
+ eos_token=eos_token,
29
+ pad_token=pad_token,
30
+ do_lower_case=do_lower_case,
31
+ word_delimiter_token=word_delimiter_token,
32
+ **kwargs,
33
+ )
34
 
35
+ self._create_trie(self.all_special_tokens_extended)
36
+
37
+ def _tokenize(self, text, **kwargs):
38
+ """
39
+ Converts a string in a sequence of tokens (string), using the tokenizer.
40
+ """
41
+ special_cases = set(['gia', 'qui', 'quy', 'que', 'qua'])
42
+ output_tokens = []
43
+ for token_idx, token in enumerate(text.split()):
44
+ if token in special_cases:
45
+ sub_tokens = [token[:2], token[2:]]
46
+ else:
47
+ end = len(token)
48
+ sub_tokens = []
49
+ while end > 0:
50
+ start = 0
51
+ cur_substr = None
52
+ while start < end:
53
+ substr = token[start:end]
54
+ if substr in self.encoder:
55
+ cur_substr = substr
56
+ break
57
+ start += 1
58
+ if cur_substr is None:
59
+ sub_tokens.insert(0, self.unk_token)
60
+ end = start - 1
61
+ else:
62
+ sub_tokens.insert(0, cur_substr)
63
+ end = start
64
+
65
+ if token_idx > 0:
66
+ output_tokens.append(self.word_delimiter_token)
67
+ output_tokens.extend(sub_tokens)
68
+ return output_tokens
69
+
70
+ def decode_ids(
71
+ self,
72
+ token_ids,
73
+ skip_special_tokens = False,
74
+ clean_up_tokenization_spaces = True,
75
+ group_tokens: bool = True,
76
+ spaces_between_special_tokens: bool = False,
77
+ ) -> str:
78
+ # For compatible with speechbrain interfaces
79
+ return self.decode(
80
+ token_ids,
81
+ skip_special_tokens=skip_special_tokens,
82
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
83
+ group_tokens=group_tokens,
84
+ spaces_between_special_tokens=spaces_between_special_tokens
85
+ )
86
+
87
  def log_results(result: Dataset, args: Dict[str, str]):
88
  """DO NOT CHANGE. This function computes and logs the result metrics."""
89
 
 
150
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
151
  sampling_rate = feature_extractor.sampling_rate
152
 
153
+ # load tokenizer
154
+ tokenizer = Wav2Vec2WordpieceTokenizer(
155
+ vocab_file = args.model_id + 'vocab.json',
156
+ )
157
+
158
  # resample audio
159
  dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
160
 
161
  # load eval pipeline
162
+ asr = pipeline(
163
+ "automatic-speech-recognition",
164
+ model=args.model_id,
165
+ tokenizer = tokenizer
166
+ )
167
+ # asr = AutomaticSpeechRecognitionPipeline(
168
+
169
+ # )
170
  # map function to decode audio
171
  def map_to_pred(batch):
172
  prediction = asr(
language_model/4gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c705d2e8fbf970a16797873649d3753d2738e4a1e3879415b7534bd677e94188
3
+ size 227715819
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff