nshmyrevgmail
commited on
Commit
•
f9a3fb3
1
Parent(s):
3ec7591
Import initial model
Browse files- am/jit_script.pt +3 -0
- decode.py +695 -0
- lang/bpe.model +3 -0
- lang/tokens.txt +502 -0
- lang/unigram_500.vocab +500 -0
- lm/2gram.fst.txt +0 -0
- lm/epoch-99.pt +3 -0
- test.wav +0 -0
am/jit_script.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5b64964bad8c24fe48f5d9c0ffe98c4787495ea991a186f00b059fc3fa549c9
|
3 |
+
size 264940286
|
decode.py
ADDED
@@ -0,0 +1,695 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
import sys
|
17 |
+
import argparse
|
18 |
+
import logging
|
19 |
+
import math
|
20 |
+
import warnings
|
21 |
+
from dataclasses import dataclass, field
|
22 |
+
from typing import Dict, List, Optional, Tuple, Union
|
23 |
+
|
24 |
+
import kaldifeat
|
25 |
+
import sentencepiece as spm
|
26 |
+
import torch
|
27 |
+
import torchaudio
|
28 |
+
|
29 |
+
from torch.nn.utils.rnn import pad_sequence
|
30 |
+
|
31 |
+
from icefall import NgramLm, NgramLmStateCost
|
32 |
+
from icefall.decode import Nbest, one_best_decoding
|
33 |
+
from icefall.lm_wrapper import LmScorer
|
34 |
+
from icefall.rnn_lm.model import RnnLmModel
|
35 |
+
from icefall.transformer_lm.model import TransformerLM
|
36 |
+
from icefall.utils import AttributeDict
|
37 |
+
from icefall.lexicon import Lexicon
|
38 |
+
|
39 |
+
import k2
|
40 |
+
|
41 |
+
|
42 |
+
def read_sound_files(
|
43 |
+
filenames: List[str], expected_sample_rate: float = 16000
|
44 |
+
) -> List[torch.Tensor]:
|
45 |
+
"""Read a list of sound files into a list 1-D float32 torch tensors.
|
46 |
+
Args:
|
47 |
+
filenames:
|
48 |
+
A list of sound filenames.
|
49 |
+
expected_sample_rate:
|
50 |
+
The expected sample rate of the sound files.
|
51 |
+
Returns:
|
52 |
+
Return a list of 1-D float32 torch tensors.
|
53 |
+
"""
|
54 |
+
ans = []
|
55 |
+
for f in filenames:
|
56 |
+
wave, sample_rate = torchaudio.load(f)
|
57 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16_000)
|
58 |
+
wav = resampler(wave[0])
|
59 |
+
ans.append(wav)
|
60 |
+
return ans
|
61 |
+
|
62 |
+
@dataclass
|
63 |
+
class Hypothesis:
|
64 |
+
# The predicted tokens so far.
|
65 |
+
# Newly predicted tokens are appended to `ys`.
|
66 |
+
ys: List[int]
|
67 |
+
|
68 |
+
# The log prob of ys.
|
69 |
+
# It contains only one entry.
|
70 |
+
log_prob: torch.Tensor
|
71 |
+
|
72 |
+
# timestamp[i] is the frame index after subsampling
|
73 |
+
# on which ys[i] is decoded
|
74 |
+
timestamp: List[int] = field(default_factory=list)
|
75 |
+
|
76 |
+
# the lm score for next token given the current ys
|
77 |
+
lm_score: Optional[torch.Tensor] = None
|
78 |
+
|
79 |
+
# the RNNLM states (h and c in LSTM)
|
80 |
+
state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
|
81 |
+
|
82 |
+
# N-gram LM state
|
83 |
+
state_cost: Optional[NgramLmStateCost] = None
|
84 |
+
|
85 |
+
@property
|
86 |
+
def key(self) -> str:
|
87 |
+
"""Return a string representation of self.ys"""
|
88 |
+
return "_".join(map(str, self.ys))
|
89 |
+
|
90 |
+
|
91 |
+
class HypothesisList(object):
|
92 |
+
def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
|
93 |
+
"""
|
94 |
+
Args:
|
95 |
+
data:
|
96 |
+
A dict of Hypotheses. Its key is its `value.key`.
|
97 |
+
"""
|
98 |
+
if data is None:
|
99 |
+
self._data = {}
|
100 |
+
else:
|
101 |
+
self._data = data
|
102 |
+
|
103 |
+
@property
|
104 |
+
def data(self) -> Dict[str, Hypothesis]:
|
105 |
+
return self._data
|
106 |
+
|
107 |
+
def add(self, hyp: Hypothesis) -> None:
|
108 |
+
"""Add a Hypothesis to `self`.
|
109 |
+
|
110 |
+
If `hyp` already exists in `self`, its probability is updated using
|
111 |
+
`log-sum-exp` with the existed one.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
hyp:
|
115 |
+
The hypothesis to be added.
|
116 |
+
"""
|
117 |
+
key = hyp.key
|
118 |
+
if key in self:
|
119 |
+
old_hyp = self._data[key] # shallow copy
|
120 |
+
torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
|
121 |
+
else:
|
122 |
+
self._data[key] = hyp
|
123 |
+
|
124 |
+
def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
|
125 |
+
"""Get the most probable hypothesis, i.e., the one with
|
126 |
+
the largest `log_prob`.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
length_norm:
|
130 |
+
If True, the `log_prob` of a hypothesis is normalized by the
|
131 |
+
number of tokens in it.
|
132 |
+
Returns:
|
133 |
+
Return the hypothesis that has the largest `log_prob`.
|
134 |
+
"""
|
135 |
+
if length_norm:
|
136 |
+
return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
|
137 |
+
else:
|
138 |
+
return max(self._data.values(), key=lambda hyp: hyp.log_prob)
|
139 |
+
|
140 |
+
def remove(self, hyp: Hypothesis) -> None:
|
141 |
+
"""Remove a given hypothesis.
|
142 |
+
|
143 |
+
Caution:
|
144 |
+
`self` is modified **in-place**.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
hyp:
|
148 |
+
The hypothesis to be removed from `self`.
|
149 |
+
Note: It must be contained in `self`. Otherwise,
|
150 |
+
an exception is raised.
|
151 |
+
"""
|
152 |
+
key = hyp.key
|
153 |
+
assert key in self, f"{key} does not exist"
|
154 |
+
del self._data[key]
|
155 |
+
|
156 |
+
def filter(self, threshold: torch.Tensor) -> "HypothesisList":
|
157 |
+
"""Remove all Hypotheses whose log_prob is less than threshold.
|
158 |
+
|
159 |
+
Caution:
|
160 |
+
`self` is not modified. Instead, a new HypothesisList is returned.
|
161 |
+
|
162 |
+
Returns:
|
163 |
+
Return a new HypothesisList containing all hypotheses from `self`
|
164 |
+
with `log_prob` being greater than the given `threshold`.
|
165 |
+
"""
|
166 |
+
ans = HypothesisList()
|
167 |
+
for _, hyp in self._data.items():
|
168 |
+
if hyp.log_prob > threshold:
|
169 |
+
ans.add(hyp) # shallow copy
|
170 |
+
return ans
|
171 |
+
|
172 |
+
def topk(self, k: int, length_norm: bool = False) -> "HypothesisList":
|
173 |
+
"""Return the top-k hypothesis.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
length_norm:
|
177 |
+
If True, the `log_prob` of a hypothesis is normalized by the
|
178 |
+
number of tokens in it.
|
179 |
+
"""
|
180 |
+
hyps = list(self._data.items())
|
181 |
+
|
182 |
+
if length_norm:
|
183 |
+
hyps = sorted(
|
184 |
+
hyps, key=lambda h: h[1].log_prob / len(h[1].ys), reverse=True
|
185 |
+
)[:k]
|
186 |
+
else:
|
187 |
+
hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
|
188 |
+
|
189 |
+
ans = HypothesisList(dict(hyps))
|
190 |
+
return ans
|
191 |
+
|
192 |
+
def __contains__(self, key: str):
|
193 |
+
return key in self._data
|
194 |
+
|
195 |
+
def __iter__(self):
|
196 |
+
return iter(self._data.values())
|
197 |
+
|
198 |
+
def __len__(self) -> int:
|
199 |
+
return len(self._data)
|
200 |
+
|
201 |
+
def __str__(self) -> str:
|
202 |
+
s = []
|
203 |
+
for key in self:
|
204 |
+
s.append(key)
|
205 |
+
return ", ".join(s)
|
206 |
+
|
207 |
+
|
208 |
+
def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
|
209 |
+
"""Return a ragged shape with axes [utt][num_hyps].
|
210 |
+
|
211 |
+
Args:
|
212 |
+
hyps:
|
213 |
+
len(hyps) == batch_size. It contains the current hypothesis for
|
214 |
+
each utterance in the batch.
|
215 |
+
Returns:
|
216 |
+
Return a ragged shape with 2 axes [utt][num_hyps]. Note that
|
217 |
+
the shape is on CPU.
|
218 |
+
"""
|
219 |
+
num_hyps = [len(h) for h in hyps]
|
220 |
+
|
221 |
+
# torch.cumsum() is inclusive sum, so we put a 0 at the beginning
|
222 |
+
# to get exclusive sum later.
|
223 |
+
num_hyps.insert(0, 0)
|
224 |
+
|
225 |
+
num_hyps = torch.tensor(num_hyps)
|
226 |
+
row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
|
227 |
+
ans = k2.ragged.create_ragged_shape2(
|
228 |
+
row_splits=row_splits, cached_tot_size=row_splits[-1].item()
|
229 |
+
)
|
230 |
+
return ans
|
231 |
+
|
232 |
+
|
233 |
+
def modified_beam_search_LODR(
|
234 |
+
model,
|
235 |
+
encoder_out: torch.Tensor,
|
236 |
+
encoder_out_lens: torch.Tensor,
|
237 |
+
LODR_lm: NgramLm,
|
238 |
+
LODR_lm_scale: float,
|
239 |
+
LM: LmScorer,
|
240 |
+
beam: int = 4,
|
241 |
+
) -> List[List[int]]:
|
242 |
+
"""This function implements LODR (https://arxiv.org/abs/2203.16776) with
|
243 |
+
`modified_beam_search`. It uses a bi-gram language model as the estimate
|
244 |
+
of the internal language model and subtracts its score during shallow fusion
|
245 |
+
with an external language model. This implementation uses a RNNLM as the
|
246 |
+
external language model.
|
247 |
+
|
248 |
+
Args:
|
249 |
+
model (Transducer):
|
250 |
+
The transducer model
|
251 |
+
encoder_out (torch.Tensor):
|
252 |
+
Encoder output in (N,T,C)
|
253 |
+
encoder_out_lens (torch.Tensor):
|
254 |
+
A 1-D tensor of shape (N,), containing the number of
|
255 |
+
valid frames in encoder_out before padding.
|
256 |
+
LODR_lm:
|
257 |
+
A low order n-gram LM, whose score will be subtracted during shallow fusion
|
258 |
+
LODR_lm_scale:
|
259 |
+
The scale of the LODR_lm
|
260 |
+
LM:
|
261 |
+
A neural net LM, e.g an RNNLM or transformer LM
|
262 |
+
beam (int, optional):
|
263 |
+
Beam size. Defaults to 4.
|
264 |
+
|
265 |
+
Returns:
|
266 |
+
Return a list-of-list of token IDs. ans[i] is the decoding results
|
267 |
+
for the i-th utterance.
|
268 |
+
|
269 |
+
"""
|
270 |
+
assert encoder_out.ndim == 3, encoder_out.shape
|
271 |
+
assert encoder_out.size(0) >= 1, encoder_out.size(0)
|
272 |
+
assert LM is not None
|
273 |
+
lm_scale = LM.lm_scale
|
274 |
+
|
275 |
+
packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
|
276 |
+
input=encoder_out,
|
277 |
+
lengths=encoder_out_lens.cpu(),
|
278 |
+
batch_first=True,
|
279 |
+
enforce_sorted=False,
|
280 |
+
)
|
281 |
+
|
282 |
+
blank_id = model.decoder.blank_id
|
283 |
+
sos_id = getattr(LM, "sos_id", 1)
|
284 |
+
unk_id = getattr(model, "unk_id", blank_id)
|
285 |
+
context_size = model.decoder.context_size
|
286 |
+
device = next(model.parameters()).device
|
287 |
+
|
288 |
+
batch_size_list = packed_encoder_out.batch_sizes.tolist()
|
289 |
+
N = encoder_out.size(0)
|
290 |
+
assert torch.all(encoder_out_lens > 0), encoder_out_lens
|
291 |
+
assert N == batch_size_list[0], (N, batch_size_list)
|
292 |
+
|
293 |
+
# get initial lm score and lm state by scoring the "sos" token
|
294 |
+
sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
|
295 |
+
lens = torch.tensor([1]).to(device)
|
296 |
+
init_score, init_states = LM.score_token(sos_token, lens)
|
297 |
+
|
298 |
+
B = [HypothesisList() for _ in range(N)]
|
299 |
+
for i in range(N):
|
300 |
+
B[i].add(
|
301 |
+
Hypothesis(
|
302 |
+
ys=[blank_id] * context_size,
|
303 |
+
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
304 |
+
state=init_states, # state of the NN LM
|
305 |
+
lm_score=init_score.reshape(-1),
|
306 |
+
state_cost=NgramLmStateCost(
|
307 |
+
LODR_lm
|
308 |
+
), # state of the source domain ngram
|
309 |
+
)
|
310 |
+
)
|
311 |
+
|
312 |
+
encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
|
313 |
+
|
314 |
+
offset = 0
|
315 |
+
finalized_B = []
|
316 |
+
for batch_size in batch_size_list:
|
317 |
+
start = offset
|
318 |
+
end = offset + batch_size
|
319 |
+
current_encoder_out = encoder_out.data[start:end] # get batch
|
320 |
+
current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
|
321 |
+
# current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
|
322 |
+
offset = end
|
323 |
+
|
324 |
+
finalized_B = B[batch_size:] + finalized_B
|
325 |
+
B = B[:batch_size]
|
326 |
+
|
327 |
+
hyps_shape = get_hyps_shape(B).to(device)
|
328 |
+
|
329 |
+
A = [list(b) for b in B]
|
330 |
+
B = [HypothesisList() for _ in range(batch_size)]
|
331 |
+
|
332 |
+
ys_log_probs = torch.cat(
|
333 |
+
[hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
|
334 |
+
)
|
335 |
+
|
336 |
+
decoder_input = torch.tensor(
|
337 |
+
[hyp.ys[-context_size:] for hyps in A for hyp in hyps],
|
338 |
+
device=device,
|
339 |
+
dtype=torch.int64,
|
340 |
+
) # (num_hyps, context_size)
|
341 |
+
|
342 |
+
decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
|
343 |
+
decoder_out = model.joiner.decoder_proj(decoder_out)
|
344 |
+
|
345 |
+
current_encoder_out = torch.index_select(
|
346 |
+
current_encoder_out,
|
347 |
+
dim=0,
|
348 |
+
index=hyps_shape.row_ids(1).to(torch.int64),
|
349 |
+
) # (num_hyps, 1, 1, encoder_out_dim)
|
350 |
+
|
351 |
+
logits = model.joiner(
|
352 |
+
current_encoder_out,
|
353 |
+
decoder_out,
|
354 |
+
project_input=False,
|
355 |
+
) # (num_hyps, 1, 1, vocab_size)
|
356 |
+
|
357 |
+
logits = logits.squeeze(1).squeeze(1) # (num_hyps, vocab_size)
|
358 |
+
|
359 |
+
log_probs = logits.log_softmax(dim=-1) # (num_hyps, vocab_size)
|
360 |
+
|
361 |
+
log_probs.add_(ys_log_probs)
|
362 |
+
|
363 |
+
vocab_size = log_probs.size(-1)
|
364 |
+
|
365 |
+
log_probs = log_probs.reshape(-1)
|
366 |
+
|
367 |
+
row_splits = hyps_shape.row_splits(1) * vocab_size
|
368 |
+
log_probs_shape = k2.ragged.create_ragged_shape2(
|
369 |
+
row_splits=row_splits, cached_tot_size=log_probs.numel()
|
370 |
+
)
|
371 |
+
ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
|
372 |
+
"""
|
373 |
+
for all hyps with a non-blank new token, score this token.
|
374 |
+
It is a little confusing here because this for-loop
|
375 |
+
looks very similar to the one below. Here, we go through all
|
376 |
+
top-k tokens and only add the non-blanks ones to the token_list.
|
377 |
+
LM will score those tokens given the LM states. Note that
|
378 |
+
the variable `scores` is the LM score after seeing the new
|
379 |
+
non-blank token.
|
380 |
+
"""
|
381 |
+
token_list = []
|
382 |
+
hs = []
|
383 |
+
cs = []
|
384 |
+
for i in range(batch_size):
|
385 |
+
topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
|
386 |
+
|
387 |
+
with warnings.catch_warnings():
|
388 |
+
warnings.simplefilter("ignore")
|
389 |
+
topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
|
390 |
+
topk_token_indexes = (topk_indexes % vocab_size).tolist()
|
391 |
+
for k in range(len(topk_hyp_indexes)):
|
392 |
+
hyp_idx = topk_hyp_indexes[k]
|
393 |
+
hyp = A[i][hyp_idx]
|
394 |
+
|
395 |
+
new_token = topk_token_indexes[k]
|
396 |
+
if new_token not in (blank_id, unk_id):
|
397 |
+
if LM.lm_type == "rnn":
|
398 |
+
token_list.append([new_token])
|
399 |
+
# store the LSTM states
|
400 |
+
hs.append(hyp.state[0])
|
401 |
+
cs.append(hyp.state[1])
|
402 |
+
else:
|
403 |
+
# for transformer LM
|
404 |
+
token_list.append(
|
405 |
+
[sos_id] + hyp.ys[context_size:] + [new_token]
|
406 |
+
)
|
407 |
+
|
408 |
+
# forward NN LM to get new states and scores
|
409 |
+
if len(token_list) != 0:
|
410 |
+
x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
|
411 |
+
if LM.lm_type == "rnn":
|
412 |
+
tokens_to_score = (
|
413 |
+
torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
|
414 |
+
)
|
415 |
+
hs = torch.cat(hs, dim=1).to(device)
|
416 |
+
cs = torch.cat(cs, dim=1).to(device)
|
417 |
+
state = (hs, cs)
|
418 |
+
else:
|
419 |
+
# for transformer LM
|
420 |
+
tokens_list = [torch.tensor(tokens) for tokens in token_list]
|
421 |
+
tokens_to_score = (
|
422 |
+
torch.nn.utils.rnn.pad_sequence(
|
423 |
+
tokens_list, batch_first=True, padding_value=0.0
|
424 |
+
)
|
425 |
+
.to(device)
|
426 |
+
.to(torch.int64)
|
427 |
+
)
|
428 |
+
|
429 |
+
state = None
|
430 |
+
|
431 |
+
scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
|
432 |
+
|
433 |
+
count = 0 # index, used to locate score and lm states
|
434 |
+
for i in range(batch_size):
|
435 |
+
topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
|
436 |
+
|
437 |
+
with warnings.catch_warnings():
|
438 |
+
warnings.simplefilter("ignore")
|
439 |
+
topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
|
440 |
+
topk_token_indexes = (topk_indexes % vocab_size).tolist()
|
441 |
+
|
442 |
+
for k in range(len(topk_hyp_indexes)):
|
443 |
+
hyp_idx = topk_hyp_indexes[k]
|
444 |
+
hyp = A[i][hyp_idx]
|
445 |
+
|
446 |
+
ys = hyp.ys[:]
|
447 |
+
|
448 |
+
# current score of hyp
|
449 |
+
lm_score = hyp.lm_score
|
450 |
+
state = hyp.state
|
451 |
+
|
452 |
+
hyp_log_prob = topk_log_probs[k] # get score of current hyp
|
453 |
+
new_token = topk_token_indexes[k]
|
454 |
+
if new_token not in (blank_id, unk_id):
|
455 |
+
|
456 |
+
ys.append(new_token)
|
457 |
+
state_cost = hyp.state_cost.forward_one_step(new_token)
|
458 |
+
|
459 |
+
# calculate the score of the latest token
|
460 |
+
current_ngram_score = state_cost.lm_score - hyp.state_cost.lm_score
|
461 |
+
|
462 |
+
assert current_ngram_score <= 0.0, (
|
463 |
+
state_cost.lm_score,
|
464 |
+
hyp.state_cost.lm_score,
|
465 |
+
)
|
466 |
+
# score = score + TDLM_score - LODR_score
|
467 |
+
# LODR_LM_scale should be a negative number here
|
468 |
+
hyp_log_prob += (
|
469 |
+
lm_score[new_token] * lm_scale
|
470 |
+
+ LODR_lm_scale * current_ngram_score
|
471 |
+
) # add the lm score
|
472 |
+
|
473 |
+
lm_score = scores[count]
|
474 |
+
if LM.lm_type == "rnn":
|
475 |
+
state = (
|
476 |
+
lm_states[0][:, count, :].unsqueeze(1),
|
477 |
+
lm_states[1][:, count, :].unsqueeze(1),
|
478 |
+
)
|
479 |
+
count += 1
|
480 |
+
else:
|
481 |
+
state_cost = hyp.state_cost
|
482 |
+
|
483 |
+
new_hyp = Hypothesis(
|
484 |
+
ys=ys,
|
485 |
+
log_prob=hyp_log_prob,
|
486 |
+
state=state,
|
487 |
+
lm_score=lm_score,
|
488 |
+
state_cost=state_cost,
|
489 |
+
)
|
490 |
+
B[i].add(new_hyp)
|
491 |
+
|
492 |
+
B = B + finalized_B
|
493 |
+
best_hyps = [b.get_most_probable(length_norm=True) for b in B]
|
494 |
+
|
495 |
+
sorted_ans = [h.ys[context_size:] for h in best_hyps]
|
496 |
+
ans = []
|
497 |
+
unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
|
498 |
+
for i in range(N):
|
499 |
+
ans.append(sorted_ans[unsorted_indices[i]])
|
500 |
+
|
501 |
+
return ans
|
502 |
+
|
503 |
+
|
504 |
+
def greedy_search(
|
505 |
+
model: torch.jit.ScriptModule,
|
506 |
+
encoder_out: torch.Tensor,
|
507 |
+
encoder_out_lens: torch.Tensor,
|
508 |
+
) -> List[List[int]]:
|
509 |
+
"""Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
|
510 |
+
Args:
|
511 |
+
model:
|
512 |
+
The transducer model.
|
513 |
+
encoder_out:
|
514 |
+
A 3-D tensor of shape (N, T, C)
|
515 |
+
encoder_out_lens:
|
516 |
+
A 1-D tensor of shape (N,).
|
517 |
+
Returns:
|
518 |
+
Return the decoded results for each utterance.
|
519 |
+
"""
|
520 |
+
assert encoder_out.ndim == 3
|
521 |
+
assert encoder_out.size(0) >= 1, encoder_out.size(0)
|
522 |
+
|
523 |
+
packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
|
524 |
+
input=encoder_out,
|
525 |
+
lengths=encoder_out_lens.cpu(),
|
526 |
+
batch_first=True,
|
527 |
+
enforce_sorted=False,
|
528 |
+
)
|
529 |
+
|
530 |
+
device = encoder_out.device
|
531 |
+
blank_id = 0 # hard-code to 0
|
532 |
+
|
533 |
+
batch_size_list = packed_encoder_out.batch_sizes.tolist()
|
534 |
+
N = encoder_out.size(0)
|
535 |
+
|
536 |
+
assert torch.all(encoder_out_lens > 0), encoder_out_lens
|
537 |
+
assert N == batch_size_list[0], (N, batch_size_list)
|
538 |
+
|
539 |
+
context_size = model.decoder.context_size
|
540 |
+
hyps = [[blank_id] * context_size for _ in range(N)]
|
541 |
+
|
542 |
+
decoder_input = torch.tensor(
|
543 |
+
hyps,
|
544 |
+
device=device,
|
545 |
+
dtype=torch.int64,
|
546 |
+
) # (N, context_size)
|
547 |
+
|
548 |
+
decoder_out = model.decoder(
|
549 |
+
decoder_input,
|
550 |
+
need_pad=torch.tensor([False]),
|
551 |
+
).squeeze(1)
|
552 |
+
|
553 |
+
offset = 0
|
554 |
+
for batch_size in batch_size_list:
|
555 |
+
start = offset
|
556 |
+
end = offset + batch_size
|
557 |
+
current_encoder_out = packed_encoder_out.data[start:end]
|
558 |
+
current_encoder_out = current_encoder_out
|
559 |
+
# current_encoder_out's shape: (batch_size, encoder_out_dim)
|
560 |
+
offset = end
|
561 |
+
|
562 |
+
decoder_out = decoder_out[:batch_size]
|
563 |
+
|
564 |
+
logits = model.joiner(
|
565 |
+
current_encoder_out,
|
566 |
+
decoder_out,
|
567 |
+
)
|
568 |
+
# logits'shape (batch_size, vocab_size)
|
569 |
+
|
570 |
+
assert logits.ndim == 2, logits.shape
|
571 |
+
y = logits.argmax(dim=1).tolist()
|
572 |
+
emitted = False
|
573 |
+
for i, v in enumerate(y):
|
574 |
+
if v != blank_id:
|
575 |
+
hyps[i].append(v)
|
576 |
+
emitted = True
|
577 |
+
if emitted:
|
578 |
+
# update decoder output
|
579 |
+
decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
|
580 |
+
decoder_input = torch.tensor(
|
581 |
+
decoder_input,
|
582 |
+
device=device,
|
583 |
+
dtype=torch.int64,
|
584 |
+
)
|
585 |
+
decoder_out = model.decoder(
|
586 |
+
decoder_input,
|
587 |
+
need_pad=torch.tensor([False]),
|
588 |
+
)
|
589 |
+
decoder_out = decoder_out.squeeze(1)
|
590 |
+
|
591 |
+
sorted_ans = [h[context_size:] for h in hyps]
|
592 |
+
ans = []
|
593 |
+
unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
|
594 |
+
for i in range(N):
|
595 |
+
ans.append(sorted_ans[unsorted_indices[i]])
|
596 |
+
|
597 |
+
return ans
|
598 |
+
|
599 |
+
|
600 |
+
@torch.no_grad()
|
601 |
+
def main():
|
602 |
+
torch.set_num_threads(8)
|
603 |
+
|
604 |
+
device = torch.device("cpu")
|
605 |
+
if torch.cuda.is_available():
|
606 |
+
device = torch.device("cuda", 0)
|
607 |
+
|
608 |
+
model = torch.jit.load("am/jit_script.pt")
|
609 |
+
|
610 |
+
model.eval()
|
611 |
+
|
612 |
+
model.to(device)
|
613 |
+
|
614 |
+
sp = spm.SentencePieceProcessor()
|
615 |
+
sp.load("lang/bpe.model")
|
616 |
+
|
617 |
+
opts = kaldifeat.FbankOptions()
|
618 |
+
opts.device = device
|
619 |
+
opts.frame_opts.dither = 0
|
620 |
+
opts.frame_opts.snip_edges = False
|
621 |
+
opts.frame_opts.samp_freq = 16000
|
622 |
+
opts.mel_opts.num_bins = 80
|
623 |
+
|
624 |
+
fbank = kaldifeat.Fbank(opts)
|
625 |
+
|
626 |
+
all_filenames = sys.argv[1:]
|
627 |
+
|
628 |
+
params = AttributeDict()
|
629 |
+
params.vocab_size = 500
|
630 |
+
params.rnn_lm_embedding_dim = 2048
|
631 |
+
params.rnn_lm_hidden_dim = 2048
|
632 |
+
params.rnn_lm_num_layers = 3
|
633 |
+
params.rnn_lm_tie_weights = True
|
634 |
+
params.lm_epoch = 99
|
635 |
+
params.lm_exp_dir = "lm"
|
636 |
+
params.lm_avg = 1
|
637 |
+
|
638 |
+
LM = LmScorer(
|
639 |
+
lm_type="rnn",
|
640 |
+
params=params,
|
641 |
+
device=device,
|
642 |
+
lm_scale=0.2,
|
643 |
+
)
|
644 |
+
LM.to(device)
|
645 |
+
LM.eval()
|
646 |
+
|
647 |
+
ngram_lm = NgramLm(
|
648 |
+
"lm/2gram.fst.txt",
|
649 |
+
backoff_id=500,
|
650 |
+
is_binary=False,
|
651 |
+
)
|
652 |
+
ngram_lm_scale = -0.1
|
653 |
+
|
654 |
+
for i in range(0, len(all_filenames), 16):
|
655 |
+
filenames = all_filenames[i:i+16]
|
656 |
+
waves = read_sound_files(
|
657 |
+
filenames=filenames,
|
658 |
+
)
|
659 |
+
waves = [w.to(device) for w in waves]
|
660 |
+
|
661 |
+
features = fbank(waves)
|
662 |
+
feature_lengths = [f.size(0) for f in features]
|
663 |
+
|
664 |
+
features = pad_sequence(
|
665 |
+
features,
|
666 |
+
batch_first=True,
|
667 |
+
padding_value=math.log(1e-10),
|
668 |
+
)
|
669 |
+
|
670 |
+
feature_lengths = torch.tensor(feature_lengths, device=device)
|
671 |
+
|
672 |
+
encoder_out, encoder_out_lens = model.encoder(
|
673 |
+
features=features,
|
674 |
+
feature_lengths=feature_lengths,
|
675 |
+
)
|
676 |
+
|
677 |
+
hyps = modified_beam_search_LODR(
|
678 |
+
model=model,
|
679 |
+
encoder_out=encoder_out,
|
680 |
+
encoder_out_lens=encoder_out_lens,
|
681 |
+
beam=20,
|
682 |
+
LODR_lm=ngram_lm,
|
683 |
+
LODR_lm_scale=ngram_lm_scale,
|
684 |
+
LM=LM,
|
685 |
+
)
|
686 |
+
|
687 |
+
for f, hyp in zip(filenames, hyps):
|
688 |
+
words = sp.decode(hyp)
|
689 |
+
print(f"{f.split('/')[-1][0:-4]} {words}")
|
690 |
+
|
691 |
+
if __name__ == "__main__":
|
692 |
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
693 |
+
|
694 |
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
695 |
+
main()
|
lang/bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7a756aeb3550417d6b2ed3efde9a7aa3eea54787d4eac011e9cce6090c9c64a
|
3 |
+
size 246184
|
lang/tokens.txt
ADDED
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<blk> 0
|
2 |
+
<sos/eos> 1
|
3 |
+
<unk> 2
|
4 |
+
е 3
|
5 |
+
▁с 4
|
6 |
+
т 5
|
7 |
+
▁в 6
|
8 |
+
▁ 7
|
9 |
+
с 8
|
10 |
+
н 9
|
11 |
+
м 10
|
12 |
+
▁и 11
|
13 |
+
и 12
|
14 |
+
р 13
|
15 |
+
▁на 14
|
16 |
+
▁по 15
|
17 |
+
д 16
|
18 |
+
в 17
|
19 |
+
у 18
|
20 |
+
л 19
|
21 |
+
▁не 20
|
22 |
+
к 21
|
23 |
+
й 22
|
24 |
+
я 23
|
25 |
+
а 24
|
26 |
+
ра 25
|
27 |
+
▁у 26
|
28 |
+
ли 27
|
29 |
+
но 28
|
30 |
+
ла 29
|
31 |
+
ка 30
|
32 |
+
х 31
|
33 |
+
ч 32
|
34 |
+
о 33
|
35 |
+
▁за 34
|
36 |
+
з 35
|
37 |
+
ю 36
|
38 |
+
ло 37
|
39 |
+
ни 38
|
40 |
+
▁а 39
|
41 |
+
то 40
|
42 |
+
ст 41
|
43 |
+
ва 42
|
44 |
+
г 43
|
45 |
+
ы 44
|
46 |
+
▁что 45
|
47 |
+
ш 46
|
48 |
+
ть 47
|
49 |
+
ь 48
|
50 |
+
ль 49
|
51 |
+
▁к 50
|
52 |
+
▁я 51
|
53 |
+
го 52
|
54 |
+
▁это 53
|
55 |
+
ж 54
|
56 |
+
на 55
|
57 |
+
ро 56
|
58 |
+
▁о 57
|
59 |
+
ти 58
|
60 |
+
▁то 59
|
61 |
+
ле 60
|
62 |
+
во 61
|
63 |
+
ре 62
|
64 |
+
ки 63
|
65 |
+
п 64
|
66 |
+
да 65
|
67 |
+
▁вы 66
|
68 |
+
б 67
|
69 |
+
та 68
|
70 |
+
▁до 69
|
71 |
+
ри 70
|
72 |
+
▁при 71
|
73 |
+
▁как 72
|
74 |
+
▁от 73
|
75 |
+
▁но 74
|
76 |
+
▁про 75
|
77 |
+
▁так 76
|
78 |
+
ко 77
|
79 |
+
▁да 78
|
80 |
+
▁д 79
|
81 |
+
ви 80
|
82 |
+
не 81
|
83 |
+
те 82
|
84 |
+
ет 83
|
85 |
+
ру 84
|
86 |
+
▁со 85
|
87 |
+
▁об 86
|
88 |
+
ди 87
|
89 |
+
▁все 88
|
90 |
+
▁ко 89
|
91 |
+
ми 90
|
92 |
+
ой 91
|
93 |
+
ве 92
|
94 |
+
▁мо 93
|
95 |
+
чи 94
|
96 |
+
ля 95
|
97 |
+
- 96
|
98 |
+
▁он 97
|
99 |
+
ё 98
|
100 |
+
ку 99
|
101 |
+
ов 100
|
102 |
+
ом 101
|
103 |
+
до 102
|
104 |
+
че 103
|
105 |
+
▁п 104
|
106 |
+
ма 105
|
107 |
+
ча 106
|
108 |
+
▁го 107
|
109 |
+
ту 108
|
110 |
+
де 109
|
111 |
+
ся 110
|
112 |
+
▁ма 111
|
113 |
+
лу 112
|
114 |
+
же 113
|
115 |
+
ц 114
|
116 |
+
▁т 115
|
117 |
+
▁г 116
|
118 |
+
▁м 117
|
119 |
+
▁из 118
|
120 |
+
ем 119
|
121 |
+
ан 120
|
122 |
+
▁вот 121
|
123 |
+
▁во 122
|
124 |
+
▁раз 123
|
125 |
+
жи 124
|
126 |
+
ста 125
|
127 |
+
по 126
|
128 |
+
ши 127
|
129 |
+
ный 128
|
130 |
+
▁есть 129
|
131 |
+
▁б 130
|
132 |
+
ше 131
|
133 |
+
▁ли 132
|
134 |
+
за 133
|
135 |
+
▁з 134
|
136 |
+
бо 135
|
137 |
+
га 136
|
138 |
+
▁ка 137
|
139 |
+
▁мы 138
|
140 |
+
мо 139
|
141 |
+
▁де 140
|
142 |
+
сти 141
|
143 |
+
▁те 142
|
144 |
+
ну 143
|
145 |
+
▁под 144
|
146 |
+
тор 145
|
147 |
+
ить 146
|
148 |
+
▁бы 147
|
149 |
+
▁ни 148
|
150 |
+
▁ф 149
|
151 |
+
▁э 150
|
152 |
+
▁ре 151
|
153 |
+
ные 152
|
154 |
+
ать 153
|
155 |
+
ры 154
|
156 |
+
мен 155
|
157 |
+
▁мне 156
|
158 |
+
ение 157
|
159 |
+
ей 158
|
160 |
+
ня 159
|
161 |
+
па 160
|
162 |
+
▁же 161
|
163 |
+
му 162
|
164 |
+
пе 163
|
165 |
+
ения 164
|
166 |
+
ду 165
|
167 |
+
ги 166
|
168 |
+
ф 167
|
169 |
+
сь 168
|
170 |
+
об 169
|
171 |
+
ны 170
|
172 |
+
ется 171
|
173 |
+
ты 172
|
174 |
+
▁пере 173
|
175 |
+
со 174
|
176 |
+
ую 175
|
177 |
+
ал 176
|
178 |
+
▁его 177
|
179 |
+
ого 178
|
180 |
+
лю 179
|
181 |
+
це 180
|
182 |
+
▁ну 181
|
183 |
+
вер 182
|
184 |
+
ной 183
|
185 |
+
ци 184
|
186 |
+
жа 185
|
187 |
+
им 186
|
188 |
+
щи 187
|
189 |
+
▁па 188
|
190 |
+
би 189
|
191 |
+
ят 190
|
192 |
+
▁ро 191
|
193 |
+
▁если 192
|
194 |
+
ного 193
|
195 |
+
ёт 194
|
196 |
+
ск 195
|
197 |
+
ных 196
|
198 |
+
ход 197
|
199 |
+
щ 198
|
200 |
+
ще 199
|
201 |
+
ая 200
|
202 |
+
ров 201
|
203 |
+
▁меня 202
|
204 |
+
▁ш 203
|
205 |
+
▁она 204
|
206 |
+
жд 205
|
207 |
+
▁ты 206
|
208 |
+
бе 207
|
209 |
+
▁ку 208
|
210 |
+
▁рас 209
|
211 |
+
мер 210
|
212 |
+
▁для 211
|
213 |
+
ца 212
|
214 |
+
ря 213
|
215 |
+
ень 214
|
216 |
+
▁бо 215
|
217 |
+
▁бе 216
|
218 |
+
си 217
|
219 |
+
тель 218
|
220 |
+
ная 219
|
221 |
+
ным 220
|
222 |
+
он 221
|
223 |
+
▁ещё 222
|
224 |
+
▁будет 223
|
225 |
+
▁хо 224
|
226 |
+
вы 225
|
227 |
+
▁ми 226
|
228 |
+
са 227
|
229 |
+
▁там 228
|
230 |
+
лы 229
|
231 |
+
ша 230
|
232 |
+
▁ле 231
|
233 |
+
▁они 232
|
234 |
+
пу 233
|
235 |
+
зна 234
|
236 |
+
нов 235
|
237 |
+
пи 236
|
238 |
+
ое 237
|
239 |
+
ба 238
|
240 |
+
ил 239
|
241 |
+
дел 240
|
242 |
+
▁был 241
|
243 |
+
ско 242
|
244 |
+
▁ба 243
|
245 |
+
ите 244
|
246 |
+
▁друг 245
|
247 |
+
▁че 246
|
248 |
+
▁вам 247
|
249 |
+
▁х 248
|
250 |
+
ый 249
|
251 |
+
▁вид 250
|
252 |
+
лов 251
|
253 |
+
▁час 252
|
254 |
+
▁было 253
|
255 |
+
▁только 254
|
256 |
+
▁вас 255
|
257 |
+
ке 256
|
258 |
+
тер 257
|
259 |
+
э 258
|
260 |
+
▁или 259
|
261 |
+
▁когда 260
|
262 |
+
▁сто 261
|
263 |
+
▁уже 262
|
264 |
+
▁нас 263
|
265 |
+
вать 264
|
266 |
+
▁пред 265
|
267 |
+
▁может 266
|
268 |
+
▁включи 267
|
269 |
+
▁очень 268
|
270 |
+
хо 269
|
271 |
+
лись 270
|
272 |
+
ха 271
|
273 |
+
▁ра 272
|
274 |
+
сть 273
|
275 |
+
ство 274
|
276 |
+
▁пре 275
|
277 |
+
ное 276
|
278 |
+
ков 277
|
279 |
+
▁кон 278
|
280 |
+
▁ис 279
|
281 |
+
ами 280
|
282 |
+
▁оп 281
|
283 |
+
▁е 282
|
284 |
+
▁тебя 283
|
285 |
+
лась 284
|
286 |
+
▁сам 285
|
287 |
+
ания 286
|
288 |
+
раз 287
|
289 |
+
ник 288
|
290 |
+
зы 289
|
291 |
+
▁фильм 290
|
292 |
+
▁один 291
|
293 |
+
▁эти 292
|
294 |
+
ним 293
|
295 |
+
▁чтобы 294
|
296 |
+
ание 295
|
297 |
+
ции 296
|
298 |
+
чно 297
|
299 |
+
▁сейчас 298
|
300 |
+
▁бу 299
|
301 |
+
▁нет 300
|
302 |
+
ком 301
|
303 |
+
▁просто 302
|
304 |
+
ём 303
|
305 |
+
ён 304
|
306 |
+
лось 305
|
307 |
+
фи 306
|
308 |
+
▁три 307
|
309 |
+
из 308
|
310 |
+
аться 309
|
311 |
+
гу 310
|
312 |
+
смотр 311
|
313 |
+
▁воз 312
|
314 |
+
вор 313
|
315 |
+
▁ком 314
|
316 |
+
▁пер 315
|
317 |
+
зи 316
|
318 |
+
▁ж 317
|
319 |
+
▁ос 318
|
320 |
+
▁можно 319
|
321 |
+
▁её 320
|
322 |
+
пол 321
|
323 |
+
▁два 322
|
324 |
+
▁оста 323
|
325 |
+
▁вер 324
|
326 |
+
нул 325
|
327 |
+
▁сер 326
|
328 |
+
ент 327
|
329 |
+
▁люб 328
|
330 |
+
ых 329
|
331 |
+
ную 330
|
332 |
+
пис 331
|
333 |
+
▁чем 332
|
334 |
+
▁после 333
|
335 |
+
▁сказал 334
|
336 |
+
▁пу 335
|
337 |
+
тов 336
|
338 |
+
айте 337
|
339 |
+
▁му 338
|
340 |
+
▁человек 339
|
341 |
+
▁кар 340
|
342 |
+
кой 341
|
343 |
+
ешь 342
|
344 |
+
ится 343
|
345 |
+
▁сезон 344
|
346 |
+
ают 345
|
347 |
+
бы 346
|
348 |
+
▁тв 347
|
349 |
+
ился 348
|
350 |
+
ально 349
|
351 |
+
▁могу 350
|
352 |
+
▁мест 351
|
353 |
+
▁две 352
|
354 |
+
тро 353
|
355 |
+
пра 354
|
356 |
+
▁боль 355
|
357 |
+
ался 356
|
358 |
+
▁где 357
|
359 |
+
сси 358
|
360 |
+
▁пи 359
|
361 |
+
ща 360
|
362 |
+
форм 361
|
363 |
+
▁потому 362
|
364 |
+
▁быть 363
|
365 |
+
▁двадцать 364
|
366 |
+
▁ваш 365
|
367 |
+
ёл 366
|
368 |
+
про 367
|
369 |
+
ность 368
|
370 |
+
▁без 369
|
371 |
+
▁дела 370
|
372 |
+
▁та 371
|
373 |
+
ъ 372
|
374 |
+
ически 373
|
375 |
+
▁пе 374
|
376 |
+
▁которые 375
|
377 |
+
▁пожалуйста 376
|
378 |
+
▁само 377
|
379 |
+
▁одно 378
|
380 |
+
▁работ 379
|
381 |
+
▁пять 380
|
382 |
+
ский 381
|
383 |
+
▁прав 382
|
384 |
+
▁время 383
|
385 |
+
▁даже 384
|
386 |
+
▁смотрешке 385
|
387 |
+
ция 386
|
388 |
+
▁здесь 387
|
389 |
+
▁четыре 388
|
390 |
+
ской 389
|
391 |
+
ственно 390
|
392 |
+
▁этого 391
|
393 |
+
▁гр 392
|
394 |
+
▁нужно 393
|
395 |
+
▁билет 394
|
396 |
+
▁рук 395
|
397 |
+
ности 396
|
398 |
+
иться 397
|
399 |
+
▁чи 398
|
400 |
+
▁какой 399
|
401 |
+
▁тре 400
|
402 |
+
▁ему 401
|
403 |
+
ются 402
|
404 |
+
ительно 403
|
405 |
+
▁стан 404
|
406 |
+
▁который 405
|
407 |
+
▁свои 406
|
408 |
+
▁семь 407
|
409 |
+
▁стар 408
|
410 |
+
▁себя 409
|
411 |
+
▁хочу 410
|
412 |
+
▁сколько 411
|
413 |
+
▁теперь 412
|
414 |
+
▁буду 413
|
415 |
+
▁какие 414
|
416 |
+
▁салют 415
|
417 |
+
▁больше 416
|
418 |
+
▁кино 417
|
419 |
+
▁потом 418
|
420 |
+
▁восемь 419
|
421 |
+
▁ничего 420
|
422 |
+
▁след 421
|
423 |
+
▁минут 422
|
424 |
+
ская 423
|
425 |
+
▁поезд 424
|
426 |
+
▁сериал 425
|
427 |
+
▁вопрос 426
|
428 |
+
▁спасибо 427
|
429 |
+
▁канал 428
|
430 |
+
▁говорит 429
|
431 |
+
▁хорошо 430
|
432 |
+
▁жив 431
|
433 |
+
▁девять 432
|
434 |
+
▁через 433
|
435 |
+
▁шесть 434
|
436 |
+
▁конечно 435
|
437 |
+
▁тогда 436
|
438 |
+
▁покажи 437
|
439 |
+
▁стал 438
|
440 |
+
▁вообще 439
|
441 |
+
ывает 440
|
442 |
+
▁интерес 441
|
443 |
+
▁себе 442
|
444 |
+
▁крас 443
|
445 |
+
▁тридцать 444
|
446 |
+
▁сегодня 445
|
447 |
+
▁действ 446
|
448 |
+
▁сторон 447
|
449 |
+
▁чу 448
|
450 |
+
ении 449
|
451 |
+
▁почему 450
|
452 |
+
▁более 451
|
453 |
+
▁поэтому 452
|
454 |
+
▁десят 453
|
455 |
+
▁отправ 454
|
456 |
+
▁знаю 455
|
457 |
+
ских 456
|
458 |
+
▁тысяч 457
|
459 |
+
▁всё 458
|
460 |
+
▁можешь 459
|
461 |
+
▁значит 460
|
462 |
+
▁против 461
|
463 |
+
▁ответ 462
|
464 |
+
▁найди 463
|
465 |
+
▁люди 464
|
466 |
+
▁одна 465
|
467 |
+
▁начал 466
|
468 |
+
▁серия 467
|
469 |
+
▁между 468
|
470 |
+
▁свет 469
|
471 |
+
▁именно 470
|
472 |
+
▁телефон 471
|
473 |
+
нибудь 472
|
474 |
+
▁пятьдесят 473
|
475 |
+
очка 474
|
476 |
+
▁времени 475
|
477 |
+
▁цел 476
|
478 |
+
▁возможно 477
|
479 |
+
▁сказать 478
|
480 |
+
▁глаза 479
|
481 |
+
▁называ 480
|
482 |
+
▁хотел 481
|
483 |
+
▁сорок 482
|
484 |
+
▁поставь 483
|
485 |
+
▁первый 484
|
486 |
+
▁связ 485
|
487 |
+
▁которая 486
|
488 |
+
▁электр 487
|
489 |
+
▁например 488
|
490 |
+
▁ведь 489
|
491 |
+
▁сделать 490
|
492 |
+
▁говорил 491
|
493 |
+
▁благо 492
|
494 |
+
ировать 493
|
495 |
+
▁посмотреть 494
|
496 |
+
▁всегда 495
|
497 |
+
▁несколько 496
|
498 |
+
▁людей 497
|
499 |
+
▁случае 498
|
500 |
+
▁гарри 499
|
501 |
+
#0 500
|
502 |
+
#1 501
|
lang/unigram_500.vocab
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<blk> 0
|
2 |
+
<sos/eos> 0
|
3 |
+
<unk> 0
|
4 |
+
е -3.72421
|
5 |
+
▁с -4.00711
|
6 |
+
т -4.08971
|
7 |
+
▁в -4.14234
|
8 |
+
▁ -4.14364
|
9 |
+
с -4.15149
|
10 |
+
н -4.16546
|
11 |
+
м -4.20176
|
12 |
+
▁и -4.21221
|
13 |
+
и -4.29997
|
14 |
+
р -4.37977
|
15 |
+
▁на -4.3817
|
16 |
+
▁по -4.38887
|
17 |
+
д -4.41525
|
18 |
+
в -4.41704
|
19 |
+
у -4.43597
|
20 |
+
л -4.53253
|
21 |
+
▁не -4.56287
|
22 |
+
к -4.58558
|
23 |
+
й -4.60495
|
24 |
+
я -4.64027
|
25 |
+
а -4.64466
|
26 |
+
ра -4.75414
|
27 |
+
▁у -4.7655
|
28 |
+
ли -4.81635
|
29 |
+
но -4.86806
|
30 |
+
ла -4.87179
|
31 |
+
ка -4.92484
|
32 |
+
х -4.9503
|
33 |
+
ч -4.95321
|
34 |
+
о -4.98385
|
35 |
+
▁за -5.05821
|
36 |
+
з -5.05928
|
37 |
+
ю -5.08551
|
38 |
+
ло -5.11068
|
39 |
+
ни -5.1306
|
40 |
+
▁а -5.13288
|
41 |
+
то -5.13924
|
42 |
+
ст -5.14875
|
43 |
+
ва -5.15182
|
44 |
+
г -5.16538
|
45 |
+
ы -5.18352
|
46 |
+
▁что -5.22316
|
47 |
+
ш -5.22707
|
48 |
+
ть -5.23793
|
49 |
+
ь -5.2509
|
50 |
+
ль -5.25799
|
51 |
+
▁к -5.26308
|
52 |
+
▁я -5.26591
|
53 |
+
го -5.27411
|
54 |
+
▁это -5.29806
|
55 |
+
ж -5.3083
|
56 |
+
на -5.30963
|
57 |
+
ро -5.3109
|
58 |
+
▁о -5.34496
|
59 |
+
ти -5.3955
|
60 |
+
▁то -5.40496
|
61 |
+
ле -5.41254
|
62 |
+
во -5.4165
|
63 |
+
ре -5.4188
|
64 |
+
ки -5.47916
|
65 |
+
п -5.48377
|
66 |
+
да -5.52795
|
67 |
+
▁вы -5.52823
|
68 |
+
б -5.52901
|
69 |
+
та -5.54602
|
70 |
+
▁до -5.55857
|
71 |
+
ри -5.56377
|
72 |
+
▁при -5.58046
|
73 |
+
▁как -5.58207
|
74 |
+
▁от -5.63345
|
75 |
+
▁но -5.63407
|
76 |
+
▁про -5.63691
|
77 |
+
▁так -5.6499
|
78 |
+
ко -5.65805
|
79 |
+
▁да -5.67491
|
80 |
+
▁д -5.67721
|
81 |
+
ви -5.68739
|
82 |
+
не -5.6981
|
83 |
+
те -5.74268
|
84 |
+
ет -5.77888
|
85 |
+
ру -5.78523
|
86 |
+
▁со -5.80155
|
87 |
+
▁об -5.80862
|
88 |
+
ди -5.85919
|
89 |
+
▁все -5.88436
|
90 |
+
▁ко -5.90038
|
91 |
+
ми -5.91224
|
92 |
+
ой -5.91948
|
93 |
+
ве -5.92044
|
94 |
+
▁мо -5.9268
|
95 |
+
чи -5.93594
|
96 |
+
ля -5.94037
|
97 |
+
- -5.95904
|
98 |
+
▁он -5.97629
|
99 |
+
ё -5.99857
|
100 |
+
ку -6.00185
|
101 |
+
ов -6.01007
|
102 |
+
ом -6.0288
|
103 |
+
до -6.04131
|
104 |
+
че -6.0471
|
105 |
+
▁п -6.05797
|
106 |
+
ма -6.06525
|
107 |
+
ча -6.07659
|
108 |
+
▁го -6.08481
|
109 |
+
ту -6.0907
|
110 |
+
де -6.09127
|
111 |
+
ся -6.10754
|
112 |
+
▁ма -6.12254
|
113 |
+
лу -6.12261
|
114 |
+
же -6.15441
|
115 |
+
ц -6.17019
|
116 |
+
▁т -6.17707
|
117 |
+
▁г -6.18064
|
118 |
+
▁м -6.187
|
119 |
+
▁из -6.18745
|
120 |
+
ем -6.18764
|
121 |
+
ан -6.1898
|
122 |
+
▁вот -6.20622
|
123 |
+
▁во -6.21312
|
124 |
+
▁раз -6.21391
|
125 |
+
жи -6.21908
|
126 |
+
ста -6.2236
|
127 |
+
по -6.23241
|
128 |
+
ши -6.23353
|
129 |
+
ный -6.23429
|
130 |
+
▁есть -6.25535
|
131 |
+
▁б -6.25862
|
132 |
+
ше -6.26684
|
133 |
+
▁ли -6.2692
|
134 |
+
за -6.27292
|
135 |
+
▁з -6.27702
|
136 |
+
бо -6.29133
|
137 |
+
га -6.29361
|
138 |
+
▁ка -6.30692
|
139 |
+
▁мы -6.30719
|
140 |
+
мо -6.30974
|
141 |
+
▁де -6.31045
|
142 |
+
сти -6.32652
|
143 |
+
▁те -6.34079
|
144 |
+
ну -6.36912
|
145 |
+
▁под -6.37186
|
146 |
+
тор -6.37209
|
147 |
+
ить -6.37813
|
148 |
+
▁бы -6.38675
|
149 |
+
▁ни -6.38974
|
150 |
+
▁ф -6.395
|
151 |
+
▁э -6.39689
|
152 |
+
▁ре -6.4069
|
153 |
+
ные -6.40992
|
154 |
+
ать -6.41185
|
155 |
+
ры -6.41376
|
156 |
+
мен -6.42197
|
157 |
+
▁мне -6.42556
|
158 |
+
ение -6.43243
|
159 |
+
ей -6.44644
|
160 |
+
ня -6.45298
|
161 |
+
па -6.45865
|
162 |
+
▁же -6.46201
|
163 |
+
му -6.48738
|
164 |
+
пе -6.49161
|
165 |
+
ения -6.49489
|
166 |
+
ду -6.50284
|
167 |
+
ги -6.50349
|
168 |
+
ф -6.50672
|
169 |
+
сь -6.50802
|
170 |
+
об -6.51477
|
171 |
+
ны -6.51839
|
172 |
+
ется -6.5246
|
173 |
+
ты -6.52819
|
174 |
+
▁пере -6.53089
|
175 |
+
со -6.53688
|
176 |
+
ую -6.53716
|
177 |
+
ал -6.53947
|
178 |
+
▁его -6.54683
|
179 |
+
ого -6.5558
|
180 |
+
лю -6.57673
|
181 |
+
це -6.60805
|
182 |
+
▁ну -6.61118
|
183 |
+
вер -6.61359
|
184 |
+
ной -6.61434
|
185 |
+
ци -6.61518
|
186 |
+
жа -6.62145
|
187 |
+
им -6.62822
|
188 |
+
щи -6.6424
|
189 |
+
▁па -6.65636
|
190 |
+
би -6.65941
|
191 |
+
ят -6.67541
|
192 |
+
▁ро -6.69099
|
193 |
+
▁если -6.70234
|
194 |
+
ного -6.71022
|
195 |
+
ёт -6.71453
|
196 |
+
ск -6.71786
|
197 |
+
ных -6.71869
|
198 |
+
ход -6.72616
|
199 |
+
щ -6.7273
|
200 |
+
ще -6.73349
|
201 |
+
ая -6.73537
|
202 |
+
ров -6.75237
|
203 |
+
▁меня -6.75302
|
204 |
+
▁ш -6.75798
|
205 |
+
▁она -6.76858
|
206 |
+
жд -6.77673
|
207 |
+
▁ты -6.77867
|
208 |
+
бе -6.79111
|
209 |
+
▁ку -6.79506
|
210 |
+
▁рас -6.79763
|
211 |
+
мер -6.79882
|
212 |
+
▁для -6.8128
|
213 |
+
ца -6.82069
|
214 |
+
ря -6.83478
|
215 |
+
ень -6.83977
|
216 |
+
▁бо -6.84372
|
217 |
+
▁бе -6.84714
|
218 |
+
си -6.85532
|
219 |
+
тель -6.85941
|
220 |
+
ная -6.86527
|
221 |
+
ным -6.86943
|
222 |
+
он -6.88114
|
223 |
+
▁ещё -6.8825
|
224 |
+
▁будет -6.88757
|
225 |
+
▁хо -6.89723
|
226 |
+
вы -6.89731
|
227 |
+
▁ми -6.90201
|
228 |
+
са -6.90676
|
229 |
+
▁там -6.91816
|
230 |
+
лы -6.92175
|
231 |
+
ша -6.92869
|
232 |
+
▁ле -6.93238
|
233 |
+
▁они -6.9348
|
234 |
+
пу -6.93718
|
235 |
+
зна -6.93926
|
236 |
+
нов -6.94786
|
237 |
+
пи -6.95597
|
238 |
+
ое -6.95774
|
239 |
+
ба -6.9586
|
240 |
+
ил -6.96984
|
241 |
+
дел -6.98119
|
242 |
+
▁был -6.98238
|
243 |
+
ско -6.98854
|
244 |
+
▁ба -6.98951
|
245 |
+
ите -6.99367
|
246 |
+
▁друг -6.9943
|
247 |
+
▁че -6.99802
|
248 |
+
▁вам -7.01029
|
249 |
+
▁х -7.01148
|
250 |
+
ый -7.01826
|
251 |
+
▁вид -7.01874
|
252 |
+
лов -7.02428
|
253 |
+
▁час -7.02801
|
254 |
+
▁было -7.03338
|
255 |
+
▁только -7.03819
|
256 |
+
▁вас -7.04398
|
257 |
+
ке -7.05017
|
258 |
+
тер -7.06037
|
259 |
+
э -7.06244
|
260 |
+
▁или -7.06254
|
261 |
+
▁когда -7.06466
|
262 |
+
▁сто -7.07688
|
263 |
+
▁уже -7.0945
|
264 |
+
▁нас -7.0947
|
265 |
+
вать -7.09646
|
266 |
+
▁пред -7.1066
|
267 |
+
▁может -7.10866
|
268 |
+
▁включи -7.1111
|
269 |
+
▁очень -7.11544
|
270 |
+
хо -7.11665
|
271 |
+
лись -7.12812
|
272 |
+
ха -7.13242
|
273 |
+
▁ра -7.13898
|
274 |
+
сть -7.13935
|
275 |
+
ство -7.14754
|
276 |
+
▁пре -7.15134
|
277 |
+
ное -7.15238
|
278 |
+
ков -7.15586
|
279 |
+
▁кон -7.15816
|
280 |
+
▁ис -7.1604
|
281 |
+
ами -7.16615
|
282 |
+
▁оп -7.18563
|
283 |
+
▁е -7.18591
|
284 |
+
▁тебя -7.18813
|
285 |
+
лась -7.19185
|
286 |
+
▁сам -7.19982
|
287 |
+
ания -7.19995
|
288 |
+
раз -7.20058
|
289 |
+
ник -7.20603
|
290 |
+
зы -7.20843
|
291 |
+
▁фильм -7.21124
|
292 |
+
▁один -7.21558
|
293 |
+
▁эти -7.21559
|
294 |
+
ним -7.21944
|
295 |
+
▁чтобы -7.22289
|
296 |
+
ание -7.23062
|
297 |
+
ции -7.23383
|
298 |
+
чно -7.23714
|
299 |
+
▁сейчас -7.24093
|
300 |
+
▁бу -7.24279
|
301 |
+
▁нет -7.24342
|
302 |
+
ком -7.24996
|
303 |
+
▁просто -7.2506
|
304 |
+
ём -7.2565
|
305 |
+
ён -7.26084
|
306 |
+
лось -7.26338
|
307 |
+
фи -7.26581
|
308 |
+
▁три -7.27362
|
309 |
+
из -7.28159
|
310 |
+
аться -7.2843
|
311 |
+
гу -7.28869
|
312 |
+
смотр -7.2938
|
313 |
+
▁воз -7.30375
|
314 |
+
вор -7.3041
|
315 |
+
▁ком -7.31052
|
316 |
+
▁пер -7.3106
|
317 |
+
зи -7.3194
|
318 |
+
▁ж -7.32187
|
319 |
+
▁ос -7.3302
|
320 |
+
▁можно -7.3382
|
321 |
+
▁её -7.33991
|
322 |
+
пол -7.34408
|
323 |
+
▁два -7.34513
|
324 |
+
▁оста -7.3556
|
325 |
+
▁вер -7.36032
|
326 |
+
нул -7.36142
|
327 |
+
▁сер -7.36736
|
328 |
+
ент -7.37146
|
329 |
+
▁люб -7.37199
|
330 |
+
ых -7.37646
|
331 |
+
ную -7.37825
|
332 |
+
пис -7.37833
|
333 |
+
▁чем -7.38971
|
334 |
+
▁после -7.39552
|
335 |
+
▁сказал -7.3956
|
336 |
+
▁пу -7.39903
|
337 |
+
тов -7.39953
|
338 |
+
айте -7.40175
|
339 |
+
▁му -7.41591
|
340 |
+
▁человек -7.42485
|
341 |
+
▁кар -7.427
|
342 |
+
кой -7.42935
|
343 |
+
ешь -7.43681
|
344 |
+
ится -7.44416
|
345 |
+
▁сезон -7.45472
|
346 |
+
ают -7.46059
|
347 |
+
бы -7.46086
|
348 |
+
▁тв -7.46131
|
349 |
+
ился -7.46134
|
350 |
+
ально -7.46924
|
351 |
+
▁могу -7.47201
|
352 |
+
▁мест -7.47594
|
353 |
+
▁две -7.48553
|
354 |
+
тро -7.49256
|
355 |
+
пра -7.49279
|
356 |
+
▁боль -7.49293
|
357 |
+
ался -7.49695
|
358 |
+
▁где -7.50004
|
359 |
+
сси -7.50442
|
360 |
+
▁пи -7.50621
|
361 |
+
ща -7.52085
|
362 |
+
форм -7.54043
|
363 |
+
▁потому -7.54055
|
364 |
+
▁быть -7.55272
|
365 |
+
▁двадцать -7.55597
|
366 |
+
▁ваш -7.5597
|
367 |
+
ёл -7.56701
|
368 |
+
про -7.57489
|
369 |
+
ность -7.57697
|
370 |
+
▁без -7.57787
|
371 |
+
▁дела -7.58268
|
372 |
+
▁та -7.58835
|
373 |
+
ъ -7.59162
|
374 |
+
ически -7.59388
|
375 |
+
▁пе -7.59514
|
376 |
+
▁которые -7.59823
|
377 |
+
▁пожалуйста -7.60476
|
378 |
+
▁само -7.62166
|
379 |
+
▁одно -7.62488
|
380 |
+
▁работ -7.62587
|
381 |
+
▁пять -7.63438
|
382 |
+
ский -7.63578
|
383 |
+
▁прав -7.64631
|
384 |
+
▁время -7.64938
|
385 |
+
▁даже -7.65259
|
386 |
+
▁смотрешке -7.65371
|
387 |
+
ция -7.65981
|
388 |
+
▁здесь -7.6756
|
389 |
+
▁четыре -7.67719
|
390 |
+
ской -7.68614
|
391 |
+
ственно -7.68683
|
392 |
+
▁этого -7.68729
|
393 |
+
▁гр -7.68949
|
394 |
+
▁нужно -7.68976
|
395 |
+
▁билет -7.69119
|
396 |
+
▁рук -7.69124
|
397 |
+
ности -7.70224
|
398 |
+
иться -7.71736
|
399 |
+
▁чи -7.74136
|
400 |
+
▁какой -7.74534
|
401 |
+
▁тре -7.75798
|
402 |
+
▁ему -7.76136
|
403 |
+
ются -7.76285
|
404 |
+
ительно -7.76457
|
405 |
+
▁стан -7.77955
|
406 |
+
▁который -7.78311
|
407 |
+
▁свои -7.79485
|
408 |
+
▁семь -7.80293
|
409 |
+
▁стар -7.82062
|
410 |
+
▁себя -7.829
|
411 |
+
▁хочу -7.82951
|
412 |
+
▁сколько -7.84939
|
413 |
+
▁теперь -7.86712
|
414 |
+
▁буду -7.88652
|
415 |
+
▁какие -7.89257
|
416 |
+
▁салют -7.89721
|
417 |
+
▁больше -7.90363
|
418 |
+
▁кино -7.9076
|
419 |
+
▁потом -7.90802
|
420 |
+
▁восемь -7.92117
|
421 |
+
▁ничего -7.92557
|
422 |
+
▁след -7.93644
|
423 |
+
▁минут -7.93963
|
424 |
+
ская -7.94281
|
425 |
+
▁поезд -7.94517
|
426 |
+
▁сериал -7.95059
|
427 |
+
▁вопрос -7.96073
|
428 |
+
▁спасибо -7.96309
|
429 |
+
▁канал -7.9692
|
430 |
+
▁говорит -7.97006
|
431 |
+
▁хорошо -7.9883
|
432 |
+
▁жив -7.99152
|
433 |
+
▁девять -7.99548
|
434 |
+
▁через -8.00708
|
435 |
+
▁шесть -8.01468
|
436 |
+
▁конечно -8.01557
|
437 |
+
▁тогда -8.03135
|
438 |
+
▁покажи -8.0337
|
439 |
+
▁стал -8.03473
|
440 |
+
▁вообще -8.05943
|
441 |
+
ывает -8.06989
|
442 |
+
▁интерес -8.07217
|
443 |
+
▁себе -8.09189
|
444 |
+
▁крас -8.10057
|
445 |
+
▁тридцать -8.10802
|
446 |
+
▁сегодня -8.11505
|
447 |
+
▁действ -8.12242
|
448 |
+
▁сторон -8.12496
|
449 |
+
▁чу -8.13269
|
450 |
+
ении -8.14634
|
451 |
+
▁почему -8.15047
|
452 |
+
▁более -8.15271
|
453 |
+
▁поэтому -8.16289
|
454 |
+
▁десят -8.16801
|
455 |
+
▁отправ -8.16928
|
456 |
+
▁знаю -8.17321
|
457 |
+
ских -8.18712
|
458 |
+
▁тысяч -8.18932
|
459 |
+
▁всё -8.20101
|
460 |
+
▁можешь -8.2129
|
461 |
+
▁значит -8.214
|
462 |
+
▁против -8.22679
|
463 |
+
▁ответ -8.22794
|
464 |
+
▁найди -8.23782
|
465 |
+
▁люди -8.23857
|
466 |
+
▁одна -8.24067
|
467 |
+
▁начал -8.24422
|
468 |
+
▁серия -8.25009
|
469 |
+
▁между -8.26
|
470 |
+
▁свет -8.2639
|
471 |
+
▁именно -8.2677
|
472 |
+
▁телефон -8.27473
|
473 |
+
нибудь -8.29168
|
474 |
+
▁пятьдесят -8.30322
|
475 |
+
очка -8.30543
|
476 |
+
▁времени -8.31148
|
477 |
+
▁цел -8.33721
|
478 |
+
▁возможно -8.35705
|
479 |
+
▁сказать -8.35962
|
480 |
+
▁глаза -8.37791
|
481 |
+
▁называ -8.3883
|
482 |
+
▁хотел -8.39147
|
483 |
+
▁сорок -8.39892
|
484 |
+
▁поставь -8.41498
|
485 |
+
▁первый -8.41767
|
486 |
+
▁связ -8.4299
|
487 |
+
▁которая -8.43202
|
488 |
+
▁электр -8.43714
|
489 |
+
▁например -8.45837
|
490 |
+
▁ведь -8.45906
|
491 |
+
▁сделать -8.46666
|
492 |
+
▁говорил -8.47077
|
493 |
+
▁благо -8.4716
|
494 |
+
ировать -8.48783
|
495 |
+
▁посмотреть -8.48969
|
496 |
+
▁всегда -8.49153
|
497 |
+
▁несколько -8.49574
|
498 |
+
▁людей -8.49624
|
499 |
+
▁случае -8.5061
|
500 |
+
▁гарри -8.5218
|
lm/2gram.fst.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lm/epoch-99.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fb6cbccb971a9cfc50c1dc8d2311eee51095ceff68e9bc03810e4176a816feb
|
3 |
+
size 406952299
|
test.wav
ADDED
Binary file (227 kB). View file
|
|