File size: 9,323 Bytes
1ce325b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 |
import os
cimport _kenlm
cdef bytes as_str(data):
if isinstance(data, bytes):
return data
elif isinstance(data, unicode):
return data.encode('utf8')
raise TypeError('Cannot convert %s to string' % type(data))
cdef class FullScoreReturn:
"""
Wrapper around FullScoreReturn.
Notes:
`prob` has been renamed to `log_prob`
`oov` has been added to flag whether the word is OOV
"""
cdef float log_prob
cdef int ngram_length
cdef bint oov
def __cinit__(self, log_prob, ngram_length, oov):
self.log_prob = log_prob
self.ngram_length = ngram_length
self.oov = oov
def __repr__(self):
return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov))
property log_prob:
def __get__(self):
return self.log_prob
property ngram_length:
def __get__(self):
return self.ngram_length
property oov:
def __get__(self):
return self.oov
cdef class State:
"""
Wrapper around lm::ngram::State so that python code can make incremental queries.
Notes:
* rich comparisons
* hashable
"""
cdef _kenlm.State _c_state
def __richcmp__(State qa, State qb, int op):
r = qa._c_state.Compare(qb._c_state)
if op == 0: # <
return r < 0
elif op == 1: # <=
return r <= 0
elif op == 2: # ==
return r == 0
elif op == 3: # !=
return r != 0
elif op == 4: # >
return r > 0
else: # >=
return r >= 0
def __hash__(self):
return _kenlm.hash_value(self._c_state)
def __copy__(self):
ret = State()
ret._c_state = self._c_state
return ret
def __deepcopy__(self):
return self.__copy__()
class LoadMethod:
LAZY = _kenlm.LAZY
POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY
POPULATE_OR_READ = _kenlm.POPULATE_OR_READ
READ = _kenlm.READ
PARALLEL_READ = _kenlm.PARALLEL_READ
class ARPALoadComplain:
ALL = _kenlm.ALL
EXPENSIVE = _kenlm.EXPENSIVE
NONE = _kenlm.NONE
cdef class Config:
"""
Wrapper around lm::ngram::Config.
Pass this to Model's constructor to set configuration options.
"""
cdef _kenlm.Config _c_config
def __init__(self):
self._c_config = _kenlm.Config()
property load_method:
def __get__(self):
return self._c_config.load_method
def __set__(self, to):
self._c_config.load_method = to
property show_progress:
def __get__(self):
return self._c_config.show_progress
def __set__(self, to):
self._c_config.show_progress = to
property arpa_complain:
def __get__(self):
return self._c_config.arpa_complain
def __set__(self, to):
self._c_config.arpa_complain = to
cdef class Model:
"""
Wrapper around lm::ngram::Model.
"""
cdef _kenlm.Model* model
cdef public bytes path
cdef _kenlm.const_Vocabulary* vocab
def __init__(self, path, Config config = Config()):
"""
Load the language model.
:param path: path to an arpa file or a kenlm binary file.
:param config: configuration options (see lm/config.hh for documentation)
"""
self.path = os.path.abspath(as_str(path))
try:
self.model = _kenlm.LoadVirtual(self.path, config._c_config)
except RuntimeError as exception:
exception_message = str(exception).replace('\n', ' ')
raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
from exception
self.vocab = &self.model.BaseVocabulary()
def __dealloc__(self):
del self.model
property order:
def __get__(self):
return self.model.Order()
def score(self, sentence, bos = True, eos = True):
"""
Return the log10 probability of a string. By default, the string is
treated as a sentence.
return log10 p(sentence </s> | <s>)
If you do not want to condition on the beginning of sentence, pass
bos = False
Never include <s> as part of the string. That would be predicting the
beginning of sentence. Language models are only supposed to condition
on it as context.
Similarly, the end of sentence token </s> can be omitted with
eos = False
Since language models explicitly predict </s>, it can be part of the
string.
Examples:
#Good: returns log10 p(this is a sentence . </s> | <s>)
model.score("this is a sentence .")
#Good: same as the above but more explicit
model.score("this is a sentence .", bos = True, eos = True)
#Bad: never include <s>
model.score("<s> this is a sentence")
#Bad: never include <s>, even if bos = False.
model.score("<s> this is a sentence", bos = False)
#Good: returns log10 p(a fragment)
model.score("a fragment", bos = False, eos = False)
#Good: returns log10 p(a fragment </s>)
model.score("a fragment", bos = False, eos = True)
#Ok, but bad practice: returns log10 p(a fragment </s>)
#Unlike <s>, the end of sentence token </s> can appear explicitly.
model.score("a fragment </s>", bos = False, eos = False)
"""
if bos and eos:
return _kenlm.ScoreSentence(self.model, as_str(sentence))
cdef list words = as_str(sentence).split()
cdef _kenlm.State state
if bos:
self.model.BeginSentenceWrite(&state)
else:
self.model.NullContextWrite(&state)
cdef _kenlm.State out_state
cdef float total = 0
for word in words:
total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state)
state = out_state
if eos:
total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state)
return total
def perplexity(self, sentence):
"""
Compute perplexity of a sentence.
@param sentence One full sentence to score. Do not include <s> or </s>.
"""
words = len(as_str(sentence).split()) + 1 # For </s>
return 10.0**(-self.score(sentence) / words)
def full_scores(self, sentence, bos = True, eos = True):
"""
full_scores(sentence, bos = True, eos = True) -> generate full scores (prob, ngram length, oov)
@param sentence is a string (do not use boundary symbols)
@param bos should kenlm add a bos state
@param eos should kenlm add an eos state
"""
cdef list words = as_str(sentence).split()
cdef _kenlm.State state
if bos:
self.model.BeginSentenceWrite(&state)
else:
self.model.NullContextWrite(&state)
cdef _kenlm.State out_state
cdef _kenlm.FullScoreReturn ret
cdef float total = 0
cdef _kenlm.WordIndex wid
for word in words:
wid = self.vocab.Index(word)
ret = self.model.BaseFullScore(&state, wid, &out_state)
yield (ret.prob, ret.ngram_length, wid == 0)
state = out_state
if eos:
ret = self.model.BaseFullScore(&state,
self.vocab.EndSentence(), &out_state)
yield (ret.prob, ret.ngram_length, False)
def BeginSentenceWrite(self, State state):
"""Change the given state to a BOS state."""
self.model.BeginSentenceWrite(&state._c_state)
def NullContextWrite(self, State state):
"""Change the given state to a NULL state."""
self.model.NullContextWrite(&state._c_state)
def BaseScore(self, State in_state, str word, State out_state):
"""
Return p(word|in_state) and update the output state.
Wrapper around model.BaseScore(in_state, Index(word), out_state)
:param word: the suffix
:param state: the context (defaults to NullContext)
:returns: p(word|state)
"""
cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state)
return total
def BaseFullScore(self, State in_state, str word, State out_state):
"""
Wrapper around model.BaseFullScore(in_state, Index(word), out_state)
:param word: the suffix
:param state: the context (defaults to NullContext)
:returns: FullScoreReturn(word|state)
"""
cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word))
cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state)
return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0)
def __contains__(self, word):
cdef bytes w = as_str(word)
return (self.vocab.Index(w) != 0)
def __repr__(self):
return '<Model from {0}>'.format(os.path.basename(self.path))
def __reduce__(self):
return (Model, (self.path,))
class LanguageModel(Model):
"""Backwards compatability stub. Use Model."""
|