acecalisto3 commited on
Commit
1bd0df4
·
verified ·
1 Parent(s): 735e695

Create gensim/corpora/dictionary.py

Browse files
Files changed (1) hide show
  1. gensim/corpora/dictionary.py +781 -0
gensim/corpora/dictionary.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
5
+ # Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
6
+
7
+ """This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""
8
+
9
+ from collections import defaultdict
10
+ from collections.abc import Mapping
11
+ import logging
12
+ import itertools
13
+ from typing import Optional, List, Tuple
14
+
15
+ from gensim import utils
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class Dictionary(utils.SaveLoad, Mapping):
22
+ """Dictionary encapsulates the mapping between normalized words and their integer ids.
23
+
24
+ Notable instance attributes:
25
+
26
+ Attributes
27
+ ----------
28
+ token2id : dict of (str, int)
29
+ token -> token_id. I.e. the reverse mapping to `self[token_id]`.
30
+ cfs : dict of (int, int)
31
+ Collection frequencies: token_id -> how many instances of this token are contained in the documents.
32
+ dfs : dict of (int, int)
33
+ Document frequencies: token_id -> how many documents contain this token.
34
+ num_docs : int
35
+ Number of documents processed.
36
+ num_pos : int
37
+ Total number of corpus positions (number of processed words).
38
+ num_nnz : int
39
+ Total number of non-zeroes in the BOW matrix (sum of the number of unique
40
+ words per document over the entire corpus).
41
+
42
+ """
43
+ def __init__(self, documents=None, prune_at=2000000):
44
+ """
45
+
46
+ Parameters
47
+ ----------
48
+ documents : iterable of iterable of str, optional
49
+ Documents to be used to initialize the mapping and collect corpus statistics.
50
+ prune_at : int, optional
51
+ Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
52
+ footprint, the correctness is not guaranteed.
53
+ Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
54
+
55
+ Examples
56
+ --------
57
+ .. sourcecode:: pycon
58
+
59
+ >>> from gensim.corpora import Dictionary
60
+ >>>
61
+ >>> texts = [['human', 'interface', 'computer']]
62
+ >>> dct = Dictionary(texts) # initialize a Dictionary
63
+ >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
64
+ >>> dct.doc2bow(["dog", "computer", "non_existent_word"])
65
+ [(0, 1), (6, 1)]
66
+
67
+ """
68
+ self.token2id = {}
69
+ self.id2token = {}
70
+ self.cfs = {}
71
+ self.dfs = {}
72
+
73
+ self.num_docs = 0
74
+ self.num_pos = 0
75
+ self.num_nnz = 0
76
+
77
+ if documents is not None:
78
+ self.add_documents(documents, prune_at=prune_at)
79
+ self.add_lifecycle_event(
80
+ "created",
81
+ msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)",
82
+ )
83
+
84
+ def __getitem__(self, tokenid):
85
+ """Get the string token that corresponds to `tokenid`.
86
+
87
+ Parameters
88
+ ----------
89
+ tokenid : int
90
+ Id of token.
91
+
92
+ Returns
93
+ -------
94
+ str
95
+ Token corresponding to `tokenid`.
96
+
97
+ Raises
98
+ ------
99
+ KeyError
100
+ If this Dictionary doesn't contain such `tokenid`.
101
+
102
+ """
103
+ if len(self.id2token) != len(self.token2id):
104
+ # the word->id mapping has changed (presumably via add_documents);
105
+ # recompute id->word accordingly
106
+ self.id2token = utils.revdict(self.token2id)
107
+ return self.id2token[tokenid] # will throw for non-existent ids
108
+
109
+ def __iter__(self):
110
+ """Iterate over all tokens."""
111
+ return iter(self.keys())
112
+
113
+ # restore Py2-style dict API
114
+ iterkeys = __iter__
115
+
116
+ def iteritems(self):
117
+ return self.items()
118
+
119
+ def itervalues(self):
120
+ return self.values()
121
+
122
+ def keys(self):
123
+ """Get all stored ids.
124
+
125
+ Returns
126
+ -------
127
+ list of int
128
+ List of all token ids.
129
+
130
+ """
131
+ return list(self.token2id.values())
132
+
133
+ def __len__(self):
134
+ """Get number of stored tokens.
135
+
136
+ Returns
137
+ -------
138
+ int
139
+ Number of stored tokens.
140
+
141
+ """
142
+ return len(self.token2id)
143
+
144
+ def __str__(self):
145
+ some_keys = list(itertools.islice(self.token2id.keys(), 5))
146
+ return "%s<%i unique tokens: %s%s>" % (
147
+ self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else ''
148
+ )
149
+
150
+ @staticmethod
151
+ def from_documents(documents):
152
+ """Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`.
153
+
154
+ Equivalent to `Dictionary(documents=documents)`.
155
+
156
+ Parameters
157
+ ----------
158
+ documents : iterable of iterable of str
159
+ Input corpus.
160
+
161
+ Returns
162
+ -------
163
+ :class:`~gensim.corpora.dictionary.Dictionary`
164
+ Dictionary initialized from `documents`.
165
+
166
+ """
167
+ return Dictionary(documents=documents)
168
+
169
+ def add_documents(self, documents, prune_at=2000000):
170
+ """Update dictionary from a collection of `documents`.
171
+
172
+ Parameters
173
+ ----------
174
+ documents : iterable of iterable of str
175
+ Input corpus. All tokens should be already **tokenized and normalized**.
176
+ prune_at : int, optional
177
+ Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
178
+ footprint, the correctness is not guaranteed.
179
+ Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
180
+
181
+ Examples
182
+ --------
183
+ .. sourcecode:: pycon
184
+
185
+ >>> from gensim.corpora import Dictionary
186
+ >>>
187
+ >>> corpus = ["máma mele maso".split(), "ema má máma".split()]
188
+ >>> dct = Dictionary(corpus)
189
+ >>> len(dct)
190
+ 5
191
+ >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
192
+ >>> len(dct)
193
+ 10
194
+
195
+ """
196
+ for docno, document in enumerate(documents):
197
+ # log progress & run a regular check for pruning, once every 10k docs
198
+ if docno % 10000 == 0:
199
+ if prune_at is not None and len(self) > prune_at:
200
+ self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
201
+ logger.info("adding document #%i to %s", docno, self)
202
+
203
+ # update Dictionary with the document
204
+ self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
205
+
206
+ logger.info("built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos)
207
+
208
+ def doc2bow(self, document, allow_update=False, return_missing=False):
209
+ """Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
210
+
211
+ Parameters
212
+ ----------
213
+ document : list of str
214
+ Input document.
215
+ allow_update : bool, optional
216
+ Update self, by adding new tokens from `document` and updating internal corpus statistics.
217
+ return_missing : bool, optional
218
+ Return missing tokens (tokens present in `document` but not in self) with frequencies?
219
+
220
+ Return
221
+ ------
222
+ list of (int, int)
223
+ BoW representation of `document`.
224
+ list of (int, int), dict of (str, int)
225
+ If `return_missing` is True, return BoW representation of `document` + dictionary with missing
226
+ tokens and their frequencies.
227
+
228
+ Examples
229
+ --------
230
+ .. sourcecode:: pycon
231
+
232
+ >>> from gensim.corpora import Dictionary
233
+ >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
234
+ >>> dct.doc2bow(["this", "is", "máma"])
235
+ [(2, 1)]
236
+ >>> dct.doc2bow(["this", "is", "máma"], return_missing=True)
237
+ ([(2, 1)], {u'this': 1, u'is': 1})
238
+
239
+ """
240
+ if isinstance(document, str):
241
+ raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
242
+
243
+ # Construct (word, frequency) mapping.
244
+ counter = defaultdict(int)
245
+ for w in document:
246
+ counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1
247
+
248
+ token2id = self.token2id
249
+ if allow_update or return_missing:
250
+ missing = sorted(x for x in counter.items() if x[0] not in token2id)
251
+ if allow_update:
252
+ for w, _ in missing:
253
+ # new id = number of ids made so far;
254
+ # NOTE this assumes there are no gaps in the id sequence!
255
+ token2id[w] = len(token2id)
256
+ result = {token2id[w]: freq for w, freq in counter.items() if w in token2id}
257
+
258
+ if allow_update:
259
+ self.num_docs += 1
260
+ self.num_pos += sum(counter.values())
261
+ self.num_nnz += len(result)
262
+ # keep track of document and collection frequencies
263
+ for tokenid, freq in result.items():
264
+ self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
265
+ self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
266
+
267
+ # return tokenids, in ascending id order
268
+ result = sorted(result.items())
269
+ if return_missing:
270
+ return result, dict(missing)
271
+ else:
272
+ return result
273
+
274
+ def doc2idx(self, document, unknown_word_index=-1):
275
+ """Convert `document` (a list of words) into a list of indexes = list of `token_id`.
276
+ Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
277
+
278
+ Parameters
279
+ ----------
280
+ document : list of str
281
+ Input document
282
+ unknown_word_index : int, optional
283
+ Index to use for words not in the dictionary.
284
+
285
+ Returns
286
+ -------
287
+ list of int
288
+ Token ids for tokens in `document`, in the same order.
289
+
290
+ Examples
291
+ --------
292
+ .. sourcecode:: pycon
293
+
294
+ >>> from gensim.corpora import Dictionary
295
+ >>>
296
+ >>> corpus = [["a", "a", "b"], ["a", "c"]]
297
+ >>> dct = Dictionary(corpus)
298
+ >>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
299
+ [0, 0, 2, -1, 2]
300
+
301
+ """
302
+ if isinstance(document, str):
303
+ raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")
304
+
305
+ document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document]
306
+ return [self.token2id.get(word, unknown_word_index) for word in document]
307
+
308
+ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
309
+ """Filter out tokens in the dictionary by their frequency.
310
+
311
+ Parameters
312
+ ----------
313
+ no_below : int, optional
314
+ Keep tokens which are contained in at least `no_below` documents.
315
+ no_above : float, optional
316
+ Keep tokens which are contained in no more than `no_above` documents
317
+ (fraction of total corpus size, not an absolute number).
318
+ keep_n : int, optional
319
+ Keep only the first `keep_n` most frequent tokens.
320
+ keep_tokens : iterable of str
321
+ Iterable of tokens that **must** stay in dictionary after filtering.
322
+
323
+ Notes
324
+ -----
325
+ This removes all tokens in the dictionary that are:
326
+
327
+ #. Less frequent than `no_below` documents (absolute number, e.g. `5`) or \n
328
+ #. More frequent than `no_above` documents (fraction of the total corpus size, e.g. `0.3`).
329
+ #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`).
330
+
331
+ After the pruning, resulting gaps in word ids are shrunk.
332
+ Due to this gap shrinking, **the same word may have a different word id before and after the call
333
+ to this function!** See :class:`gensim.models.VocabTransform` and the
334
+ `dedicated FAQ entry <https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q8-how-can-i-filter-a-saved-corpus-and-its-corresponding-dictionary>`_ on how # noqa
335
+ to transform a corpus built with a dictionary before pruning.
336
+
337
+ Examples
338
+ --------
339
+ .. sourcecode:: pycon
340
+
341
+ >>> from gensim.corpora import Dictionary
342
+ >>>
343
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
344
+ >>> dct = Dictionary(corpus)
345
+ >>> len(dct)
346
+ 5
347
+ >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
348
+ >>> len(dct)
349
+ 1
350
+
351
+ """
352
+ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
353
+
354
+ # determine which tokens to keep
355
+ if keep_tokens:
356
+ keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
357
+ good_ids = [
358
+ v for v in self.token2id.values()
359
+ if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
360
+ ]
361
+ good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
362
+ else:
363
+ good_ids = [
364
+ v for v in self.token2id.values()
365
+ if no_below <= self.dfs.get(v, 0) <= no_above_abs
366
+ ]
367
+ good_ids.sort(key=self.dfs.get, reverse=True)
368
+ if keep_n is not None:
369
+ good_ids = good_ids[:keep_n]
370
+ bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
371
+ logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
372
+ logger.info(
373
+ "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
374
+ len(good_ids), no_below, no_above_abs, 100.0 * no_above
375
+ )
376
+
377
+ # do the actual filtering, then rebuild dictionary to remove gaps in ids
378
+ self.filter_tokens(good_ids=good_ids)
379
+ logger.info("resulting dictionary: %s", self)
380
+
381
+ def filter_n_most_frequent(self, remove_n):
382
+ """Filter out the 'remove_n' most frequent tokens that appear in the documents.
383
+
384
+ Parameters
385
+ ----------
386
+ remove_n : int
387
+ Number of the most frequent tokens that will be removed.
388
+
389
+ Examples
390
+ --------
391
+ .. sourcecode:: pycon
392
+
393
+ >>> from gensim.corpora import Dictionary
394
+ >>>
395
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
396
+ >>> dct = Dictionary(corpus)
397
+ >>> len(dct)
398
+ 5
399
+ >>> dct.filter_n_most_frequent(2)
400
+ >>> len(dct)
401
+ 3
402
+
403
+ """
404
+ # determine which tokens to keep
405
+ most_frequent_ids = (v for v in self.token2id.values())
406
+ most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
407
+ most_frequent_ids = most_frequent_ids[:remove_n]
408
+ # do the actual filtering, then rebuild dictionary to remove gaps in ids
409
+ most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids]
410
+ logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])
411
+
412
+ self.filter_tokens(bad_ids=most_frequent_ids)
413
+ logger.info("resulting dictionary: %s", self)
414
+
415
+ def filter_tokens(self, bad_ids=None, good_ids=None):
416
+ """Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`.
417
+
418
+ Alternatively, keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest.
419
+
420
+ Parameters
421
+ ----------
422
+ bad_ids : iterable of int, optional
423
+ Collection of word ids to be removed.
424
+ good_ids : collection of int, optional
425
+ Keep selected collection of word ids and remove the rest.
426
+
427
+ Examples
428
+ --------
429
+ .. sourcecode:: pycon
430
+
431
+ >>> from gensim.corpora import Dictionary
432
+ >>>
433
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
434
+ >>> dct = Dictionary(corpus)
435
+ >>> 'ema' in dct.token2id
436
+ True
437
+ >>> dct.filter_tokens(bad_ids=[dct.token2id['ema']])
438
+ >>> 'ema' in dct.token2id
439
+ False
440
+ >>> len(dct)
441
+ 4
442
+ >>> dct.filter_tokens(good_ids=[dct.token2id['maso']])
443
+ >>> len(dct)
444
+ 1
445
+
446
+ """
447
+ if bad_ids is not None:
448
+ bad_ids = set(bad_ids)
449
+ self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids}
450
+ self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids}
451
+ self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids}
452
+ if good_ids is not None:
453
+ good_ids = set(good_ids)
454
+ self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids}
455
+ self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids}
456
+ self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids}
457
+ self.compactify()
458
+
459
+ def compactify(self):
460
+ """Assign new word ids to all words, shrinking any gaps."""
461
+ logger.debug("rebuilding dictionary, shrinking gaps")
462
+
463
+ # build mapping from old id -> new id
464
+ idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id))))
465
+
466
+ # reassign mappings to new ids
467
+ self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
468
+ self.id2token = {}
469
+ self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
470
+ self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()}
471
+
472
+ def save_as_text(self, fname, sort_by_word=True):
473
+ """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
474
+
475
+ Parameters
476
+ ----------
477
+ fname : str
478
+ Path to output file.
479
+ sort_by_word : bool, optional
480
+ Sort words in lexicographical order before writing them out?
481
+
482
+ Notes
483
+ -----
484
+ Format::
485
+
486
+ num_docs
487
+ id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
488
+ id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
489
+ ....
490
+ id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]
491
+
492
+ This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
493
+ to other tools and frameworks. For better performance and to store the entire object state,
494
+ including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
495
+ :meth:`~gensim.corpora.dictionary.Dictionary.load` instead.
496
+
497
+ See Also
498
+ --------
499
+ :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
500
+ Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.
501
+
502
+ Examples
503
+ --------
504
+ .. sourcecode:: pycon
505
+
506
+ >>> from gensim.corpora import Dictionary
507
+ >>> from gensim.test.utils import get_tmpfile
508
+ >>>
509
+ >>> tmp_fname = get_tmpfile("dictionary")
510
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
511
+ >>>
512
+ >>> dct = Dictionary(corpus)
513
+ >>> dct.save_as_text(tmp_fname)
514
+ >>>
515
+ >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
516
+ >>> assert dct.token2id == loaded_dct.token2id
517
+
518
+ """
519
+ logger.info("saving dictionary mapping to %s", fname)
520
+ with utils.open(fname, 'wb') as fout:
521
+ numdocs_line = "%d\n" % self.num_docs
522
+ fout.write(utils.to_utf8(numdocs_line))
523
+ if sort_by_word:
524
+ for token, tokenid in sorted(self.token2id.items()):
525
+ line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
526
+ fout.write(utils.to_utf8(line))
527
+ else:
528
+ for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]):
529
+ line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
530
+ fout.write(utils.to_utf8(line))
531
+
532
+ def merge_with(self, other):
533
+ """Merge another dictionary into this dictionary, mapping the same tokens to the same ids
534
+ and new tokens to new ids.
535
+
536
+ Notes
537
+ -----
538
+ The purpose is to merge two corpora created using two different dictionaries: `self` and `other`.
539
+ `other` can be any id=>word mapping (a dict, a Dictionary object, ...).
540
+
541
+ Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents
542
+ from a corpus built using the `other` dictionary into a document using the new, merged dictionary.
543
+
544
+ Parameters
545
+ ----------
546
+ other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`}
547
+ Other dictionary.
548
+
549
+ Return
550
+ ------
551
+ :class:`gensim.models.VocabTransform`
552
+ Transformation object.
553
+
554
+ Examples
555
+ --------
556
+ .. sourcecode:: pycon
557
+
558
+ >>> from gensim.corpora import Dictionary
559
+ >>>
560
+ >>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]]
561
+ >>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2)
562
+ >>> dct_1.doc2bow(corpus_2[0])
563
+ [(0, 1)]
564
+ >>> transformer = dct_1.merge_with(dct_2)
565
+ >>> dct_1.doc2bow(corpus_2[0])
566
+ [(0, 1), (3, 2)]
567
+
568
+ """
569
+ old2new = {}
570
+ for other_id, other_token in other.items():
571
+ if other_token in self.token2id:
572
+ new_id = self.token2id[other_token]
573
+ else:
574
+ new_id = len(self.token2id)
575
+ self.token2id[other_token] = new_id
576
+ self.dfs[new_id] = 0
577
+ old2new[other_id] = new_id
578
+ try:
579
+ self.dfs[new_id] += other.dfs[other_id]
580
+ except Exception:
581
+ # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
582
+ pass
583
+ try:
584
+ self.num_docs += other.num_docs
585
+ self.num_nnz += other.num_nnz
586
+ self.num_pos += other.num_pos
587
+ except Exception:
588
+ pass
589
+
590
+ import gensim.models
591
+ return gensim.models.VocabTransform(old2new)
592
+
593
+ def patch_with_special_tokens(self, special_token_dict):
594
+ """Patch token2id and id2token using a dictionary of special tokens.
595
+
596
+
597
+ **Usecase:** when doing sequence modeling (e.g. named entity recognition), one may want to specify
598
+ special tokens that behave differently than others.
599
+ One example is the "unknown" token, and another is the padding token.
600
+ It is usual to set the padding token to have index `0`, and patching the dictionary with `{'<PAD>': 0}`
601
+ would be one way to specify this.
602
+
603
+ Parameters
604
+ ----------
605
+ special_token_dict : dict of (str, int)
606
+ dict containing the special tokens as keys and their wanted indices as values.
607
+
608
+ Examples
609
+ --------
610
+ .. sourcecode:: pycon
611
+
612
+ >>> from gensim.corpora import Dictionary
613
+ >>>
614
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
615
+ >>> dct = Dictionary(corpus)
616
+ >>>
617
+ >>> special_tokens = {'pad': 0, 'space': 1}
618
+ >>> print(dct.token2id)
619
+ {'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4}
620
+ >>>
621
+ >>> dct.patch_with_special_tokens(special_tokens)
622
+ >>> print(dct.token2id)
623
+ {'maso': 6, 'mele': 7, 'máma': 2, 'ema': 3, 'má': 4, 'pad': 0, 'space': 1}
624
+
625
+ """
626
+ possible_ids = []
627
+ for token, idx in special_token_dict.items():
628
+ if token in self.token2id and self.token2id[token] == idx:
629
+ continue
630
+ if token in self.token2id and self.token2id[token] != idx:
631
+ possible_ids.append(self.token2id[token])
632
+ del self.token2id[token]
633
+ old_token = self[idx]
634
+ self.token2id[token] = idx
635
+ self.token2id[old_token] = possible_ids.pop() if \
636
+ len(possible_ids) > 0 else len(self.token2id) - 1
637
+ self.id2token = {} # Make sure that id2token is updated according to special tokens.
638
+
639
+ @staticmethod
640
+ def load_from_text(fname):
641
+ """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
642
+
643
+ Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.
644
+
645
+ Parameters
646
+ ----------
647
+ fname: str
648
+ Path to a file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.
649
+
650
+ See Also
651
+ --------
652
+ :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`
653
+ Save :class:`~gensim.corpora.dictionary.Dictionary` to text file.
654
+
655
+ Examples
656
+ --------
657
+ .. sourcecode:: pycon
658
+
659
+ >>> from gensim.corpora import Dictionary
660
+ >>> from gensim.test.utils import get_tmpfile
661
+ >>>
662
+ >>> tmp_fname = get_tmpfile("dictionary")
663
+ >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
664
+ >>>
665
+ >>> dct = Dictionary(corpus)
666
+ >>> dct.save_as_text(tmp_fname)
667
+ >>>
668
+ >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
669
+ >>> assert dct.token2id == loaded_dct.token2id
670
+
671
+ """
672
+ result = Dictionary()
673
+ with utils.open(fname, 'rb') as f:
674
+ for lineno, line in enumerate(f):
675
+ line = utils.to_unicode(line)
676
+ if lineno == 0:
677
+ if line.strip().isdigit():
678
+ # Older versions of save_as_text may not write num_docs on first line.
679
+ result.num_docs = int(line.strip())
680
+ continue
681
+ else:
682
+ logging.warning("Text does not contain num_docs on the first line.")
683
+ try:
684
+ wordid, word, docfreq = line[:-1].split('\t')
685
+ except Exception:
686
+ raise ValueError("invalid line in dictionary file %s: %s"
687
+ % (fname, line.strip()))
688
+ wordid = int(wordid)
689
+ if word in result.token2id:
690
+ raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
691
+ result.token2id[word] = wordid
692
+ result.dfs[wordid] = int(docfreq)
693
+ return result
694
+
695
+ def most_common(self, n: Optional[int] = None) -> List[Tuple[str, int]]:
696
+ """Return a list of the n most common words and their counts from the most common to the least.
697
+
698
+ Words with equal counts are ordered in the increasing order of their ids.
699
+
700
+ Parameters
701
+ ----------
702
+ n : int or None, optional
703
+ The number of most common words to be returned. If `None`, all words in the dictionary
704
+ will be returned. Default is `None`.
705
+
706
+ Returns
707
+ -------
708
+ most_common : list of (str, int)
709
+ The n most common words and their counts from the most common to the least.
710
+
711
+ """
712
+ most_common = [
713
+ (self[word], count)
714
+ for word, count
715
+ in sorted(self.cfs.items(), key=lambda x: (-x[1], x[0]))[:n]
716
+ ]
717
+ return most_common
718
+
719
+ @staticmethod
720
+ def from_corpus(corpus, id2word=None):
721
+ """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.
722
+
723
+ Parameters
724
+ ----------
725
+ corpus : iterable of iterable of (int, number)
726
+ Corpus in BoW format.
727
+ id2word : dict of (int, object)
728
+ Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used.
729
+
730
+ Notes
731
+ -----
732
+ This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original
733
+ text corpus. This method will scan the term-document count matrix for all word ids that appear in it,
734
+ then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`.
735
+ `id2word` is an optional dictionary that maps the `word_id` to a token.
736
+ In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used.
737
+
738
+ Returns
739
+ -------
740
+ :class:`~gensim.corpora.dictionary.Dictionary`
741
+ Inferred dictionary from corpus.
742
+
743
+ Examples
744
+ --------
745
+ .. sourcecode:: pycon
746
+
747
+ >>> from gensim.corpora import Dictionary
748
+ >>>
749
+ >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
750
+ >>> dct = Dictionary.from_corpus(corpus)
751
+ >>> len(dct)
752
+ 3
753
+
754
+ """
755
+ result = Dictionary()
756
+ max_id = -1
757
+ for docno, document in enumerate(corpus):
758
+ if docno % 10000 == 0:
759
+ logger.info("adding document #%i to %s", docno, result)
760
+ result.num_docs += 1
761
+ result.num_nnz += len(document)
762
+ for wordid, word_freq in document:
763
+ max_id = max(wordid, max_id)
764
+ result.num_pos += word_freq
765
+ result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
766
+
767
+ if id2word is None:
768
+ # make sure length(result) == get_max_id(corpus) + 1
769
+ result.token2id = {str(i): i for i in range(max_id + 1)}
770
+ else:
771
+ # id=>word mapping given: simply copy it
772
+ result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()}
773
+ for idx in result.token2id.values():
774
+ # make sure all token ids have a valid `dfs` entry
775
+ result.dfs[idx] = result.dfs.get(idx, 0)
776
+
777
+ logger.info(
778
+ "built %s from %i documents (total %i corpus positions)",
779
+ result, result.num_docs, result.num_pos
780
+ )
781
+ return result