Spaces:
Sleeping
Sleeping
Create gensim/corpora/dictionary.py
Browse files- gensim/corpora/dictionary.py +781 -0
gensim/corpora/dictionary.py
ADDED
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
#
|
4 |
+
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
5 |
+
# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
|
6 |
+
|
7 |
+
"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""
|
8 |
+
|
9 |
+
from collections import defaultdict
|
10 |
+
from collections.abc import Mapping
|
11 |
+
import logging
|
12 |
+
import itertools
|
13 |
+
from typing import Optional, List, Tuple
|
14 |
+
|
15 |
+
from gensim import utils
|
16 |
+
|
17 |
+
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
|
21 |
+
class Dictionary(utils.SaveLoad, Mapping):
|
22 |
+
"""Dictionary encapsulates the mapping between normalized words and their integer ids.
|
23 |
+
|
24 |
+
Notable instance attributes:
|
25 |
+
|
26 |
+
Attributes
|
27 |
+
----------
|
28 |
+
token2id : dict of (str, int)
|
29 |
+
token -> token_id. I.e. the reverse mapping to `self[token_id]`.
|
30 |
+
cfs : dict of (int, int)
|
31 |
+
Collection frequencies: token_id -> how many instances of this token are contained in the documents.
|
32 |
+
dfs : dict of (int, int)
|
33 |
+
Document frequencies: token_id -> how many documents contain this token.
|
34 |
+
num_docs : int
|
35 |
+
Number of documents processed.
|
36 |
+
num_pos : int
|
37 |
+
Total number of corpus positions (number of processed words).
|
38 |
+
num_nnz : int
|
39 |
+
Total number of non-zeroes in the BOW matrix (sum of the number of unique
|
40 |
+
words per document over the entire corpus).
|
41 |
+
|
42 |
+
"""
|
43 |
+
def __init__(self, documents=None, prune_at=2000000):
|
44 |
+
"""
|
45 |
+
|
46 |
+
Parameters
|
47 |
+
----------
|
48 |
+
documents : iterable of iterable of str, optional
|
49 |
+
Documents to be used to initialize the mapping and collect corpus statistics.
|
50 |
+
prune_at : int, optional
|
51 |
+
Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
|
52 |
+
footprint, the correctness is not guaranteed.
|
53 |
+
Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
|
54 |
+
|
55 |
+
Examples
|
56 |
+
--------
|
57 |
+
.. sourcecode:: pycon
|
58 |
+
|
59 |
+
>>> from gensim.corpora import Dictionary
|
60 |
+
>>>
|
61 |
+
>>> texts = [['human', 'interface', 'computer']]
|
62 |
+
>>> dct = Dictionary(texts) # initialize a Dictionary
|
63 |
+
>>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
|
64 |
+
>>> dct.doc2bow(["dog", "computer", "non_existent_word"])
|
65 |
+
[(0, 1), (6, 1)]
|
66 |
+
|
67 |
+
"""
|
68 |
+
self.token2id = {}
|
69 |
+
self.id2token = {}
|
70 |
+
self.cfs = {}
|
71 |
+
self.dfs = {}
|
72 |
+
|
73 |
+
self.num_docs = 0
|
74 |
+
self.num_pos = 0
|
75 |
+
self.num_nnz = 0
|
76 |
+
|
77 |
+
if documents is not None:
|
78 |
+
self.add_documents(documents, prune_at=prune_at)
|
79 |
+
self.add_lifecycle_event(
|
80 |
+
"created",
|
81 |
+
msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)",
|
82 |
+
)
|
83 |
+
|
84 |
+
def __getitem__(self, tokenid):
|
85 |
+
"""Get the string token that corresponds to `tokenid`.
|
86 |
+
|
87 |
+
Parameters
|
88 |
+
----------
|
89 |
+
tokenid : int
|
90 |
+
Id of token.
|
91 |
+
|
92 |
+
Returns
|
93 |
+
-------
|
94 |
+
str
|
95 |
+
Token corresponding to `tokenid`.
|
96 |
+
|
97 |
+
Raises
|
98 |
+
------
|
99 |
+
KeyError
|
100 |
+
If this Dictionary doesn't contain such `tokenid`.
|
101 |
+
|
102 |
+
"""
|
103 |
+
if len(self.id2token) != len(self.token2id):
|
104 |
+
# the word->id mapping has changed (presumably via add_documents);
|
105 |
+
# recompute id->word accordingly
|
106 |
+
self.id2token = utils.revdict(self.token2id)
|
107 |
+
return self.id2token[tokenid] # will throw for non-existent ids
|
108 |
+
|
109 |
+
def __iter__(self):
|
110 |
+
"""Iterate over all tokens."""
|
111 |
+
return iter(self.keys())
|
112 |
+
|
113 |
+
# restore Py2-style dict API
|
114 |
+
iterkeys = __iter__
|
115 |
+
|
116 |
+
def iteritems(self):
|
117 |
+
return self.items()
|
118 |
+
|
119 |
+
def itervalues(self):
|
120 |
+
return self.values()
|
121 |
+
|
122 |
+
def keys(self):
|
123 |
+
"""Get all stored ids.
|
124 |
+
|
125 |
+
Returns
|
126 |
+
-------
|
127 |
+
list of int
|
128 |
+
List of all token ids.
|
129 |
+
|
130 |
+
"""
|
131 |
+
return list(self.token2id.values())
|
132 |
+
|
133 |
+
def __len__(self):
|
134 |
+
"""Get number of stored tokens.
|
135 |
+
|
136 |
+
Returns
|
137 |
+
-------
|
138 |
+
int
|
139 |
+
Number of stored tokens.
|
140 |
+
|
141 |
+
"""
|
142 |
+
return len(self.token2id)
|
143 |
+
|
144 |
+
def __str__(self):
|
145 |
+
some_keys = list(itertools.islice(self.token2id.keys(), 5))
|
146 |
+
return "%s<%i unique tokens: %s%s>" % (
|
147 |
+
self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else ''
|
148 |
+
)
|
149 |
+
|
150 |
+
@staticmethod
|
151 |
+
def from_documents(documents):
|
152 |
+
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`.
|
153 |
+
|
154 |
+
Equivalent to `Dictionary(documents=documents)`.
|
155 |
+
|
156 |
+
Parameters
|
157 |
+
----------
|
158 |
+
documents : iterable of iterable of str
|
159 |
+
Input corpus.
|
160 |
+
|
161 |
+
Returns
|
162 |
+
-------
|
163 |
+
:class:`~gensim.corpora.dictionary.Dictionary`
|
164 |
+
Dictionary initialized from `documents`.
|
165 |
+
|
166 |
+
"""
|
167 |
+
return Dictionary(documents=documents)
|
168 |
+
|
169 |
+
def add_documents(self, documents, prune_at=2000000):
|
170 |
+
"""Update dictionary from a collection of `documents`.
|
171 |
+
|
172 |
+
Parameters
|
173 |
+
----------
|
174 |
+
documents : iterable of iterable of str
|
175 |
+
Input corpus. All tokens should be already **tokenized and normalized**.
|
176 |
+
prune_at : int, optional
|
177 |
+
Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
|
178 |
+
footprint, the correctness is not guaranteed.
|
179 |
+
Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.
|
180 |
+
|
181 |
+
Examples
|
182 |
+
--------
|
183 |
+
.. sourcecode:: pycon
|
184 |
+
|
185 |
+
>>> from gensim.corpora import Dictionary
|
186 |
+
>>>
|
187 |
+
>>> corpus = ["máma mele maso".split(), "ema má máma".split()]
|
188 |
+
>>> dct = Dictionary(corpus)
|
189 |
+
>>> len(dct)
|
190 |
+
5
|
191 |
+
>>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
|
192 |
+
>>> len(dct)
|
193 |
+
10
|
194 |
+
|
195 |
+
"""
|
196 |
+
for docno, document in enumerate(documents):
|
197 |
+
# log progress & run a regular check for pruning, once every 10k docs
|
198 |
+
if docno % 10000 == 0:
|
199 |
+
if prune_at is not None and len(self) > prune_at:
|
200 |
+
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
|
201 |
+
logger.info("adding document #%i to %s", docno, self)
|
202 |
+
|
203 |
+
# update Dictionary with the document
|
204 |
+
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
|
205 |
+
|
206 |
+
logger.info("built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos)
|
207 |
+
|
208 |
+
def doc2bow(self, document, allow_update=False, return_missing=False):
|
209 |
+
"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
|
210 |
+
|
211 |
+
Parameters
|
212 |
+
----------
|
213 |
+
document : list of str
|
214 |
+
Input document.
|
215 |
+
allow_update : bool, optional
|
216 |
+
Update self, by adding new tokens from `document` and updating internal corpus statistics.
|
217 |
+
return_missing : bool, optional
|
218 |
+
Return missing tokens (tokens present in `document` but not in self) with frequencies?
|
219 |
+
|
220 |
+
Return
|
221 |
+
------
|
222 |
+
list of (int, int)
|
223 |
+
BoW representation of `document`.
|
224 |
+
list of (int, int), dict of (str, int)
|
225 |
+
If `return_missing` is True, return BoW representation of `document` + dictionary with missing
|
226 |
+
tokens and their frequencies.
|
227 |
+
|
228 |
+
Examples
|
229 |
+
--------
|
230 |
+
.. sourcecode:: pycon
|
231 |
+
|
232 |
+
>>> from gensim.corpora import Dictionary
|
233 |
+
>>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
|
234 |
+
>>> dct.doc2bow(["this", "is", "máma"])
|
235 |
+
[(2, 1)]
|
236 |
+
>>> dct.doc2bow(["this", "is", "máma"], return_missing=True)
|
237 |
+
([(2, 1)], {u'this': 1, u'is': 1})
|
238 |
+
|
239 |
+
"""
|
240 |
+
if isinstance(document, str):
|
241 |
+
raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
|
242 |
+
|
243 |
+
# Construct (word, frequency) mapping.
|
244 |
+
counter = defaultdict(int)
|
245 |
+
for w in document:
|
246 |
+
counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1
|
247 |
+
|
248 |
+
token2id = self.token2id
|
249 |
+
if allow_update or return_missing:
|
250 |
+
missing = sorted(x for x in counter.items() if x[0] not in token2id)
|
251 |
+
if allow_update:
|
252 |
+
for w, _ in missing:
|
253 |
+
# new id = number of ids made so far;
|
254 |
+
# NOTE this assumes there are no gaps in the id sequence!
|
255 |
+
token2id[w] = len(token2id)
|
256 |
+
result = {token2id[w]: freq for w, freq in counter.items() if w in token2id}
|
257 |
+
|
258 |
+
if allow_update:
|
259 |
+
self.num_docs += 1
|
260 |
+
self.num_pos += sum(counter.values())
|
261 |
+
self.num_nnz += len(result)
|
262 |
+
# keep track of document and collection frequencies
|
263 |
+
for tokenid, freq in result.items():
|
264 |
+
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
|
265 |
+
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
|
266 |
+
|
267 |
+
# return tokenids, in ascending id order
|
268 |
+
result = sorted(result.items())
|
269 |
+
if return_missing:
|
270 |
+
return result, dict(missing)
|
271 |
+
else:
|
272 |
+
return result
|
273 |
+
|
274 |
+
def doc2idx(self, document, unknown_word_index=-1):
|
275 |
+
"""Convert `document` (a list of words) into a list of indexes = list of `token_id`.
|
276 |
+
Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
|
277 |
+
|
278 |
+
Parameters
|
279 |
+
----------
|
280 |
+
document : list of str
|
281 |
+
Input document
|
282 |
+
unknown_word_index : int, optional
|
283 |
+
Index to use for words not in the dictionary.
|
284 |
+
|
285 |
+
Returns
|
286 |
+
-------
|
287 |
+
list of int
|
288 |
+
Token ids for tokens in `document`, in the same order.
|
289 |
+
|
290 |
+
Examples
|
291 |
+
--------
|
292 |
+
.. sourcecode:: pycon
|
293 |
+
|
294 |
+
>>> from gensim.corpora import Dictionary
|
295 |
+
>>>
|
296 |
+
>>> corpus = [["a", "a", "b"], ["a", "c"]]
|
297 |
+
>>> dct = Dictionary(corpus)
|
298 |
+
>>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
|
299 |
+
[0, 0, 2, -1, 2]
|
300 |
+
|
301 |
+
"""
|
302 |
+
if isinstance(document, str):
|
303 |
+
raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")
|
304 |
+
|
305 |
+
document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document]
|
306 |
+
return [self.token2id.get(word, unknown_word_index) for word in document]
|
307 |
+
|
308 |
+
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
|
309 |
+
"""Filter out tokens in the dictionary by their frequency.
|
310 |
+
|
311 |
+
Parameters
|
312 |
+
----------
|
313 |
+
no_below : int, optional
|
314 |
+
Keep tokens which are contained in at least `no_below` documents.
|
315 |
+
no_above : float, optional
|
316 |
+
Keep tokens which are contained in no more than `no_above` documents
|
317 |
+
(fraction of total corpus size, not an absolute number).
|
318 |
+
keep_n : int, optional
|
319 |
+
Keep only the first `keep_n` most frequent tokens.
|
320 |
+
keep_tokens : iterable of str
|
321 |
+
Iterable of tokens that **must** stay in dictionary after filtering.
|
322 |
+
|
323 |
+
Notes
|
324 |
+
-----
|
325 |
+
This removes all tokens in the dictionary that are:
|
326 |
+
|
327 |
+
#. Less frequent than `no_below` documents (absolute number, e.g. `5`) or \n
|
328 |
+
#. More frequent than `no_above` documents (fraction of the total corpus size, e.g. `0.3`).
|
329 |
+
#. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`).
|
330 |
+
|
331 |
+
After the pruning, resulting gaps in word ids are shrunk.
|
332 |
+
Due to this gap shrinking, **the same word may have a different word id before and after the call
|
333 |
+
to this function!** See :class:`gensim.models.VocabTransform` and the
|
334 |
+
`dedicated FAQ entry <https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q8-how-can-i-filter-a-saved-corpus-and-its-corresponding-dictionary>`_ on how # noqa
|
335 |
+
to transform a corpus built with a dictionary before pruning.
|
336 |
+
|
337 |
+
Examples
|
338 |
+
--------
|
339 |
+
.. sourcecode:: pycon
|
340 |
+
|
341 |
+
>>> from gensim.corpora import Dictionary
|
342 |
+
>>>
|
343 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
344 |
+
>>> dct = Dictionary(corpus)
|
345 |
+
>>> len(dct)
|
346 |
+
5
|
347 |
+
>>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
|
348 |
+
>>> len(dct)
|
349 |
+
1
|
350 |
+
|
351 |
+
"""
|
352 |
+
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
|
353 |
+
|
354 |
+
# determine which tokens to keep
|
355 |
+
if keep_tokens:
|
356 |
+
keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
|
357 |
+
good_ids = [
|
358 |
+
v for v in self.token2id.values()
|
359 |
+
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
|
360 |
+
]
|
361 |
+
good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
|
362 |
+
else:
|
363 |
+
good_ids = [
|
364 |
+
v for v in self.token2id.values()
|
365 |
+
if no_below <= self.dfs.get(v, 0) <= no_above_abs
|
366 |
+
]
|
367 |
+
good_ids.sort(key=self.dfs.get, reverse=True)
|
368 |
+
if keep_n is not None:
|
369 |
+
good_ids = good_ids[:keep_n]
|
370 |
+
bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
|
371 |
+
logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
|
372 |
+
logger.info(
|
373 |
+
"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
|
374 |
+
len(good_ids), no_below, no_above_abs, 100.0 * no_above
|
375 |
+
)
|
376 |
+
|
377 |
+
# do the actual filtering, then rebuild dictionary to remove gaps in ids
|
378 |
+
self.filter_tokens(good_ids=good_ids)
|
379 |
+
logger.info("resulting dictionary: %s", self)
|
380 |
+
|
381 |
+
def filter_n_most_frequent(self, remove_n):
|
382 |
+
"""Filter out the 'remove_n' most frequent tokens that appear in the documents.
|
383 |
+
|
384 |
+
Parameters
|
385 |
+
----------
|
386 |
+
remove_n : int
|
387 |
+
Number of the most frequent tokens that will be removed.
|
388 |
+
|
389 |
+
Examples
|
390 |
+
--------
|
391 |
+
.. sourcecode:: pycon
|
392 |
+
|
393 |
+
>>> from gensim.corpora import Dictionary
|
394 |
+
>>>
|
395 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
396 |
+
>>> dct = Dictionary(corpus)
|
397 |
+
>>> len(dct)
|
398 |
+
5
|
399 |
+
>>> dct.filter_n_most_frequent(2)
|
400 |
+
>>> len(dct)
|
401 |
+
3
|
402 |
+
|
403 |
+
"""
|
404 |
+
# determine which tokens to keep
|
405 |
+
most_frequent_ids = (v for v in self.token2id.values())
|
406 |
+
most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
|
407 |
+
most_frequent_ids = most_frequent_ids[:remove_n]
|
408 |
+
# do the actual filtering, then rebuild dictionary to remove gaps in ids
|
409 |
+
most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids]
|
410 |
+
logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])
|
411 |
+
|
412 |
+
self.filter_tokens(bad_ids=most_frequent_ids)
|
413 |
+
logger.info("resulting dictionary: %s", self)
|
414 |
+
|
415 |
+
def filter_tokens(self, bad_ids=None, good_ids=None):
|
416 |
+
"""Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`.
|
417 |
+
|
418 |
+
Alternatively, keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest.
|
419 |
+
|
420 |
+
Parameters
|
421 |
+
----------
|
422 |
+
bad_ids : iterable of int, optional
|
423 |
+
Collection of word ids to be removed.
|
424 |
+
good_ids : collection of int, optional
|
425 |
+
Keep selected collection of word ids and remove the rest.
|
426 |
+
|
427 |
+
Examples
|
428 |
+
--------
|
429 |
+
.. sourcecode:: pycon
|
430 |
+
|
431 |
+
>>> from gensim.corpora import Dictionary
|
432 |
+
>>>
|
433 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
434 |
+
>>> dct = Dictionary(corpus)
|
435 |
+
>>> 'ema' in dct.token2id
|
436 |
+
True
|
437 |
+
>>> dct.filter_tokens(bad_ids=[dct.token2id['ema']])
|
438 |
+
>>> 'ema' in dct.token2id
|
439 |
+
False
|
440 |
+
>>> len(dct)
|
441 |
+
4
|
442 |
+
>>> dct.filter_tokens(good_ids=[dct.token2id['maso']])
|
443 |
+
>>> len(dct)
|
444 |
+
1
|
445 |
+
|
446 |
+
"""
|
447 |
+
if bad_ids is not None:
|
448 |
+
bad_ids = set(bad_ids)
|
449 |
+
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids}
|
450 |
+
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids}
|
451 |
+
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids}
|
452 |
+
if good_ids is not None:
|
453 |
+
good_ids = set(good_ids)
|
454 |
+
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids}
|
455 |
+
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids}
|
456 |
+
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids}
|
457 |
+
self.compactify()
|
458 |
+
|
459 |
+
def compactify(self):
|
460 |
+
"""Assign new word ids to all words, shrinking any gaps."""
|
461 |
+
logger.debug("rebuilding dictionary, shrinking gaps")
|
462 |
+
|
463 |
+
# build mapping from old id -> new id
|
464 |
+
idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id))))
|
465 |
+
|
466 |
+
# reassign mappings to new ids
|
467 |
+
self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
|
468 |
+
self.id2token = {}
|
469 |
+
self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
|
470 |
+
self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()}
|
471 |
+
|
472 |
+
def save_as_text(self, fname, sort_by_word=True):
|
473 |
+
"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
|
474 |
+
|
475 |
+
Parameters
|
476 |
+
----------
|
477 |
+
fname : str
|
478 |
+
Path to output file.
|
479 |
+
sort_by_word : bool, optional
|
480 |
+
Sort words in lexicographical order before writing them out?
|
481 |
+
|
482 |
+
Notes
|
483 |
+
-----
|
484 |
+
Format::
|
485 |
+
|
486 |
+
num_docs
|
487 |
+
id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
|
488 |
+
id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
|
489 |
+
....
|
490 |
+
id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]
|
491 |
+
|
492 |
+
This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
|
493 |
+
to other tools and frameworks. For better performance and to store the entire object state,
|
494 |
+
including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
|
495 |
+
:meth:`~gensim.corpora.dictionary.Dictionary.load` instead.
|
496 |
+
|
497 |
+
See Also
|
498 |
+
--------
|
499 |
+
:meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
|
500 |
+
Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.
|
501 |
+
|
502 |
+
Examples
|
503 |
+
--------
|
504 |
+
.. sourcecode:: pycon
|
505 |
+
|
506 |
+
>>> from gensim.corpora import Dictionary
|
507 |
+
>>> from gensim.test.utils import get_tmpfile
|
508 |
+
>>>
|
509 |
+
>>> tmp_fname = get_tmpfile("dictionary")
|
510 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
511 |
+
>>>
|
512 |
+
>>> dct = Dictionary(corpus)
|
513 |
+
>>> dct.save_as_text(tmp_fname)
|
514 |
+
>>>
|
515 |
+
>>> loaded_dct = Dictionary.load_from_text(tmp_fname)
|
516 |
+
>>> assert dct.token2id == loaded_dct.token2id
|
517 |
+
|
518 |
+
"""
|
519 |
+
logger.info("saving dictionary mapping to %s", fname)
|
520 |
+
with utils.open(fname, 'wb') as fout:
|
521 |
+
numdocs_line = "%d\n" % self.num_docs
|
522 |
+
fout.write(utils.to_utf8(numdocs_line))
|
523 |
+
if sort_by_word:
|
524 |
+
for token, tokenid in sorted(self.token2id.items()):
|
525 |
+
line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
|
526 |
+
fout.write(utils.to_utf8(line))
|
527 |
+
else:
|
528 |
+
for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]):
|
529 |
+
line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
|
530 |
+
fout.write(utils.to_utf8(line))
|
531 |
+
|
532 |
+
def merge_with(self, other):
|
533 |
+
"""Merge another dictionary into this dictionary, mapping the same tokens to the same ids
|
534 |
+
and new tokens to new ids.
|
535 |
+
|
536 |
+
Notes
|
537 |
+
-----
|
538 |
+
The purpose is to merge two corpora created using two different dictionaries: `self` and `other`.
|
539 |
+
`other` can be any id=>word mapping (a dict, a Dictionary object, ...).
|
540 |
+
|
541 |
+
Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents
|
542 |
+
from a corpus built using the `other` dictionary into a document using the new, merged dictionary.
|
543 |
+
|
544 |
+
Parameters
|
545 |
+
----------
|
546 |
+
other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`}
|
547 |
+
Other dictionary.
|
548 |
+
|
549 |
+
Return
|
550 |
+
------
|
551 |
+
:class:`gensim.models.VocabTransform`
|
552 |
+
Transformation object.
|
553 |
+
|
554 |
+
Examples
|
555 |
+
--------
|
556 |
+
.. sourcecode:: pycon
|
557 |
+
|
558 |
+
>>> from gensim.corpora import Dictionary
|
559 |
+
>>>
|
560 |
+
>>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]]
|
561 |
+
>>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2)
|
562 |
+
>>> dct_1.doc2bow(corpus_2[0])
|
563 |
+
[(0, 1)]
|
564 |
+
>>> transformer = dct_1.merge_with(dct_2)
|
565 |
+
>>> dct_1.doc2bow(corpus_2[0])
|
566 |
+
[(0, 1), (3, 2)]
|
567 |
+
|
568 |
+
"""
|
569 |
+
old2new = {}
|
570 |
+
for other_id, other_token in other.items():
|
571 |
+
if other_token in self.token2id:
|
572 |
+
new_id = self.token2id[other_token]
|
573 |
+
else:
|
574 |
+
new_id = len(self.token2id)
|
575 |
+
self.token2id[other_token] = new_id
|
576 |
+
self.dfs[new_id] = 0
|
577 |
+
old2new[other_id] = new_id
|
578 |
+
try:
|
579 |
+
self.dfs[new_id] += other.dfs[other_id]
|
580 |
+
except Exception:
|
581 |
+
# `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
|
582 |
+
pass
|
583 |
+
try:
|
584 |
+
self.num_docs += other.num_docs
|
585 |
+
self.num_nnz += other.num_nnz
|
586 |
+
self.num_pos += other.num_pos
|
587 |
+
except Exception:
|
588 |
+
pass
|
589 |
+
|
590 |
+
import gensim.models
|
591 |
+
return gensim.models.VocabTransform(old2new)
|
592 |
+
|
593 |
+
def patch_with_special_tokens(self, special_token_dict):
|
594 |
+
"""Patch token2id and id2token using a dictionary of special tokens.
|
595 |
+
|
596 |
+
|
597 |
+
**Usecase:** when doing sequence modeling (e.g. named entity recognition), one may want to specify
|
598 |
+
special tokens that behave differently than others.
|
599 |
+
One example is the "unknown" token, and another is the padding token.
|
600 |
+
It is usual to set the padding token to have index `0`, and patching the dictionary with `{'<PAD>': 0}`
|
601 |
+
would be one way to specify this.
|
602 |
+
|
603 |
+
Parameters
|
604 |
+
----------
|
605 |
+
special_token_dict : dict of (str, int)
|
606 |
+
dict containing the special tokens as keys and their wanted indices as values.
|
607 |
+
|
608 |
+
Examples
|
609 |
+
--------
|
610 |
+
.. sourcecode:: pycon
|
611 |
+
|
612 |
+
>>> from gensim.corpora import Dictionary
|
613 |
+
>>>
|
614 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
615 |
+
>>> dct = Dictionary(corpus)
|
616 |
+
>>>
|
617 |
+
>>> special_tokens = {'pad': 0, 'space': 1}
|
618 |
+
>>> print(dct.token2id)
|
619 |
+
{'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4}
|
620 |
+
>>>
|
621 |
+
>>> dct.patch_with_special_tokens(special_tokens)
|
622 |
+
>>> print(dct.token2id)
|
623 |
+
{'maso': 6, 'mele': 7, 'máma': 2, 'ema': 3, 'má': 4, 'pad': 0, 'space': 1}
|
624 |
+
|
625 |
+
"""
|
626 |
+
possible_ids = []
|
627 |
+
for token, idx in special_token_dict.items():
|
628 |
+
if token in self.token2id and self.token2id[token] == idx:
|
629 |
+
continue
|
630 |
+
if token in self.token2id and self.token2id[token] != idx:
|
631 |
+
possible_ids.append(self.token2id[token])
|
632 |
+
del self.token2id[token]
|
633 |
+
old_token = self[idx]
|
634 |
+
self.token2id[token] = idx
|
635 |
+
self.token2id[old_token] = possible_ids.pop() if \
|
636 |
+
len(possible_ids) > 0 else len(self.token2id) - 1
|
637 |
+
self.id2token = {} # Make sure that id2token is updated according to special tokens.
|
638 |
+
|
639 |
+
@staticmethod
|
640 |
+
def load_from_text(fname):
|
641 |
+
"""Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
|
642 |
+
|
643 |
+
Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.
|
644 |
+
|
645 |
+
Parameters
|
646 |
+
----------
|
647 |
+
fname: str
|
648 |
+
Path to a file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.
|
649 |
+
|
650 |
+
See Also
|
651 |
+
--------
|
652 |
+
:meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`
|
653 |
+
Save :class:`~gensim.corpora.dictionary.Dictionary` to text file.
|
654 |
+
|
655 |
+
Examples
|
656 |
+
--------
|
657 |
+
.. sourcecode:: pycon
|
658 |
+
|
659 |
+
>>> from gensim.corpora import Dictionary
|
660 |
+
>>> from gensim.test.utils import get_tmpfile
|
661 |
+
>>>
|
662 |
+
>>> tmp_fname = get_tmpfile("dictionary")
|
663 |
+
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
|
664 |
+
>>>
|
665 |
+
>>> dct = Dictionary(corpus)
|
666 |
+
>>> dct.save_as_text(tmp_fname)
|
667 |
+
>>>
|
668 |
+
>>> loaded_dct = Dictionary.load_from_text(tmp_fname)
|
669 |
+
>>> assert dct.token2id == loaded_dct.token2id
|
670 |
+
|
671 |
+
"""
|
672 |
+
result = Dictionary()
|
673 |
+
with utils.open(fname, 'rb') as f:
|
674 |
+
for lineno, line in enumerate(f):
|
675 |
+
line = utils.to_unicode(line)
|
676 |
+
if lineno == 0:
|
677 |
+
if line.strip().isdigit():
|
678 |
+
# Older versions of save_as_text may not write num_docs on first line.
|
679 |
+
result.num_docs = int(line.strip())
|
680 |
+
continue
|
681 |
+
else:
|
682 |
+
logging.warning("Text does not contain num_docs on the first line.")
|
683 |
+
try:
|
684 |
+
wordid, word, docfreq = line[:-1].split('\t')
|
685 |
+
except Exception:
|
686 |
+
raise ValueError("invalid line in dictionary file %s: %s"
|
687 |
+
% (fname, line.strip()))
|
688 |
+
wordid = int(wordid)
|
689 |
+
if word in result.token2id:
|
690 |
+
raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
|
691 |
+
result.token2id[word] = wordid
|
692 |
+
result.dfs[wordid] = int(docfreq)
|
693 |
+
return result
|
694 |
+
|
695 |
+
def most_common(self, n: Optional[int] = None) -> List[Tuple[str, int]]:
|
696 |
+
"""Return a list of the n most common words and their counts from the most common to the least.
|
697 |
+
|
698 |
+
Words with equal counts are ordered in the increasing order of their ids.
|
699 |
+
|
700 |
+
Parameters
|
701 |
+
----------
|
702 |
+
n : int or None, optional
|
703 |
+
The number of most common words to be returned. If `None`, all words in the dictionary
|
704 |
+
will be returned. Default is `None`.
|
705 |
+
|
706 |
+
Returns
|
707 |
+
-------
|
708 |
+
most_common : list of (str, int)
|
709 |
+
The n most common words and their counts from the most common to the least.
|
710 |
+
|
711 |
+
"""
|
712 |
+
most_common = [
|
713 |
+
(self[word], count)
|
714 |
+
for word, count
|
715 |
+
in sorted(self.cfs.items(), key=lambda x: (-x[1], x[0]))[:n]
|
716 |
+
]
|
717 |
+
return most_common
|
718 |
+
|
719 |
+
@staticmethod
|
720 |
+
def from_corpus(corpus, id2word=None):
|
721 |
+
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.
|
722 |
+
|
723 |
+
Parameters
|
724 |
+
----------
|
725 |
+
corpus : iterable of iterable of (int, number)
|
726 |
+
Corpus in BoW format.
|
727 |
+
id2word : dict of (int, object)
|
728 |
+
Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used.
|
729 |
+
|
730 |
+
Notes
|
731 |
+
-----
|
732 |
+
This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original
|
733 |
+
text corpus. This method will scan the term-document count matrix for all word ids that appear in it,
|
734 |
+
then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`.
|
735 |
+
`id2word` is an optional dictionary that maps the `word_id` to a token.
|
736 |
+
In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used.
|
737 |
+
|
738 |
+
Returns
|
739 |
+
-------
|
740 |
+
:class:`~gensim.corpora.dictionary.Dictionary`
|
741 |
+
Inferred dictionary from corpus.
|
742 |
+
|
743 |
+
Examples
|
744 |
+
--------
|
745 |
+
.. sourcecode:: pycon
|
746 |
+
|
747 |
+
>>> from gensim.corpora import Dictionary
|
748 |
+
>>>
|
749 |
+
>>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
|
750 |
+
>>> dct = Dictionary.from_corpus(corpus)
|
751 |
+
>>> len(dct)
|
752 |
+
3
|
753 |
+
|
754 |
+
"""
|
755 |
+
result = Dictionary()
|
756 |
+
max_id = -1
|
757 |
+
for docno, document in enumerate(corpus):
|
758 |
+
if docno % 10000 == 0:
|
759 |
+
logger.info("adding document #%i to %s", docno, result)
|
760 |
+
result.num_docs += 1
|
761 |
+
result.num_nnz += len(document)
|
762 |
+
for wordid, word_freq in document:
|
763 |
+
max_id = max(wordid, max_id)
|
764 |
+
result.num_pos += word_freq
|
765 |
+
result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
|
766 |
+
|
767 |
+
if id2word is None:
|
768 |
+
# make sure length(result) == get_max_id(corpus) + 1
|
769 |
+
result.token2id = {str(i): i for i in range(max_id + 1)}
|
770 |
+
else:
|
771 |
+
# id=>word mapping given: simply copy it
|
772 |
+
result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()}
|
773 |
+
for idx in result.token2id.values():
|
774 |
+
# make sure all token ids have a valid `dfs` entry
|
775 |
+
result.dfs[idx] = result.dfs.get(idx, 0)
|
776 |
+
|
777 |
+
logger.info(
|
778 |
+
"built %s from %i documents (total %i corpus positions)",
|
779 |
+
result, result.num_docs, result.num_pos
|
780 |
+
)
|
781 |
+
return result
|