|
from __future__ import absolute_import, unicode_literals |
|
|
|
__version__ = '0.42.1' |
|
__license__ = 'MIT' |
|
|
|
import marshal |
|
import re |
|
import tempfile |
|
import threading |
|
import time |
|
from hashlib import md5 |
|
from math import log |
|
|
|
from . import finalseg |
|
from ._compat import * |
|
|
|
if os.name == 'nt': |
|
from shutil import move as _replace_file |
|
else: |
|
_replace_file = os.rename |
|
|
|
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) |
|
|
|
DEFAULT_DICT = None |
|
DEFAULT_DICT_NAME = "dict.txt" |
|
|
|
log_console = logging.StreamHandler(sys.stderr) |
|
default_logger = logging.getLogger(__name__) |
|
default_logger.setLevel(logging.DEBUG) |
|
default_logger.addHandler(log_console) |
|
|
|
DICT_WRITING = {} |
|
|
|
pool = None |
|
|
|
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U) |
|
|
|
re_eng = re.compile('[a-zA-Z0-9]', re.U) |
|
|
|
|
|
|
|
|
|
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) |
|
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U) |
|
|
|
|
|
def setLogLevel(log_level): |
|
default_logger.setLevel(log_level) |
|
|
|
|
|
class Tokenizer(object): |
|
|
|
def __init__(self, dictionary=DEFAULT_DICT): |
|
self.lock = threading.RLock() |
|
if dictionary == DEFAULT_DICT: |
|
self.dictionary = dictionary |
|
else: |
|
self.dictionary = _get_abs_path(dictionary) |
|
self.FREQ = {} |
|
self.total = 0 |
|
self.user_word_tag_tab = {} |
|
self.initialized = False |
|
self.tmp_dir = None |
|
self.cache_file = None |
|
|
|
def __repr__(self): |
|
return '<Tokenizer dictionary=%r>' % self.dictionary |
|
|
|
@staticmethod |
|
def gen_pfdict(f): |
|
lfreq = {} |
|
ltotal = 0 |
|
f_name = resolve_filename(f) |
|
for lineno, line in enumerate(f, 1): |
|
try: |
|
line = line.strip().decode('utf-8') |
|
word, freq = line.split(' ')[:2] |
|
freq = int(freq) |
|
lfreq[word] = freq |
|
ltotal += freq |
|
for ch in xrange(len(word)): |
|
wfrag = word[:ch + 1] |
|
if wfrag not in lfreq: |
|
lfreq[wfrag] = 0 |
|
except ValueError: |
|
raise ValueError( |
|
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line)) |
|
f.close() |
|
return lfreq, ltotal |
|
|
|
def initialize(self, dictionary=None): |
|
if dictionary: |
|
abs_path = _get_abs_path(dictionary) |
|
if self.dictionary == abs_path and self.initialized: |
|
return |
|
else: |
|
self.dictionary = abs_path |
|
self.initialized = False |
|
else: |
|
abs_path = self.dictionary |
|
|
|
with self.lock: |
|
try: |
|
with DICT_WRITING[abs_path]: |
|
pass |
|
except KeyError: |
|
pass |
|
if self.initialized: |
|
return |
|
|
|
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) |
|
t1 = time.time() |
|
if self.cache_file: |
|
cache_file = self.cache_file |
|
|
|
elif abs_path == DEFAULT_DICT: |
|
cache_file = "jieba.cache" |
|
|
|
else: |
|
cache_file = "jieba.u%s.cache" % md5( |
|
abs_path.encode('utf-8', 'replace')).hexdigest() |
|
cache_file = os.path.join( |
|
self.tmp_dir or tempfile.gettempdir(), cache_file) |
|
|
|
tmpdir = os.path.dirname(cache_file) |
|
|
|
load_from_cache_fail = True |
|
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or |
|
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): |
|
default_logger.debug( |
|
"Loading model from cache %s" % cache_file) |
|
try: |
|
with open(cache_file, 'rb') as cf: |
|
self.FREQ, self.total = marshal.load(cf) |
|
load_from_cache_fail = False |
|
except Exception: |
|
load_from_cache_fail = True |
|
|
|
if load_from_cache_fail: |
|
wlock = DICT_WRITING.get(abs_path, threading.RLock()) |
|
DICT_WRITING[abs_path] = wlock |
|
with wlock: |
|
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) |
|
default_logger.debug( |
|
"Dumping model to file cache %s" % cache_file) |
|
try: |
|
|
|
fd, fpath = tempfile.mkstemp(dir=tmpdir) |
|
with os.fdopen(fd, 'wb') as temp_cache_file: |
|
marshal.dump( |
|
(self.FREQ, self.total), temp_cache_file) |
|
_replace_file(fpath, cache_file) |
|
except Exception: |
|
default_logger.exception("Dump cache file failed.") |
|
|
|
try: |
|
del DICT_WRITING[abs_path] |
|
except KeyError: |
|
pass |
|
|
|
self.initialized = True |
|
default_logger.debug( |
|
"Loading model cost %.3f seconds." % (time.time() - t1)) |
|
default_logger.debug("Prefix dict has been built successfully.") |
|
|
|
def check_initialized(self): |
|
if not self.initialized: |
|
self.initialize() |
|
|
|
def calc(self, sentence, DAG, route): |
|
N = len(sentence) |
|
route[N] = (0, 0) |
|
logtotal = log(self.total) |
|
for idx in xrange(N - 1, -1, -1): |
|
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) - |
|
logtotal + route[x + 1][0], x) for x in DAG[idx]) |
|
|
|
def get_DAG(self, sentence): |
|
self.check_initialized() |
|
DAG = {} |
|
N = len(sentence) |
|
for k in xrange(N): |
|
tmplist = [] |
|
i = k |
|
frag = sentence[k] |
|
while i < N and frag in self.FREQ: |
|
if self.FREQ[frag]: |
|
tmplist.append(i) |
|
i += 1 |
|
frag = sentence[k:i + 1] |
|
if not tmplist: |
|
tmplist.append(k) |
|
DAG[k] = tmplist |
|
return DAG |
|
|
|
def __cut_all(self, sentence): |
|
dag = self.get_DAG(sentence) |
|
old_j = -1 |
|
eng_scan = 0 |
|
eng_buf = u'' |
|
for k, L in iteritems(dag): |
|
if eng_scan == 1 and not re_eng.match(sentence[k]): |
|
eng_scan = 0 |
|
yield eng_buf |
|
if len(L) == 1 and k > old_j: |
|
word = sentence[k:L[0] + 1] |
|
if re_eng.match(word): |
|
if eng_scan == 0: |
|
eng_scan = 1 |
|
eng_buf = word |
|
else: |
|
eng_buf += word |
|
if eng_scan == 0: |
|
yield word |
|
old_j = L[0] |
|
else: |
|
for j in L: |
|
if j > k: |
|
yield sentence[k:j + 1] |
|
old_j = j |
|
if eng_scan == 1: |
|
yield eng_buf |
|
|
|
def __cut_DAG_NO_HMM(self, sentence): |
|
DAG = self.get_DAG(sentence) |
|
route = {} |
|
self.calc(sentence, DAG, route) |
|
x = 0 |
|
N = len(sentence) |
|
buf = '' |
|
while x < N: |
|
y = route[x][1] + 1 |
|
l_word = sentence[x:y] |
|
if re_eng.match(l_word) and len(l_word) == 1: |
|
buf += l_word |
|
x = y |
|
else: |
|
if buf: |
|
yield buf |
|
buf = '' |
|
yield l_word |
|
x = y |
|
if buf: |
|
yield buf |
|
buf = '' |
|
|
|
def __cut_DAG(self, sentence): |
|
DAG = self.get_DAG(sentence) |
|
route = {} |
|
self.calc(sentence, DAG, route) |
|
x = 0 |
|
buf = '' |
|
N = len(sentence) |
|
while x < N: |
|
y = route[x][1] + 1 |
|
l_word = sentence[x:y] |
|
if y - x == 1: |
|
buf += l_word |
|
else: |
|
if buf: |
|
if len(buf) == 1: |
|
yield buf |
|
buf = '' |
|
else: |
|
if not self.FREQ.get(buf): |
|
recognized = finalseg.cut(buf) |
|
for t in recognized: |
|
yield t |
|
else: |
|
for elem in buf: |
|
yield elem |
|
buf = '' |
|
yield l_word |
|
x = y |
|
|
|
if buf: |
|
if len(buf) == 1: |
|
yield buf |
|
elif not self.FREQ.get(buf): |
|
recognized = finalseg.cut(buf) |
|
for t in recognized: |
|
yield t |
|
else: |
|
for elem in buf: |
|
yield elem |
|
|
|
def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False): |
|
""" |
|
The main function that segments an entire sentence that contains |
|
Chinese characters into separated words. |
|
|
|
Parameter: |
|
- sentence: The str(unicode) to be segmented. |
|
- cut_all: Model type. True for full pattern, False for accurate pattern. |
|
- HMM: Whether to use the Hidden Markov Model. |
|
""" |
|
is_paddle_installed = check_paddle_install['is_paddle_installed'] |
|
sentence = strdecode(sentence) |
|
if use_paddle and is_paddle_installed: |
|
|
|
if sentence is None or len(sentence) == 0: |
|
return |
|
import jieba.lac_small.predict as predict |
|
results = predict.get_sent(sentence) |
|
for sent in results: |
|
if sent is None: |
|
continue |
|
yield sent |
|
return |
|
re_han = re_han_default |
|
re_skip = re_skip_default |
|
if cut_all: |
|
cut_block = self.__cut_all |
|
elif HMM: |
|
cut_block = self.__cut_DAG |
|
else: |
|
cut_block = self.__cut_DAG_NO_HMM |
|
blocks = re_han.split(sentence) |
|
for blk in blocks: |
|
if not blk: |
|
continue |
|
if re_han.match(blk): |
|
for word in cut_block(blk): |
|
yield word |
|
else: |
|
tmp = re_skip.split(blk) |
|
for x in tmp: |
|
if re_skip.match(x): |
|
yield x |
|
elif not cut_all: |
|
for xx in x: |
|
yield xx |
|
else: |
|
yield x |
|
|
|
def cut_for_search(self, sentence, HMM=True): |
|
""" |
|
Finer segmentation for search engines. |
|
""" |
|
words = self.cut(sentence, HMM=HMM) |
|
for w in words: |
|
if len(w) > 2: |
|
for i in xrange(len(w) - 1): |
|
gram2 = w[i:i + 2] |
|
if self.FREQ.get(gram2): |
|
yield gram2 |
|
if len(w) > 3: |
|
for i in xrange(len(w) - 2): |
|
gram3 = w[i:i + 3] |
|
if self.FREQ.get(gram3): |
|
yield gram3 |
|
yield w |
|
|
|
def lcut(self, *args, **kwargs): |
|
return list(self.cut(*args, **kwargs)) |
|
|
|
def lcut_for_search(self, *args, **kwargs): |
|
return list(self.cut_for_search(*args, **kwargs)) |
|
|
|
_lcut = lcut |
|
_lcut_for_search = lcut_for_search |
|
|
|
def _lcut_no_hmm(self, sentence): |
|
return self.lcut(sentence, False, False) |
|
|
|
def _lcut_all(self, sentence): |
|
return self.lcut(sentence, True) |
|
|
|
def _lcut_for_search_no_hmm(self, sentence): |
|
return self.lcut_for_search(sentence, False) |
|
|
|
def get_dict_file(self): |
|
if self.dictionary == DEFAULT_DICT: |
|
return get_module_res(DEFAULT_DICT_NAME) |
|
else: |
|
return open(self.dictionary, 'rb') |
|
|
|
def load_userdict(self, f): |
|
''' |
|
Load personalized dict to improve detect rate. |
|
|
|
Parameter: |
|
- f : A plain text file contains words and their ocurrences. |
|
Can be a file-like object, or the path of the dictionary file, |
|
whose encoding must be utf-8. |
|
|
|
Structure of dict file: |
|
word1 freq1 word_type1 |
|
word2 freq2 word_type2 |
|
... |
|
Word type may be ignored |
|
''' |
|
self.check_initialized() |
|
if isinstance(f, string_types): |
|
f_name = f |
|
f = open(f, 'rb') |
|
else: |
|
f_name = resolve_filename(f) |
|
for lineno, ln in enumerate(f, 1): |
|
line = ln.strip() |
|
if not isinstance(line, text_type): |
|
try: |
|
line = line.decode('utf-8').lstrip('\ufeff') |
|
except UnicodeDecodeError: |
|
raise ValueError('dictionary file %s must be utf-8' % f_name) |
|
if not line: |
|
continue |
|
|
|
word, freq, tag = re_userdict.match(line).groups() |
|
if freq is not None: |
|
freq = freq.strip() |
|
if tag is not None: |
|
tag = tag.strip() |
|
self.add_word(word, freq, tag) |
|
|
|
def add_word(self, word, freq=None, tag=None): |
|
""" |
|
Add a word to dictionary. |
|
|
|
freq and tag can be omitted, freq defaults to be a calculated value |
|
that ensures the word can be cut out. |
|
""" |
|
self.check_initialized() |
|
word = strdecode(word) |
|
freq = int(freq) if freq is not None else self.suggest_freq(word, False) |
|
self.FREQ[word] = freq |
|
self.total += freq |
|
if tag: |
|
self.user_word_tag_tab[word] = tag |
|
for ch in xrange(len(word)): |
|
wfrag = word[:ch + 1] |
|
if wfrag not in self.FREQ: |
|
self.FREQ[wfrag] = 0 |
|
if freq == 0: |
|
finalseg.add_force_split(word) |
|
|
|
def del_word(self, word): |
|
""" |
|
Convenient function for deleting a word. |
|
""" |
|
self.add_word(word, 0) |
|
|
|
def suggest_freq(self, segment, tune=False): |
|
""" |
|
Suggest word frequency to force the characters in a word to be |
|
joined or splitted. |
|
|
|
Parameter: |
|
- segment : The segments that the word is expected to be cut into, |
|
If the word should be treated as a whole, use a str. |
|
- tune : If True, tune the word frequency. |
|
|
|
Note that HMM may affect the final result. If the result doesn't change, |
|
set HMM=False. |
|
""" |
|
self.check_initialized() |
|
ftotal = float(self.total) |
|
freq = 1 |
|
if isinstance(segment, string_types): |
|
word = segment |
|
for seg in self.cut(word, HMM=False): |
|
freq *= self.FREQ.get(seg, 1) / ftotal |
|
freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1)) |
|
else: |
|
segment = tuple(map(strdecode, segment)) |
|
word = ''.join(segment) |
|
for seg in segment: |
|
freq *= self.FREQ.get(seg, 1) / ftotal |
|
freq = min(int(freq * self.total), self.FREQ.get(word, 0)) |
|
if tune: |
|
self.add_word(word, freq) |
|
return freq |
|
|
|
def tokenize(self, unicode_sentence, mode="default", HMM=True): |
|
""" |
|
Tokenize a sentence and yields tuples of (word, start, end) |
|
|
|
Parameter: |
|
- sentence: the str(unicode) to be segmented. |
|
- mode: "default" or "search", "search" is for finer segmentation. |
|
- HMM: whether to use the Hidden Markov Model. |
|
""" |
|
if not isinstance(unicode_sentence, text_type): |
|
raise ValueError("jieba: the input parameter should be unicode.") |
|
start = 0 |
|
if mode == 'default': |
|
for w in self.cut(unicode_sentence, HMM=HMM): |
|
width = len(w) |
|
yield (w, start, start + width) |
|
start += width |
|
else: |
|
for w in self.cut(unicode_sentence, HMM=HMM): |
|
width = len(w) |
|
if len(w) > 2: |
|
for i in xrange(len(w) - 1): |
|
gram2 = w[i:i + 2] |
|
if self.FREQ.get(gram2): |
|
yield (gram2, start + i, start + i + 2) |
|
if len(w) > 3: |
|
for i in xrange(len(w) - 2): |
|
gram3 = w[i:i + 3] |
|
if self.FREQ.get(gram3): |
|
yield (gram3, start + i, start + i + 3) |
|
yield (w, start, start + width) |
|
start += width |
|
|
|
def set_dictionary(self, dictionary_path): |
|
with self.lock: |
|
abs_path = _get_abs_path(dictionary_path) |
|
if not os.path.isfile(abs_path): |
|
raise Exception("jieba: file does not exist: " + abs_path) |
|
self.dictionary = abs_path |
|
self.initialized = False |
|
|
|
|
|
|
|
|
|
dt = Tokenizer() |
|
|
|
|
|
|
|
get_FREQ = lambda k, d=None: dt.FREQ.get(k, d) |
|
add_word = dt.add_word |
|
calc = dt.calc |
|
cut = dt.cut |
|
lcut = dt.lcut |
|
cut_for_search = dt.cut_for_search |
|
lcut_for_search = dt.lcut_for_search |
|
del_word = dt.del_word |
|
get_DAG = dt.get_DAG |
|
get_dict_file = dt.get_dict_file |
|
initialize = dt.initialize |
|
load_userdict = dt.load_userdict |
|
set_dictionary = dt.set_dictionary |
|
suggest_freq = dt.suggest_freq |
|
tokenize = dt.tokenize |
|
user_word_tag_tab = dt.user_word_tag_tab |
|
|
|
|
|
def _lcut_all(s): |
|
return dt._lcut_all(s) |
|
|
|
|
|
def _lcut(s): |
|
return dt._lcut(s) |
|
|
|
|
|
def _lcut_no_hmm(s): |
|
return dt._lcut_no_hmm(s) |
|
|
|
|
|
def _lcut_all(s): |
|
return dt._lcut_all(s) |
|
|
|
|
|
def _lcut_for_search(s): |
|
return dt._lcut_for_search(s) |
|
|
|
|
|
def _lcut_for_search_no_hmm(s): |
|
return dt._lcut_for_search_no_hmm(s) |
|
|
|
|
|
def _pcut(sentence, cut_all=False, HMM=True): |
|
parts = strdecode(sentence).splitlines(True) |
|
if cut_all: |
|
result = pool.map(_lcut_all, parts) |
|
elif HMM: |
|
result = pool.map(_lcut, parts) |
|
else: |
|
result = pool.map(_lcut_no_hmm, parts) |
|
for r in result: |
|
for w in r: |
|
yield w |
|
|
|
|
|
def _pcut_for_search(sentence, HMM=True): |
|
parts = strdecode(sentence).splitlines(True) |
|
if HMM: |
|
result = pool.map(_lcut_for_search, parts) |
|
else: |
|
result = pool.map(_lcut_for_search_no_hmm, parts) |
|
for r in result: |
|
for w in r: |
|
yield w |
|
|
|
|
|
def enable_parallel(processnum=None): |
|
""" |
|
Change the module's `cut` and `cut_for_search` functions to the |
|
parallel version. |
|
|
|
Note that this only works using dt, custom Tokenizer |
|
instances are not supported. |
|
""" |
|
global pool, dt, cut, cut_for_search |
|
from multiprocessing import cpu_count |
|
if os.name == 'nt': |
|
raise NotImplementedError( |
|
"jieba: parallel mode only supports posix system") |
|
else: |
|
from multiprocessing import Pool |
|
dt.check_initialized() |
|
if processnum is None: |
|
processnum = cpu_count() |
|
pool = Pool(processnum) |
|
cut = _pcut |
|
cut_for_search = _pcut_for_search |
|
|
|
|
|
def disable_parallel(): |
|
global pool, dt, cut, cut_for_search |
|
if pool: |
|
pool.close() |
|
pool = None |
|
cut = dt.cut |
|
cut_for_search = dt.cut_for_search |
|
|