from __future__ import absolute_import, unicode_literals |
__version__ = '0.42.1' |
__license__ = 'MIT' |
import marshal |
import re |
import tempfile |
import threading |
import time |
from hashlib import md5 |
from math import log |
from . import finalseg |
from ._compat import * |
if os.name == 'nt': |
from shutil import move as _replace_file |
else: |
_replace_file = os.rename |
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) |
DEFAULT_DICT_NAME = "dict.txt" |
log_console = logging.StreamHandler(sys.stderr) |
default_logger = logging.getLogger(__name__) |
default_logger.setLevel(logging.DEBUG) |
default_logger.addHandler(log_console) |
pool = None |
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U) |
re_eng = re.compile('[a-zA-Z0-9]', re.U) |
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) |
re_skip_default = re.compile("(\r\n|\s)", re.U) |
def setLogLevel(log_level): |
default_logger.setLevel(log_level) |
class Tokenizer(object): |
def __init__(self, dictionary=DEFAULT_DICT): |
self.lock = threading.RLock() |
if dictionary == DEFAULT_DICT: |
self.dictionary = dictionary |
else: |
self.dictionary = _get_abs_path(dictionary) |
self.FREQ = {} |
self.total = 0 |
self.user_word_tag_tab = {} |
self.initialized = False |
self.tmp_dir = None |
self.cache_file = None |
def __repr__(self): |
return '<Tokenizer dictionary=%r>' % self.dictionary |
@staticmethod |
def gen_pfdict(f): |
lfreq = {} |
ltotal = 0 |
f_name = resolve_filename(f) |
for lineno, line in enumerate(f, 1): |
try: |
line = line.strip().decode('utf-8') |
word, freq = line.split(' ')[:2] |
freq = int(freq) |
lfreq[word] = freq |
ltotal += freq |
for ch in xrange(len(word)): |
wfrag = word[:ch + 1] |
if wfrag not in lfreq: |
lfreq[wfrag] = 0 |
except ValueError: |
raise ValueError( |
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line)) |
f.close() |
return lfreq, ltotal |
def initialize(self, dictionary=None): |
if dictionary: |
abs_path = _get_abs_path(dictionary) |
if self.dictionary == abs_path and self.initialized: |
return |
else: |
self.dictionary = abs_path |
self.initialized = False |
else: |
abs_path = self.dictionary |
with self.lock: |
try: |
with DICT_WRITING[abs_path]: |
pass |
except KeyError: |
pass |
if self.initialized: |
return |
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) |
t1 = time.time() |
if self.cache_file: |
cache_file = self.cache_file |
elif abs_path == DEFAULT_DICT: |
cache_file = "jieba.cache" |
else: |
cache_file = "jieba.u%s.cache" % md5( |
abs_path.encode('utf-8', 'replace')).hexdigest() |
cache_file = os.path.join( |
self.tmp_dir or tempfile.gettempdir(), cache_file) |
tmpdir = os.path.dirname(cache_file) |
load_from_cache_fail = True |
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or |
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): |
default_logger.debug( |
"Loading model from cache %s" % cache_file) |
try: |
with open(cache_file, 'rb') as cf: |
self.FREQ, self.total = marshal.load(cf) |
load_from_cache_fail = False |
except Exception: |
load_from_cache_fail = True |
if load_from_cache_fail: |
wlock = DICT_WRITING.get(abs_path, threading.RLock()) |
DICT_WRITING[abs_path] = wlock |
with wlock: |
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) |
default_logger.debug( |
"Dumping model to file cache %s" % cache_file) |
try: |
fd, fpath = tempfile.mkstemp(dir=tmpdir) |
with os.fdopen(fd, 'wb') as temp_cache_file: |
marshal.dump( |
(self.FREQ, self.total), temp_cache_file) |
_replace_file(fpath, cache_file) |
except Exception: |
default_logger.exception("Dump cache file failed.") |
try: |
del DICT_WRITING[abs_path] |
except KeyError: |
pass |
self.initialized = True |
default_logger.debug( |
"Loading model cost %.3f seconds." % (time.time() - t1)) |
default_logger.debug("Prefix dict has been built successfully.") |
def check_initialized(self): |
if not self.initialized: |
self.initialize() |
def calc(self, sentence, DAG, route): |
N = len(sentence) |
route[N] = (0, 0) |
logtotal = log(self.total) |
for idx in xrange(N - 1, -1, -1): |
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) - |
logtotal + route[x + 1][0], x) for x in DAG[idx]) |
def get_DAG(self, sentence): |
self.check_initialized() |
DAG = {} |
N = len(sentence) |
for k in xrange(N): |
tmplist = [] |
i = k |
frag = sentence[k] |
while i < N and frag in self.FREQ: |
if self.FREQ[frag]: |
tmplist.append(i) |
i += 1 |
frag = sentence[k:i + 1] |
if not tmplist: |
tmplist.append(k) |
DAG[k] = tmplist |
return DAG |
def __cut_all(self, sentence): |
dag = self.get_DAG(sentence) |
old_j = -1 |
eng_scan = 0 |
eng_buf = u'' |
for k, L in iteritems(dag): |
if eng_scan == 1 and not re_eng.match(sentence[k]): |
eng_scan = 0 |
yield eng_buf |
if len(L) == 1 and k > old_j: |
word = sentence[k:L[0] + 1] |
if re_eng.match(word): |
if eng_scan == 0: |
eng_scan = 1 |
eng_buf = word |
else: |
eng_buf += word |
if eng_scan == 0: |
yield word |
old_j = L[0] |
else: |
for j in L: |
if j > k: |
yield sentence[k:j + 1] |
old_j = j |
if eng_scan == 1: |
yield eng_buf |
def __cut_DAG_NO_HMM(self, sentence): |
DAG = self.get_DAG(sentence) |
route = {} |
self.calc(sentence, DAG, route) |
x = 0 |
N = len(sentence) |
buf = '' |
while x < N: |
y = route[x][1] + 1 |
l_word = sentence[x:y] |
if re_eng.match(l_word) and len(l_word) == 1: |
buf += l_word |
x = y |
else: |
if buf: |
yield buf |
buf = '' |
yield l_word |
x = y |
if buf: |
yield buf |
buf = '' |
def __cut_DAG(self, sentence): |
DAG = self.get_DAG(sentence) |
route = {} |
self.calc(sentence, DAG, route) |
x = 0 |
buf = '' |
N = len(sentence) |
while x < N: |
y = route[x][1] + 1 |
l_word = sentence[x:y] |
if y - x == 1: |
buf += l_word |
else: |
if buf: |
if len(buf) == 1: |
yield buf |
buf = '' |
else: |
if not self.FREQ.get(buf): |
recognized = finalseg.cut(buf) |
for t in recognized: |
yield t |
else: |
for elem in buf: |
yield elem |
buf = '' |
yield l_word |
x = y |
if buf: |
if len(buf) == 1: |
yield buf |
elif not self.FREQ.get(buf): |
recognized = finalseg.cut(buf) |
for t in recognized: |
yield t |
else: |
for elem in buf: |
yield elem |
def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False): |
""" |
The main function that segments an entire sentence that contains |
Chinese characters into separated words. |
Parameter: |
- sentence: The str(unicode) to be segmented. |
- cut_all: Model type. True for full pattern, False for accurate pattern. |
- HMM: Whether to use the Hidden Markov Model. |
""" |
is_paddle_installed = check_paddle_install['is_paddle_installed'] |
sentence = strdecode(sentence) |
if use_paddle and is_paddle_installed: |
if sentence is None or len(sentence) == 0: |
return |
import jieba.lac_small.predict as predict |
results = predict.get_sent(sentence) |
for sent in results: |
if sent is None: |
continue |
yield sent |
return |
re_han = re_han_default |
re_skip = re_skip_default |
if cut_all: |
cut_block = self.__cut_all |
elif HMM: |
cut_block = self.__cut_DAG |
else: |
cut_block = self.__cut_DAG_NO_HMM |
blocks = re_han.split(sentence) |
for blk in blocks: |
if not blk: |
continue |
if re_han.match(blk): |
for word in cut_block(blk): |
yield word |
else: |
tmp = re_skip.split(blk) |
for x in tmp: |
if re_skip.match(x): |
yield x |
elif not cut_all: |
for xx in x: |
yield xx |
else: |
yield x |
def cut_for_search(self, sentence, HMM=True): |
""" |
Finer segmentation for search engines. |
""" |
words = self.cut(sentence, HMM=HMM) |
for w in words: |
if len(w) > 2: |
for i in xrange(len(w) - 1): |
gram2 = w[i:i + 2] |
if self.FREQ.get(gram2): |
yield gram2 |
if len(w) > 3: |
for i in xrange(len(w) - 2): |
gram3 = w[i:i + 3] |
if self.FREQ.get(gram3): |
yield gram3 |
yield w |
def lcut(self, *args, **kwargs): |
return list(self.cut(*args, **kwargs)) |
def lcut_for_search(self, *args, **kwargs): |
return list(self.cut_for_search(*args, **kwargs)) |
_lcut = lcut |
_lcut_for_search = lcut_for_search |
def _lcut_no_hmm(self, sentence): |
return self.lcut(sentence, False, False) |
def _lcut_all(self, sentence): |
return self.lcut(sentence, True) |
def _lcut_for_search_no_hmm(self, sentence): |
return self.lcut_for_search(sentence, False) |
def get_dict_file(self): |
if self.dictionary == DEFAULT_DICT: |
return get_module_res(DEFAULT_DICT_NAME) |
else: |
return open(self.dictionary, 'rb') |
def load_userdict(self, f): |
''' |
Load personalized dict to improve detect rate. |
Parameter: |
- f : A plain text file contains words and their ocurrences. |
Can be a file-like object, or the path of the dictionary file, |
whose encoding must be utf-8. |
Structure of dict file: |
word1 freq1 word_type1 |
word2 freq2 word_type2 |
... |
Word type may be ignored |
''' |
self.check_initialized() |
if isinstance(f, string_types): |
f_name = f |
f = open(f, 'rb') |
else: |
f_name = resolve_filename(f) |
for lineno, ln in enumerate(f, 1): |
line = ln.strip() |
if not isinstance(line, text_type): |
try: |
line = line.decode('utf-8').lstrip('\ufeff') |
except UnicodeDecodeError: |
raise ValueError('dictionary file %s must be utf-8' % f_name) |
if not line: |
continue |
word, freq, tag = re_userdict.match(line).groups() |
if freq is not None: |
freq = freq.strip() |
if tag is not None: |
tag = tag.strip() |
self.add_word(word, freq, tag) |
def add_word(self, word, freq=None, tag=None): |
""" |
Add a word to dictionary. |
freq and tag can be omitted, freq defaults to be a calculated value |
that ensures the word can be cut out. |
""" |
self.check_initialized() |
word = strdecode(word) |
freq = int(freq) if freq is not None else self.suggest_freq(word, False) |
self.FREQ[word] = freq |
self.total += freq |
if tag: |
self.user_word_tag_tab[word] = tag |
for ch in xrange(len(word)): |
wfrag = word[:ch + 1] |
if wfrag not in self.FREQ: |
self.FREQ[wfrag] = 0 |
if freq == 0: |
finalseg.add_force_split(word) |
def del_word(self, word): |
""" |
Convenient function for deleting a word. |
""" |
self.add_word(word, 0) |
def suggest_freq(self, segment, tune=False): |
""" |
Suggest word frequency to force the characters in a word to be |
joined or splitted. |
Parameter: |
- segment : The segments that the word is expected to be cut into, |
If the word should be treated as a whole, use a str. |
- tune : If True, tune the word frequency. |
Note that HMM may affect the final result. If the result doesn't change, |
set HMM=False. |
""" |
self.check_initialized() |
ftotal = float(self.total) |
freq = 1 |
if isinstance(segment, string_types): |
word = segment |
for seg in self.cut(word, HMM=False): |
freq *= self.FREQ.get(seg, 1) / ftotal |
freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1)) |
else: |
segment = tuple(map(strdecode, segment)) |
word = ''.join(segment) |
for seg in segment: |
freq *= self.FREQ.get(seg, 1) / ftotal |
freq = min(int(freq * self.total), self.FREQ.get(word, 0)) |
if tune: |
self.add_word(word, freq) |
return freq |
def tokenize(self, unicode_sentence, mode="default", HMM=True): |
""" |
Tokenize a sentence and yields tuples of (word, start, end) |
Parameter: |
- sentence: the str(unicode) to be segmented. |
- mode: "default" or "search", "search" is for finer segmentation. |
- HMM: whether to use the Hidden Markov Model. |
""" |
if not isinstance(unicode_sentence, text_type): |
raise ValueError("jieba: the input parameter should be unicode.") |
start = 0 |
if mode == 'default': |
for w in self.cut(unicode_sentence, HMM=HMM): |
width = len(w) |
yield (w, start, start + width) |
start += width |
else: |
for w in self.cut(unicode_sentence, HMM=HMM): |
width = len(w) |
if len(w) > 2: |
for i in xrange(len(w) - 1): |
gram2 = w[i:i + 2] |
if self.FREQ.get(gram2): |
yield (gram2, start + i, start + i + 2) |
if len(w) > 3: |
for i in xrange(len(w) - 2): |
gram3 = w[i:i + 3] |
if self.FREQ.get(gram3): |
yield (gram3, start + i, start + i + 3) |
yield (w, start, start + width) |
start += width |
def set_dictionary(self, dictionary_path): |
with self.lock: |
abs_path = _get_abs_path(dictionary_path) |
if not os.path.isfile(abs_path): |
raise Exception("jieba: file does not exist: " + abs_path) |
self.dictionary = abs_path |
self.initialized = False |
dt = Tokenizer() |
get_FREQ = lambda k, d=None: dt.FREQ.get(k, d) |
add_word = dt.add_word |
calc = dt.calc |
cut = dt.cut |
lcut = dt.lcut |
cut_for_search = dt.cut_for_search |
lcut_for_search = dt.lcut_for_search |
del_word = dt.del_word |
get_DAG = dt.get_DAG |
get_dict_file = dt.get_dict_file |
initialize = dt.initialize |
load_userdict = dt.load_userdict |
set_dictionary = dt.set_dictionary |
suggest_freq = dt.suggest_freq |
tokenize = dt.tokenize |
user_word_tag_tab = dt.user_word_tag_tab |
def _lcut_all(s): |
return dt._lcut_all(s) |
def _lcut(s): |
return dt._lcut(s) |
def _lcut_no_hmm(s): |
return dt._lcut_no_hmm(s) |
def _lcut_all(s): |
return dt._lcut_all(s) |
def _lcut_for_search(s): |
return dt._lcut_for_search(s) |
def _lcut_for_search_no_hmm(s): |
return dt._lcut_for_search_no_hmm(s) |
def _pcut(sentence, cut_all=False, HMM=True): |
parts = strdecode(sentence).splitlines(True) |
if cut_all: |
result = pool.map(_lcut_all, parts) |
elif HMM: |
result = pool.map(_lcut, parts) |
else: |
result = pool.map(_lcut_no_hmm, parts) |
for r in result: |
for w in r: |
yield w |
def _pcut_for_search(sentence, HMM=True): |
parts = strdecode(sentence).splitlines(True) |
if HMM: |
result = pool.map(_lcut_for_search, parts) |
else: |
result = pool.map(_lcut_for_search_no_hmm, parts) |
for r in result: |
for w in r: |
yield w |
def enable_parallel(processnum=None): |
""" |
Change the module's `cut` and `cut_for_search` functions to the |
parallel version. |
Note that this only works using dt, custom Tokenizer |
instances are not supported. |
""" |
global pool, dt, cut, cut_for_search |
from multiprocessing import cpu_count |
if os.name == 'nt': |
raise NotImplementedError( |
"jieba: parallel mode only supports posix system") |
else: |
from multiprocessing import Pool |
dt.check_initialized() |
if processnum is None: |
processnum = cpu_count() |
pool = Pool(processnum) |
cut = _pcut |
cut_for_search = _pcut_for_search |
def disable_parallel(): |
global pool, dt, cut, cut_for_search |
if pool: |
pool.close() |
pool = None |
cut = dt.cut |
cut_for_search = dt.cut_for_search |