from __future__ import absolute_import, unicode_literals __version__ = '0.38' __license__ = 'MIT' import re import os import sys import time import logging import marshal import tempfile import threading from math import log from hashlib import md5 from ._compat import * from . import finalseg if os.name == 'nt': from shutil import move as _replace_file else: _replace_file = os.rename _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) DEFAULT_DICT = None DEFAULT_DICT_NAME = "dict.txt" log_console = logging.StreamHandler(sys.stderr) default_logger = logging.getLogger(__name__) default_logger.setLevel(logging.DEBUG) default_logger.addHandler(log_console) DICT_WRITING = {} pool = None re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U) re_eng = re.compile('[a-zA-Z0-9]', re.U) # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \r\n|\s : whitespace characters. Will not be handled. re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U) re_skip_default = re.compile("(\r\n|\s)", re.U) re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) def setLogLevel(log_level): global logger default_logger.setLevel(log_level) class Tokenizer(object): def __init__(self, dictionary=DEFAULT_DICT): self.lock = threading.RLock() if dictionary == DEFAULT_DICT: self.dictionary = dictionary else: self.dictionary = _get_abs_path(dictionary) self.FREQ = {} self.total = 0 self.user_word_tag_tab = {} self.initialized = False self.tmp_dir = None self.cache_file = None def __repr__(self): return '' % self.dictionary def gen_pfdict(self, f): lfreq = {} ltotal = 0 f_name = resolve_filename(f) for lineno, line in enumerate(f, 1): try: line = line.strip().decode('utf-8') word, freq = line.split(' ')[:2] freq = int(freq) lfreq[word] = freq ltotal += freq for ch in xrange(len(word)): wfrag = word[:ch + 1] if wfrag not in lfreq: lfreq[wfrag] = 0 except ValueError: raise ValueError( 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line)) f.close() return lfreq, ltotal def initialize(self, dictionary=None): if dictionary: abs_path = _get_abs_path(dictionary) if self.dictionary == abs_path and self.initialized: return else: self.dictionary = abs_path self.initialized = False else: abs_path = self.dictionary with self.lock: try: with DICT_WRITING[abs_path]: pass except KeyError: pass if self.initialized: return default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() if self.cache_file: cache_file = self.cache_file # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "jieba.cache" # custom dictionary else: cache_file = "jieba.u%s.cache" % md5( abs_path.encode('utf-8', 'replace')).hexdigest() cache_file = os.path.join( self.tmp_dir or tempfile.gettempdir(), cache_file) # prevent absolute path in self.cache_file tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): default_logger.debug( "Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: self.FREQ, self.total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) default_logger.debug( "Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump( (self.FREQ, self.total), temp_cache_file) _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass self.initialized = True default_logger.debug( "Loading model cost %.3f seconds." % (time.time() - t1)) default_logger.debug("Prefix dict has been built succesfully.") def check_initialized(self): if not self.initialized: self.initialize() def calc(self, sentence, DAG, route): N = len(sentence) route[N] = (0, 0) logtotal = log(self.total) for idx in xrange(N - 1, -1, -1): route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx]) def get_DAG(self, sentence): self.check_initialized() DAG = {} N = len(sentence) for k in xrange(N): tmplist = [] i = k frag = sentence[k] while i < N and frag in self.FREQ: if self.FREQ[frag]: tmplist.append(i) i += 1 frag = sentence[k:i + 1] if not tmplist: tmplist.append(k) DAG[k] = tmplist return DAG def __cut_all(self, sentence): dag = self.get_DAG(sentence) old_j = -1 for k, L in iteritems(dag): if len(L) == 1 and k > old_j: yield sentence[k:L[0] + 1] old_j = L[0] else: for j in L: if j > k: yield sentence[k:j + 1] old_j = j def __cut_DAG_NO_HMM(self, sentence): DAG = self.get_DAG(sentence) route = {} self.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if re_eng.match(l_word) and len(l_word) == 1: buf += l_word x = y else: if buf: yield buf buf = '' yield l_word x = y if buf: yield buf buf = '' def __cut_DAG(self, sentence): DAG = self.get_DAG(sentence) route = {} self.calc(sentence, DAG, route) x = 0 buf = '' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield buf buf = '' else: if not self.FREQ.get(buf): recognized = finalseg.cut(buf) for t in recognized: yield t else: for elem in buf: yield elem buf = '' yield l_word x = y if buf: if len(buf) == 1: yield buf elif not self.FREQ.get(buf): recognized = finalseg.cut(buf) for t in recognized: yield t else: for elem in buf: yield elem def cut(self, sentence, cut_all=False, HMM=True): ''' The main function that segments an entire sentence that contains Chinese characters into seperated words. Parameter: - sentence: The str(unicode) to be segmented. - cut_all: Model type. True for full pattern, False for accurate pattern. - HMM: Whether to use the Hidden Markov Model. ''' sentence = strdecode(sentence) if cut_all: re_han = re_han_cut_all re_skip = re_skip_cut_all else: re_han = re_han_default re_skip = re_skip_default if cut_all: cut_block = self.__cut_all elif HMM: cut_block = self.__cut_DAG else: cut_block = self.__cut_DAG_NO_HMM blocks = re_han.split(sentence) for blk in blocks: if not blk: continue if re_han.match(blk): for word in cut_block(blk): yield word else: tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): yield x elif not cut_all: for xx in x: yield xx else: yield x def cut_for_search(self, sentence, HMM=True): """ Finer segmentation for search engines. """ words = self.cut(sentence, HMM=HMM) for w in words: if len(w) > 2: for i in xrange(len(w) - 1): gram2 = w[i:i + 2] if self.FREQ.get(gram2): yield gram2 if len(w) > 3: for i in xrange(len(w) - 2): gram3 = w[i:i + 3] if self.FREQ.get(gram3): yield gram3 yield w def lcut(self, *args, **kwargs): return list(self.cut(*args, **kwargs)) def lcut_for_search(self, *args, **kwargs): return list(self.cut_for_search(*args, **kwargs)) _lcut = lcut _lcut_for_search = lcut_for_search def _lcut_no_hmm(self, sentence): return self.lcut(sentence, False, False) def _lcut_all(self, sentence): return self.lcut(sentence, True) def _lcut_for_search_no_hmm(self, sentence): return self.lcut_for_search(sentence, False) def get_dict_file(self): if self.dictionary == DEFAULT_DICT: return get_module_res(DEFAULT_DICT_NAME) else: return open(self.dictionary, 'rb') def load_userdict(self, f): ''' Load personalized dict to improve detect rate. Parameter: - f : A plain text file contains words and their ocurrences. Can be a file-like object, or the path of the dictionary file, whose encoding must be utf-8. Structure of dict file: word1 freq1 word_type1 word2 freq2 word_type2 ... Word type may be ignored ''' self.check_initialized() if isinstance(f, string_types): f_name = f f = open(f, 'rb') else: f_name = resolve_filename(f) for lineno, ln in enumerate(f, 1): line = ln.strip() if not isinstance(line, text_type): try: line = line.decode('utf-8').lstrip('\ufeff') except UnicodeDecodeError: raise ValueError('dictionary file %s must be utf-8' % f_name) if not line: continue # match won't be None because there's at least one character word, freq, tag = re_userdict.match(line).groups() if freq is not None: freq = freq.strip() if tag is not None: tag = tag.strip() self.add_word(word, freq, tag) def add_word(self, word, freq=None, tag=None): """ Add a word to dictionary. freq and tag can be omitted, freq defaults to be a calculated value that ensures the word can be cut out. """ self.check_initialized() word = strdecode(word) freq = int(freq) if freq is not None else self.suggest_freq(word, False) self.FREQ[word] = freq self.total += freq if tag: self.user_word_tag_tab[word] = tag for ch in xrange(len(word)): wfrag = word[:ch + 1] if wfrag not in self.FREQ: self.FREQ[wfrag] = 0 def del_word(self, word): """ Convenient function for deleting a word. """ self.add_word(word, 0) def suggest_freq(self, segment, tune=False): """ Suggest word frequency to force the characters in a word to be joined or splitted. Parameter: - segment : The segments that the word is expected to be cut into, If the word should be treated as a whole, use a str. - tune : If True, tune the word frequency. Note that HMM may affect the final result. If the result doesn't change, set HMM=False. """ self.check_initialized() ftotal = float(self.total) freq = 1 if isinstance(segment, string_types): word = segment for seg in self.cut(word, HMM=False): freq *= self.FREQ.get(seg, 1) / ftotal freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1)) else: segment = tuple(map(strdecode, segment)) word = ''.join(segment) for seg in segment: freq *= self.FREQ.get(seg, 1) / ftotal freq = min(int(freq * self.total), self.FREQ.get(word, 0)) if tune: add_word(word, freq) return freq def tokenize(self, unicode_sentence, mode="default", HMM=True): """ Tokenize a sentence and yields tuples of (word, start, end) Parameter: - sentence: the str(unicode) to be segmented. - mode: "default" or "search", "search" is for finer segmentation. - HMM: whether to use the Hidden Markov Model. """ if not isinstance(unicode_sentence, text_type): raise ValueError("jieba: the input parameter should be unicode.") start = 0 if mode == 'default': for w in self.cut(unicode_sentence, HMM=HMM): width = len(w) yield (w, start, start + width) start += width else: for w in self.cut(unicode_sentence, HMM=HMM): width = len(w) if len(w) > 2: for i in xrange(len(w) - 1): gram2 = w[i:i + 2] if self.FREQ.get(gram2): yield (gram2, start + i, start + i + 2) if len(w) > 3: for i in xrange(len(w) - 2): gram3 = w[i:i + 3] if self.FREQ.get(gram3): yield (gram3, start + i, start + i + 3) yield (w, start, start + width) start += width def set_dictionary(self, dictionary_path): with self.lock: abs_path = _get_abs_path(dictionary_path) if not os.path.isfile(abs_path): raise Exception("jieba: file does not exist: " + abs_path) self.dictionary = abs_path self.initialized = False # default Tokenizer instance dt = Tokenizer() # global functions get_FREQ = lambda k, d=None: dt.FREQ.get(k, d) add_word = dt.add_word calc = dt.calc cut = dt.cut lcut = dt.lcut cut_for_search = dt.cut_for_search lcut_for_search = dt.lcut_for_search del_word = dt.del_word get_DAG = dt.get_DAG get_dict_file = dt.get_dict_file initialize = dt.initialize load_userdict = dt.load_userdict set_dictionary = dt.set_dictionary suggest_freq = dt.suggest_freq tokenize = dt.tokenize user_word_tag_tab = dt.user_word_tag_tab def _lcut_all(s): return dt._lcut_all(s) def _lcut(s): return dt._lcut(s) def _lcut_all(s): return dt._lcut_all(s) def _lcut_for_search(s): return dt._lcut_for_search(s) def _lcut_for_search_no_hmm(s): return dt._lcut_for_search_no_hmm(s) def _pcut(sentence, cut_all=False, HMM=True): parts = strdecode(sentence).splitlines(True) if cut_all: result = pool.map(_lcut_all, parts) elif HMM: result = pool.map(_lcut, parts) else: result = pool.map(_lcut_no_hmm, parts) for r in result: for w in r: yield w def _pcut_for_search(sentence, HMM=True): parts = strdecode(sentence).splitlines(True) if HMM: result = pool.map(_lcut_for_search, parts) else: result = pool.map(_lcut_for_search_no_hmm, parts) for r in result: for w in r: yield w def enable_parallel(processnum=None): """ Change the module's `cut` and `cut_for_search` functions to the parallel version. Note that this only works using dt, custom Tokenizer instances are not supported. """ global pool, dt, cut, cut_for_search from multiprocessing import cpu_count if os.name == 'nt': raise NotImplementedError( "jieba: parallel mode only supports posix system") else: from multiprocessing import Pool dt.check_initialized() if processnum is None: processnum = cpu_count() pool = Pool(processnum) cut = _pcut cut_for_search = _pcut_for_search def disable_parallel(): global pool, dt, cut, cut_for_search if pool: pool.close() pool = None cut = dt.cut cut_for_search = dt.cut_for_search