Spaces:

jeffeux
/

spacy-streamlit-haowenchiang

Runtime error

App Files Files Community

jeffeux commited on Nov 26, 2022

Commit

d825710

1 Parent(s): 1886cbc

Migrate to HF Space

Browse files

Files changed (37) hide show

.DS_Store +0 -0
LICENSE +21 -0
README.md +6 -11
app.py +14 -0
jieba/__init__.py +581 -0
jieba/__main__.py +61 -0
jieba/_compat.py +46 -0
jieba/analyse/__init__.py +18 -0
jieba/analyse/analyzer.py +37 -0
jieba/analyse/idf.txt +0 -0
jieba/analyse/textrank.py +110 -0
jieba/analyse/tfidf.py +116 -0
jieba/dict.txt +0 -0
jieba/finalseg/__init__.py +92 -0
jieba/finalseg/prob_emit.p +0 -0
jieba/finalseg/prob_emit.py +0 -0
jieba/finalseg/prob_start.p +14 -0
jieba/finalseg/prob_start.py +4 -0
jieba/finalseg/prob_trans.p +30 -0
jieba/finalseg/prob_trans.py +4 -0
jieba/posseg/__init__.py +294 -0
jieba/posseg/char_state_tab.p +0 -0
jieba/posseg/char_state_tab.py +0 -0
jieba/posseg/prob_emit.p +0 -0
jieba/posseg/prob_emit.py +0 -0
jieba/posseg/prob_start.p +1094 -0
jieba/posseg/prob_start.py +256 -0
jieba/posseg/prob_trans.p +0 -0
jieba/posseg/prob_trans.py +0 -0
jieba/posseg/viterbi.py +53 -0
models.json +5 -0
pages/01_🍊Mandarin.py +209 -0
pages/02_🍣Japanese.py +183 -0
pages/03_🍔English.py +206 -0
requirements.txt +27 -0
tocfl_wordlist.csv +0 -0
update_data.py +63 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 ExplosionAI GmbH
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,7 @@
----
-title: Spacy Streamlit Haowenchiang
-emoji: 🦀
-colorFrom: purple
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.10.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/howard-haowen/spacy-streamlit/HEAD)
+This repo hosts a [Streamlit Web APP](https://share.streamlit.io/howard-haowen/spacy-streamlit/app.py) that leverages the power of [spaCy](https://spacy.io/) to assist language learning. It currently suppports the following languages:
+- Mandarin
+- English
+- Japanese

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.markdown("""
+# AI模型輔助語言學習
+## Language Learning Assisted by AI Models
+- 開啟左側選單可選擇語言，目前支援華語、日語和英語。
+  - Select a language from the sidebar. Supported languages include Mandarin, Japanese, and English.
+- 選單自動隱藏時，點選左上角 > 符號以開啟選單。
+  - If the sidebar is hidden, click on the > symbol in the upper left corner to open it.
+""")

jieba/__init__.py ADDED Viewed

	@@ -0,0 +1,581 @@

+from __future__ import absolute_import, unicode_literals
+__version__ = '0.38'
+__license__ = 'MIT'
+import re
+import os
+import sys
+import time
+import logging
+import marshal
+import tempfile
+import threading
+from math import log
+from hashlib import md5
+from ._compat import *
+from . import finalseg
+if os.name == 'nt':
+    from shutil import move as _replace_file
+else:
+    _replace_file = os.rename
+_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
+DEFAULT_DICT = None
+DEFAULT_DICT_NAME = "dict.txt"
+log_console = logging.StreamHandler(sys.stderr)
+default_logger = logging.getLogger(__name__)
+default_logger.setLevel(logging.DEBUG)
+default_logger.addHandler(log_console)
+DICT_WRITING = {}
+pool = None
+re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
+re_eng = re.compile('[a-zA-Z0-9]', re.U)
+# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
+# \r\n|\s : whitespace characters. Will not be handled.
+re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
+re_skip_default = re.compile("(\r\n|\s)", re.U)
+re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
+re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
+def setLogLevel(log_level):
+    global logger
+    default_logger.setLevel(log_level)
+class Tokenizer(object):
+    def __init__(self, dictionary=DEFAULT_DICT):
+        self.lock = threading.RLock()
+        if dictionary == DEFAULT_DICT:
+            self.dictionary = dictionary
+        else:
+            self.dictionary = _get_abs_path(dictionary)
+        self.FREQ = {}
+        self.total = 0
+        self.user_word_tag_tab = {}
+        self.initialized = False
+        self.tmp_dir = None
+        self.cache_file = None
+    def __repr__(self):
+        return '<Tokenizer dictionary=%r>' % self.dictionary
+    def gen_pfdict(self, f):
+        lfreq = {}
+        ltotal = 0
+        f_name = resolve_filename(f)
+        for lineno, line in enumerate(f, 1):
+            try:
+                line = line.strip().decode('utf-8')
+                word, freq = line.split(' ')[:2]
+                freq = int(freq)
+                lfreq[word] = freq
+                ltotal += freq
+                for ch in xrange(len(word)):
+                    wfrag = word[:ch + 1]
+                    if wfrag not in lfreq:
+                        lfreq[wfrag] = 0
+            except ValueError:
+                raise ValueError(
+                    'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
+        f.close()
+        return lfreq, ltotal
+    def initialize(self, dictionary=None):
+        if dictionary:
+            abs_path = _get_abs_path(dictionary)
+            if self.dictionary == abs_path and self.initialized:
+                return
+            else:
+                self.dictionary = abs_path
+                self.initialized = False
+        else:
+            abs_path = self.dictionary
+        with self.lock:
+            try:
+                with DICT_WRITING[abs_path]:
+                    pass
+            except KeyError:
+                pass
+            if self.initialized:
+                return
+            default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
+            t1 = time.time()
+            if self.cache_file:
+                cache_file = self.cache_file
+            # default dictionary
+            elif abs_path == DEFAULT_DICT:
+                cache_file = "jieba.cache"
+            # custom dictionary
+            else:
+                cache_file = "jieba.u%s.cache" % md5(
+                    abs_path.encode('utf-8', 'replace')).hexdigest()
+            cache_file = os.path.join(
+                self.tmp_dir or tempfile.gettempdir(), cache_file)
+            # prevent absolute path in self.cache_file
+            tmpdir = os.path.dirname(cache_file)
+            load_from_cache_fail = True
+            if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
+                os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
+                default_logger.debug(
+                    "Loading model from cache %s" % cache_file)
+                try:
+                    with open(cache_file, 'rb') as cf:
+                        self.FREQ, self.total = marshal.load(cf)
+                    load_from_cache_fail = False
+                except Exception:
+                    load_from_cache_fail = True
+            if load_from_cache_fail:
+                wlock = DICT_WRITING.get(abs_path, threading.RLock())
+                DICT_WRITING[abs_path] = wlock
+                with wlock:
+                    self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
+                    default_logger.debug(
+                        "Dumping model to file cache %s" % cache_file)
+                    try:
+                        # prevent moving across different filesystems
+                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
+                        with os.fdopen(fd, 'wb') as temp_cache_file:
+                            marshal.dump(
+                                (self.FREQ, self.total), temp_cache_file)
+                        _replace_file(fpath, cache_file)
+                    except Exception:
+                        default_logger.exception("Dump cache file failed.")
+                try:
+                    del DICT_WRITING[abs_path]
+                except KeyError:
+                    pass
+            self.initialized = True
+            default_logger.debug(
+                "Loading model cost %.3f seconds." % (time.time() - t1))
+            default_logger.debug("Prefix dict has been built succesfully.")
+    def check_initialized(self):
+        if not self.initialized:
+            self.initialize()
+    def calc(self, sentence, DAG, route):
+        N = len(sentence)
+        route[N] = (0, 0)
+        logtotal = log(self.total)
+        for idx in xrange(N - 1, -1, -1):
+            route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
+                              logtotal + route[x + 1][0], x) for x in DAG[idx])
+    def get_DAG(self, sentence):
+        self.check_initialized()
+        DAG = {}
+        N = len(sentence)
+        for k in xrange(N):
+            tmplist = []
+            i = k
+            frag = sentence[k]
+            while i < N and frag in self.FREQ:
+                if self.FREQ[frag]:
+                    tmplist.append(i)
+                i += 1
+                frag = sentence[k:i + 1]
+            if not tmplist:
+                tmplist.append(k)
+            DAG[k] = tmplist
+        return DAG
+    def __cut_all(self, sentence):
+        dag = self.get_DAG(sentence)
+        old_j = -1
+        for k, L in iteritems(dag):
+            if len(L) == 1 and k > old_j:
+                yield sentence[k:L[0] + 1]
+                old_j = L[0]
+            else:
+                for j in L:
+                    if j > k:
+                        yield sentence[k:j + 1]
+                        old_j = j
+    def __cut_DAG_NO_HMM(self, sentence):
+        DAG = self.get_DAG(sentence)
+        route = {}
+        self.calc(sentence, DAG, route)
+        x = 0
+        N = len(sentence)
+        buf = ''
+        while x < N:
+            y = route[x][1] + 1
+            l_word = sentence[x:y]
+            if re_eng.match(l_word) and len(l_word) == 1:
+                buf += l_word
+                x = y
+            else:
+                if buf:
+                    yield buf
+                    buf = ''
+                yield l_word
+                x = y
+        if buf:
+            yield buf
+            buf = ''
+    def __cut_DAG(self, sentence):
+        DAG = self.get_DAG(sentence)
+        route = {}
+        self.calc(sentence, DAG, route)
+        x = 0
+        buf = ''
+        N = len(sentence)
+        while x < N:
+            y = route[x][1] + 1
+            l_word = sentence[x:y]
+            if y - x == 1:
+                buf += l_word
+            else:
+                if buf:
+                    if len(buf) == 1:
+                        yield buf
+                        buf = ''
+                    else:
+                        if not self.FREQ.get(buf):
+                            recognized = finalseg.cut(buf)
+                            for t in recognized:
+                                yield t
+                        else:
+                            for elem in buf:
+                                yield elem
+                        buf = ''
+                yield l_word
+            x = y
+        if buf:
+            if len(buf) == 1:
+                yield buf
+            elif not self.FREQ.get(buf):
+                recognized = finalseg.cut(buf)
+                for t in recognized:
+                    yield t
+            else:
+                for elem in buf:
+                    yield elem
+    def cut(self, sentence, cut_all=False, HMM=True):
+        '''
+        The main function that segments an entire sentence that contains
+        Chinese characters into seperated words.
+        Parameter:
+            - sentence: The str(unicode) to be segmented.
+            - cut_all: Model type. True for full pattern, False for accurate pattern.
+            - HMM: Whether to use the Hidden Markov Model.
+        '''
+        sentence = strdecode(sentence)
+        if cut_all:
+            re_han = re_han_cut_all
+            re_skip = re_skip_cut_all
+        else:
+            re_han = re_han_default
+            re_skip = re_skip_default
+        if cut_all:
+            cut_block = self.__cut_all
+        elif HMM:
+            cut_block = self.__cut_DAG
+        else:
+            cut_block = self.__cut_DAG_NO_HMM
+        blocks = re_han.split(sentence)
+        for blk in blocks:
+            if not blk:
+                continue
+            if re_han.match(blk):
+                for word in cut_block(blk):
+                    yield word
+            else:
+                tmp = re_skip.split(blk)
+                for x in tmp:
+                    if re_skip.match(x):
+                        yield x
+                    elif not cut_all:
+                        for xx in x:
+                            yield xx
+                    else:
+                        yield x
+    def cut_for_search(self, sentence, HMM=True):
+        """
+        Finer segmentation for search engines.
+        """
+        words = self.cut(sentence, HMM=HMM)
+        for w in words:
+            if len(w) > 2:
+                for i in xrange(len(w) - 1):
+                    gram2 = w[i:i + 2]
+                    if self.FREQ.get(gram2):
+                        yield gram2
+            if len(w) > 3:
+                for i in xrange(len(w) - 2):
+                    gram3 = w[i:i + 3]
+                    if self.FREQ.get(gram3):
+                        yield gram3
+            yield w
+    def lcut(self, *args, **kwargs):
+        return list(self.cut(*args, **kwargs))
+    def lcut_for_search(self, *args, **kwargs):
+        return list(self.cut_for_search(*args, **kwargs))
+    _lcut = lcut
+    _lcut_for_search = lcut_for_search
+    def _lcut_no_hmm(self, sentence):
+        return self.lcut(sentence, False, False)
+    def _lcut_all(self, sentence):
+        return self.lcut(sentence, True)
+    def _lcut_for_search_no_hmm(self, sentence):
+        return self.lcut_for_search(sentence, False)
+    def get_dict_file(self):
+        if self.dictionary == DEFAULT_DICT:
+            return get_module_res(DEFAULT_DICT_NAME)
+        else:
+            return open(self.dictionary, 'rb')
+    def load_userdict(self, f):
+        '''
+        Load personalized dict to improve detect rate.
+        Parameter:
+            - f : A plain text file contains words and their ocurrences.
+                  Can be a file-like object, or the path of the dictionary file,
+                  whose encoding must be utf-8.
+        Structure of dict file:
+        word1 freq1 word_type1
+        word2 freq2 word_type2
+        ...
+        Word type may be ignored
+        '''
+        self.check_initialized()
+        if isinstance(f, string_types):
+            f_name = f
+            f = open(f, 'rb')
+        else:
+            f_name = resolve_filename(f)
+        for lineno, ln in enumerate(f, 1):
+            line = ln.strip()
+            if not isinstance(line, text_type):
+                try:
+                    line = line.decode('utf-8').lstrip('\ufeff')
+                except UnicodeDecodeError:
+                    raise ValueError('dictionary file %s must be utf-8' % f_name)
+            if not line:
+                continue
+            # match won't be None because there's at least one character
+            word, freq, tag = re_userdict.match(line).groups()
+            if freq is not None:
+                freq = freq.strip()
+            if tag is not None:
+                tag = tag.strip()
+            self.add_word(word, freq, tag)
+    def add_word(self, word, freq=None, tag=None):
+        """
+        Add a word to dictionary.
+        freq and tag can be omitted, freq defaults to be a calculated value
+        that ensures the word can be cut out.
+        """
+        self.check_initialized()
+        word = strdecode(word)
+        freq = int(freq) if freq is not None else self.suggest_freq(word, False)
+        self.FREQ[word] = freq
+        self.total += freq
+        if tag:
+            self.user_word_tag_tab[word] = tag
+        for ch in xrange(len(word)):
+            wfrag = word[:ch + 1]
+            if wfrag not in self.FREQ:
+                self.FREQ[wfrag] = 0
+    def del_word(self, word):
+        """
+        Convenient function for deleting a word.
+        """
+        self.add_word(word, 0)
+    def suggest_freq(self, segment, tune=False):
+        """
+        Suggest word frequency to force the characters in a word to be
+        joined or splitted.
+        Parameter:
+            - segment : The segments that the word is expected to be cut into,
+                        If the word should be treated as a whole, use a str.
+            - tune : If True, tune the word frequency.
+        Note that HMM may affect the final result. If the result doesn't change,
+        set HMM=False.
+        """
+        self.check_initialized()
+        ftotal = float(self.total)
+        freq = 1
+        if isinstance(segment, string_types):
+            word = segment
+            for seg in self.cut(word, HMM=False):
+                freq *= self.FREQ.get(seg, 1) / ftotal
+            freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
+        else:
+            segment = tuple(map(strdecode, segment))
+            word = ''.join(segment)
+            for seg in segment:
+                freq *= self.FREQ.get(seg, 1) / ftotal
+            freq = min(int(freq * self.total), self.FREQ.get(word, 0))
+        if tune:
+            add_word(word, freq)
+        return freq
+    def tokenize(self, unicode_sentence, mode="default", HMM=True):
+        """
+        Tokenize a sentence and yields tuples of (word, start, end)
+        Parameter:
+            - sentence: the str(unicode) to be segmented.
+            - mode: "default" or "search", "search" is for finer segmentation.
+            - HMM: whether to use the Hidden Markov Model.
+        """
+        if not isinstance(unicode_sentence, text_type):
+            raise ValueError("jieba: the input parameter should be unicode.")
+        start = 0
+        if mode == 'default':
+            for w in self.cut(unicode_sentence, HMM=HMM):
+                width = len(w)
+                yield (w, start, start + width)
+                start += width
+        else:
+            for w in self.cut(unicode_sentence, HMM=HMM):
+                width = len(w)
+                if len(w) > 2:
+                    for i in xrange(len(w) - 1):
+                        gram2 = w[i:i + 2]
+                        if self.FREQ.get(gram2):
+                            yield (gram2, start + i, start + i + 2)
+                if len(w) > 3:
+                    for i in xrange(len(w) - 2):
+                        gram3 = w[i:i + 3]
+                        if self.FREQ.get(gram3):
+                            yield (gram3, start + i, start + i + 3)
+                yield (w, start, start + width)
+                start += width
+    def set_dictionary(self, dictionary_path):
+        with self.lock:
+            abs_path = _get_abs_path(dictionary_path)
+            if not os.path.isfile(abs_path):
+                raise Exception("jieba: file does not exist: " + abs_path)
+            self.dictionary = abs_path
+            self.initialized = False
+# default Tokenizer instance
+dt = Tokenizer()
+# global functions
+get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
+add_word = dt.add_word
+calc = dt.calc
+cut = dt.cut
+lcut = dt.lcut
+cut_for_search = dt.cut_for_search
+lcut_for_search = dt.lcut_for_search
+del_word = dt.del_word
+get_DAG = dt.get_DAG
+get_dict_file = dt.get_dict_file
+initialize = dt.initialize
+load_userdict = dt.load_userdict
+set_dictionary = dt.set_dictionary
+suggest_freq = dt.suggest_freq
+tokenize = dt.tokenize
+user_word_tag_tab = dt.user_word_tag_tab
+def _lcut_all(s):
+    return dt._lcut_all(s)
+def _lcut(s):
+    return dt._lcut(s)
+def _lcut_all(s):
+    return dt._lcut_all(s)
+def _lcut_for_search(s):
+    return dt._lcut_for_search(s)
+def _lcut_for_search_no_hmm(s):
+    return dt._lcut_for_search_no_hmm(s)
+def _pcut(sentence, cut_all=False, HMM=True):
+    parts = strdecode(sentence).splitlines(True)
+    if cut_all:
+        result = pool.map(_lcut_all, parts)
+    elif HMM:
+        result = pool.map(_lcut, parts)
+    else:
+        result = pool.map(_lcut_no_hmm, parts)
+    for r in result:
+        for w in r:
+            yield w
+def _pcut_for_search(sentence, HMM=True):
+    parts = strdecode(sentence).splitlines(True)
+    if HMM:
+        result = pool.map(_lcut_for_search, parts)
+    else:
+        result = pool.map(_lcut_for_search_no_hmm, parts)
+    for r in result:
+        for w in r:
+            yield w
+def enable_parallel(processnum=None):
+    """
+    Change the module's `cut` and `cut_for_search` functions to the
+    parallel version.
+    Note that this only works using dt, custom Tokenizer
+    instances are not supported.
+    """
+    global pool, dt, cut, cut_for_search
+    from multiprocessing import cpu_count
+    if os.name == 'nt':
+        raise NotImplementedError(
+            "jieba: parallel mode only supports posix system")
+    else:
+        from multiprocessing import Pool
+    dt.check_initialized()
+    if processnum is None:
+        processnum = cpu_count()
+    pool = Pool(processnum)
+    cut = _pcut
+    cut_for_search = _pcut_for_search
+def disable_parallel():
+    global pool, dt, cut, cut_for_search
+    if pool:
+        pool.close()
+        pool = None
+    cut = dt.cut
+    cut_for_search = dt.cut_for_search

jieba/__main__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Jieba command line interface."""
+import sys
+import jieba
+from argparse import ArgumentParser
+from ._compat import *
+parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
+parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
+                    nargs='?', const=' ',
+                    help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
+parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
+                    help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
+parser.add_argument("-D", "--dict", help="use DICT as dictionary")
+parser.add_argument("-u", "--user-dict",
+                    help="use USER_DICT together with the default dictionary or DICT (if specified)")
+parser.add_argument("-a", "--cut-all",
+                    action="store_true", dest="cutall", default=False,
+                    help="full pattern cutting (ignored with POS tagging)")
+parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
+                    default=True, help="don't use the Hidden Markov Model")
+parser.add_argument("-q", "--quiet", action="store_true", default=False,
+                    help="don't print loading messages to stderr")
+parser.add_argument("-V", '--version', action='version',
+                    version="Jieba " + jieba.__version__)
+parser.add_argument("filename", nargs='?', help="input file")
+args = parser.parse_args()
+if args.quiet:
+    jieba.setLogLevel(60)
+if args.pos:
+    import jieba.posseg
+    posdelim = args.pos
+    def cutfunc(sentence, _, HMM=True):
+        for w, f in jieba.posseg.cut(sentence, HMM):
+            yield w + posdelim + f
+else:
+    cutfunc = jieba.cut
+delim = text_type(args.delimiter)
+cutall = args.cutall
+hmm = args.hmm
+fp = open(args.filename, 'r') if args.filename else sys.stdin
+if args.dict:
+    jieba.initialize(args.dict)
+else:
+    jieba.initialize()
+if args.user_dict:
+    jieba.load_userdict(args.user_dict)
+ln = fp.readline()
+while ln:
+    l = ln.rstrip('\r\n')
+    result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
+    if PY2:
+        result = result.encode(default_encoding)
+    print(result)
+    ln = fp.readline()
+fp.close()

jieba/_compat.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# -*- coding: utf-8 -*-
+import os
+import sys
+try:
+    import pkg_resources
+    get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
+                                                                os.path.join(*res))
+except ImportError:
+    get_module_res = lambda *res: open(os.path.normpath(os.path.join(
+                            os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
+PY2 = sys.version_info[0] == 2
+default_encoding = sys.getfilesystemencoding()
+if PY2:
+    text_type = unicode
+    string_types = (str, unicode)
+    iterkeys = lambda d: d.iterkeys()
+    itervalues = lambda d: d.itervalues()
+    iteritems = lambda d: d.iteritems()
+else:
+    text_type = str
+    string_types = (str,)
+    xrange = range
+    iterkeys = lambda d: iter(d.keys())
+    itervalues = lambda d: iter(d.values())
+    iteritems = lambda d: iter(d.items())
+def strdecode(sentence):
+    if not isinstance(sentence, text_type):
+        try:
+            sentence = sentence.decode('utf-8')
+        except UnicodeDecodeError:
+            sentence = sentence.decode('gbk', 'ignore')
+    return sentence
+def resolve_filename(f):
+    try:
+        return f.name
+    except AttributeError:
+        return repr(f)

jieba/analyse/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from __future__ import absolute_import
+from .tfidf import TFIDF
+from .textrank import TextRank
+try:
+    from .analyzer import ChineseAnalyzer
+except ImportError:
+    pass
+default_tfidf = TFIDF()
+default_textrank = TextRank()
+extract_tags = tfidf = default_tfidf.extract_tags
+set_idf_path = default_tfidf.set_idf_path
+textrank = default_textrank.extract_tags
+def set_stop_words(stop_words_path):
+    default_tfidf.set_stop_words(stop_words_path)
+    default_textrank.set_stop_words(stop_words_path)

jieba/analyse/analyzer.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# encoding=utf-8
+from __future__ import unicode_literals
+from whoosh.analysis import RegexAnalyzer, LowercaseFilter, StopFilter, StemFilter
+from whoosh.analysis import Tokenizer, Token
+from whoosh.lang.porter import stem
+import jieba
+import re
+STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
+                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
+                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
+                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
+                        'you', 'your', '的', '了', '和'))
+accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")
+class ChineseTokenizer(Tokenizer):
+    def __call__(self, text, **kargs):
+        words = jieba.tokenize(text, mode="search")
+        token = Token()
+        for (w, start_pos, stop_pos) in words:
+            if not accepted_chars.match(w) and len(w) <= 1:
+                continue
+            token.original = token.text = w
+            token.pos = start_pos
+            token.startchar = start_pos
+            token.endchar = stop_pos
+            yield token
+def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
+    return (ChineseTokenizer() | LowercaseFilter() |
+            StopFilter(stoplist=stoplist, minsize=minsize) |
+            StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))

jieba/analyse/idf.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/analyse/textrank.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+import sys
+from operator import itemgetter
+from collections import defaultdict
+import jieba.posseg
+from .tfidf import KeywordExtractor
+from .._compat import *
+class UndirectWeightedGraph:
+    d = 0.85
+    def __init__(self):
+        self.graph = defaultdict(list)
+    def addEdge(self, start, end, weight):
+        # use a tuple (start, end, weight) instead of a Edge object
+        self.graph[start].append((start, end, weight))
+        self.graph[end].append((end, start, weight))
+    def rank(self):
+        ws = defaultdict(float)
+        outSum = defaultdict(float)
+        wsdef = 1.0 / (len(self.graph) or 1.0)
+        for n, out in self.graph.items():
+            ws[n] = wsdef
+            outSum[n] = sum((e[2] for e in out), 0.0)
+        # this line for build stable iteration
+        sorted_keys = sorted(self.graph.keys())
+        for x in xrange(10):  # 10 iters
+            for n in sorted_keys:
+                s = 0
+                for e in self.graph[n]:
+                    s += e[2] / outSum[e[1]] * ws[e[1]]
+                ws[n] = (1 - self.d) + self.d * s
+        (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
+        for w in itervalues(ws):
+            if w < min_rank:
+                min_rank = w
+            if w > max_rank:
+                max_rank = w
+        for n, w in ws.items():
+            # to unify the weights, don't *100.
+            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
+        return ws
+class TextRank(KeywordExtractor):
+    def __init__(self):
+        self.tokenizer = self.postokenizer = jieba.posseg.dt
+        self.stop_words = self.STOP_WORDS.copy()
+        self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
+        self.span = 5
+    def pairfilter(self, wp):
+        return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
+                and wp.word.lower() not in self.stop_words)
+    def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
+        """
+        Extract keywords from sentence using TextRank algorithm.
+        Parameter:
+            - topK: return how many top keywords. `None` for all possible words.
+            - withWeight: if True, return a list of (word, weight);
+                          if False, return a list of words.
+            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                        if the POS of w is not in this list, it will be filtered.
+            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
+                        if False, return a list of words
+        """
+        self.pos_filt = frozenset(allowPOS)
+        g = UndirectWeightedGraph()
+        cm = defaultdict(int)
+        words = tuple(self.tokenizer.cut(sentence))
+        for i, wp in enumerate(words):
+            if self.pairfilter(wp):
+                for j in xrange(i + 1, i + self.span):
+                    if j >= len(words):
+                        break
+                    if not self.pairfilter(words[j]):
+                        continue
+                    if allowPOS and withFlag:
+                        cm[(wp, words[j])] += 1
+                    else:
+                        cm[(wp.word, words[j].word)] += 1
+        for terms, w in cm.items():
+            g.addEdge(terms[0], terms[1], w)
+        nodes_rank = g.rank()
+        if withWeight:
+            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
+        else:
+            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
+        if topK:
+            return tags[:topK]
+        else:
+            return tags
+    extract_tags = textrank

jieba/analyse/tfidf.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# encoding=utf-8
+from __future__ import absolute_import
+import os
+import jieba
+import jieba.posseg
+from operator import itemgetter
+_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
+                                                 os.path.dirname(__file__), path))
+_get_abs_path = jieba._get_abs_path
+DEFAULT_IDF = _get_module_path("idf.txt")
+class KeywordExtractor(object):
+    STOP_WORDS = set((
+        "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
+        "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
+        "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
+    ))
+    def set_stop_words(self, stop_words_path):
+        abs_path = _get_abs_path(stop_words_path)
+        if not os.path.isfile(abs_path):
+            raise Exception("jieba: file does not exist: " + abs_path)
+        content = open(abs_path, 'rb').read().decode('utf-8')
+        for line in content.splitlines():
+            self.stop_words.add(line)
+    def extract_tags(self, *args, **kwargs):
+        raise NotImplementedError
+class IDFLoader(object):
+    def __init__(self, idf_path=None):
+        self.path = ""
+        self.idf_freq = {}
+        self.median_idf = 0.0
+        if idf_path:
+            self.set_new_path(idf_path)
+    def set_new_path(self, new_idf_path):
+        if self.path != new_idf_path:
+            self.path = new_idf_path
+            content = open(new_idf_path, 'rb').read().decode('utf-8')
+            self.idf_freq = {}
+            for line in content.splitlines():
+                word, freq = line.strip().split(' ')
+                self.idf_freq[word] = float(freq)
+            self.median_idf = sorted(
+                self.idf_freq.values())[len(self.idf_freq) // 2]
+    def get_idf(self):
+        return self.idf_freq, self.median_idf
+class TFIDF(KeywordExtractor):
+    def __init__(self, idf_path=None):
+        self.tokenizer = jieba.dt
+        self.postokenizer = jieba.posseg.dt
+        self.stop_words = self.STOP_WORDS.copy()
+        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
+        self.idf_freq, self.median_idf = self.idf_loader.get_idf()
+    def set_idf_path(self, idf_path):
+        new_abs_path = _get_abs_path(idf_path)
+        if not os.path.isfile(new_abs_path):
+            raise Exception("jieba: file does not exist: " + new_abs_path)
+        self.idf_loader.set_new_path(new_abs_path)
+        self.idf_freq, self.median_idf = self.idf_loader.get_idf()
+    def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
+        """
+        Extract keywords from sentence using TF-IDF algorithm.
+        Parameter:
+            - topK: return how many top keywords. `None` for all possible words.
+            - withWeight: if True, return a list of (word, weight);
+                          if False, return a list of words.
+            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
+                        if the POS of w is not in this list,it will be filtered.
+            - withFlag: only work with allowPOS is not empty.
+                        if True, return a list of pair(word, weight) like posseg.cut
+                        if False, return a list of words
+        """
+        if allowPOS:
+            allowPOS = frozenset(allowPOS)
+            words = self.postokenizer.cut(sentence)
+        else:
+            words = self.tokenizer.cut(sentence)
+        freq = {}
+        for w in words:
+            if allowPOS:
+                if w.flag not in allowPOS:
+                    continue
+                elif not withFlag:
+                    w = w.word
+            wc = w.word if allowPOS and withFlag else w
+            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
+                continue
+            freq[w] = freq.get(w, 0.0) + 1.0
+        total = sum(freq.values())
+        for k in freq:
+            kw = k.word if allowPOS and withFlag else k
+            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
+        if withWeight:
+            tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
+        else:
+            tags = sorted(freq, key=freq.__getitem__, reverse=True)
+        if topK:
+            return tags[:topK]
+        else:
+            return tags

jieba/dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/finalseg/__init__.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from __future__ import absolute_import, unicode_literals
+import re
+import os
+import sys
+import pickle
+from .._compat import *
+MIN_FLOAT = -3.14e100
+PROB_START_P = "prob_start.p"
+PROB_TRANS_P = "prob_trans.p"
+PROB_EMIT_P = "prob_emit.p"
+PrevStatus = {
+    'B': 'ES',
+    'M': 'MB',
+    'S': 'SE',
+    'E': 'BM'
+}
+def load_model():
+    start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
+    trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
+    emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
+    return start_p, trans_p, emit_p
+if sys.platform.startswith("java"):
+    start_P, trans_P, emit_P = load_model()
+else:
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit import P as emit_P
+def viterbi(obs, states, start_p, trans_p, emit_p):
+    V = [{}]  # tabular
+    path = {}
+    for y in states:  # init
+        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
+        path[y] = [y]
+    for t in xrange(1, len(obs)):
+        V.append({})
+        newpath = {}
+        for y in states:
+            em_p = emit_p[y].get(obs[t], MIN_FLOAT)
+            (prob, state) = max(
+                [(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
+            V[t][y] = prob
+            newpath[y] = path[state] + [y]
+        path = newpath
+    (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
+    return (prob, path[state])
+def __cut(sentence):
+    global emit_P
+    prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
+    begin, nexti = 0, 0
+    # print pos_list, sentence
+    for i, char in enumerate(sentence):
+        pos = pos_list[i]
+        if pos == 'B':
+            begin = i
+        elif pos == 'E':
+            yield sentence[begin:i + 1]
+            nexti = i + 1
+        elif pos == 'S':
+            yield char
+            nexti = i + 1
+    if nexti < len(sentence):
+        yield sentence[nexti:]
+re_han = re.compile("([\u4E00-\u9FD5]+)")
+re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
+def cut(sentence):
+    sentence = strdecode(sentence)
+    blocks = re_han.split(sentence)
+    for blk in blocks:
+        if re_han.match(blk):
+            for word in __cut(blk):
+                yield word
+        else:
+            tmp = re_skip.split(blk)
+            for x in tmp:
+                if x:
+                    yield x

jieba/finalseg/prob_emit.p ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/finalseg/prob_emit.py ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/finalseg/prob_start.p ADDED Viewed

	@@ -0,0 +1,14 @@

+(dp0
+S'E'
+p1
+F-3.14e+100
+sS'S'
+p2
+F-1.0490863400100874
+sS'B'
+p3
+F-0.4311793320941878
+sS'M'
+p4
+F-3.14e+100
+s.

jieba/finalseg/prob_start.py ADDED Viewed

	@@ -0,0 +1,4 @@

+P={'B': -0.4311793320941878,
+ 'E': -3.14e+100,
+ 'M': -3.14e+100,
+ 'S': -1.0490863400100874}

jieba/finalseg/prob_trans.p ADDED Viewed

	@@ -0,0 +1,30 @@

+(dp0
+S'M'
+p1
+(dp2
+g1
+F-1.776721924369053
+sS'E'
+p3
+F-0.18535639277522836
+ssS'S'
+p4
+(dp5
+g4
+F-0.8149794471455989
+sS'B'
+p6
+F-0.5845590441999979
+ssg6
+(dp7
+g1
+F-1.9405006828418647
+sg3
+F-0.15505510933264552
+ssg3
+(dp8
+g4
+F-1.0069624262712982
+sg6
+F-0.4546453789910586
+ss.

jieba/finalseg/prob_trans.py ADDED Viewed

	@@ -0,0 +1,4 @@

+P={'B': {'E': -0.15505510933264552, 'M': -1.9405006828418647},
+ 'E': {'B': -0.4546453789910586, 'S': -1.0069624262712982},
+ 'M': {'E': -0.18535639277522836, 'M': -1.776721924369053},
+ 'S': {'B': -0.5845590441999979, 'S': -0.8149794471455989}}

jieba/posseg/__init__.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from __future__ import absolute_import, unicode_literals
+import os
+import re
+import sys
+import jieba
+import pickle
+from .._compat import *
+from .viterbi import viterbi
+PROB_START_P = "prob_start.p"
+PROB_TRANS_P = "prob_trans.p"
+PROB_EMIT_P = "prob_emit.p"
+CHAR_STATE_TAB_P = "char_state_tab.p"
+re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
+re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
+re_skip_internal = re.compile("(\r\n|\s)")
+re_eng = re.compile("[a-zA-Z0-9]+")
+re_num = re.compile("[\.0-9]+")
+re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)
+def load_model():
+    # For Jython
+    start_p = pickle.load(get_module_res("posseg", PROB_START_P))
+    trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P))
+    emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P))
+    state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P))
+    return state, start_p, trans_p, emit_p
+if sys.platform.startswith("java"):
+    char_state_tab_P, start_P, trans_P, emit_P = load_model()
+else:
+    from .char_state_tab import P as char_state_tab_P
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit import P as emit_P
+class pair(object):
+    def __init__(self, word, flag):
+        self.word = word
+        self.flag = flag
+    def __unicode__(self):
+        return '%s/%s' % (self.word, self.flag)
+    def __repr__(self):
+        return 'pair(%r, %r)' % (self.word, self.flag)
+    def __str__(self):
+        if PY2:
+            return self.__unicode__().encode(default_encoding)
+        else:
+            return self.__unicode__()
+    def __iter__(self):
+        return iter((self.word, self.flag))
+    def __lt__(self, other):
+        return self.word < other.word
+    def __eq__(self, other):
+        return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
+    def __hash__(self):
+        return hash(self.word)
+    def encode(self, arg):
+        return self.__unicode__().encode(arg)
+class POSTokenizer(object):
+    def __init__(self, tokenizer=None):
+        self.tokenizer = tokenizer or jieba.Tokenizer()
+        self.load_word_tag(self.tokenizer.get_dict_file())
+    def __repr__(self):
+        return '<POSTokenizer tokenizer=%r>' % self.tokenizer
+    def __getattr__(self, name):
+        if name in ('cut_for_search', 'lcut_for_search', 'tokenize'):
+            # may be possible?
+            raise NotImplementedError
+        return getattr(self.tokenizer, name)
+    def initialize(self, dictionary=None):
+        self.tokenizer.initialize(dictionary)
+        self.load_word_tag(self.tokenizer.get_dict_file())
+    def load_word_tag(self, f):
+        self.word_tag_tab = {}
+        f_name = resolve_filename(f)
+        for lineno, line in enumerate(f, 1):
+            try:
+                line = line.strip().decode("utf-8")
+                if not line:
+                    continue
+                word, _, tag = line.split(" ")
+                self.word_tag_tab[word] = tag
+            except Exception:
+                raise ValueError(
+                    'invalid POS dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
+        f.close()
+    def makesure_userdict_loaded(self):
+        if self.tokenizer.user_word_tag_tab:
+            self.word_tag_tab.update(self.tokenizer.user_word_tag_tab)
+            self.tokenizer.user_word_tag_tab = {}
+    def __cut(self, sentence):
+        prob, pos_list = viterbi(
+            sentence, char_state_tab_P, start_P, trans_P, emit_P)
+        begin, nexti = 0, 0
+        for i, char in enumerate(sentence):
+            pos = pos_list[i][0]
+            if pos == 'B':
+                begin = i
+            elif pos == 'E':
+                yield pair(sentence[begin:i + 1], pos_list[i][1])
+                nexti = i + 1
+            elif pos == 'S':
+                yield pair(char, pos_list[i][1])
+                nexti = i + 1
+        if nexti < len(sentence):
+            yield pair(sentence[nexti:], pos_list[nexti][1])
+    def __cut_detail(self, sentence):
+        blocks = re_han_detail.split(sentence)
+        for blk in blocks:
+            if re_han_detail.match(blk):
+                for word in self.__cut(blk):
+                    yield word
+            else:
+                tmp = re_skip_detail.split(blk)
+                for x in tmp:
+                    if x:
+                        if re_num.match(x):
+                            yield pair(x, 'm')
+                        elif re_eng.match(x):
+                            yield pair(x, 'eng')
+                        else:
+                            yield pair(x, 'x')
+    def __cut_DAG_NO_HMM(self, sentence):
+        DAG = self.tokenizer.get_DAG(sentence)
+        route = {}
+        self.tokenizer.calc(sentence, DAG, route)
+        x = 0
+        N = len(sentence)
+        buf = ''
+        while x < N:
+            y = route[x][1] + 1
+            l_word = sentence[x:y]
+            if re_eng1.match(l_word):
+                buf += l_word
+                x = y
+            else:
+                if buf:
+                    yield pair(buf, 'eng')
+                    buf = ''
+                yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
+                x = y
+        if buf:
+            yield pair(buf, 'eng')
+            buf = ''
+    def __cut_DAG(self, sentence):
+        DAG = self.tokenizer.get_DAG(sentence)
+        route = {}
+        self.tokenizer.calc(sentence, DAG, route)
+        x = 0
+        buf = ''
+        N = len(sentence)
+        while x < N:
+            y = route[x][1] + 1
+            l_word = sentence[x:y]
+            if y - x == 1:
+                buf += l_word
+            else:
+                if buf:
+                    if len(buf) == 1:
+                        yield pair(buf, self.word_tag_tab.get(buf, 'x'))
+                    elif not self.tokenizer.FREQ.get(buf):
+                        recognized = self.__cut_detail(buf)
+                        for t in recognized:
+                            yield t
+                    else:
+                        for elem in buf:
+                            yield pair(elem, self.word_tag_tab.get(elem, 'x'))
+                    buf = ''
+                yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
+            x = y
+        if buf:
+            if len(buf) == 1:
+                yield pair(buf, self.word_tag_tab.get(buf, 'x'))
+            elif not self.tokenizer.FREQ.get(buf):
+                recognized = self.__cut_detail(buf)
+                for t in recognized:
+                    yield t
+            else:
+                for elem in buf:
+                    yield pair(elem, self.word_tag_tab.get(elem, 'x'))
+    def __cut_internal(self, sentence, HMM=True):
+        self.makesure_userdict_loaded()
+        sentence = strdecode(sentence)
+        blocks = re_han_internal.split(sentence)
+        if HMM:
+            cut_blk = self.__cut_DAG
+        else:
+            cut_blk = self.__cut_DAG_NO_HMM
+        for blk in blocks:
+            if re_han_internal.match(blk):
+                for word in cut_blk(blk):
+                    yield word
+            else:
+                tmp = re_skip_internal.split(blk)
+                for x in tmp:
+                    if re_skip_internal.match(x):
+                        yield pair(x, 'x')
+                    else:
+                        for xx in x:
+                            if re_num.match(xx):
+                                yield pair(xx, 'm')
+                            elif re_eng.match(x):
+                                yield pair(xx, 'eng')
+                            else:
+                                yield pair(xx, 'x')
+    def _lcut_internal(self, sentence):
+        return list(self.__cut_internal(sentence))
+    def _lcut_internal_no_hmm(self, sentence):
+        return list(self.__cut_internal(sentence, False))
+    def cut(self, sentence, HMM=True):
+        for w in self.__cut_internal(sentence, HMM=HMM):
+            yield w
+    def lcut(self, *args, **kwargs):
+        return list(self.cut(*args, **kwargs))
+# default Tokenizer instance
+dt = POSTokenizer(jieba.dt)
+# global functions
+initialize = dt.initialize
+def _lcut_internal(s):
+    return dt._lcut_internal(s)
+def _lcut_internal_no_hmm(s):
+    return dt._lcut_internal_no_hmm(s)
+def cut(sentence, HMM=True):
+    """
+    Global `cut` function that supports parallel processing.
+    Note that this only works using dt, custom POSTokenizer
+    instances are not supported.
+    """
+    global dt
+    if jieba.pool is None:
+        for w in dt.cut(sentence, HMM=HMM):
+            yield w
+    else:
+        parts = strdecode(sentence).splitlines(True)
+        if HMM:
+            result = jieba.pool.map(_lcut_internal, parts)
+        else:
+            result = jieba.pool.map(_lcut_internal_no_hmm, parts)
+        for r in result:
+            for w in r:
+                yield w
+def lcut(sentence, HMM=True):
+    return list(cut(sentence, HMM))

jieba/posseg/char_state_tab.p ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/char_state_tab.py ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/prob_emit.p ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/prob_emit.py ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/prob_start.p ADDED Viewed

	@@ -0,0 +1,1094 @@

+(dp0
+(S'B'
+p1
+S'a'
+p2
+tp3
+F-4.762305214596967
+s(g1
+S'ad'
+p4
+tp5
+F-6.680066036784177
+s(g1
+S'ag'
+p6
+tp7
+F-3.14e+100
+s(g1
+S'an'
+p8
+tp9
+F-8.697083223018778
+s(g1
+S'b'
+p10
+tp11
+F-5.018374362109218
+s(g1
+S'bg'
+p12
+tp13
+F-3.14e+100
+s(g1
+S'c'
+p14
+tp15
+F-3.423880184954888
+s(g1
+S'd'
+p16
+tp17
+F-3.9750475297585357
+s(g1
+S'df'
+p18
+tp19
+F-8.888974230828882
+s(g1
+S'dg'
+p20
+tp21
+F-3.14e+100
+s(g1
+S'e'
+p22
+tp23
+F-8.563551830394255
+s(g1
+S'en'
+p24
+tp25
+F-3.14e+100
+s(g1
+S'f'
+p26
+tp27
+F-5.491630418482717
+s(g1
+S'g'
+p28
+tp29
+F-3.14e+100
+s(g1
+S'h'
+p30
+tp31
+F-13.533365129970255
+s(g1
+S'i'
+p32
+tp33
+F-6.1157847275557105
+s(g1
+S'in'
+p34
+tp35
+F-3.14e+100
+s(g1
+S'j'
+p36
+tp37
+F-5.0576191284681915
+s(g1
+S'jn'
+p38
+tp39
+F-3.14e+100
+s(g1
+S'k'
+p40
+tp41
+F-3.14e+100
+s(g1
+S'l'
+p42
+tp43
+F-4.905883584659895
+s(g1
+S'ln'
+p44
+tp45
+F-3.14e+100
+s(g1
+S'm'
+p46
+tp47
+F-3.6524299819046386
+s(g1
+S'mg'
+p48
+tp49
+F-3.14e+100
+s(g1
+S'mq'
+p50
+tp51
+F-6.78695300139688
+s(g1
+S'n'
+p52
+tp53
+F-1.6966257797548328
+s(g1
+S'ng'
+p54
+tp55
+F-3.14e+100
+s(g1
+S'nr'
+p56
+tp57
+F-2.2310495913769506
+s(g1
+S'nrfg'
+p58
+tp59
+F-5.873722175405573
+s(g1
+S'nrt'
+p60
+tp61
+F-4.985642733519195
+s(g1
+S'ns'
+p62
+tp63
+F-2.8228438314969213
+s(g1
+S'nt'
+p64
+tp65
+F-4.846091668182416
+s(g1
+S'nz'
+p66
+tp67
+F-3.94698846057672
+s(g1
+S'o'
+p68
+tp69
+F-8.433498702146057
+s(g1
+S'p'
+p70
+tp71
+F-4.200984132085048
+s(g1
+S'q'
+p72
+tp73
+F-6.998123858956596
+s(g1
+S'qe'
+p74
+tp75
+F-3.14e+100
+s(g1
+S'qg'
+p76
+tp77
+F-3.14e+100
+s(g1
+S'r'
+p78
+tp79
+F-3.4098187790818413
+s(g1
+S'rg'
+p80
+tp81
+F-3.14e+100
+s(g1
+S'rr'
+p82
+tp83
+F-12.434752841302146
+s(g1
+S'rz'
+p84
+tp85
+F-7.946116471570005
+s(g1
+S's'
+p86
+tp87
+F-5.522673590839954
+s(g1
+S't'
+p88
+tp89
+F-3.3647479094528574
+s(g1
+S'tg'
+p90
+tp91
+F-3.14e+100
+s(g1
+S'u'
+p92
+tp93
+F-9.163917277503234
+s(g1
+S'ud'
+p94
+tp95
+F-3.14e+100
+s(g1
+S'ug'
+p96
+tp97
+F-3.14e+100
+s(g1
+S'uj'
+p98
+tp99
+F-3.14e+100
+s(g1
+S'ul'
+p100
+tp101
+F-3.14e+100
+s(g1
+S'uv'
+p102
+tp103
+F-3.14e+100
+s(g1
+S'uz'
+p104
+tp105
+F-3.14e+100
+s(g1
+S'v'
+p106
+tp107
+F-2.6740584874265685
+s(g1
+S'vd'
+p108
+tp109
+F-9.044728760238115
+s(g1
+S'vg'
+p110
+tp111
+F-3.14e+100
+s(g1
+S'vi'
+p112
+tp113
+F-12.434752841302146
+s(g1
+S'vn'
+p114
+tp115
+F-4.3315610890163585
+s(g1
+S'vq'
+p116
+tp117
+F-12.147070768850364
+s(g1
+S'w'
+p118
+tp119
+F-3.14e+100
+s(g1
+S'x'
+p120
+tp121
+F-3.14e+100
+s(g1
+S'y'
+p122
+tp123
+F-9.844485675856319
+s(g1
+S'yg'
+p124
+tp125
+F-3.14e+100
+s(g1
+S'z'
+p126
+tp127
+F-7.045681111485645
+s(g1
+S'zg'
+p128
+tp129
+F-3.14e+100
+s(S'E'
+p130
+g2
+tp131
+F-3.14e+100
+s(g130
+g4
+tp132
+F-3.14e+100
+s(g130
+g6
+tp133
+F-3.14e+100
+s(g130
+g8
+tp134
+F-3.14e+100
+s(g130
+g10
+tp135
+F-3.14e+100
+s(g130
+g12
+tp136
+F-3.14e+100
+s(g130
+g14
+tp137
+F-3.14e+100
+s(g130
+g16
+tp138
+F-3.14e+100
+s(g130
+g18
+tp139
+F-3.14e+100
+s(g130
+g20
+tp140
+F-3.14e+100
+s(g130
+g22
+tp141
+F-3.14e+100
+s(g130
+g24
+tp142
+F-3.14e+100
+s(g130
+g26
+tp143
+F-3.14e+100
+s(g130
+g28
+tp144
+F-3.14e+100
+s(g130
+g30
+tp145
+F-3.14e+100
+s(g130
+g32
+tp146
+F-3.14e+100
+s(g130
+g34
+tp147
+F-3.14e+100
+s(g130
+g36
+tp148
+F-3.14e+100
+s(g130
+g38
+tp149
+F-3.14e+100
+s(g130
+g40
+tp150
+F-3.14e+100
+s(g130
+g42
+tp151
+F-3.14e+100
+s(g130
+g44
+tp152
+F-3.14e+100
+s(g130
+g46
+tp153
+F-3.14e+100
+s(g130
+g48
+tp154
+F-3.14e+100
+s(g130
+g50
+tp155
+F-3.14e+100
+s(g130
+g52
+tp156
+F-3.14e+100
+s(g130
+g54
+tp157
+F-3.14e+100
+s(g130
+g56
+tp158
+F-3.14e+100
+s(g130
+g58
+tp159
+F-3.14e+100
+s(g130
+g60
+tp160
+F-3.14e+100
+s(g130
+g62
+tp161
+F-3.14e+100
+s(g130
+g64
+tp162
+F-3.14e+100
+s(g130
+g66
+tp163
+F-3.14e+100
+s(g130
+g68
+tp164
+F-3.14e+100
+s(g130
+g70
+tp165
+F-3.14e+100
+s(g130
+g72
+tp166
+F-3.14e+100
+s(g130
+g74
+tp167
+F-3.14e+100
+s(g130
+g76
+tp168
+F-3.14e+100
+s(g130
+g78
+tp169
+F-3.14e+100
+s(g130
+g80
+tp170
+F-3.14e+100
+s(g130
+g82
+tp171
+F-3.14e+100
+s(g130
+g84
+tp172
+F-3.14e+100
+s(g130
+g86
+tp173
+F-3.14e+100
+s(g130
+g88
+tp174
+F-3.14e+100
+s(g130
+g90
+tp175
+F-3.14e+100
+s(g130
+g92
+tp176
+F-3.14e+100
+s(g130
+g94
+tp177
+F-3.14e+100
+s(g130
+g96
+tp178
+F-3.14e+100
+s(g130
+g98
+tp179
+F-3.14e+100
+s(g130
+g100
+tp180
+F-3.14e+100
+s(g130
+g102
+tp181
+F-3.14e+100
+s(g130
+g104
+tp182
+F-3.14e+100
+s(g130
+g106
+tp183
+F-3.14e+100
+s(g130
+g108
+tp184
+F-3.14e+100
+s(g130
+g110
+tp185
+F-3.14e+100
+s(g130
+g112
+tp186
+F-3.14e+100
+s(g130
+g114
+tp187
+F-3.14e+100
+s(g130
+g116
+tp188
+F-3.14e+100
+s(g130
+g118
+tp189
+F-3.14e+100
+s(g130
+g120
+tp190
+F-3.14e+100
+s(g130
+g122
+tp191
+F-3.14e+100
+s(g130
+g124
+tp192
+F-3.14e+100
+s(g130
+g126
+tp193
+F-3.14e+100
+s(g130
+g128
+tp194
+F-3.14e+100
+s(S'M'
+p195
+g2
+tp196
+F-3.14e+100
+s(g195
+g4
+tp197
+F-3.14e+100
+s(g195
+g6
+tp198
+F-3.14e+100
+s(g195
+g8
+tp199
+F-3.14e+100
+s(g195
+g10
+tp200
+F-3.14e+100
+s(g195
+g12
+tp201
+F-3.14e+100
+s(g195
+g14
+tp202
+F-3.14e+100
+s(g195
+g16
+tp203
+F-3.14e+100
+s(g195
+g18
+tp204
+F-3.14e+100
+s(g195
+g20
+tp205
+F-3.14e+100
+s(g195
+g22
+tp206
+F-3.14e+100
+s(g195
+g24
+tp207
+F-3.14e+100
+s(g195
+g26
+tp208
+F-3.14e+100
+s(g195
+g28
+tp209
+F-3.14e+100
+s(g195
+g30
+tp210
+F-3.14e+100
+s(g195
+g32
+tp211
+F-3.14e+100
+s(g195
+g34
+tp212
+F-3.14e+100
+s(g195
+g36
+tp213
+F-3.14e+100
+s(g195
+g38
+tp214
+F-3.14e+100
+s(g195
+g40
+tp215
+F-3.14e+100
+s(g195
+g42
+tp216
+F-3.14e+100
+s(g195
+g44
+tp217
+F-3.14e+100
+s(g195
+g46
+tp218
+F-3.14e+100
+s(g195
+g48
+tp219
+F-3.14e+100
+s(g195
+g50
+tp220
+F-3.14e+100
+s(g195
+g52
+tp221
+F-3.14e+100
+s(g195
+g54
+tp222
+F-3.14e+100
+s(g195
+g56
+tp223
+F-3.14e+100
+s(g195
+g58
+tp224
+F-3.14e+100
+s(g195
+g60
+tp225
+F-3.14e+100
+s(g195
+g62
+tp226
+F-3.14e+100
+s(g195
+g64
+tp227
+F-3.14e+100
+s(g195
+g66
+tp228
+F-3.14e+100
+s(g195
+g68
+tp229
+F-3.14e+100
+s(g195
+g70
+tp230
+F-3.14e+100
+s(g195
+g72
+tp231
+F-3.14e+100
+s(g195
+g74
+tp232
+F-3.14e+100
+s(g195
+g76
+tp233
+F-3.14e+100
+s(g195
+g78
+tp234
+F-3.14e+100
+s(g195
+g80
+tp235
+F-3.14e+100
+s(g195
+g82
+tp236
+F-3.14e+100
+s(g195
+g84
+tp237
+F-3.14e+100
+s(g195
+g86
+tp238
+F-3.14e+100
+s(g195
+g88
+tp239
+F-3.14e+100
+s(g195
+g90
+tp240
+F-3.14e+100
+s(g195
+g92
+tp241
+F-3.14e+100
+s(g195
+g94
+tp242
+F-3.14e+100
+s(g195
+g96
+tp243
+F-3.14e+100
+s(g195
+g98
+tp244
+F-3.14e+100
+s(g195
+g100
+tp245
+F-3.14e+100
+s(g195
+g102
+tp246
+F-3.14e+100
+s(g195
+g104
+tp247
+F-3.14e+100
+s(g195
+g106
+tp248
+F-3.14e+100
+s(g195
+g108
+tp249
+F-3.14e+100
+s(g195
+g110
+tp250
+F-3.14e+100
+s(g195
+g112
+tp251
+F-3.14e+100
+s(g195
+g114
+tp252
+F-3.14e+100
+s(g195
+g116
+tp253
+F-3.14e+100
+s(g195
+g118
+tp254
+F-3.14e+100
+s(g195
+g120
+tp255
+F-3.14e+100
+s(g195
+g122
+tp256
+F-3.14e+100
+s(g195
+g124
+tp257
+F-3.14e+100
+s(g195
+g126
+tp258
+F-3.14e+100
+s(g195
+g128
+tp259
+F-3.14e+100
+s(S'S'
+p260
+g2
+tp261
+F-3.9025396831295227
+s(g260
+g4
+tp262
+F-11.048458480182255
+s(g260
+g6
+tp263
+F-6.954113917960154
+s(g260
+g8
+tp264
+F-12.84021794941031
+s(g260
+g10
+tp265
+F-6.472888763970454
+s(g260
+g12
+tp266
+F-3.14e+100
+s(g260
+g14
+tp267
+F-4.786966795861212
+s(g260
+g16
+tp268
+F-3.903919764181873
+s(g260
+g18
+tp269
+F-3.14e+100
+s(g260
+g20
+tp270
+F-8.948397651299683
+s(g260
+g22
+tp271
+F-5.942513006281674
+s(g260
+g24
+tp272
+F-3.14e+100
+s(g260
+g26
+tp273
+F-5.194820249981676
+s(g260
+g28
+tp274
+F-6.507826815331734
+s(g260
+g30
+tp275
+F-8.650563207383884
+s(g260
+g32
+tp276
+F-3.14e+100
+s(g260
+g34
+tp277
+F-3.14e+100
+s(g260
+g36
+tp278
+F-4.911992119644354
+s(g260
+g38
+tp279
+F-3.14e+100
+s(g260
+g40
+tp280
+F-6.940320595827818
+s(g260
+g42
+tp281
+F-3.14e+100
+s(g260
+g44
+tp282
+F-3.14e+100
+s(g260
+g46
+tp283
+F-3.269200652116097
+s(g260
+g48
+tp284
+F-10.825314928868044
+s(g260
+g50
+tp285
+F-3.14e+100
+s(g260
+g52
+tp286
+F-3.8551483897645107
+s(g260
+g54
+tp287
+F-4.913434861102905
+s(g260
+g56
+tp288
+F-4.483663103956885
+s(g260
+g58
+tp289
+F-3.14e+100
+s(g260
+g60
+tp290
+F-3.14e+100
+s(g260
+g62
+tp291
+F-3.14e+100
+s(g260
+g64
+tp292
+F-12.147070768850364
+s(g260
+g66
+tp293
+F-3.14e+100
+s(g260
+g68
+tp294
+F-8.464460927750023
+s(g260
+g70
+tp295
+F-2.9868401813596317
+s(g260
+g72
+tp296
+F-4.888658618255058
+s(g260
+g74
+tp297
+F-3.14e+100
+s(g260
+g76
+tp298
+F-3.14e+100
+s(g260
+g78
+tp299
+F-2.7635336784127853
+s(g260
+g80
+tp300
+F-10.275268591948773
+s(g260
+g82
+tp301
+F-3.14e+100
+s(g260
+g84
+tp302
+F-3.14e+100
+s(g260
+g86
+tp303
+F-3.14e+100
+s(g260
+g88
+tp304
+F-3.14e+100
+s(g260
+g90
+tp305
+F-6.272842531880403
+s(g260
+g92
+tp306
+F-6.940320595827818
+s(g260
+g94
+tp307
+F-7.728230161053767
+s(g260
+g96
+tp308
+F-7.5394037026636855
+s(g260
+g98
+tp309
+F-6.85251045118004
+s(g260
+g100
+tp310
+F-8.4153713175535
+s(g260
+g102
+tp311
+F-8.15808672228609
+s(g260
+g104
+tp312
+F-9.299258625372996
+s(g260
+g106
+tp313
+F-3.053292303412302
+s(g260
+g108
+tp314
+F-3.14e+100
+s(g260
+g110
+tp315
+F-5.9430181843676895
+s(g260
+g112
+tp316
+F-3.14e+100
+s(g260
+g114
+tp317
+F-11.453923588290419
+s(g260
+g116
+tp318
+F-3.14e+100
+s(g260
+g118
+tp319
+F-3.14e+100
+s(g260
+g120
+tp320
+F-8.427419656069674
+s(g260
+g122
+tp321
+F-6.1970794699489575
+s(g260
+g124
+tp322
+F-13.533365129970255
+s(g260
+g126
+tp323
+F-3.14e+100
+s(g260
+g128
+tp324
+F-3.14e+100
+s.

jieba/posseg/prob_start.py ADDED Viewed

	@@ -0,0 +1,256 @@

+P={('B', 'a'): -4.762305214596967,
+ ('B', 'ad'): -6.680066036784177,
+ ('B', 'ag'): -3.14e+100,
+ ('B', 'an'): -8.697083223018778,
+ ('B', 'b'): -5.018374362109218,
+ ('B', 'bg'): -3.14e+100,
+ ('B', 'c'): -3.423880184954888,
+ ('B', 'd'): -3.9750475297585357,
+ ('B', 'df'): -8.888974230828882,
+ ('B', 'dg'): -3.14e+100,
+ ('B', 'e'): -8.563551830394255,
+ ('B', 'en'): -3.14e+100,
+ ('B', 'f'): -5.491630418482717,
+ ('B', 'g'): -3.14e+100,
+ ('B', 'h'): -13.533365129970255,
+ ('B', 'i'): -6.1157847275557105,
+ ('B', 'in'): -3.14e+100,
+ ('B', 'j'): -5.0576191284681915,
+ ('B', 'jn'): -3.14e+100,
+ ('B', 'k'): -3.14e+100,
+ ('B', 'l'): -4.905883584659895,
+ ('B', 'ln'): -3.14e+100,
+ ('B', 'm'): -3.6524299819046386,
+ ('B', 'mg'): -3.14e+100,
+ ('B', 'mq'): -6.78695300139688,
+ ('B', 'n'): -1.6966257797548328,
+ ('B', 'ng'): -3.14e+100,
+ ('B', 'nr'): -2.2310495913769506,
+ ('B', 'nrfg'): -5.873722175405573,
+ ('B', 'nrt'): -4.985642733519195,
+ ('B', 'ns'): -2.8228438314969213,
+ ('B', 'nt'): -4.846091668182416,
+ ('B', 'nz'): -3.94698846057672,
+ ('B', 'o'): -8.433498702146057,
+ ('B', 'p'): -4.200984132085048,
+ ('B', 'q'): -6.998123858956596,
+ ('B', 'qe'): -3.14e+100,
+ ('B', 'qg'): -3.14e+100,
+ ('B', 'r'): -3.4098187790818413,
+ ('B', 'rg'): -3.14e+100,
+ ('B', 'rr'): -12.434752841302146,
+ ('B', 'rz'): -7.946116471570005,
+ ('B', 's'): -5.522673590839954,
+ ('B', 't'): -3.3647479094528574,
+ ('B', 'tg'): -3.14e+100,
+ ('B', 'u'): -9.163917277503234,
+ ('B', 'ud'): -3.14e+100,
+ ('B', 'ug'): -3.14e+100,
+ ('B', 'uj'): -3.14e+100,
+ ('B', 'ul'): -3.14e+100,
+ ('B', 'uv'): -3.14e+100,
+ ('B', 'uz'): -3.14e+100,
+ ('B', 'v'): -2.6740584874265685,
+ ('B', 'vd'): -9.044728760238115,
+ ('B', 'vg'): -3.14e+100,
+ ('B', 'vi'): -12.434752841302146,
+ ('B', 'vn'): -4.3315610890163585,
+ ('B', 'vq'): -12.147070768850364,
+ ('B', 'w'): -3.14e+100,
+ ('B', 'x'): -3.14e+100,
+ ('B', 'y'): -9.844485675856319,
+ ('B', 'yg'): -3.14e+100,
+ ('B', 'z'): -7.045681111485645,
+ ('B', 'zg'): -3.14e+100,
+ ('E', 'a'): -3.14e+100,
+ ('E', 'ad'): -3.14e+100,
+ ('E', 'ag'): -3.14e+100,
+ ('E', 'an'): -3.14e+100,
+ ('E', 'b'): -3.14e+100,
+ ('E', 'bg'): -3.14e+100,
+ ('E', 'c'): -3.14e+100,
+ ('E', 'd'): -3.14e+100,
+ ('E', 'df'): -3.14e+100,
+ ('E', 'dg'): -3.14e+100,
+ ('E', 'e'): -3.14e+100,
+ ('E', 'en'): -3.14e+100,
+ ('E', 'f'): -3.14e+100,
+ ('E', 'g'): -3.14e+100,
+ ('E', 'h'): -3.14e+100,
+ ('E', 'i'): -3.14e+100,
+ ('E', 'in'): -3.14e+100,
+ ('E', 'j'): -3.14e+100,
+ ('E', 'jn'): -3.14e+100,
+ ('E', 'k'): -3.14e+100,
+ ('E', 'l'): -3.14e+100,
+ ('E', 'ln'): -3.14e+100,
+ ('E', 'm'): -3.14e+100,
+ ('E', 'mg'): -3.14e+100,
+ ('E', 'mq'): -3.14e+100,
+ ('E', 'n'): -3.14e+100,
+ ('E', 'ng'): -3.14e+100,
+ ('E', 'nr'): -3.14e+100,
+ ('E', 'nrfg'): -3.14e+100,
+ ('E', 'nrt'): -3.14e+100,
+ ('E', 'ns'): -3.14e+100,
+ ('E', 'nt'): -3.14e+100,
+ ('E', 'nz'): -3.14e+100,
+ ('E', 'o'): -3.14e+100,
+ ('E', 'p'): -3.14e+100,
+ ('E', 'q'): -3.14e+100,
+ ('E', 'qe'): -3.14e+100,
+ ('E', 'qg'): -3.14e+100,
+ ('E', 'r'): -3.14e+100,
+ ('E', 'rg'): -3.14e+100,
+ ('E', 'rr'): -3.14e+100,
+ ('E', 'rz'): -3.14e+100,
+ ('E', 's'): -3.14e+100,
+ ('E', 't'): -3.14e+100,
+ ('E', 'tg'): -3.14e+100,
+ ('E', 'u'): -3.14e+100,
+ ('E', 'ud'): -3.14e+100,
+ ('E', 'ug'): -3.14e+100,
+ ('E', 'uj'): -3.14e+100,
+ ('E', 'ul'): -3.14e+100,
+ ('E', 'uv'): -3.14e+100,
+ ('E', 'uz'): -3.14e+100,
+ ('E', 'v'): -3.14e+100,
+ ('E', 'vd'): -3.14e+100,
+ ('E', 'vg'): -3.14e+100,
+ ('E', 'vi'): -3.14e+100,
+ ('E', 'vn'): -3.14e+100,
+ ('E', 'vq'): -3.14e+100,
+ ('E', 'w'): -3.14e+100,
+ ('E', 'x'): -3.14e+100,
+ ('E', 'y'): -3.14e+100,
+ ('E', 'yg'): -3.14e+100,
+ ('E', 'z'): -3.14e+100,
+ ('E', 'zg'): -3.14e+100,
+ ('M', 'a'): -3.14e+100,
+ ('M', 'ad'): -3.14e+100,
+ ('M', 'ag'): -3.14e+100,
+ ('M', 'an'): -3.14e+100,
+ ('M', 'b'): -3.14e+100,
+ ('M', 'bg'): -3.14e+100,
+ ('M', 'c'): -3.14e+100,
+ ('M', 'd'): -3.14e+100,
+ ('M', 'df'): -3.14e+100,
+ ('M', 'dg'): -3.14e+100,
+ ('M', 'e'): -3.14e+100,
+ ('M', 'en'): -3.14e+100,
+ ('M', 'f'): -3.14e+100,
+ ('M', 'g'): -3.14e+100,
+ ('M', 'h'): -3.14e+100,
+ ('M', 'i'): -3.14e+100,
+ ('M', 'in'): -3.14e+100,
+ ('M', 'j'): -3.14e+100,
+ ('M', 'jn'): -3.14e+100,
+ ('M', 'k'): -3.14e+100,
+ ('M', 'l'): -3.14e+100,
+ ('M', 'ln'): -3.14e+100,
+ ('M', 'm'): -3.14e+100,
+ ('M', 'mg'): -3.14e+100,
+ ('M', 'mq'): -3.14e+100,
+ ('M', 'n'): -3.14e+100,
+ ('M', 'ng'): -3.14e+100,
+ ('M', 'nr'): -3.14e+100,
+ ('M', 'nrfg'): -3.14e+100,
+ ('M', 'nrt'): -3.14e+100,
+ ('M', 'ns'): -3.14e+100,
+ ('M', 'nt'): -3.14e+100,
+ ('M', 'nz'): -3.14e+100,
+ ('M', 'o'): -3.14e+100,
+ ('M', 'p'): -3.14e+100,
+ ('M', 'q'): -3.14e+100,
+ ('M', 'qe'): -3.14e+100,
+ ('M', 'qg'): -3.14e+100,
+ ('M', 'r'): -3.14e+100,
+ ('M', 'rg'): -3.14e+100,
+ ('M', 'rr'): -3.14e+100,
+ ('M', 'rz'): -3.14e+100,
+ ('M', 's'): -3.14e+100,
+ ('M', 't'): -3.14e+100,
+ ('M', 'tg'): -3.14e+100,
+ ('M', 'u'): -3.14e+100,
+ ('M', 'ud'): -3.14e+100,
+ ('M', 'ug'): -3.14e+100,
+ ('M', 'uj'): -3.14e+100,
+ ('M', 'ul'): -3.14e+100,
+ ('M', 'uv'): -3.14e+100,
+ ('M', 'uz'): -3.14e+100,
+ ('M', 'v'): -3.14e+100,
+ ('M', 'vd'): -3.14e+100,
+ ('M', 'vg'): -3.14e+100,
+ ('M', 'vi'): -3.14e+100,
+ ('M', 'vn'): -3.14e+100,
+ ('M', 'vq'): -3.14e+100,
+ ('M', 'w'): -3.14e+100,
+ ('M', 'x'): -3.14e+100,
+ ('M', 'y'): -3.14e+100,
+ ('M', 'yg'): -3.14e+100,
+ ('M', 'z'): -3.14e+100,
+ ('M', 'zg'): -3.14e+100,
+ ('S', 'a'): -3.9025396831295227,
+ ('S', 'ad'): -11.048458480182255,
+ ('S', 'ag'): -6.954113917960154,
+ ('S', 'an'): -12.84021794941031,
+ ('S', 'b'): -6.472888763970454,
+ ('S', 'bg'): -3.14e+100,
+ ('S', 'c'): -4.786966795861212,
+ ('S', 'd'): -3.903919764181873,
+ ('S', 'df'): -3.14e+100,
+ ('S', 'dg'): -8.948397651299683,
+ ('S', 'e'): -5.942513006281674,
+ ('S', 'en'): -3.14e+100,
+ ('S', 'f'): -5.194820249981676,
+ ('S', 'g'): -6.507826815331734,
+ ('S', 'h'): -8.650563207383884,
+ ('S', 'i'): -3.14e+100,
+ ('S', 'in'): -3.14e+100,
+ ('S', 'j'): -4.911992119644354,
+ ('S', 'jn'): -3.14e+100,
+ ('S', 'k'): -6.940320595827818,
+ ('S', 'l'): -3.14e+100,
+ ('S', 'ln'): -3.14e+100,
+ ('S', 'm'): -3.269200652116097,
+ ('S', 'mg'): -10.825314928868044,
+ ('S', 'mq'): -3.14e+100,
+ ('S', 'n'): -3.8551483897645107,
+ ('S', 'ng'): -4.913434861102905,
+ ('S', 'nr'): -4.483663103956885,
+ ('S', 'nrfg'): -3.14e+100,
+ ('S', 'nrt'): -3.14e+100,
+ ('S', 'ns'): -3.14e+100,
+ ('S', 'nt'): -12.147070768850364,
+ ('S', 'nz'): -3.14e+100,
+ ('S', 'o'): -8.464460927750023,
+ ('S', 'p'): -2.9868401813596317,
+ ('S', 'q'): -4.888658618255058,
+ ('S', 'qe'): -3.14e+100,
+ ('S', 'qg'): -3.14e+100,
+ ('S', 'r'): -2.7635336784127853,
+ ('S', 'rg'): -10.275268591948773,
+ ('S', 'rr'): -3.14e+100,
+ ('S', 'rz'): -3.14e+100,
+ ('S', 's'): -3.14e+100,
+ ('S', 't'): -3.14e+100,
+ ('S', 'tg'): -6.272842531880403,
+ ('S', 'u'): -6.940320595827818,
+ ('S', 'ud'): -7.728230161053767,
+ ('S', 'ug'): -7.5394037026636855,
+ ('S', 'uj'): -6.85251045118004,
+ ('S', 'ul'): -8.4153713175535,
+ ('S', 'uv'): -8.15808672228609,
+ ('S', 'uz'): -9.299258625372996,
+ ('S', 'v'): -3.053292303412302,
+ ('S', 'vd'): -3.14e+100,
+ ('S', 'vg'): -5.9430181843676895,
+ ('S', 'vi'): -3.14e+100,
+ ('S', 'vn'): -11.453923588290419,
+ ('S', 'vq'): -3.14e+100,
+ ('S', 'w'): -3.14e+100,
+ ('S', 'x'): -8.427419656069674,
+ ('S', 'y'): -6.1970794699489575,
+ ('S', 'yg'): -13.533365129970255,
+ ('S', 'z'): -3.14e+100,
+ ('S', 'zg'): -3.14e+100}

jieba/posseg/prob_trans.p ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/prob_trans.py ADDED Viewed

The diff for this file is too large to render. See raw diff

jieba/posseg/viterbi.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import sys
+import operator
+MIN_FLOAT = -3.14e100
+MIN_INF = float("-inf")
+if sys.version_info[0] > 2:
+    xrange = range
+def get_top_states(t_state_v, K=4):
+    return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
+def viterbi(obs, states, start_p, trans_p, emit_p):
+    V = [{}]  # tabular
+    mem_path = [{}]
+    all_states = trans_p.keys()
+    for y in states.get(obs[0], all_states):  # init
+        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
+        mem_path[0][y] = ''
+    for t in xrange(1, len(obs)):
+        V.append({})
+        mem_path.append({})
+        #prev_states = get_top_states(V[t-1])
+        prev_states = [
+            x for x in mem_path[t - 1].keys() if len(trans_p[x]) > 0]
+        prev_states_expect_next = set(
+            (y for x in prev_states for y in trans_p[x].keys()))
+        obs_states = set(
+            states.get(obs[t], all_states)) & prev_states_expect_next
+        if not obs_states:
+            obs_states = prev_states_expect_next if prev_states_expect_next else all_states
+        for y in obs_states:
+            prob, state = max((V[t - 1][y0] + trans_p[y0].get(y, MIN_INF) +
+                               emit_p[y].get(obs[t], MIN_FLOAT), y0) for y0 in prev_states)
+            V[t][y] = prob
+            mem_path[t][y] = state
+    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
+    # if len(last)==0:
+    #     print obs
+    prob, state = max(last)
+    route = [None] * len(obs)
+    i = len(obs) - 1
+    while i >= 0:
+        route[i] = state
+        state = mem_path[i][state]
+        i -= 1
+    return (prob, route)

models.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "zh_core_web_sm":"Chinese (zh_core_web_sm)",
+  "en_core_web_sm":"English (en_core_web_sm)",
+  "ja_core_news_sm":"Japanese (ja_core_news_sm)"
+}

pages/01_🍊Mandarin.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from collections import Counter
+from dragonmapper import hanzi, transcriptions
+import jieba
+import pandas as pd
+import plotly.express as px
+import re
+import requests
+import spacy
+from spacy_streamlit import visualize_ner, visualize_tokens
+#from spacy.language import Language
+from spacy.tokens import Doc
+import streamlit as st
+# Global variables
+DEFAULT_TEXT = "我如此的過著孤單的生活，我沒有一個可以真正跟他談話的人，一直到六年前，我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師，也沒有乘客，我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候，我被一種奇異的小聲音叫醒，你可以想像到，這時我是多麼的驚訝。那聲音說：「請你﹒﹒﹒給我畫一隻綿羊！」「哪！」「給我畫一隻綿羊！」《小王子》"
+DESCRIPTION = "AI模型輔助語言學習：華語"
+TOK_SEP = " | "
+PUNCT_SYM = ["PUNCT", "SYM"]
+MODEL_NAME = "zh_core_web_sm"
+# External API callers
+def moedict_caller(word):
+    st.write(f"### {word}")
+    req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
+    try:
+        definitions = req.json().get('heteronyms')[0].get('definitions')
+        df = pd.DataFrame(definitions)
+        df.fillna("---", inplace=True)
+        if 'example' not in df.columns:
+            df['example'] = '---'
+        if 'synonyms' not in df.columns:
+            df['synonyms'] = '---'
+        if 'antonyms' not in df.columns:
+            df['antonyms'] = '---'
+        cols = ['def', 'example', 'synonyms', 'antonyms']
+        df = df[cols]
+        df.rename(columns={
+            'def': '解釋',
+            'example': '例句',
+            'synonyms': '同義詞',
+            'antonyms': '反義詞',
+        }, inplace=True)
+        with st.expander("點擊 + 查看結果"):
+            st.table(df)
+    except:
+        st.write("查無結果")
+# Custom tokenizer class
+class JiebaTokenizer:
+    def __init__(self, vocab):
+        self.vocab = vocab
+    def __call__(self, text):
+        words = jieba.cut(text) # returns a generator
+        tokens = list(words) # convert the genetator to a list
+        spaces = [False] * len(tokens)
+        doc = Doc(self.vocab, words=tokens, spaces=spaces)
+        return doc
+# Utility functions
+def filter_tokens(doc):
+    clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
+    clean_tokens = (
+        [tok for tok in clean_tokens if
+         not tok.like_email and
+         not tok.like_num and
+         not tok.like_url and
+         not tok.is_space]
+    )
+    return clean_tokens
+def get_vocab(doc):
+    clean_tokens = filter_tokens(doc)
+    alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
+    clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
+    vocab = list(set(clean_tokens_text))
+    return vocab
+def get_counter(doc):
+    clean_tokens = filter_tokens(doc)
+    tokens = [token.text for token in clean_tokens]
+    counter = Counter(tokens)
+    return counter
+def get_freq_fig(doc):
+    counter = get_counter(doc)
+    counter_df = (
+        pd.DataFrame.from_dict(counter, orient='index').
+        reset_index().
+        rename(columns={
+            0: 'count',
+            'index': 'word'
+            }).
+        sort_values(by='count', ascending=False)
+        )
+    fig = px.bar(counter_df, x='word', y='count')
+    return fig
+def get_level_pie(tocfl_result):
+    level = tocfl_result['詞條分級'].value_counts()
+    fig = px.pie(tocfl_result,
+                values=level.values,
+                names=level.index,
+                title='詞彙分級圓餅圖')
+    return fig
+@st.cache
+def load_tocfl_table(filename="./tocfl_wordlist.csv"):
+    table = pd.read_csv(filename)
+    cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
+    table = table[cols]
+    return table
+# Page setting
+st.set_page_config(
+    page_icon="🤠",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+st.markdown(f"# {DESCRIPTION}")
+# Load the model
+nlp = spacy.load(MODEL_NAME)
+# Add pipelines to spaCy
+# nlp.add_pipe("yake") # keyword extraction
+# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
+# Select a tokenizer if the Chinese model is chosen
+selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
+if selected_tokenizer == "jieba-TW":
+    nlp.tokenizer = JiebaTokenizer(nlp.vocab)
+# Page starts from here
+st.markdown("## 待分析文本")
+st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
+text = st.text_area("",  DEFAULT_TEXT, height=200)
+doc = nlp(text)
+st.markdown("---")
+st.info("���勾選以下至少一項功能")
+# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
+analyzed_text = st.checkbox("增強文本", True)
+defs_examples = st.checkbox("單詞解析", True)
+# morphology = st.sidebar.checkbox("詞形變化", True)
+freq_count = st.checkbox("詞頻統計", True)
+ner_viz = st.checkbox("命名實體", True)
+tok_table = st.checkbox("斷詞特徵", False)
+if analyzed_text:
+    st.markdown("## 增強文本")
+    pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
+    for idx, sent in enumerate(doc.sents):
+        tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
+        pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
+        sounds = pinyins
+        if pronunciation == "注音符號":
+            zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
+            sounds = zhuyins
+        elif pronunciation == "國際音標":
+            ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
+            sounds = ipas
+        display = []
+        for text, sound in zip(tokens_text, sounds):
+            res = f"{text} [{sound}]"
+            display.append(res)
+        if display:
+            display_text = TOK_SEP.join(display)
+            st.write(f"{idx+1} >>> {display_text}")
+        else:
+            st.write(f"{idx+1} >>> EMPTY LINE")
+if defs_examples:
+    st.markdown("## 單詞解析")
+    vocab = get_vocab(doc)
+    if vocab:
+        tocfl_table = load_tocfl_table()
+        filt = tocfl_table['詞彙'].isin(vocab)
+        tocfl_res = tocfl_table[filt]
+        st.markdown("### 華語詞彙分級")
+        fig = get_level_pie(tocfl_res)
+        st.plotly_chart(fig, use_container_width=True)
+        with st.expander("點擊 + 查看結果"):
+            st.table(tocfl_res)
+        st.markdown("---")
+        st.markdown("### 單詞解釋與例句")
+        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
+        for w in selected_words:
+            moedict_caller(w)
+if freq_count:
+    st.markdown("## 詞頻統計")
+    counter = get_counter(doc)
+    topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
+    most_common = counter.most_common(topK)
+    st.write(most_common)
+    st.markdown("---")
+    fig = get_freq_fig(doc)
+    st.plotly_chart(fig, use_container_width=True)
+if ner_viz:
+    ner_labels = nlp.get_pipe("ner").labels
+    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
+if tok_table:
+    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")

pages/02_🍣Japanese.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from jisho_api.word import Word
+from jisho_api.sentence import Sentence
+import pandas as pd
+import re
+import requests
+import spacy
+from spacy_streamlit import visualize_ner, visualize_tokens
+#from spacy.language import Language
+from spacy.tokens import Doc
+import spacy_ke
+import streamlit as st
+# Global variables
+DEFAULT_TEXT = """それまで、ぼくはずっとひとりぼっちだった。だれともうちとけられないまま、６年まえ、ちょっとおかしくなって、サハラさばくに下りた。ぼくのエンジンのなかで、なにかがこわれていた。ぼくには、みてくれるひとも、おきゃくさんもいなかったから、なおすのはむずかしいけど、ぜんぶひとりでなんとかやってみることにした。それでぼくのいのちがきまってしまう。のみ水は、たった７日ぶんしかなかった。
+　１日めの夜、ぼくはすなの上でねむった。ひとのすむところは、はるかかなただった。海のどまんなか、いかだでさまよっているひとよりも、もっとひとりぼっち。だから、ぼくがびっくりしたのも、みんなわかってくれるとおもう。じつは、あさ日がのぼるころ、ぼくは、ふしぎなかわいいこえでおこされたんだ。
+「ごめんください……ヒツジの絵をかいて！」
+「えっ？」
+「ぼくにヒツジの絵をかいて……」
+『星の王子さま』"""
+DESCRIPTION = "AI模型輔助語言學習：日語"
+TOK_SEP = " | "
+MODEL_NAME = "ja_ginza"
+# External API callers
+def parse_jisho_senses(word):
+    res = Word.request(word)
+    response = res.dict()
+    if response["meta"]["status"] == 200:
+        data = response["data"]
+        commons = [d for d in data if d["is_common"]]
+        if commons:
+            common = commons[0] # Only get the first entry that is common
+            senses = common["senses"]
+            if len(senses) > 3:
+                senses = senses[:3]
+            with st.container():
+                for idx, sense in enumerate(senses):
+                    eng_def = "; ".join(sense["english_definitions"])
+                    pos = "/".join(sense["parts_of_speech"])
+                    st.write(f"Sense {idx+1}: {eng_def} ({pos})")
+        else:
+            st.info("Found no common words on Jisho!")
+    else:
+        st.error("Can't get response from Jisho!")
+def parse_jisho_sentences(word):
+    res = Sentence.request(word)
+    try:
+        response = res.dict()
+        data = response["data"]
+        if len(data) > 3:
+            sents = data[:3]
+        else:
+            sents = data
+        with st.container():
+            for idx, sent in enumerate(sents):
+                eng = sent["en_translation"]
+                jap = sent["japanese"]
+                st.write(f"Sentence {idx+1}: {jap}")
+                st.write(f"({eng})")
+    except:
+        st.info("Found no results on Jisho!")
+# Utility functions
+def create_jap_df(tokens):
+    seen_texts = []
+    filtered_tokens = []
+    for tok in tokens:
+        if tok.text not in seen_texts:
+            filtered_tokens.append(tok)
+    df = pd.DataFrame(
+      {
+          "單詞": [tok.text for tok in filtered_tokens],
+          "發音": ["/".join(tok.morph.get("Reading")) for tok in filtered_tokens],
+          "詞形變化": ["/".join(tok.morph.get("Inflection")) for tok in filtered_tokens],
+          "原形": [tok.lemma_ for tok in filtered_tokens],
+          #"正規形": [tok.norm_ for tok in verbs],
+      }
+    )
+    st.dataframe(df)
+    csv = df.to_csv().encode('utf-8')
+    st.download_button(
+      label="下載表格",
+      data=csv,
+      file_name='jap_forms.csv',
+      )
+def filter_tokens(doc):
+    clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
+    clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
+    clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
+    return clean_tokens
+def create_kw_section(doc):
+    st.markdown("## 關鍵詞分析")
+    kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
+    kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
+    kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
+    count = 1
+    for keyword, score in kws2scores:
+        rounded_score = round(score, 3)
+        st.write(f"{count} >>> {keyword} ({rounded_score})")
+        count += 1
+# Page setting
+st.set_page_config(
+    page_icon="🤠",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+st.markdown(f"# {DESCRIPTION}")
+# Load the model
+nlp = spacy.load(MODEL_NAME)
+# Add pipelines to spaCy
+nlp.add_pipe("yake") # keyword extraction
+# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
+# Page starts from here
+st.markdown("## 待分析文本")
+st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
+text = st.text_area("",  DEFAULT_TEXT, height=200)
+doc = nlp(text)
+st.markdown("---")
+st.info("請勾選以下至少一項功能")
+keywords_extraction = st.checkbox("關鍵詞分析", False)
+analyzed_text = st.checkbox("增強文本", True)
+defs_examples = st.checkbox("單詞解析", True)
+morphology = st.checkbox("詞形變化", False)
+ner_viz = st.checkbox("命名實體", True)
+tok_table = st.checkbox("斷詞特徵", False)
+if keywords_extraction:
+    create_kw_section(doc)
+if analyzed_text:
+    st.markdown("## 分析後文本")
+    for idx, sent in enumerate(doc.sents):
+        clean_tokens = [tok for tok in sent if tok.pos_ not in ["PUNCT", "SYM"]]
+        tokens_text = [tok.text for tok in clean_tokens]
+        readings = ["/".join(tok.morph.get("Reading")) for tok in clean_tokens]
+        display = [f"{text} [{reading}]" for text, reading in zip(tokens_text, readings)]
+        if display:
+          display_text = TOK_SEP.join(display)
+          st.write(f"{idx+1} >>> {display_text}")
+        else:
+          st.write(f"{idx+1} >>> EMPTY LINE")
+if defs_examples:
+    st.markdown("## 單詞解釋與例句")
+    clean_tokens = filter_tokens(doc)
+    alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
+    clean_lemmas = [tok.lemma_ for tok in clean_tokens if not alphanum_pattern.search(tok.lemma_)]
+    vocab = list(set(clean_lemmas))
+    if vocab:
+        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
+        for w in selected_words:
+            st.write(f"### {w}")
+            with st.expander("點擊 + 檢視結果"):
+                parse_jisho_senses(w)
+                parse_jisho_sentences(w)
+if morphology:
+    st.markdown("## 詞形變化")
+    # Collect inflected forms
+    inflected_forms = [tok for tok in doc if tok.tag_.startswith("動詞") or tok.tag_.startswith("形")]
+    if inflected_forms:
+        create_jap_df(inflected_forms)
+if ner_viz:
+    ner_labels = nlp.get_pipe("ner").labels
+    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
+if tok_table:
+    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")

pages/03_🍔English.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import pandas as pd
+import re
+import requests
+import spacy
+from spacy_streamlit import visualize_ner, visualize_tokens
+#from spacy.language import Language
+from spacy.tokens import Doc
+import spacy_ke
+import streamlit as st
+# Global variables
+DEFAULT_TEXT = """So I lived my life alone, without anyone that I could really talk to, until I had an accident with my plane in the Desert of Sahara, six years ago. Something was broken in my engine. And as I had with me neither a mechanic nor any passengers, I set myself to attempt the difficult repairs all alone. It was a question of life or death for me: I had scarcely enough drinking water to last a week. The first night, then, I went to sleep on the sand, a thousand miles from any human habitation. I was more isolated than a shipwrecked sailor on a raft in the middle of the ocean. Thus you can imagine my amazement, at sunrise, when I was awakened by an odd little voice. It said:
+"If you please−− draw me a sheep!"
+"What!"
+"Draw me a sheep!"
+The Little Prince
+"""
+DESCRIPTION = "AI模型輔助語言學習：英語"
+TOK_SEP = " | "
+MODEL_NAME = "en_core_web_sm"
+API_LOOKUP = {}
+MAX_SYM_NUM = 5
+# External API caller
+def free_dict_caller(word):
+    req = requests.get(f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}")
+    try:
+        result = req.json()[0]
+        if word not in API_LOOKUP:
+            API_LOOKUP[word] = result
+    except:
+        pass
+def show_definitions_and_examples(word, pos):
+    if word not in API_LOOKUP:
+        free_dict_caller(word)
+    result = API_LOOKUP.get(word)
+    if result:
+        meanings = result.get('meanings')
+        if meanings:
+            definitions = []
+            for meaning in meanings:
+                if meaning['partOfSpeech'] == pos.lower():
+                    definitions = meaning.get('definitions')
+            if len(definitions) > 3:
+              definitions = definitions[:3]
+            for definition in definitions:
+              df = definition.get("definition")
+              ex = definition.get("example")
+              st.markdown(f" - {df}")
+              st.markdown(f" Example: *{ex}*")
+              st.markdown("---")
+    else:
+        st.info("Found no matching result on Free Dictionary!")
+def get_synonyms(word, pos):
+    if word not in API_LOOKUP:
+        free_dict_caller(word)
+    result = API_LOOKUP.get(word)
+    if result:
+        meanings = result.get('meanings')
+        if meanings:
+            synonyms = []
+            for meaning in meanings:
+                if meaning['partOfSpeech'] == pos.lower():
+                    synonyms = meaning.get('synonyms')
+            return synonyms
+# Utility functions
+def create_eng_df(tokens):
+    seen_texts = []
+    filtered_tokens = []
+    for tok in tokens:
+        if tok.lemma_ not in seen_texts:
+            filtered_tokens.append(tok)
+    df = pd.DataFrame(
+      {
+          "單詞": [tok.text.lower() for tok in filtered_tokens],
+          "詞類": [tok.pos_ for tok in filtered_tokens],
+          "原形": [tok.lemma_ for tok in filtered_tokens],
+      }
+    )
+    st.dataframe(df)
+    csv = df.to_csv().encode('utf-8')
+    st.download_button(
+      label="下載表格",
+      data=csv,
+      file_name='eng_forms.csv',
+      )
+def filter_tokens(doc):
+    clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
+    clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
+    clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
+    clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
+    return clean_tokens
+def create_kw_section(doc):
+    st.markdown("## 關鍵詞分析")
+    kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
+    kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
+    kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
+    count = 1
+    for keyword, score in kws2scores:
+        rounded_score = round(score, 3)
+        st.write(f"{count} >>> {keyword} ({rounded_score})")
+        count += 1
+# Page setting
+st.set_page_config(
+    page_icon="🤠",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+st.markdown(f"# {DESCRIPTION}")
+# Load the language model
+nlp = spacy.load(MODEL_NAME)
+# Add pipelines to spaCy
+nlp.add_pipe("yake") # keyword extraction
+# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
+# Page starts from here
+st.markdown("## 待分析文本")
+st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
+text = st.text_area("",  DEFAULT_TEXT, height=200)
+doc = nlp(text)
+st.markdown("---")
+st.info("請勾選以下至少一項功能")
+keywords_extraction = st.checkbox("關鍵詞分析", False)
+analyzed_text = st.checkbox("增強文本", True)
+defs_examples = st.checkbox("單詞解析", True)
+morphology = st.checkbox("詞形變化", False)
+ner_viz = st.checkbox("命名實體", True)
+tok_table = st.checkbox("斷詞特徵", False)
+if keywords_extraction:
+    create_kw_section(doc)
+if analyzed_text:
+    st.markdown("## 分析後文本")
+    for idx, sent in enumerate(doc.sents):
+        enriched_sentence = []
+        for tok in sent:
+            if tok.pos_ != "VERB":
+                enriched_sentence.append(tok.text)
+            else:
+                synonyms = get_synonyms(tok.text, tok.pos_)
+                if synonyms:
+                    if len(synonyms) > MAX_SYM_NUM:
+                        synonyms = synonyms[:MAX_SYM_NUM]
+                    added_verbs = " | ".join(synonyms)
+                    enriched_tok = f"{tok.text} (cf. {added_verbs})"
+                    enriched_sentence.append(enriched_tok)
+                else:
+                    enriched_sentence.append(tok.text)
+        display_text = " ".join(enriched_sentence)
+        st.write(f"{idx+1} >>> {display_text}")
+if defs_examples:
+    st.markdown("## 單詞解釋與例句")
+    clean_tokens = filter_tokens(doc)
+    num_pattern = re.compile(r"[0-9]")
+    clean_tokens = [tok for tok in clean_tokens if not num_pattern.search(tok.lemma_)]
+    selected_pos = ["VERB", "NOUN", "ADJ", "ADV"]
+    clean_tokens = [tok for tok in clean_tokens if tok.pos_ in selected_pos]
+    tokens_lemma_pos = [tok.lemma_ + " | " + tok.pos_ for tok in clean_tokens]
+    vocab = list(set(tokens_lemma_pos))
+    if vocab:
+        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
+        for w in selected_words:
+            word_pos = w.split("|")
+            word = word_pos[0].strip()
+            pos = word_pos[1].strip()
+            st.write(f"### {w}")
+            with st.expander("點擊 + 檢視結果"):
+                show_definitions_and_examples(word, pos)
+if morphology:
+    st.markdown("## 詞形變化")
+    # Collect inflected forms
+    inflected_forms = [tok for tok in doc if tok.text.lower() != tok.lemma_.lower()]
+    if inflected_forms:
+        create_eng_df(inflected_forms)
+if ner_viz:
+    ner_labels = nlp.get_pipe("ner").labels
+    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
+if tok_table:
+    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# ja_ginza is a Japanese model with a lemmatizer and a morphologizer more fine-grained than the default one in spaCy
+ginza
+ja_ginza
+# ja_ginza depends on spacy>=3.2.0,<3.3.0
+spacy>=3.2.0,<3.3.0
+spacy-streamlit>=1.0.0rc1,<1.1.0
+spacy-wordnet
+# spacy-wordnet depends on nltk
+nltk
+# sapCy models
+https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
+# conversion between hanzi and transcriptions
+dragonmapper
+# Jisho online Japanese dictionary
+jisho_api
+# YAKE keyword extraction
+spacy-ke
+# interactive plotting
+plotly

tocfl_wordlist.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

update_data.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""A little helper scripts to generate the requirements.txt and models.json with
+the latest supported model versions based on the compatibility.json."""
+from spacy.about import __compatibility__ as COMPAT_URL
+from spacy.util import get_lang_class, is_compatible_version
+from pathlib import Path
+import requests
+import typer
+import srsly
+URL_TEMPLATE = "https://github.com/explosion/spacy-models/releases/download/{name}-{version}/{name}-{version}.tar.gz#egg={name}=={version}"
+def main(
+    # fmt: off
+    spacy_version: str = typer.Argument(">=3.0.0,<3.1.0", help="The spaCy version range"),
+    spacy_streamlit_version: str = typer.Argument(">=1.0.0rc1,<1.1.0", help="The version range of spacy-streamlit"),
+    req_path: Path = typer.Option(Path(__file__).parent / "requirements.txt", "--requirements-path", "-rp", help="Path to requirements.txt"),
+    desc_path: Path = typer.Option(Path(__file__).parent / "models.json", "--models-json-path", "-mp", help="Path to models.json with model details for dropdown"),
+    package: str = typer.Option("spacy", "--package", "-p", help="The parent package (spacy, spacy-nightly, etc.)"),
+    exclude: str = typer.Option("en_vectors_web_lg", "--exclude", "-e", help="Comma-separated model names to exclude"),
+    # fmt: on
+):
+    exclude = [name.strip() for name in exclude.split(",")]
+    r = requests.get(COMPAT_URL)
+    r.raise_for_status()
+    compat = r.json()["spacy"]
+    data = None
+    for version_option in compat:
+        if is_compatible_version(version_option, spacy_version):
+            data = compat[version_option]
+            break
+    if data is None:
+        raise ValueError(f"No compatible models found for {spacy_version}")
+    reqs = [
+        f"# Auto-generated by {Path(__file__).name}",
+        f"{package}{spacy_version}",
+        f"spacy-streamlit{spacy_streamlit_version}",
+    ]
+    models = {}
+    for model_name, model_versions in data.items():
+        if model_name not in exclude and model_versions:
+            url = URL_TEMPLATE.format(name=model_name, version=model_versions[0])
+            # We do a quick check if the URL exists
+            r = requests.get(url, headers={"Range": "bytes=0"})
+            if r.status_code == 404:
+                print(f"Invalid package URL (skipping): {url}")
+                continue
+            reqs.append(url)
+            lang = model_name.split("_", 1)[0]
+            lang_name = get_lang_class(lang).__name__
+            models[model_name] = f"{lang_name} ({model_name})"
+    # Sort by human-readable language name, then by model size
+    sort_key = lambda x: f"{x[1].split(' ')[0]}_{['sm', 'md', 'lg', 'trf'].index(x[0].split('_')[-1])}"
+    models = {name: desc for name, desc in sorted(models.items(), key=sort_key)}
+    with Path(req_path).open("w", encoding="utf8") as f:
+        f.write("\n".join(reqs))
+    srsly.write_json(desc_path, models)
+    print(f"Generated requirements.txt and models.json for {len(reqs) - 1} models")
+if __name__ == "__main__":
+    typer.run(main)