jeffeux commited on
Commit
d825710
1 Parent(s): 1886cbc

Migrate to HF Space

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 ExplosionAI GmbH
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,7 @@
1
- ---
2
- title: Spacy Streamlit Haowenchiang
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/howard-haowen/spacy-streamlit/HEAD)
 
 
 
 
 
 
 
 
 
2
 
3
+ This repo hosts a [Streamlit Web APP](https://share.streamlit.io/howard-haowen/spacy-streamlit/app.py) that leverages the power of [spaCy](https://spacy.io/) to assist language learning. It currently suppports the following languages:
4
+
5
+ - Mandarin
6
+ - English
7
+ - Japanese
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.markdown("""
4
+
5
+ # AI模型輔助語言學習
6
+
7
+ ## Language Learning Assisted by AI Models
8
+
9
+ - 開啟左側選單可選擇語言,目前支援華語、日語和英語。
10
+ - Select a language from the sidebar. Supported languages include Mandarin, Japanese, and English.
11
+ - 選單自動隱藏時,點選左上角 > 符號以開啟選單。
12
+ - If the sidebar is hidden, click on the > symbol in the upper left corner to open it.
13
+
14
+ """)
jieba/__init__.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, unicode_literals
2
+ __version__ = '0.38'
3
+ __license__ = 'MIT'
4
+
5
+ import re
6
+ import os
7
+ import sys
8
+ import time
9
+ import logging
10
+ import marshal
11
+ import tempfile
12
+ import threading
13
+ from math import log
14
+ from hashlib import md5
15
+ from ._compat import *
16
+ from . import finalseg
17
+
18
+ if os.name == 'nt':
19
+ from shutil import move as _replace_file
20
+ else:
21
+ _replace_file = os.rename
22
+
23
+ _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
24
+
25
+ DEFAULT_DICT = None
26
+ DEFAULT_DICT_NAME = "dict.txt"
27
+
28
+ log_console = logging.StreamHandler(sys.stderr)
29
+ default_logger = logging.getLogger(__name__)
30
+ default_logger.setLevel(logging.DEBUG)
31
+ default_logger.addHandler(log_console)
32
+
33
+ DICT_WRITING = {}
34
+
35
+ pool = None
36
+
37
+ re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
38
+
39
+ re_eng = re.compile('[a-zA-Z0-9]', re.U)
40
+
41
+ # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
42
+ # \r\n|\s : whitespace characters. Will not be handled.
43
+ re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
44
+ re_skip_default = re.compile("(\r\n|\s)", re.U)
45
+ re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
46
+ re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
47
+
48
+ def setLogLevel(log_level):
49
+ global logger
50
+ default_logger.setLevel(log_level)
51
+
52
+ class Tokenizer(object):
53
+
54
+ def __init__(self, dictionary=DEFAULT_DICT):
55
+ self.lock = threading.RLock()
56
+ if dictionary == DEFAULT_DICT:
57
+ self.dictionary = dictionary
58
+ else:
59
+ self.dictionary = _get_abs_path(dictionary)
60
+ self.FREQ = {}
61
+ self.total = 0
62
+ self.user_word_tag_tab = {}
63
+ self.initialized = False
64
+ self.tmp_dir = None
65
+ self.cache_file = None
66
+
67
+ def __repr__(self):
68
+ return '<Tokenizer dictionary=%r>' % self.dictionary
69
+
70
+ def gen_pfdict(self, f):
71
+ lfreq = {}
72
+ ltotal = 0
73
+ f_name = resolve_filename(f)
74
+ for lineno, line in enumerate(f, 1):
75
+ try:
76
+ line = line.strip().decode('utf-8')
77
+ word, freq = line.split(' ')[:2]
78
+ freq = int(freq)
79
+ lfreq[word] = freq
80
+ ltotal += freq
81
+ for ch in xrange(len(word)):
82
+ wfrag = word[:ch + 1]
83
+ if wfrag not in lfreq:
84
+ lfreq[wfrag] = 0
85
+ except ValueError:
86
+ raise ValueError(
87
+ 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
88
+ f.close()
89
+ return lfreq, ltotal
90
+
91
+ def initialize(self, dictionary=None):
92
+ if dictionary:
93
+ abs_path = _get_abs_path(dictionary)
94
+ if self.dictionary == abs_path and self.initialized:
95
+ return
96
+ else:
97
+ self.dictionary = abs_path
98
+ self.initialized = False
99
+ else:
100
+ abs_path = self.dictionary
101
+
102
+ with self.lock:
103
+ try:
104
+ with DICT_WRITING[abs_path]:
105
+ pass
106
+ except KeyError:
107
+ pass
108
+ if self.initialized:
109
+ return
110
+
111
+ default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
112
+ t1 = time.time()
113
+ if self.cache_file:
114
+ cache_file = self.cache_file
115
+ # default dictionary
116
+ elif abs_path == DEFAULT_DICT:
117
+ cache_file = "jieba.cache"
118
+ # custom dictionary
119
+ else:
120
+ cache_file = "jieba.u%s.cache" % md5(
121
+ abs_path.encode('utf-8', 'replace')).hexdigest()
122
+ cache_file = os.path.join(
123
+ self.tmp_dir or tempfile.gettempdir(), cache_file)
124
+ # prevent absolute path in self.cache_file
125
+ tmpdir = os.path.dirname(cache_file)
126
+
127
+ load_from_cache_fail = True
128
+ if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
129
+ os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
130
+ default_logger.debug(
131
+ "Loading model from cache %s" % cache_file)
132
+ try:
133
+ with open(cache_file, 'rb') as cf:
134
+ self.FREQ, self.total = marshal.load(cf)
135
+ load_from_cache_fail = False
136
+ except Exception:
137
+ load_from_cache_fail = True
138
+
139
+ if load_from_cache_fail:
140
+ wlock = DICT_WRITING.get(abs_path, threading.RLock())
141
+ DICT_WRITING[abs_path] = wlock
142
+ with wlock:
143
+ self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
144
+ default_logger.debug(
145
+ "Dumping model to file cache %s" % cache_file)
146
+ try:
147
+ # prevent moving across different filesystems
148
+ fd, fpath = tempfile.mkstemp(dir=tmpdir)
149
+ with os.fdopen(fd, 'wb') as temp_cache_file:
150
+ marshal.dump(
151
+ (self.FREQ, self.total), temp_cache_file)
152
+ _replace_file(fpath, cache_file)
153
+ except Exception:
154
+ default_logger.exception("Dump cache file failed.")
155
+
156
+ try:
157
+ del DICT_WRITING[abs_path]
158
+ except KeyError:
159
+ pass
160
+
161
+ self.initialized = True
162
+ default_logger.debug(
163
+ "Loading model cost %.3f seconds." % (time.time() - t1))
164
+ default_logger.debug("Prefix dict has been built succesfully.")
165
+
166
+ def check_initialized(self):
167
+ if not self.initialized:
168
+ self.initialize()
169
+
170
+ def calc(self, sentence, DAG, route):
171
+ N = len(sentence)
172
+ route[N] = (0, 0)
173
+ logtotal = log(self.total)
174
+ for idx in xrange(N - 1, -1, -1):
175
+ route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
176
+ logtotal + route[x + 1][0], x) for x in DAG[idx])
177
+
178
+ def get_DAG(self, sentence):
179
+ self.check_initialized()
180
+ DAG = {}
181
+ N = len(sentence)
182
+ for k in xrange(N):
183
+ tmplist = []
184
+ i = k
185
+ frag = sentence[k]
186
+ while i < N and frag in self.FREQ:
187
+ if self.FREQ[frag]:
188
+ tmplist.append(i)
189
+ i += 1
190
+ frag = sentence[k:i + 1]
191
+ if not tmplist:
192
+ tmplist.append(k)
193
+ DAG[k] = tmplist
194
+ return DAG
195
+
196
+ def __cut_all(self, sentence):
197
+ dag = self.get_DAG(sentence)
198
+ old_j = -1
199
+ for k, L in iteritems(dag):
200
+ if len(L) == 1 and k > old_j:
201
+ yield sentence[k:L[0] + 1]
202
+ old_j = L[0]
203
+ else:
204
+ for j in L:
205
+ if j > k:
206
+ yield sentence[k:j + 1]
207
+ old_j = j
208
+
209
+ def __cut_DAG_NO_HMM(self, sentence):
210
+ DAG = self.get_DAG(sentence)
211
+ route = {}
212
+ self.calc(sentence, DAG, route)
213
+ x = 0
214
+ N = len(sentence)
215
+ buf = ''
216
+ while x < N:
217
+ y = route[x][1] + 1
218
+ l_word = sentence[x:y]
219
+ if re_eng.match(l_word) and len(l_word) == 1:
220
+ buf += l_word
221
+ x = y
222
+ else:
223
+ if buf:
224
+ yield buf
225
+ buf = ''
226
+ yield l_word
227
+ x = y
228
+ if buf:
229
+ yield buf
230
+ buf = ''
231
+
232
+ def __cut_DAG(self, sentence):
233
+ DAG = self.get_DAG(sentence)
234
+ route = {}
235
+ self.calc(sentence, DAG, route)
236
+ x = 0
237
+ buf = ''
238
+ N = len(sentence)
239
+ while x < N:
240
+ y = route[x][1] + 1
241
+ l_word = sentence[x:y]
242
+ if y - x == 1:
243
+ buf += l_word
244
+ else:
245
+ if buf:
246
+ if len(buf) == 1:
247
+ yield buf
248
+ buf = ''
249
+ else:
250
+ if not self.FREQ.get(buf):
251
+ recognized = finalseg.cut(buf)
252
+ for t in recognized:
253
+ yield t
254
+ else:
255
+ for elem in buf:
256
+ yield elem
257
+ buf = ''
258
+ yield l_word
259
+ x = y
260
+
261
+ if buf:
262
+ if len(buf) == 1:
263
+ yield buf
264
+ elif not self.FREQ.get(buf):
265
+ recognized = finalseg.cut(buf)
266
+ for t in recognized:
267
+ yield t
268
+ else:
269
+ for elem in buf:
270
+ yield elem
271
+
272
+ def cut(self, sentence, cut_all=False, HMM=True):
273
+ '''
274
+ The main function that segments an entire sentence that contains
275
+ Chinese characters into seperated words.
276
+ Parameter:
277
+ - sentence: The str(unicode) to be segmented.
278
+ - cut_all: Model type. True for full pattern, False for accurate pattern.
279
+ - HMM: Whether to use the Hidden Markov Model.
280
+ '''
281
+ sentence = strdecode(sentence)
282
+
283
+ if cut_all:
284
+ re_han = re_han_cut_all
285
+ re_skip = re_skip_cut_all
286
+ else:
287
+ re_han = re_han_default
288
+ re_skip = re_skip_default
289
+ if cut_all:
290
+ cut_block = self.__cut_all
291
+ elif HMM:
292
+ cut_block = self.__cut_DAG
293
+ else:
294
+ cut_block = self.__cut_DAG_NO_HMM
295
+ blocks = re_han.split(sentence)
296
+ for blk in blocks:
297
+ if not blk:
298
+ continue
299
+ if re_han.match(blk):
300
+ for word in cut_block(blk):
301
+ yield word
302
+ else:
303
+ tmp = re_skip.split(blk)
304
+ for x in tmp:
305
+ if re_skip.match(x):
306
+ yield x
307
+ elif not cut_all:
308
+ for xx in x:
309
+ yield xx
310
+ else:
311
+ yield x
312
+
313
+ def cut_for_search(self, sentence, HMM=True):
314
+ """
315
+ Finer segmentation for search engines.
316
+ """
317
+ words = self.cut(sentence, HMM=HMM)
318
+ for w in words:
319
+ if len(w) > 2:
320
+ for i in xrange(len(w) - 1):
321
+ gram2 = w[i:i + 2]
322
+ if self.FREQ.get(gram2):
323
+ yield gram2
324
+ if len(w) > 3:
325
+ for i in xrange(len(w) - 2):
326
+ gram3 = w[i:i + 3]
327
+ if self.FREQ.get(gram3):
328
+ yield gram3
329
+ yield w
330
+
331
+ def lcut(self, *args, **kwargs):
332
+ return list(self.cut(*args, **kwargs))
333
+
334
+ def lcut_for_search(self, *args, **kwargs):
335
+ return list(self.cut_for_search(*args, **kwargs))
336
+
337
+ _lcut = lcut
338
+ _lcut_for_search = lcut_for_search
339
+
340
+ def _lcut_no_hmm(self, sentence):
341
+ return self.lcut(sentence, False, False)
342
+
343
+ def _lcut_all(self, sentence):
344
+ return self.lcut(sentence, True)
345
+
346
+ def _lcut_for_search_no_hmm(self, sentence):
347
+ return self.lcut_for_search(sentence, False)
348
+
349
+ def get_dict_file(self):
350
+ if self.dictionary == DEFAULT_DICT:
351
+ return get_module_res(DEFAULT_DICT_NAME)
352
+ else:
353
+ return open(self.dictionary, 'rb')
354
+
355
+ def load_userdict(self, f):
356
+ '''
357
+ Load personalized dict to improve detect rate.
358
+ Parameter:
359
+ - f : A plain text file contains words and their ocurrences.
360
+ Can be a file-like object, or the path of the dictionary file,
361
+ whose encoding must be utf-8.
362
+ Structure of dict file:
363
+ word1 freq1 word_type1
364
+ word2 freq2 word_type2
365
+ ...
366
+ Word type may be ignored
367
+ '''
368
+ self.check_initialized()
369
+ if isinstance(f, string_types):
370
+ f_name = f
371
+ f = open(f, 'rb')
372
+ else:
373
+ f_name = resolve_filename(f)
374
+ for lineno, ln in enumerate(f, 1):
375
+ line = ln.strip()
376
+ if not isinstance(line, text_type):
377
+ try:
378
+ line = line.decode('utf-8').lstrip('\ufeff')
379
+ except UnicodeDecodeError:
380
+ raise ValueError('dictionary file %s must be utf-8' % f_name)
381
+ if not line:
382
+ continue
383
+ # match won't be None because there's at least one character
384
+ word, freq, tag = re_userdict.match(line).groups()
385
+ if freq is not None:
386
+ freq = freq.strip()
387
+ if tag is not None:
388
+ tag = tag.strip()
389
+ self.add_word(word, freq, tag)
390
+
391
+ def add_word(self, word, freq=None, tag=None):
392
+ """
393
+ Add a word to dictionary.
394
+ freq and tag can be omitted, freq defaults to be a calculated value
395
+ that ensures the word can be cut out.
396
+ """
397
+ self.check_initialized()
398
+ word = strdecode(word)
399
+ freq = int(freq) if freq is not None else self.suggest_freq(word, False)
400
+ self.FREQ[word] = freq
401
+ self.total += freq
402
+ if tag:
403
+ self.user_word_tag_tab[word] = tag
404
+ for ch in xrange(len(word)):
405
+ wfrag = word[:ch + 1]
406
+ if wfrag not in self.FREQ:
407
+ self.FREQ[wfrag] = 0
408
+
409
+ def del_word(self, word):
410
+ """
411
+ Convenient function for deleting a word.
412
+ """
413
+ self.add_word(word, 0)
414
+
415
+ def suggest_freq(self, segment, tune=False):
416
+ """
417
+ Suggest word frequency to force the characters in a word to be
418
+ joined or splitted.
419
+ Parameter:
420
+ - segment : The segments that the word is expected to be cut into,
421
+ If the word should be treated as a whole, use a str.
422
+ - tune : If True, tune the word frequency.
423
+ Note that HMM may affect the final result. If the result doesn't change,
424
+ set HMM=False.
425
+ """
426
+ self.check_initialized()
427
+ ftotal = float(self.total)
428
+ freq = 1
429
+ if isinstance(segment, string_types):
430
+ word = segment
431
+ for seg in self.cut(word, HMM=False):
432
+ freq *= self.FREQ.get(seg, 1) / ftotal
433
+ freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
434
+ else:
435
+ segment = tuple(map(strdecode, segment))
436
+ word = ''.join(segment)
437
+ for seg in segment:
438
+ freq *= self.FREQ.get(seg, 1) / ftotal
439
+ freq = min(int(freq * self.total), self.FREQ.get(word, 0))
440
+ if tune:
441
+ add_word(word, freq)
442
+ return freq
443
+
444
+ def tokenize(self, unicode_sentence, mode="default", HMM=True):
445
+ """
446
+ Tokenize a sentence and yields tuples of (word, start, end)
447
+ Parameter:
448
+ - sentence: the str(unicode) to be segmented.
449
+ - mode: "default" or "search", "search" is for finer segmentation.
450
+ - HMM: whether to use the Hidden Markov Model.
451
+ """
452
+ if not isinstance(unicode_sentence, text_type):
453
+ raise ValueError("jieba: the input parameter should be unicode.")
454
+ start = 0
455
+ if mode == 'default':
456
+ for w in self.cut(unicode_sentence, HMM=HMM):
457
+ width = len(w)
458
+ yield (w, start, start + width)
459
+ start += width
460
+ else:
461
+ for w in self.cut(unicode_sentence, HMM=HMM):
462
+ width = len(w)
463
+ if len(w) > 2:
464
+ for i in xrange(len(w) - 1):
465
+ gram2 = w[i:i + 2]
466
+ if self.FREQ.get(gram2):
467
+ yield (gram2, start + i, start + i + 2)
468
+ if len(w) > 3:
469
+ for i in xrange(len(w) - 2):
470
+ gram3 = w[i:i + 3]
471
+ if self.FREQ.get(gram3):
472
+ yield (gram3, start + i, start + i + 3)
473
+ yield (w, start, start + width)
474
+ start += width
475
+
476
+ def set_dictionary(self, dictionary_path):
477
+ with self.lock:
478
+ abs_path = _get_abs_path(dictionary_path)
479
+ if not os.path.isfile(abs_path):
480
+ raise Exception("jieba: file does not exist: " + abs_path)
481
+ self.dictionary = abs_path
482
+ self.initialized = False
483
+
484
+
485
+ # default Tokenizer instance
486
+
487
+ dt = Tokenizer()
488
+
489
+ # global functions
490
+
491
+ get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
492
+ add_word = dt.add_word
493
+ calc = dt.calc
494
+ cut = dt.cut
495
+ lcut = dt.lcut
496
+ cut_for_search = dt.cut_for_search
497
+ lcut_for_search = dt.lcut_for_search
498
+ del_word = dt.del_word
499
+ get_DAG = dt.get_DAG
500
+ get_dict_file = dt.get_dict_file
501
+ initialize = dt.initialize
502
+ load_userdict = dt.load_userdict
503
+ set_dictionary = dt.set_dictionary
504
+ suggest_freq = dt.suggest_freq
505
+ tokenize = dt.tokenize
506
+ user_word_tag_tab = dt.user_word_tag_tab
507
+
508
+
509
+ def _lcut_all(s):
510
+ return dt._lcut_all(s)
511
+
512
+
513
+ def _lcut(s):
514
+ return dt._lcut(s)
515
+
516
+
517
+ def _lcut_all(s):
518
+ return dt._lcut_all(s)
519
+
520
+
521
+ def _lcut_for_search(s):
522
+ return dt._lcut_for_search(s)
523
+
524
+
525
+ def _lcut_for_search_no_hmm(s):
526
+ return dt._lcut_for_search_no_hmm(s)
527
+
528
+
529
+ def _pcut(sentence, cut_all=False, HMM=True):
530
+ parts = strdecode(sentence).splitlines(True)
531
+ if cut_all:
532
+ result = pool.map(_lcut_all, parts)
533
+ elif HMM:
534
+ result = pool.map(_lcut, parts)
535
+ else:
536
+ result = pool.map(_lcut_no_hmm, parts)
537
+ for r in result:
538
+ for w in r:
539
+ yield w
540
+
541
+
542
+ def _pcut_for_search(sentence, HMM=True):
543
+ parts = strdecode(sentence).splitlines(True)
544
+ if HMM:
545
+ result = pool.map(_lcut_for_search, parts)
546
+ else:
547
+ result = pool.map(_lcut_for_search_no_hmm, parts)
548
+ for r in result:
549
+ for w in r:
550
+ yield w
551
+
552
+
553
+ def enable_parallel(processnum=None):
554
+ """
555
+ Change the module's `cut` and `cut_for_search` functions to the
556
+ parallel version.
557
+ Note that this only works using dt, custom Tokenizer
558
+ instances are not supported.
559
+ """
560
+ global pool, dt, cut, cut_for_search
561
+ from multiprocessing import cpu_count
562
+ if os.name == 'nt':
563
+ raise NotImplementedError(
564
+ "jieba: parallel mode only supports posix system")
565
+ else:
566
+ from multiprocessing import Pool
567
+ dt.check_initialized()
568
+ if processnum is None:
569
+ processnum = cpu_count()
570
+ pool = Pool(processnum)
571
+ cut = _pcut
572
+ cut_for_search = _pcut_for_search
573
+
574
+
575
+ def disable_parallel():
576
+ global pool, dt, cut, cut_for_search
577
+ if pool:
578
+ pool.close()
579
+ pool = None
580
+ cut = dt.cut
581
+ cut_for_search = dt.cut_for_search
jieba/__main__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Jieba command line interface."""
2
+ import sys
3
+ import jieba
4
+ from argparse import ArgumentParser
5
+ from ._compat import *
6
+
7
+ parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
8
+ parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
9
+ nargs='?', const=' ',
10
+ help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
11
+ parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
12
+ help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
13
+ parser.add_argument("-D", "--dict", help="use DICT as dictionary")
14
+ parser.add_argument("-u", "--user-dict",
15
+ help="use USER_DICT together with the default dictionary or DICT (if specified)")
16
+ parser.add_argument("-a", "--cut-all",
17
+ action="store_true", dest="cutall", default=False,
18
+ help="full pattern cutting (ignored with POS tagging)")
19
+ parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
20
+ default=True, help="don't use the Hidden Markov Model")
21
+ parser.add_argument("-q", "--quiet", action="store_true", default=False,
22
+ help="don't print loading messages to stderr")
23
+ parser.add_argument("-V", '--version', action='version',
24
+ version="Jieba " + jieba.__version__)
25
+ parser.add_argument("filename", nargs='?', help="input file")
26
+
27
+ args = parser.parse_args()
28
+
29
+ if args.quiet:
30
+ jieba.setLogLevel(60)
31
+ if args.pos:
32
+ import jieba.posseg
33
+ posdelim = args.pos
34
+ def cutfunc(sentence, _, HMM=True):
35
+ for w, f in jieba.posseg.cut(sentence, HMM):
36
+ yield w + posdelim + f
37
+ else:
38
+ cutfunc = jieba.cut
39
+
40
+ delim = text_type(args.delimiter)
41
+ cutall = args.cutall
42
+ hmm = args.hmm
43
+ fp = open(args.filename, 'r') if args.filename else sys.stdin
44
+
45
+ if args.dict:
46
+ jieba.initialize(args.dict)
47
+ else:
48
+ jieba.initialize()
49
+ if args.user_dict:
50
+ jieba.load_userdict(args.user_dict)
51
+
52
+ ln = fp.readline()
53
+ while ln:
54
+ l = ln.rstrip('\r\n')
55
+ result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
56
+ if PY2:
57
+ result = result.encode(default_encoding)
58
+ print(result)
59
+ ln = fp.readline()
60
+
61
+ fp.close()
jieba/_compat.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import sys
4
+
5
+ try:
6
+ import pkg_resources
7
+ get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
8
+ os.path.join(*res))
9
+ except ImportError:
10
+ get_module_res = lambda *res: open(os.path.normpath(os.path.join(
11
+ os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
12
+
13
+ PY2 = sys.version_info[0] == 2
14
+
15
+ default_encoding = sys.getfilesystemencoding()
16
+
17
+ if PY2:
18
+ text_type = unicode
19
+ string_types = (str, unicode)
20
+
21
+ iterkeys = lambda d: d.iterkeys()
22
+ itervalues = lambda d: d.itervalues()
23
+ iteritems = lambda d: d.iteritems()
24
+
25
+ else:
26
+ text_type = str
27
+ string_types = (str,)
28
+ xrange = range
29
+
30
+ iterkeys = lambda d: iter(d.keys())
31
+ itervalues = lambda d: iter(d.values())
32
+ iteritems = lambda d: iter(d.items())
33
+
34
+ def strdecode(sentence):
35
+ if not isinstance(sentence, text_type):
36
+ try:
37
+ sentence = sentence.decode('utf-8')
38
+ except UnicodeDecodeError:
39
+ sentence = sentence.decode('gbk', 'ignore')
40
+ return sentence
41
+
42
+ def resolve_filename(f):
43
+ try:
44
+ return f.name
45
+ except AttributeError:
46
+ return repr(f)
jieba/analyse/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import
2
+ from .tfidf import TFIDF
3
+ from .textrank import TextRank
4
+ try:
5
+ from .analyzer import ChineseAnalyzer
6
+ except ImportError:
7
+ pass
8
+
9
+ default_tfidf = TFIDF()
10
+ default_textrank = TextRank()
11
+
12
+ extract_tags = tfidf = default_tfidf.extract_tags
13
+ set_idf_path = default_tfidf.set_idf_path
14
+ textrank = default_textrank.extract_tags
15
+
16
+ def set_stop_words(stop_words_path):
17
+ default_tfidf.set_stop_words(stop_words_path)
18
+ default_textrank.set_stop_words(stop_words_path)
jieba/analyse/analyzer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # encoding=utf-8
2
+ from __future__ import unicode_literals
3
+ from whoosh.analysis import RegexAnalyzer, LowercaseFilter, StopFilter, StemFilter
4
+ from whoosh.analysis import Tokenizer, Token
5
+ from whoosh.lang.porter import stem
6
+
7
+ import jieba
8
+ import re
9
+
10
+ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
11
+ 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
12
+ 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
13
+ 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
14
+ 'you', 'your', '的', '了', '和'))
15
+
16
+ accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")
17
+
18
+
19
+ class ChineseTokenizer(Tokenizer):
20
+
21
+ def __call__(self, text, **kargs):
22
+ words = jieba.tokenize(text, mode="search")
23
+ token = Token()
24
+ for (w, start_pos, stop_pos) in words:
25
+ if not accepted_chars.match(w) and len(w) <= 1:
26
+ continue
27
+ token.original = token.text = w
28
+ token.pos = start_pos
29
+ token.startchar = start_pos
30
+ token.endchar = stop_pos
31
+ yield token
32
+
33
+
34
+ def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
35
+ return (ChineseTokenizer() | LowercaseFilter() |
36
+ StopFilter(stoplist=stoplist, minsize=minsize) |
37
+ StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
jieba/analyse/idf.txt ADDED
The diff for this file is too large to render. See raw diff
 
jieba/analyse/textrank.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import absolute_import, unicode_literals
5
+ import sys
6
+ from operator import itemgetter
7
+ from collections import defaultdict
8
+ import jieba.posseg
9
+ from .tfidf import KeywordExtractor
10
+ from .._compat import *
11
+
12
+
13
+ class UndirectWeightedGraph:
14
+ d = 0.85
15
+
16
+ def __init__(self):
17
+ self.graph = defaultdict(list)
18
+
19
+ def addEdge(self, start, end, weight):
20
+ # use a tuple (start, end, weight) instead of a Edge object
21
+ self.graph[start].append((start, end, weight))
22
+ self.graph[end].append((end, start, weight))
23
+
24
+ def rank(self):
25
+ ws = defaultdict(float)
26
+ outSum = defaultdict(float)
27
+
28
+ wsdef = 1.0 / (len(self.graph) or 1.0)
29
+ for n, out in self.graph.items():
30
+ ws[n] = wsdef
31
+ outSum[n] = sum((e[2] for e in out), 0.0)
32
+
33
+ # this line for build stable iteration
34
+ sorted_keys = sorted(self.graph.keys())
35
+ for x in xrange(10): # 10 iters
36
+ for n in sorted_keys:
37
+ s = 0
38
+ for e in self.graph[n]:
39
+ s += e[2] / outSum[e[1]] * ws[e[1]]
40
+ ws[n] = (1 - self.d) + self.d * s
41
+
42
+ (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
43
+
44
+ for w in itervalues(ws):
45
+ if w < min_rank:
46
+ min_rank = w
47
+ if w > max_rank:
48
+ max_rank = w
49
+
50
+ for n, w in ws.items():
51
+ # to unify the weights, don't *100.
52
+ ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
53
+
54
+ return ws
55
+
56
+
57
+ class TextRank(KeywordExtractor):
58
+
59
+ def __init__(self):
60
+ self.tokenizer = self.postokenizer = jieba.posseg.dt
61
+ self.stop_words = self.STOP_WORDS.copy()
62
+ self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
63
+ self.span = 5
64
+
65
+ def pairfilter(self, wp):
66
+ return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
67
+ and wp.word.lower() not in self.stop_words)
68
+
69
+ def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
70
+ """
71
+ Extract keywords from sentence using TextRank algorithm.
72
+ Parameter:
73
+ - topK: return how many top keywords. `None` for all possible words.
74
+ - withWeight: if True, return a list of (word, weight);
75
+ if False, return a list of words.
76
+ - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
77
+ if the POS of w is not in this list, it will be filtered.
78
+ - withFlag: if True, return a list of pair(word, weight) like posseg.cut
79
+ if False, return a list of words
80
+ """
81
+ self.pos_filt = frozenset(allowPOS)
82
+ g = UndirectWeightedGraph()
83
+ cm = defaultdict(int)
84
+ words = tuple(self.tokenizer.cut(sentence))
85
+ for i, wp in enumerate(words):
86
+ if self.pairfilter(wp):
87
+ for j in xrange(i + 1, i + self.span):
88
+ if j >= len(words):
89
+ break
90
+ if not self.pairfilter(words[j]):
91
+ continue
92
+ if allowPOS and withFlag:
93
+ cm[(wp, words[j])] += 1
94
+ else:
95
+ cm[(wp.word, words[j].word)] += 1
96
+
97
+ for terms, w in cm.items():
98
+ g.addEdge(terms[0], terms[1], w)
99
+ nodes_rank = g.rank()
100
+ if withWeight:
101
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
102
+ else:
103
+ tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
104
+
105
+ if topK:
106
+ return tags[:topK]
107
+ else:
108
+ return tags
109
+
110
+ extract_tags = textrank
jieba/analyse/tfidf.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # encoding=utf-8
2
+ from __future__ import absolute_import
3
+ import os
4
+ import jieba
5
+ import jieba.posseg
6
+ from operator import itemgetter
7
+
8
+ _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
9
+ os.path.dirname(__file__), path))
10
+ _get_abs_path = jieba._get_abs_path
11
+
12
+ DEFAULT_IDF = _get_module_path("idf.txt")
13
+
14
+
15
+ class KeywordExtractor(object):
16
+
17
+ STOP_WORDS = set((
18
+ "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
19
+ "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
20
+ "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
21
+ ))
22
+
23
+ def set_stop_words(self, stop_words_path):
24
+ abs_path = _get_abs_path(stop_words_path)
25
+ if not os.path.isfile(abs_path):
26
+ raise Exception("jieba: file does not exist: " + abs_path)
27
+ content = open(abs_path, 'rb').read().decode('utf-8')
28
+ for line in content.splitlines():
29
+ self.stop_words.add(line)
30
+
31
+ def extract_tags(self, *args, **kwargs):
32
+ raise NotImplementedError
33
+
34
+
35
+ class IDFLoader(object):
36
+
37
+ def __init__(self, idf_path=None):
38
+ self.path = ""
39
+ self.idf_freq = {}
40
+ self.median_idf = 0.0
41
+ if idf_path:
42
+ self.set_new_path(idf_path)
43
+
44
+ def set_new_path(self, new_idf_path):
45
+ if self.path != new_idf_path:
46
+ self.path = new_idf_path
47
+ content = open(new_idf_path, 'rb').read().decode('utf-8')
48
+ self.idf_freq = {}
49
+ for line in content.splitlines():
50
+ word, freq = line.strip().split(' ')
51
+ self.idf_freq[word] = float(freq)
52
+ self.median_idf = sorted(
53
+ self.idf_freq.values())[len(self.idf_freq) // 2]
54
+
55
+ def get_idf(self):
56
+ return self.idf_freq, self.median_idf
57
+
58
+
59
+ class TFIDF(KeywordExtractor):
60
+
61
+ def __init__(self, idf_path=None):
62
+ self.tokenizer = jieba.dt
63
+ self.postokenizer = jieba.posseg.dt
64
+ self.stop_words = self.STOP_WORDS.copy()
65
+ self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
66
+ self.idf_freq, self.median_idf = self.idf_loader.get_idf()
67
+
68
+ def set_idf_path(self, idf_path):
69
+ new_abs_path = _get_abs_path(idf_path)
70
+ if not os.path.isfile(new_abs_path):
71
+ raise Exception("jieba: file does not exist: " + new_abs_path)
72
+ self.idf_loader.set_new_path(new_abs_path)
73
+ self.idf_freq, self.median_idf = self.idf_loader.get_idf()
74
+
75
+ def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
76
+ """
77
+ Extract keywords from sentence using TF-IDF algorithm.
78
+ Parameter:
79
+ - topK: return how many top keywords. `None` for all possible words.
80
+ - withWeight: if True, return a list of (word, weight);
81
+ if False, return a list of words.
82
+ - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
83
+ if the POS of w is not in this list,it will be filtered.
84
+ - withFlag: only work with allowPOS is not empty.
85
+ if True, return a list of pair(word, weight) like posseg.cut
86
+ if False, return a list of words
87
+ """
88
+ if allowPOS:
89
+ allowPOS = frozenset(allowPOS)
90
+ words = self.postokenizer.cut(sentence)
91
+ else:
92
+ words = self.tokenizer.cut(sentence)
93
+ freq = {}
94
+ for w in words:
95
+ if allowPOS:
96
+ if w.flag not in allowPOS:
97
+ continue
98
+ elif not withFlag:
99
+ w = w.word
100
+ wc = w.word if allowPOS and withFlag else w
101
+ if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
102
+ continue
103
+ freq[w] = freq.get(w, 0.0) + 1.0
104
+ total = sum(freq.values())
105
+ for k in freq:
106
+ kw = k.word if allowPOS and withFlag else k
107
+ freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
108
+
109
+ if withWeight:
110
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
111
+ else:
112
+ tags = sorted(freq, key=freq.__getitem__, reverse=True)
113
+ if topK:
114
+ return tags[:topK]
115
+ else:
116
+ return tags
jieba/dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
jieba/finalseg/__init__.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, unicode_literals
2
+ import re
3
+ import os
4
+ import sys
5
+ import pickle
6
+ from .._compat import *
7
+
8
+ MIN_FLOAT = -3.14e100
9
+
10
+ PROB_START_P = "prob_start.p"
11
+ PROB_TRANS_P = "prob_trans.p"
12
+ PROB_EMIT_P = "prob_emit.p"
13
+
14
+
15
+ PrevStatus = {
16
+ 'B': 'ES',
17
+ 'M': 'MB',
18
+ 'S': 'SE',
19
+ 'E': 'BM'
20
+ }
21
+
22
+
23
+ def load_model():
24
+ start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
25
+ trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
26
+ emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
27
+ return start_p, trans_p, emit_p
28
+
29
+ if sys.platform.startswith("java"):
30
+ start_P, trans_P, emit_P = load_model()
31
+ else:
32
+ from .prob_start import P as start_P
33
+ from .prob_trans import P as trans_P
34
+ from .prob_emit import P as emit_P
35
+
36
+
37
+ def viterbi(obs, states, start_p, trans_p, emit_p):
38
+ V = [{}] # tabular
39
+ path = {}
40
+ for y in states: # init
41
+ V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
42
+ path[y] = [y]
43
+ for t in xrange(1, len(obs)):
44
+ V.append({})
45
+ newpath = {}
46
+ for y in states:
47
+ em_p = emit_p[y].get(obs[t], MIN_FLOAT)
48
+ (prob, state) = max(
49
+ [(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
50
+ V[t][y] = prob
51
+ newpath[y] = path[state] + [y]
52
+ path = newpath
53
+
54
+ (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
55
+
56
+ return (prob, path[state])
57
+
58
+
59
+ def __cut(sentence):
60
+ global emit_P
61
+ prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
62
+ begin, nexti = 0, 0
63
+ # print pos_list, sentence
64
+ for i, char in enumerate(sentence):
65
+ pos = pos_list[i]
66
+ if pos == 'B':
67
+ begin = i
68
+ elif pos == 'E':
69
+ yield sentence[begin:i + 1]
70
+ nexti = i + 1
71
+ elif pos == 'S':
72
+ yield char
73
+ nexti = i + 1
74
+ if nexti < len(sentence):
75
+ yield sentence[nexti:]
76
+
77
+ re_han = re.compile("([\u4E00-\u9FD5]+)")
78
+ re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
79
+
80
+
81
+ def cut(sentence):
82
+ sentence = strdecode(sentence)
83
+ blocks = re_han.split(sentence)
84
+ for blk in blocks:
85
+ if re_han.match(blk):
86
+ for word in __cut(blk):
87
+ yield word
88
+ else:
89
+ tmp = re_skip.split(blk)
90
+ for x in tmp:
91
+ if x:
92
+ yield x
jieba/finalseg/prob_emit.p ADDED
The diff for this file is too large to render. See raw diff
 
jieba/finalseg/prob_emit.py ADDED
The diff for this file is too large to render. See raw diff
 
jieba/finalseg/prob_start.p ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (dp0
2
+ S'E'
3
+ p1
4
+ F-3.14e+100
5
+ sS'S'
6
+ p2
7
+ F-1.0490863400100874
8
+ sS'B'
9
+ p3
10
+ F-0.4311793320941878
11
+ sS'M'
12
+ p4
13
+ F-3.14e+100
14
+ s.
jieba/finalseg/prob_start.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ P={'B': -0.4311793320941878,
2
+ 'E': -3.14e+100,
3
+ 'M': -3.14e+100,
4
+ 'S': -1.0490863400100874}
jieba/finalseg/prob_trans.p ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (dp0
2
+ S'M'
3
+ p1
4
+ (dp2
5
+ g1
6
+ F-1.776721924369053
7
+ sS'E'
8
+ p3
9
+ F-0.18535639277522836
10
+ ssS'S'
11
+ p4
12
+ (dp5
13
+ g4
14
+ F-0.8149794471455989
15
+ sS'B'
16
+ p6
17
+ F-0.5845590441999979
18
+ ssg6
19
+ (dp7
20
+ g1
21
+ F-1.9405006828418647
22
+ sg3
23
+ F-0.15505510933264552
24
+ ssg3
25
+ (dp8
26
+ g4
27
+ F-1.0069624262712982
28
+ sg6
29
+ F-0.4546453789910586
30
+ ss.
jieba/finalseg/prob_trans.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ P={'B': {'E': -0.15505510933264552, 'M': -1.9405006828418647},
2
+ 'E': {'B': -0.4546453789910586, 'S': -1.0069624262712982},
3
+ 'M': {'E': -0.18535639277522836, 'M': -1.776721924369053},
4
+ 'S': {'B': -0.5845590441999979, 'S': -0.8149794471455989}}
jieba/posseg/__init__.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, unicode_literals
2
+ import os
3
+ import re
4
+ import sys
5
+ import jieba
6
+ import pickle
7
+ from .._compat import *
8
+ from .viterbi import viterbi
9
+
10
+ PROB_START_P = "prob_start.p"
11
+ PROB_TRANS_P = "prob_trans.p"
12
+ PROB_EMIT_P = "prob_emit.p"
13
+ CHAR_STATE_TAB_P = "char_state_tab.p"
14
+
15
+ re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
16
+ re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
17
+ re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
18
+ re_skip_internal = re.compile("(\r\n|\s)")
19
+
20
+ re_eng = re.compile("[a-zA-Z0-9]+")
21
+ re_num = re.compile("[\.0-9]+")
22
+
23
+ re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)
24
+
25
+
26
+ def load_model():
27
+ # For Jython
28
+ start_p = pickle.load(get_module_res("posseg", PROB_START_P))
29
+ trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P))
30
+ emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P))
31
+ state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P))
32
+ return state, start_p, trans_p, emit_p
33
+
34
+
35
+ if sys.platform.startswith("java"):
36
+ char_state_tab_P, start_P, trans_P, emit_P = load_model()
37
+ else:
38
+ from .char_state_tab import P as char_state_tab_P
39
+ from .prob_start import P as start_P
40
+ from .prob_trans import P as trans_P
41
+ from .prob_emit import P as emit_P
42
+
43
+
44
+ class pair(object):
45
+
46
+ def __init__(self, word, flag):
47
+ self.word = word
48
+ self.flag = flag
49
+
50
+ def __unicode__(self):
51
+ return '%s/%s' % (self.word, self.flag)
52
+
53
+ def __repr__(self):
54
+ return 'pair(%r, %r)' % (self.word, self.flag)
55
+
56
+ def __str__(self):
57
+ if PY2:
58
+ return self.__unicode__().encode(default_encoding)
59
+ else:
60
+ return self.__unicode__()
61
+
62
+ def __iter__(self):
63
+ return iter((self.word, self.flag))
64
+
65
+ def __lt__(self, other):
66
+ return self.word < other.word
67
+
68
+ def __eq__(self, other):
69
+ return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
70
+
71
+ def __hash__(self):
72
+ return hash(self.word)
73
+
74
+ def encode(self, arg):
75
+ return self.__unicode__().encode(arg)
76
+
77
+
78
+ class POSTokenizer(object):
79
+
80
+ def __init__(self, tokenizer=None):
81
+ self.tokenizer = tokenizer or jieba.Tokenizer()
82
+ self.load_word_tag(self.tokenizer.get_dict_file())
83
+
84
+ def __repr__(self):
85
+ return '<POSTokenizer tokenizer=%r>' % self.tokenizer
86
+
87
+ def __getattr__(self, name):
88
+ if name in ('cut_for_search', 'lcut_for_search', 'tokenize'):
89
+ # may be possible?
90
+ raise NotImplementedError
91
+ return getattr(self.tokenizer, name)
92
+
93
+ def initialize(self, dictionary=None):
94
+ self.tokenizer.initialize(dictionary)
95
+ self.load_word_tag(self.tokenizer.get_dict_file())
96
+
97
+ def load_word_tag(self, f):
98
+ self.word_tag_tab = {}
99
+ f_name = resolve_filename(f)
100
+ for lineno, line in enumerate(f, 1):
101
+ try:
102
+ line = line.strip().decode("utf-8")
103
+ if not line:
104
+ continue
105
+ word, _, tag = line.split(" ")
106
+ self.word_tag_tab[word] = tag
107
+ except Exception:
108
+ raise ValueError(
109
+ 'invalid POS dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
110
+ f.close()
111
+
112
+ def makesure_userdict_loaded(self):
113
+ if self.tokenizer.user_word_tag_tab:
114
+ self.word_tag_tab.update(self.tokenizer.user_word_tag_tab)
115
+ self.tokenizer.user_word_tag_tab = {}
116
+
117
+ def __cut(self, sentence):
118
+ prob, pos_list = viterbi(
119
+ sentence, char_state_tab_P, start_P, trans_P, emit_P)
120
+ begin, nexti = 0, 0
121
+
122
+ for i, char in enumerate(sentence):
123
+ pos = pos_list[i][0]
124
+ if pos == 'B':
125
+ begin = i
126
+ elif pos == 'E':
127
+ yield pair(sentence[begin:i + 1], pos_list[i][1])
128
+ nexti = i + 1
129
+ elif pos == 'S':
130
+ yield pair(char, pos_list[i][1])
131
+ nexti = i + 1
132
+ if nexti < len(sentence):
133
+ yield pair(sentence[nexti:], pos_list[nexti][1])
134
+
135
+ def __cut_detail(self, sentence):
136
+ blocks = re_han_detail.split(sentence)
137
+ for blk in blocks:
138
+ if re_han_detail.match(blk):
139
+ for word in self.__cut(blk):
140
+ yield word
141
+ else:
142
+ tmp = re_skip_detail.split(blk)
143
+ for x in tmp:
144
+ if x:
145
+ if re_num.match(x):
146
+ yield pair(x, 'm')
147
+ elif re_eng.match(x):
148
+ yield pair(x, 'eng')
149
+ else:
150
+ yield pair(x, 'x')
151
+
152
+ def __cut_DAG_NO_HMM(self, sentence):
153
+ DAG = self.tokenizer.get_DAG(sentence)
154
+ route = {}
155
+ self.tokenizer.calc(sentence, DAG, route)
156
+ x = 0
157
+ N = len(sentence)
158
+ buf = ''
159
+ while x < N:
160
+ y = route[x][1] + 1
161
+ l_word = sentence[x:y]
162
+ if re_eng1.match(l_word):
163
+ buf += l_word
164
+ x = y
165
+ else:
166
+ if buf:
167
+ yield pair(buf, 'eng')
168
+ buf = ''
169
+ yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
170
+ x = y
171
+ if buf:
172
+ yield pair(buf, 'eng')
173
+ buf = ''
174
+
175
+ def __cut_DAG(self, sentence):
176
+ DAG = self.tokenizer.get_DAG(sentence)
177
+ route = {}
178
+
179
+ self.tokenizer.calc(sentence, DAG, route)
180
+
181
+ x = 0
182
+ buf = ''
183
+ N = len(sentence)
184
+ while x < N:
185
+ y = route[x][1] + 1
186
+ l_word = sentence[x:y]
187
+ if y - x == 1:
188
+ buf += l_word
189
+ else:
190
+ if buf:
191
+ if len(buf) == 1:
192
+ yield pair(buf, self.word_tag_tab.get(buf, 'x'))
193
+ elif not self.tokenizer.FREQ.get(buf):
194
+ recognized = self.__cut_detail(buf)
195
+ for t in recognized:
196
+ yield t
197
+ else:
198
+ for elem in buf:
199
+ yield pair(elem, self.word_tag_tab.get(elem, 'x'))
200
+ buf = ''
201
+ yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
202
+ x = y
203
+
204
+ if buf:
205
+ if len(buf) == 1:
206
+ yield pair(buf, self.word_tag_tab.get(buf, 'x'))
207
+ elif not self.tokenizer.FREQ.get(buf):
208
+ recognized = self.__cut_detail(buf)
209
+ for t in recognized:
210
+ yield t
211
+ else:
212
+ for elem in buf:
213
+ yield pair(elem, self.word_tag_tab.get(elem, 'x'))
214
+
215
+ def __cut_internal(self, sentence, HMM=True):
216
+ self.makesure_userdict_loaded()
217
+ sentence = strdecode(sentence)
218
+ blocks = re_han_internal.split(sentence)
219
+ if HMM:
220
+ cut_blk = self.__cut_DAG
221
+ else:
222
+ cut_blk = self.__cut_DAG_NO_HMM
223
+
224
+ for blk in blocks:
225
+ if re_han_internal.match(blk):
226
+ for word in cut_blk(blk):
227
+ yield word
228
+ else:
229
+ tmp = re_skip_internal.split(blk)
230
+ for x in tmp:
231
+ if re_skip_internal.match(x):
232
+ yield pair(x, 'x')
233
+ else:
234
+ for xx in x:
235
+ if re_num.match(xx):
236
+ yield pair(xx, 'm')
237
+ elif re_eng.match(x):
238
+ yield pair(xx, 'eng')
239
+ else:
240
+ yield pair(xx, 'x')
241
+
242
+ def _lcut_internal(self, sentence):
243
+ return list(self.__cut_internal(sentence))
244
+
245
+ def _lcut_internal_no_hmm(self, sentence):
246
+ return list(self.__cut_internal(sentence, False))
247
+
248
+ def cut(self, sentence, HMM=True):
249
+ for w in self.__cut_internal(sentence, HMM=HMM):
250
+ yield w
251
+
252
+ def lcut(self, *args, **kwargs):
253
+ return list(self.cut(*args, **kwargs))
254
+
255
+ # default Tokenizer instance
256
+
257
+ dt = POSTokenizer(jieba.dt)
258
+
259
+ # global functions
260
+
261
+ initialize = dt.initialize
262
+
263
+
264
+ def _lcut_internal(s):
265
+ return dt._lcut_internal(s)
266
+
267
+
268
+ def _lcut_internal_no_hmm(s):
269
+ return dt._lcut_internal_no_hmm(s)
270
+
271
+
272
+ def cut(sentence, HMM=True):
273
+ """
274
+ Global `cut` function that supports parallel processing.
275
+ Note that this only works using dt, custom POSTokenizer
276
+ instances are not supported.
277
+ """
278
+ global dt
279
+ if jieba.pool is None:
280
+ for w in dt.cut(sentence, HMM=HMM):
281
+ yield w
282
+ else:
283
+ parts = strdecode(sentence).splitlines(True)
284
+ if HMM:
285
+ result = jieba.pool.map(_lcut_internal, parts)
286
+ else:
287
+ result = jieba.pool.map(_lcut_internal_no_hmm, parts)
288
+ for r in result:
289
+ for w in r:
290
+ yield w
291
+
292
+
293
+ def lcut(sentence, HMM=True):
294
+ return list(cut(sentence, HMM))
jieba/posseg/char_state_tab.p ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/char_state_tab.py ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/prob_emit.p ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/prob_emit.py ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/prob_start.p ADDED
@@ -0,0 +1,1094 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (dp0
2
+ (S'B'
3
+ p1
4
+ S'a'
5
+ p2
6
+ tp3
7
+ F-4.762305214596967
8
+ s(g1
9
+ S'ad'
10
+ p4
11
+ tp5
12
+ F-6.680066036784177
13
+ s(g1
14
+ S'ag'
15
+ p6
16
+ tp7
17
+ F-3.14e+100
18
+ s(g1
19
+ S'an'
20
+ p8
21
+ tp9
22
+ F-8.697083223018778
23
+ s(g1
24
+ S'b'
25
+ p10
26
+ tp11
27
+ F-5.018374362109218
28
+ s(g1
29
+ S'bg'
30
+ p12
31
+ tp13
32
+ F-3.14e+100
33
+ s(g1
34
+ S'c'
35
+ p14
36
+ tp15
37
+ F-3.423880184954888
38
+ s(g1
39
+ S'd'
40
+ p16
41
+ tp17
42
+ F-3.9750475297585357
43
+ s(g1
44
+ S'df'
45
+ p18
46
+ tp19
47
+ F-8.888974230828882
48
+ s(g1
49
+ S'dg'
50
+ p20
51
+ tp21
52
+ F-3.14e+100
53
+ s(g1
54
+ S'e'
55
+ p22
56
+ tp23
57
+ F-8.563551830394255
58
+ s(g1
59
+ S'en'
60
+ p24
61
+ tp25
62
+ F-3.14e+100
63
+ s(g1
64
+ S'f'
65
+ p26
66
+ tp27
67
+ F-5.491630418482717
68
+ s(g1
69
+ S'g'
70
+ p28
71
+ tp29
72
+ F-3.14e+100
73
+ s(g1
74
+ S'h'
75
+ p30
76
+ tp31
77
+ F-13.533365129970255
78
+ s(g1
79
+ S'i'
80
+ p32
81
+ tp33
82
+ F-6.1157847275557105
83
+ s(g1
84
+ S'in'
85
+ p34
86
+ tp35
87
+ F-3.14e+100
88
+ s(g1
89
+ S'j'
90
+ p36
91
+ tp37
92
+ F-5.0576191284681915
93
+ s(g1
94
+ S'jn'
95
+ p38
96
+ tp39
97
+ F-3.14e+100
98
+ s(g1
99
+ S'k'
100
+ p40
101
+ tp41
102
+ F-3.14e+100
103
+ s(g1
104
+ S'l'
105
+ p42
106
+ tp43
107
+ F-4.905883584659895
108
+ s(g1
109
+ S'ln'
110
+ p44
111
+ tp45
112
+ F-3.14e+100
113
+ s(g1
114
+ S'm'
115
+ p46
116
+ tp47
117
+ F-3.6524299819046386
118
+ s(g1
119
+ S'mg'
120
+ p48
121
+ tp49
122
+ F-3.14e+100
123
+ s(g1
124
+ S'mq'
125
+ p50
126
+ tp51
127
+ F-6.78695300139688
128
+ s(g1
129
+ S'n'
130
+ p52
131
+ tp53
132
+ F-1.6966257797548328
133
+ s(g1
134
+ S'ng'
135
+ p54
136
+ tp55
137
+ F-3.14e+100
138
+ s(g1
139
+ S'nr'
140
+ p56
141
+ tp57
142
+ F-2.2310495913769506
143
+ s(g1
144
+ S'nrfg'
145
+ p58
146
+ tp59
147
+ F-5.873722175405573
148
+ s(g1
149
+ S'nrt'
150
+ p60
151
+ tp61
152
+ F-4.985642733519195
153
+ s(g1
154
+ S'ns'
155
+ p62
156
+ tp63
157
+ F-2.8228438314969213
158
+ s(g1
159
+ S'nt'
160
+ p64
161
+ tp65
162
+ F-4.846091668182416
163
+ s(g1
164
+ S'nz'
165
+ p66
166
+ tp67
167
+ F-3.94698846057672
168
+ s(g1
169
+ S'o'
170
+ p68
171
+ tp69
172
+ F-8.433498702146057
173
+ s(g1
174
+ S'p'
175
+ p70
176
+ tp71
177
+ F-4.200984132085048
178
+ s(g1
179
+ S'q'
180
+ p72
181
+ tp73
182
+ F-6.998123858956596
183
+ s(g1
184
+ S'qe'
185
+ p74
186
+ tp75
187
+ F-3.14e+100
188
+ s(g1
189
+ S'qg'
190
+ p76
191
+ tp77
192
+ F-3.14e+100
193
+ s(g1
194
+ S'r'
195
+ p78
196
+ tp79
197
+ F-3.4098187790818413
198
+ s(g1
199
+ S'rg'
200
+ p80
201
+ tp81
202
+ F-3.14e+100
203
+ s(g1
204
+ S'rr'
205
+ p82
206
+ tp83
207
+ F-12.434752841302146
208
+ s(g1
209
+ S'rz'
210
+ p84
211
+ tp85
212
+ F-7.946116471570005
213
+ s(g1
214
+ S's'
215
+ p86
216
+ tp87
217
+ F-5.522673590839954
218
+ s(g1
219
+ S't'
220
+ p88
221
+ tp89
222
+ F-3.3647479094528574
223
+ s(g1
224
+ S'tg'
225
+ p90
226
+ tp91
227
+ F-3.14e+100
228
+ s(g1
229
+ S'u'
230
+ p92
231
+ tp93
232
+ F-9.163917277503234
233
+ s(g1
234
+ S'ud'
235
+ p94
236
+ tp95
237
+ F-3.14e+100
238
+ s(g1
239
+ S'ug'
240
+ p96
241
+ tp97
242
+ F-3.14e+100
243
+ s(g1
244
+ S'uj'
245
+ p98
246
+ tp99
247
+ F-3.14e+100
248
+ s(g1
249
+ S'ul'
250
+ p100
251
+ tp101
252
+ F-3.14e+100
253
+ s(g1
254
+ S'uv'
255
+ p102
256
+ tp103
257
+ F-3.14e+100
258
+ s(g1
259
+ S'uz'
260
+ p104
261
+ tp105
262
+ F-3.14e+100
263
+ s(g1
264
+ S'v'
265
+ p106
266
+ tp107
267
+ F-2.6740584874265685
268
+ s(g1
269
+ S'vd'
270
+ p108
271
+ tp109
272
+ F-9.044728760238115
273
+ s(g1
274
+ S'vg'
275
+ p110
276
+ tp111
277
+ F-3.14e+100
278
+ s(g1
279
+ S'vi'
280
+ p112
281
+ tp113
282
+ F-12.434752841302146
283
+ s(g1
284
+ S'vn'
285
+ p114
286
+ tp115
287
+ F-4.3315610890163585
288
+ s(g1
289
+ S'vq'
290
+ p116
291
+ tp117
292
+ F-12.147070768850364
293
+ s(g1
294
+ S'w'
295
+ p118
296
+ tp119
297
+ F-3.14e+100
298
+ s(g1
299
+ S'x'
300
+ p120
301
+ tp121
302
+ F-3.14e+100
303
+ s(g1
304
+ S'y'
305
+ p122
306
+ tp123
307
+ F-9.844485675856319
308
+ s(g1
309
+ S'yg'
310
+ p124
311
+ tp125
312
+ F-3.14e+100
313
+ s(g1
314
+ S'z'
315
+ p126
316
+ tp127
317
+ F-7.045681111485645
318
+ s(g1
319
+ S'zg'
320
+ p128
321
+ tp129
322
+ F-3.14e+100
323
+ s(S'E'
324
+ p130
325
+ g2
326
+ tp131
327
+ F-3.14e+100
328
+ s(g130
329
+ g4
330
+ tp132
331
+ F-3.14e+100
332
+ s(g130
333
+ g6
334
+ tp133
335
+ F-3.14e+100
336
+ s(g130
337
+ g8
338
+ tp134
339
+ F-3.14e+100
340
+ s(g130
341
+ g10
342
+ tp135
343
+ F-3.14e+100
344
+ s(g130
345
+ g12
346
+ tp136
347
+ F-3.14e+100
348
+ s(g130
349
+ g14
350
+ tp137
351
+ F-3.14e+100
352
+ s(g130
353
+ g16
354
+ tp138
355
+ F-3.14e+100
356
+ s(g130
357
+ g18
358
+ tp139
359
+ F-3.14e+100
360
+ s(g130
361
+ g20
362
+ tp140
363
+ F-3.14e+100
364
+ s(g130
365
+ g22
366
+ tp141
367
+ F-3.14e+100
368
+ s(g130
369
+ g24
370
+ tp142
371
+ F-3.14e+100
372
+ s(g130
373
+ g26
374
+ tp143
375
+ F-3.14e+100
376
+ s(g130
377
+ g28
378
+ tp144
379
+ F-3.14e+100
380
+ s(g130
381
+ g30
382
+ tp145
383
+ F-3.14e+100
384
+ s(g130
385
+ g32
386
+ tp146
387
+ F-3.14e+100
388
+ s(g130
389
+ g34
390
+ tp147
391
+ F-3.14e+100
392
+ s(g130
393
+ g36
394
+ tp148
395
+ F-3.14e+100
396
+ s(g130
397
+ g38
398
+ tp149
399
+ F-3.14e+100
400
+ s(g130
401
+ g40
402
+ tp150
403
+ F-3.14e+100
404
+ s(g130
405
+ g42
406
+ tp151
407
+ F-3.14e+100
408
+ s(g130
409
+ g44
410
+ tp152
411
+ F-3.14e+100
412
+ s(g130
413
+ g46
414
+ tp153
415
+ F-3.14e+100
416
+ s(g130
417
+ g48
418
+ tp154
419
+ F-3.14e+100
420
+ s(g130
421
+ g50
422
+ tp155
423
+ F-3.14e+100
424
+ s(g130
425
+ g52
426
+ tp156
427
+ F-3.14e+100
428
+ s(g130
429
+ g54
430
+ tp157
431
+ F-3.14e+100
432
+ s(g130
433
+ g56
434
+ tp158
435
+ F-3.14e+100
436
+ s(g130
437
+ g58
438
+ tp159
439
+ F-3.14e+100
440
+ s(g130
441
+ g60
442
+ tp160
443
+ F-3.14e+100
444
+ s(g130
445
+ g62
446
+ tp161
447
+ F-3.14e+100
448
+ s(g130
449
+ g64
450
+ tp162
451
+ F-3.14e+100
452
+ s(g130
453
+ g66
454
+ tp163
455
+ F-3.14e+100
456
+ s(g130
457
+ g68
458
+ tp164
459
+ F-3.14e+100
460
+ s(g130
461
+ g70
462
+ tp165
463
+ F-3.14e+100
464
+ s(g130
465
+ g72
466
+ tp166
467
+ F-3.14e+100
468
+ s(g130
469
+ g74
470
+ tp167
471
+ F-3.14e+100
472
+ s(g130
473
+ g76
474
+ tp168
475
+ F-3.14e+100
476
+ s(g130
477
+ g78
478
+ tp169
479
+ F-3.14e+100
480
+ s(g130
481
+ g80
482
+ tp170
483
+ F-3.14e+100
484
+ s(g130
485
+ g82
486
+ tp171
487
+ F-3.14e+100
488
+ s(g130
489
+ g84
490
+ tp172
491
+ F-3.14e+100
492
+ s(g130
493
+ g86
494
+ tp173
495
+ F-3.14e+100
496
+ s(g130
497
+ g88
498
+ tp174
499
+ F-3.14e+100
500
+ s(g130
501
+ g90
502
+ tp175
503
+ F-3.14e+100
504
+ s(g130
505
+ g92
506
+ tp176
507
+ F-3.14e+100
508
+ s(g130
509
+ g94
510
+ tp177
511
+ F-3.14e+100
512
+ s(g130
513
+ g96
514
+ tp178
515
+ F-3.14e+100
516
+ s(g130
517
+ g98
518
+ tp179
519
+ F-3.14e+100
520
+ s(g130
521
+ g100
522
+ tp180
523
+ F-3.14e+100
524
+ s(g130
525
+ g102
526
+ tp181
527
+ F-3.14e+100
528
+ s(g130
529
+ g104
530
+ tp182
531
+ F-3.14e+100
532
+ s(g130
533
+ g106
534
+ tp183
535
+ F-3.14e+100
536
+ s(g130
537
+ g108
538
+ tp184
539
+ F-3.14e+100
540
+ s(g130
541
+ g110
542
+ tp185
543
+ F-3.14e+100
544
+ s(g130
545
+ g112
546
+ tp186
547
+ F-3.14e+100
548
+ s(g130
549
+ g114
550
+ tp187
551
+ F-3.14e+100
552
+ s(g130
553
+ g116
554
+ tp188
555
+ F-3.14e+100
556
+ s(g130
557
+ g118
558
+ tp189
559
+ F-3.14e+100
560
+ s(g130
561
+ g120
562
+ tp190
563
+ F-3.14e+100
564
+ s(g130
565
+ g122
566
+ tp191
567
+ F-3.14e+100
568
+ s(g130
569
+ g124
570
+ tp192
571
+ F-3.14e+100
572
+ s(g130
573
+ g126
574
+ tp193
575
+ F-3.14e+100
576
+ s(g130
577
+ g128
578
+ tp194
579
+ F-3.14e+100
580
+ s(S'M'
581
+ p195
582
+ g2
583
+ tp196
584
+ F-3.14e+100
585
+ s(g195
586
+ g4
587
+ tp197
588
+ F-3.14e+100
589
+ s(g195
590
+ g6
591
+ tp198
592
+ F-3.14e+100
593
+ s(g195
594
+ g8
595
+ tp199
596
+ F-3.14e+100
597
+ s(g195
598
+ g10
599
+ tp200
600
+ F-3.14e+100
601
+ s(g195
602
+ g12
603
+ tp201
604
+ F-3.14e+100
605
+ s(g195
606
+ g14
607
+ tp202
608
+ F-3.14e+100
609
+ s(g195
610
+ g16
611
+ tp203
612
+ F-3.14e+100
613
+ s(g195
614
+ g18
615
+ tp204
616
+ F-3.14e+100
617
+ s(g195
618
+ g20
619
+ tp205
620
+ F-3.14e+100
621
+ s(g195
622
+ g22
623
+ tp206
624
+ F-3.14e+100
625
+ s(g195
626
+ g24
627
+ tp207
628
+ F-3.14e+100
629
+ s(g195
630
+ g26
631
+ tp208
632
+ F-3.14e+100
633
+ s(g195
634
+ g28
635
+ tp209
636
+ F-3.14e+100
637
+ s(g195
638
+ g30
639
+ tp210
640
+ F-3.14e+100
641
+ s(g195
642
+ g32
643
+ tp211
644
+ F-3.14e+100
645
+ s(g195
646
+ g34
647
+ tp212
648
+ F-3.14e+100
649
+ s(g195
650
+ g36
651
+ tp213
652
+ F-3.14e+100
653
+ s(g195
654
+ g38
655
+ tp214
656
+ F-3.14e+100
657
+ s(g195
658
+ g40
659
+ tp215
660
+ F-3.14e+100
661
+ s(g195
662
+ g42
663
+ tp216
664
+ F-3.14e+100
665
+ s(g195
666
+ g44
667
+ tp217
668
+ F-3.14e+100
669
+ s(g195
670
+ g46
671
+ tp218
672
+ F-3.14e+100
673
+ s(g195
674
+ g48
675
+ tp219
676
+ F-3.14e+100
677
+ s(g195
678
+ g50
679
+ tp220
680
+ F-3.14e+100
681
+ s(g195
682
+ g52
683
+ tp221
684
+ F-3.14e+100
685
+ s(g195
686
+ g54
687
+ tp222
688
+ F-3.14e+100
689
+ s(g195
690
+ g56
691
+ tp223
692
+ F-3.14e+100
693
+ s(g195
694
+ g58
695
+ tp224
696
+ F-3.14e+100
697
+ s(g195
698
+ g60
699
+ tp225
700
+ F-3.14e+100
701
+ s(g195
702
+ g62
703
+ tp226
704
+ F-3.14e+100
705
+ s(g195
706
+ g64
707
+ tp227
708
+ F-3.14e+100
709
+ s(g195
710
+ g66
711
+ tp228
712
+ F-3.14e+100
713
+ s(g195
714
+ g68
715
+ tp229
716
+ F-3.14e+100
717
+ s(g195
718
+ g70
719
+ tp230
720
+ F-3.14e+100
721
+ s(g195
722
+ g72
723
+ tp231
724
+ F-3.14e+100
725
+ s(g195
726
+ g74
727
+ tp232
728
+ F-3.14e+100
729
+ s(g195
730
+ g76
731
+ tp233
732
+ F-3.14e+100
733
+ s(g195
734
+ g78
735
+ tp234
736
+ F-3.14e+100
737
+ s(g195
738
+ g80
739
+ tp235
740
+ F-3.14e+100
741
+ s(g195
742
+ g82
743
+ tp236
744
+ F-3.14e+100
745
+ s(g195
746
+ g84
747
+ tp237
748
+ F-3.14e+100
749
+ s(g195
750
+ g86
751
+ tp238
752
+ F-3.14e+100
753
+ s(g195
754
+ g88
755
+ tp239
756
+ F-3.14e+100
757
+ s(g195
758
+ g90
759
+ tp240
760
+ F-3.14e+100
761
+ s(g195
762
+ g92
763
+ tp241
764
+ F-3.14e+100
765
+ s(g195
766
+ g94
767
+ tp242
768
+ F-3.14e+100
769
+ s(g195
770
+ g96
771
+ tp243
772
+ F-3.14e+100
773
+ s(g195
774
+ g98
775
+ tp244
776
+ F-3.14e+100
777
+ s(g195
778
+ g100
779
+ tp245
780
+ F-3.14e+100
781
+ s(g195
782
+ g102
783
+ tp246
784
+ F-3.14e+100
785
+ s(g195
786
+ g104
787
+ tp247
788
+ F-3.14e+100
789
+ s(g195
790
+ g106
791
+ tp248
792
+ F-3.14e+100
793
+ s(g195
794
+ g108
795
+ tp249
796
+ F-3.14e+100
797
+ s(g195
798
+ g110
799
+ tp250
800
+ F-3.14e+100
801
+ s(g195
802
+ g112
803
+ tp251
804
+ F-3.14e+100
805
+ s(g195
806
+ g114
807
+ tp252
808
+ F-3.14e+100
809
+ s(g195
810
+ g116
811
+ tp253
812
+ F-3.14e+100
813
+ s(g195
814
+ g118
815
+ tp254
816
+ F-3.14e+100
817
+ s(g195
818
+ g120
819
+ tp255
820
+ F-3.14e+100
821
+ s(g195
822
+ g122
823
+ tp256
824
+ F-3.14e+100
825
+ s(g195
826
+ g124
827
+ tp257
828
+ F-3.14e+100
829
+ s(g195
830
+ g126
831
+ tp258
832
+ F-3.14e+100
833
+ s(g195
834
+ g128
835
+ tp259
836
+ F-3.14e+100
837
+ s(S'S'
838
+ p260
839
+ g2
840
+ tp261
841
+ F-3.9025396831295227
842
+ s(g260
843
+ g4
844
+ tp262
845
+ F-11.048458480182255
846
+ s(g260
847
+ g6
848
+ tp263
849
+ F-6.954113917960154
850
+ s(g260
851
+ g8
852
+ tp264
853
+ F-12.84021794941031
854
+ s(g260
855
+ g10
856
+ tp265
857
+ F-6.472888763970454
858
+ s(g260
859
+ g12
860
+ tp266
861
+ F-3.14e+100
862
+ s(g260
863
+ g14
864
+ tp267
865
+ F-4.786966795861212
866
+ s(g260
867
+ g16
868
+ tp268
869
+ F-3.903919764181873
870
+ s(g260
871
+ g18
872
+ tp269
873
+ F-3.14e+100
874
+ s(g260
875
+ g20
876
+ tp270
877
+ F-8.948397651299683
878
+ s(g260
879
+ g22
880
+ tp271
881
+ F-5.942513006281674
882
+ s(g260
883
+ g24
884
+ tp272
885
+ F-3.14e+100
886
+ s(g260
887
+ g26
888
+ tp273
889
+ F-5.194820249981676
890
+ s(g260
891
+ g28
892
+ tp274
893
+ F-6.507826815331734
894
+ s(g260
895
+ g30
896
+ tp275
897
+ F-8.650563207383884
898
+ s(g260
899
+ g32
900
+ tp276
901
+ F-3.14e+100
902
+ s(g260
903
+ g34
904
+ tp277
905
+ F-3.14e+100
906
+ s(g260
907
+ g36
908
+ tp278
909
+ F-4.911992119644354
910
+ s(g260
911
+ g38
912
+ tp279
913
+ F-3.14e+100
914
+ s(g260
915
+ g40
916
+ tp280
917
+ F-6.940320595827818
918
+ s(g260
919
+ g42
920
+ tp281
921
+ F-3.14e+100
922
+ s(g260
923
+ g44
924
+ tp282
925
+ F-3.14e+100
926
+ s(g260
927
+ g46
928
+ tp283
929
+ F-3.269200652116097
930
+ s(g260
931
+ g48
932
+ tp284
933
+ F-10.825314928868044
934
+ s(g260
935
+ g50
936
+ tp285
937
+ F-3.14e+100
938
+ s(g260
939
+ g52
940
+ tp286
941
+ F-3.8551483897645107
942
+ s(g260
943
+ g54
944
+ tp287
945
+ F-4.913434861102905
946
+ s(g260
947
+ g56
948
+ tp288
949
+ F-4.483663103956885
950
+ s(g260
951
+ g58
952
+ tp289
953
+ F-3.14e+100
954
+ s(g260
955
+ g60
956
+ tp290
957
+ F-3.14e+100
958
+ s(g260
959
+ g62
960
+ tp291
961
+ F-3.14e+100
962
+ s(g260
963
+ g64
964
+ tp292
965
+ F-12.147070768850364
966
+ s(g260
967
+ g66
968
+ tp293
969
+ F-3.14e+100
970
+ s(g260
971
+ g68
972
+ tp294
973
+ F-8.464460927750023
974
+ s(g260
975
+ g70
976
+ tp295
977
+ F-2.9868401813596317
978
+ s(g260
979
+ g72
980
+ tp296
981
+ F-4.888658618255058
982
+ s(g260
983
+ g74
984
+ tp297
985
+ F-3.14e+100
986
+ s(g260
987
+ g76
988
+ tp298
989
+ F-3.14e+100
990
+ s(g260
991
+ g78
992
+ tp299
993
+ F-2.7635336784127853
994
+ s(g260
995
+ g80
996
+ tp300
997
+ F-10.275268591948773
998
+ s(g260
999
+ g82
1000
+ tp301
1001
+ F-3.14e+100
1002
+ s(g260
1003
+ g84
1004
+ tp302
1005
+ F-3.14e+100
1006
+ s(g260
1007
+ g86
1008
+ tp303
1009
+ F-3.14e+100
1010
+ s(g260
1011
+ g88
1012
+ tp304
1013
+ F-3.14e+100
1014
+ s(g260
1015
+ g90
1016
+ tp305
1017
+ F-6.272842531880403
1018
+ s(g260
1019
+ g92
1020
+ tp306
1021
+ F-6.940320595827818
1022
+ s(g260
1023
+ g94
1024
+ tp307
1025
+ F-7.728230161053767
1026
+ s(g260
1027
+ g96
1028
+ tp308
1029
+ F-7.5394037026636855
1030
+ s(g260
1031
+ g98
1032
+ tp309
1033
+ F-6.85251045118004
1034
+ s(g260
1035
+ g100
1036
+ tp310
1037
+ F-8.4153713175535
1038
+ s(g260
1039
+ g102
1040
+ tp311
1041
+ F-8.15808672228609
1042
+ s(g260
1043
+ g104
1044
+ tp312
1045
+ F-9.299258625372996
1046
+ s(g260
1047
+ g106
1048
+ tp313
1049
+ F-3.053292303412302
1050
+ s(g260
1051
+ g108
1052
+ tp314
1053
+ F-3.14e+100
1054
+ s(g260
1055
+ g110
1056
+ tp315
1057
+ F-5.9430181843676895
1058
+ s(g260
1059
+ g112
1060
+ tp316
1061
+ F-3.14e+100
1062
+ s(g260
1063
+ g114
1064
+ tp317
1065
+ F-11.453923588290419
1066
+ s(g260
1067
+ g116
1068
+ tp318
1069
+ F-3.14e+100
1070
+ s(g260
1071
+ g118
1072
+ tp319
1073
+ F-3.14e+100
1074
+ s(g260
1075
+ g120
1076
+ tp320
1077
+ F-8.427419656069674
1078
+ s(g260
1079
+ g122
1080
+ tp321
1081
+ F-6.1970794699489575
1082
+ s(g260
1083
+ g124
1084
+ tp322
1085
+ F-13.533365129970255
1086
+ s(g260
1087
+ g126
1088
+ tp323
1089
+ F-3.14e+100
1090
+ s(g260
1091
+ g128
1092
+ tp324
1093
+ F-3.14e+100
1094
+ s.
jieba/posseg/prob_start.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ P={('B', 'a'): -4.762305214596967,
2
+ ('B', 'ad'): -6.680066036784177,
3
+ ('B', 'ag'): -3.14e+100,
4
+ ('B', 'an'): -8.697083223018778,
5
+ ('B', 'b'): -5.018374362109218,
6
+ ('B', 'bg'): -3.14e+100,
7
+ ('B', 'c'): -3.423880184954888,
8
+ ('B', 'd'): -3.9750475297585357,
9
+ ('B', 'df'): -8.888974230828882,
10
+ ('B', 'dg'): -3.14e+100,
11
+ ('B', 'e'): -8.563551830394255,
12
+ ('B', 'en'): -3.14e+100,
13
+ ('B', 'f'): -5.491630418482717,
14
+ ('B', 'g'): -3.14e+100,
15
+ ('B', 'h'): -13.533365129970255,
16
+ ('B', 'i'): -6.1157847275557105,
17
+ ('B', 'in'): -3.14e+100,
18
+ ('B', 'j'): -5.0576191284681915,
19
+ ('B', 'jn'): -3.14e+100,
20
+ ('B', 'k'): -3.14e+100,
21
+ ('B', 'l'): -4.905883584659895,
22
+ ('B', 'ln'): -3.14e+100,
23
+ ('B', 'm'): -3.6524299819046386,
24
+ ('B', 'mg'): -3.14e+100,
25
+ ('B', 'mq'): -6.78695300139688,
26
+ ('B', 'n'): -1.6966257797548328,
27
+ ('B', 'ng'): -3.14e+100,
28
+ ('B', 'nr'): -2.2310495913769506,
29
+ ('B', 'nrfg'): -5.873722175405573,
30
+ ('B', 'nrt'): -4.985642733519195,
31
+ ('B', 'ns'): -2.8228438314969213,
32
+ ('B', 'nt'): -4.846091668182416,
33
+ ('B', 'nz'): -3.94698846057672,
34
+ ('B', 'o'): -8.433498702146057,
35
+ ('B', 'p'): -4.200984132085048,
36
+ ('B', 'q'): -6.998123858956596,
37
+ ('B', 'qe'): -3.14e+100,
38
+ ('B', 'qg'): -3.14e+100,
39
+ ('B', 'r'): -3.4098187790818413,
40
+ ('B', 'rg'): -3.14e+100,
41
+ ('B', 'rr'): -12.434752841302146,
42
+ ('B', 'rz'): -7.946116471570005,
43
+ ('B', 's'): -5.522673590839954,
44
+ ('B', 't'): -3.3647479094528574,
45
+ ('B', 'tg'): -3.14e+100,
46
+ ('B', 'u'): -9.163917277503234,
47
+ ('B', 'ud'): -3.14e+100,
48
+ ('B', 'ug'): -3.14e+100,
49
+ ('B', 'uj'): -3.14e+100,
50
+ ('B', 'ul'): -3.14e+100,
51
+ ('B', 'uv'): -3.14e+100,
52
+ ('B', 'uz'): -3.14e+100,
53
+ ('B', 'v'): -2.6740584874265685,
54
+ ('B', 'vd'): -9.044728760238115,
55
+ ('B', 'vg'): -3.14e+100,
56
+ ('B', 'vi'): -12.434752841302146,
57
+ ('B', 'vn'): -4.3315610890163585,
58
+ ('B', 'vq'): -12.147070768850364,
59
+ ('B', 'w'): -3.14e+100,
60
+ ('B', 'x'): -3.14e+100,
61
+ ('B', 'y'): -9.844485675856319,
62
+ ('B', 'yg'): -3.14e+100,
63
+ ('B', 'z'): -7.045681111485645,
64
+ ('B', 'zg'): -3.14e+100,
65
+ ('E', 'a'): -3.14e+100,
66
+ ('E', 'ad'): -3.14e+100,
67
+ ('E', 'ag'): -3.14e+100,
68
+ ('E', 'an'): -3.14e+100,
69
+ ('E', 'b'): -3.14e+100,
70
+ ('E', 'bg'): -3.14e+100,
71
+ ('E', 'c'): -3.14e+100,
72
+ ('E', 'd'): -3.14e+100,
73
+ ('E', 'df'): -3.14e+100,
74
+ ('E', 'dg'): -3.14e+100,
75
+ ('E', 'e'): -3.14e+100,
76
+ ('E', 'en'): -3.14e+100,
77
+ ('E', 'f'): -3.14e+100,
78
+ ('E', 'g'): -3.14e+100,
79
+ ('E', 'h'): -3.14e+100,
80
+ ('E', 'i'): -3.14e+100,
81
+ ('E', 'in'): -3.14e+100,
82
+ ('E', 'j'): -3.14e+100,
83
+ ('E', 'jn'): -3.14e+100,
84
+ ('E', 'k'): -3.14e+100,
85
+ ('E', 'l'): -3.14e+100,
86
+ ('E', 'ln'): -3.14e+100,
87
+ ('E', 'm'): -3.14e+100,
88
+ ('E', 'mg'): -3.14e+100,
89
+ ('E', 'mq'): -3.14e+100,
90
+ ('E', 'n'): -3.14e+100,
91
+ ('E', 'ng'): -3.14e+100,
92
+ ('E', 'nr'): -3.14e+100,
93
+ ('E', 'nrfg'): -3.14e+100,
94
+ ('E', 'nrt'): -3.14e+100,
95
+ ('E', 'ns'): -3.14e+100,
96
+ ('E', 'nt'): -3.14e+100,
97
+ ('E', 'nz'): -3.14e+100,
98
+ ('E', 'o'): -3.14e+100,
99
+ ('E', 'p'): -3.14e+100,
100
+ ('E', 'q'): -3.14e+100,
101
+ ('E', 'qe'): -3.14e+100,
102
+ ('E', 'qg'): -3.14e+100,
103
+ ('E', 'r'): -3.14e+100,
104
+ ('E', 'rg'): -3.14e+100,
105
+ ('E', 'rr'): -3.14e+100,
106
+ ('E', 'rz'): -3.14e+100,
107
+ ('E', 's'): -3.14e+100,
108
+ ('E', 't'): -3.14e+100,
109
+ ('E', 'tg'): -3.14e+100,
110
+ ('E', 'u'): -3.14e+100,
111
+ ('E', 'ud'): -3.14e+100,
112
+ ('E', 'ug'): -3.14e+100,
113
+ ('E', 'uj'): -3.14e+100,
114
+ ('E', 'ul'): -3.14e+100,
115
+ ('E', 'uv'): -3.14e+100,
116
+ ('E', 'uz'): -3.14e+100,
117
+ ('E', 'v'): -3.14e+100,
118
+ ('E', 'vd'): -3.14e+100,
119
+ ('E', 'vg'): -3.14e+100,
120
+ ('E', 'vi'): -3.14e+100,
121
+ ('E', 'vn'): -3.14e+100,
122
+ ('E', 'vq'): -3.14e+100,
123
+ ('E', 'w'): -3.14e+100,
124
+ ('E', 'x'): -3.14e+100,
125
+ ('E', 'y'): -3.14e+100,
126
+ ('E', 'yg'): -3.14e+100,
127
+ ('E', 'z'): -3.14e+100,
128
+ ('E', 'zg'): -3.14e+100,
129
+ ('M', 'a'): -3.14e+100,
130
+ ('M', 'ad'): -3.14e+100,
131
+ ('M', 'ag'): -3.14e+100,
132
+ ('M', 'an'): -3.14e+100,
133
+ ('M', 'b'): -3.14e+100,
134
+ ('M', 'bg'): -3.14e+100,
135
+ ('M', 'c'): -3.14e+100,
136
+ ('M', 'd'): -3.14e+100,
137
+ ('M', 'df'): -3.14e+100,
138
+ ('M', 'dg'): -3.14e+100,
139
+ ('M', 'e'): -3.14e+100,
140
+ ('M', 'en'): -3.14e+100,
141
+ ('M', 'f'): -3.14e+100,
142
+ ('M', 'g'): -3.14e+100,
143
+ ('M', 'h'): -3.14e+100,
144
+ ('M', 'i'): -3.14e+100,
145
+ ('M', 'in'): -3.14e+100,
146
+ ('M', 'j'): -3.14e+100,
147
+ ('M', 'jn'): -3.14e+100,
148
+ ('M', 'k'): -3.14e+100,
149
+ ('M', 'l'): -3.14e+100,
150
+ ('M', 'ln'): -3.14e+100,
151
+ ('M', 'm'): -3.14e+100,
152
+ ('M', 'mg'): -3.14e+100,
153
+ ('M', 'mq'): -3.14e+100,
154
+ ('M', 'n'): -3.14e+100,
155
+ ('M', 'ng'): -3.14e+100,
156
+ ('M', 'nr'): -3.14e+100,
157
+ ('M', 'nrfg'): -3.14e+100,
158
+ ('M', 'nrt'): -3.14e+100,
159
+ ('M', 'ns'): -3.14e+100,
160
+ ('M', 'nt'): -3.14e+100,
161
+ ('M', 'nz'): -3.14e+100,
162
+ ('M', 'o'): -3.14e+100,
163
+ ('M', 'p'): -3.14e+100,
164
+ ('M', 'q'): -3.14e+100,
165
+ ('M', 'qe'): -3.14e+100,
166
+ ('M', 'qg'): -3.14e+100,
167
+ ('M', 'r'): -3.14e+100,
168
+ ('M', 'rg'): -3.14e+100,
169
+ ('M', 'rr'): -3.14e+100,
170
+ ('M', 'rz'): -3.14e+100,
171
+ ('M', 's'): -3.14e+100,
172
+ ('M', 't'): -3.14e+100,
173
+ ('M', 'tg'): -3.14e+100,
174
+ ('M', 'u'): -3.14e+100,
175
+ ('M', 'ud'): -3.14e+100,
176
+ ('M', 'ug'): -3.14e+100,
177
+ ('M', 'uj'): -3.14e+100,
178
+ ('M', 'ul'): -3.14e+100,
179
+ ('M', 'uv'): -3.14e+100,
180
+ ('M', 'uz'): -3.14e+100,
181
+ ('M', 'v'): -3.14e+100,
182
+ ('M', 'vd'): -3.14e+100,
183
+ ('M', 'vg'): -3.14e+100,
184
+ ('M', 'vi'): -3.14e+100,
185
+ ('M', 'vn'): -3.14e+100,
186
+ ('M', 'vq'): -3.14e+100,
187
+ ('M', 'w'): -3.14e+100,
188
+ ('M', 'x'): -3.14e+100,
189
+ ('M', 'y'): -3.14e+100,
190
+ ('M', 'yg'): -3.14e+100,
191
+ ('M', 'z'): -3.14e+100,
192
+ ('M', 'zg'): -3.14e+100,
193
+ ('S', 'a'): -3.9025396831295227,
194
+ ('S', 'ad'): -11.048458480182255,
195
+ ('S', 'ag'): -6.954113917960154,
196
+ ('S', 'an'): -12.84021794941031,
197
+ ('S', 'b'): -6.472888763970454,
198
+ ('S', 'bg'): -3.14e+100,
199
+ ('S', 'c'): -4.786966795861212,
200
+ ('S', 'd'): -3.903919764181873,
201
+ ('S', 'df'): -3.14e+100,
202
+ ('S', 'dg'): -8.948397651299683,
203
+ ('S', 'e'): -5.942513006281674,
204
+ ('S', 'en'): -3.14e+100,
205
+ ('S', 'f'): -5.194820249981676,
206
+ ('S', 'g'): -6.507826815331734,
207
+ ('S', 'h'): -8.650563207383884,
208
+ ('S', 'i'): -3.14e+100,
209
+ ('S', 'in'): -3.14e+100,
210
+ ('S', 'j'): -4.911992119644354,
211
+ ('S', 'jn'): -3.14e+100,
212
+ ('S', 'k'): -6.940320595827818,
213
+ ('S', 'l'): -3.14e+100,
214
+ ('S', 'ln'): -3.14e+100,
215
+ ('S', 'm'): -3.269200652116097,
216
+ ('S', 'mg'): -10.825314928868044,
217
+ ('S', 'mq'): -3.14e+100,
218
+ ('S', 'n'): -3.8551483897645107,
219
+ ('S', 'ng'): -4.913434861102905,
220
+ ('S', 'nr'): -4.483663103956885,
221
+ ('S', 'nrfg'): -3.14e+100,
222
+ ('S', 'nrt'): -3.14e+100,
223
+ ('S', 'ns'): -3.14e+100,
224
+ ('S', 'nt'): -12.147070768850364,
225
+ ('S', 'nz'): -3.14e+100,
226
+ ('S', 'o'): -8.464460927750023,
227
+ ('S', 'p'): -2.9868401813596317,
228
+ ('S', 'q'): -4.888658618255058,
229
+ ('S', 'qe'): -3.14e+100,
230
+ ('S', 'qg'): -3.14e+100,
231
+ ('S', 'r'): -2.7635336784127853,
232
+ ('S', 'rg'): -10.275268591948773,
233
+ ('S', 'rr'): -3.14e+100,
234
+ ('S', 'rz'): -3.14e+100,
235
+ ('S', 's'): -3.14e+100,
236
+ ('S', 't'): -3.14e+100,
237
+ ('S', 'tg'): -6.272842531880403,
238
+ ('S', 'u'): -6.940320595827818,
239
+ ('S', 'ud'): -7.728230161053767,
240
+ ('S', 'ug'): -7.5394037026636855,
241
+ ('S', 'uj'): -6.85251045118004,
242
+ ('S', 'ul'): -8.4153713175535,
243
+ ('S', 'uv'): -8.15808672228609,
244
+ ('S', 'uz'): -9.299258625372996,
245
+ ('S', 'v'): -3.053292303412302,
246
+ ('S', 'vd'): -3.14e+100,
247
+ ('S', 'vg'): -5.9430181843676895,
248
+ ('S', 'vi'): -3.14e+100,
249
+ ('S', 'vn'): -11.453923588290419,
250
+ ('S', 'vq'): -3.14e+100,
251
+ ('S', 'w'): -3.14e+100,
252
+ ('S', 'x'): -8.427419656069674,
253
+ ('S', 'y'): -6.1970794699489575,
254
+ ('S', 'yg'): -13.533365129970255,
255
+ ('S', 'z'): -3.14e+100,
256
+ ('S', 'zg'): -3.14e+100}
jieba/posseg/prob_trans.p ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/prob_trans.py ADDED
The diff for this file is too large to render. See raw diff
 
jieba/posseg/viterbi.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import operator
3
+ MIN_FLOAT = -3.14e100
4
+ MIN_INF = float("-inf")
5
+
6
+ if sys.version_info[0] > 2:
7
+ xrange = range
8
+
9
+
10
+ def get_top_states(t_state_v, K=4):
11
+ return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
12
+
13
+
14
+ def viterbi(obs, states, start_p, trans_p, emit_p):
15
+ V = [{}] # tabular
16
+ mem_path = [{}]
17
+ all_states = trans_p.keys()
18
+ for y in states.get(obs[0], all_states): # init
19
+ V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
20
+ mem_path[0][y] = ''
21
+ for t in xrange(1, len(obs)):
22
+ V.append({})
23
+ mem_path.append({})
24
+ #prev_states = get_top_states(V[t-1])
25
+ prev_states = [
26
+ x for x in mem_path[t - 1].keys() if len(trans_p[x]) > 0]
27
+
28
+ prev_states_expect_next = set(
29
+ (y for x in prev_states for y in trans_p[x].keys()))
30
+ obs_states = set(
31
+ states.get(obs[t], all_states)) & prev_states_expect_next
32
+
33
+ if not obs_states:
34
+ obs_states = prev_states_expect_next if prev_states_expect_next else all_states
35
+
36
+ for y in obs_states:
37
+ prob, state = max((V[t - 1][y0] + trans_p[y0].get(y, MIN_INF) +
38
+ emit_p[y].get(obs[t], MIN_FLOAT), y0) for y0 in prev_states)
39
+ V[t][y] = prob
40
+ mem_path[t][y] = state
41
+
42
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
43
+ # if len(last)==0:
44
+ # print obs
45
+ prob, state = max(last)
46
+
47
+ route = [None] * len(obs)
48
+ i = len(obs) - 1
49
+ while i >= 0:
50
+ route[i] = state
51
+ state = mem_path[i][state]
52
+ i -= 1
53
+ return (prob, route)
models.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "zh_core_web_sm":"Chinese (zh_core_web_sm)",
3
+ "en_core_web_sm":"English (en_core_web_sm)",
4
+ "ja_core_news_sm":"Japanese (ja_core_news_sm)"
5
+ }
pages/01_🍊Mandarin.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from dragonmapper import hanzi, transcriptions
3
+ import jieba
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import re
7
+ import requests
8
+ import spacy
9
+ from spacy_streamlit import visualize_ner, visualize_tokens
10
+ #from spacy.language import Language
11
+ from spacy.tokens import Doc
12
+ import streamlit as st
13
+
14
+ # Global variables
15
+ DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》"
16
+ DESCRIPTION = "AI模型輔助語言學習:華語"
17
+ TOK_SEP = " | "
18
+ PUNCT_SYM = ["PUNCT", "SYM"]
19
+ MODEL_NAME = "zh_core_web_sm"
20
+
21
+ # External API callers
22
+ def moedict_caller(word):
23
+ st.write(f"### {word}")
24
+ req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
25
+ try:
26
+ definitions = req.json().get('heteronyms')[0].get('definitions')
27
+ df = pd.DataFrame(definitions)
28
+ df.fillna("---", inplace=True)
29
+ if 'example' not in df.columns:
30
+ df['example'] = '---'
31
+ if 'synonyms' not in df.columns:
32
+ df['synonyms'] = '---'
33
+ if 'antonyms' not in df.columns:
34
+ df['antonyms'] = '---'
35
+ cols = ['def', 'example', 'synonyms', 'antonyms']
36
+ df = df[cols]
37
+ df.rename(columns={
38
+ 'def': '解釋',
39
+ 'example': '例句',
40
+ 'synonyms': '同義詞',
41
+ 'antonyms': '反義詞',
42
+ }, inplace=True)
43
+ with st.expander("點擊 + 查看結果"):
44
+ st.table(df)
45
+ except:
46
+ st.write("查無結果")
47
+
48
+ # Custom tokenizer class
49
+ class JiebaTokenizer:
50
+ def __init__(self, vocab):
51
+ self.vocab = vocab
52
+
53
+ def __call__(self, text):
54
+ words = jieba.cut(text) # returns a generator
55
+ tokens = list(words) # convert the genetator to a list
56
+ spaces = [False] * len(tokens)
57
+ doc = Doc(self.vocab, words=tokens, spaces=spaces)
58
+ return doc
59
+
60
+ # Utility functions
61
+ def filter_tokens(doc):
62
+ clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
63
+ clean_tokens = (
64
+ [tok for tok in clean_tokens if
65
+ not tok.like_email and
66
+ not tok.like_num and
67
+ not tok.like_url and
68
+ not tok.is_space]
69
+ )
70
+ return clean_tokens
71
+
72
+ def get_vocab(doc):
73
+ clean_tokens = filter_tokens(doc)
74
+ alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
75
+ clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
76
+ vocab = list(set(clean_tokens_text))
77
+ return vocab
78
+
79
+ def get_counter(doc):
80
+ clean_tokens = filter_tokens(doc)
81
+ tokens = [token.text for token in clean_tokens]
82
+ counter = Counter(tokens)
83
+ return counter
84
+
85
+ def get_freq_fig(doc):
86
+ counter = get_counter(doc)
87
+ counter_df = (
88
+ pd.DataFrame.from_dict(counter, orient='index').
89
+ reset_index().
90
+ rename(columns={
91
+ 0: 'count',
92
+ 'index': 'word'
93
+ }).
94
+ sort_values(by='count', ascending=False)
95
+ )
96
+ fig = px.bar(counter_df, x='word', y='count')
97
+ return fig
98
+
99
+ def get_level_pie(tocfl_result):
100
+ level = tocfl_result['詞條分級'].value_counts()
101
+ fig = px.pie(tocfl_result,
102
+ values=level.values,
103
+ names=level.index,
104
+ title='詞彙分級圓餅圖')
105
+ return fig
106
+
107
+ @st.cache
108
+ def load_tocfl_table(filename="./tocfl_wordlist.csv"):
109
+ table = pd.read_csv(filename)
110
+ cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
111
+ table = table[cols]
112
+ return table
113
+
114
+ # Page setting
115
+ st.set_page_config(
116
+ page_icon="🤠",
117
+ layout="wide",
118
+ initial_sidebar_state="auto",
119
+ )
120
+ st.markdown(f"# {DESCRIPTION}")
121
+
122
+ # Load the model
123
+ nlp = spacy.load(MODEL_NAME)
124
+
125
+ # Add pipelines to spaCy
126
+ # nlp.add_pipe("yake") # keyword extraction
127
+ # nlp.add_pipe("merge_entities") # Merge entity spans to tokens
128
+
129
+ # Select a tokenizer if the Chinese model is chosen
130
+ selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
131
+ if selected_tokenizer == "jieba-TW":
132
+ nlp.tokenizer = JiebaTokenizer(nlp.vocab)
133
+
134
+ # Page starts from here
135
+ st.markdown("## 待分析文本")
136
+ st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
137
+ text = st.text_area("", DEFAULT_TEXT, height=200)
138
+ doc = nlp(text)
139
+ st.markdown("---")
140
+
141
+ st.info("���勾選以下至少一項功能")
142
+ # keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
143
+ analyzed_text = st.checkbox("增強文本", True)
144
+ defs_examples = st.checkbox("單詞解析", True)
145
+ # morphology = st.sidebar.checkbox("詞形變化", True)
146
+ freq_count = st.checkbox("詞頻統計", True)
147
+ ner_viz = st.checkbox("命名實體", True)
148
+ tok_table = st.checkbox("斷詞特徵", False)
149
+
150
+ if analyzed_text:
151
+ st.markdown("## 增強文本")
152
+ pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
153
+ for idx, sent in enumerate(doc.sents):
154
+ tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
155
+ pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
156
+ sounds = pinyins
157
+ if pronunciation == "注音符號":
158
+ zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
159
+ sounds = zhuyins
160
+ elif pronunciation == "國際音標":
161
+ ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
162
+ sounds = ipas
163
+
164
+ display = []
165
+ for text, sound in zip(tokens_text, sounds):
166
+ res = f"{text} [{sound}]"
167
+ display.append(res)
168
+ if display:
169
+ display_text = TOK_SEP.join(display)
170
+ st.write(f"{idx+1} >>> {display_text}")
171
+ else:
172
+ st.write(f"{idx+1} >>> EMPTY LINE")
173
+
174
+ if defs_examples:
175
+ st.markdown("## 單詞解析")
176
+ vocab = get_vocab(doc)
177
+ if vocab:
178
+ tocfl_table = load_tocfl_table()
179
+ filt = tocfl_table['詞彙'].isin(vocab)
180
+ tocfl_res = tocfl_table[filt]
181
+ st.markdown("### 華語詞彙分級")
182
+ fig = get_level_pie(tocfl_res)
183
+ st.plotly_chart(fig, use_container_width=True)
184
+
185
+ with st.expander("點擊 + 查看結果"):
186
+ st.table(tocfl_res)
187
+ st.markdown("---")
188
+ st.markdown("### 單詞解釋與例句")
189
+ selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
190
+ for w in selected_words:
191
+ moedict_caller(w)
192
+
193
+ if freq_count:
194
+ st.markdown("## 詞頻統計")
195
+ counter = get_counter(doc)
196
+ topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
197
+ most_common = counter.most_common(topK)
198
+ st.write(most_common)
199
+ st.markdown("---")
200
+
201
+ fig = get_freq_fig(doc)
202
+ st.plotly_chart(fig, use_container_width=True)
203
+
204
+ if ner_viz:
205
+ ner_labels = nlp.get_pipe("ner").labels
206
+ visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
207
+
208
+ if tok_table:
209
+ visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")
pages/02_🍣Japanese.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from jisho_api.word import Word
2
+ from jisho_api.sentence import Sentence
3
+ import pandas as pd
4
+ import re
5
+ import requests
6
+ import spacy
7
+ from spacy_streamlit import visualize_ner, visualize_tokens
8
+ #from spacy.language import Language
9
+ from spacy.tokens import Doc
10
+ import spacy_ke
11
+ import streamlit as st
12
+
13
+ # Global variables
14
+ DEFAULT_TEXT = """それまで、ぼくはずっとひとりぼっちだった。だれともうちとけられないまま、6年まえ、ちょっとおかしくなって、サハラさばくに下りた。ぼくのエンジンのなかで、なにかがこわれていた。ぼくには、みてくれるひとも、おきゃくさんもいなかったから、なおすのはむずかしいけど、ぜんぶひとりでなんとかやってみることにした。それでぼくのいのちがきまってしまう。のみ水は、たった7日ぶんしかなかった。
15
+  1日めの夜、ぼくはすなの上でねむった。ひとのすむところは、はるかかなただった。海のどまんなか、いかだでさまよっているひとよりも、もっとひとりぼっち。だから、ぼくがびっくりしたのも、みんなわかってくれるとおもう。じつは、あさ日がのぼるころ、ぼくは、ふしぎなかわいいこえでおこされたんだ。
16
+ 「ごめんください……ヒツジの絵をかいて!」
17
+ 「えっ?」
18
+ 「ぼくにヒツジの絵をかいて……」
19
+ 『星の王子さま』"""
20
+ DESCRIPTION = "AI模型輔助語言學習:日語"
21
+ TOK_SEP = " | "
22
+ MODEL_NAME = "ja_ginza"
23
+
24
+ # External API callers
25
+ def parse_jisho_senses(word):
26
+ res = Word.request(word)
27
+ response = res.dict()
28
+ if response["meta"]["status"] == 200:
29
+ data = response["data"]
30
+ commons = [d for d in data if d["is_common"]]
31
+ if commons:
32
+ common = commons[0] # Only get the first entry that is common
33
+ senses = common["senses"]
34
+ if len(senses) > 3:
35
+ senses = senses[:3]
36
+ with st.container():
37
+ for idx, sense in enumerate(senses):
38
+ eng_def = "; ".join(sense["english_definitions"])
39
+ pos = "/".join(sense["parts_of_speech"])
40
+ st.write(f"Sense {idx+1}: {eng_def} ({pos})")
41
+ else:
42
+ st.info("Found no common words on Jisho!")
43
+ else:
44
+ st.error("Can't get response from Jisho!")
45
+
46
+
47
+ def parse_jisho_sentences(word):
48
+ res = Sentence.request(word)
49
+ try:
50
+ response = res.dict()
51
+ data = response["data"]
52
+ if len(data) > 3:
53
+ sents = data[:3]
54
+ else:
55
+ sents = data
56
+ with st.container():
57
+ for idx, sent in enumerate(sents):
58
+ eng = sent["en_translation"]
59
+ jap = sent["japanese"]
60
+ st.write(f"Sentence {idx+1}: {jap}")
61
+ st.write(f"({eng})")
62
+ except:
63
+ st.info("Found no results on Jisho!")
64
+
65
+ # Utility functions
66
+ def create_jap_df(tokens):
67
+ seen_texts = []
68
+ filtered_tokens = []
69
+ for tok in tokens:
70
+ if tok.text not in seen_texts:
71
+ filtered_tokens.append(tok)
72
+
73
+ df = pd.DataFrame(
74
+ {
75
+ "單詞": [tok.text for tok in filtered_tokens],
76
+ "發音": ["/".join(tok.morph.get("Reading")) for tok in filtered_tokens],
77
+ "詞形變化": ["/".join(tok.morph.get("Inflection")) for tok in filtered_tokens],
78
+ "原形": [tok.lemma_ for tok in filtered_tokens],
79
+ #"正規形": [tok.norm_ for tok in verbs],
80
+ }
81
+ )
82
+ st.dataframe(df)
83
+ csv = df.to_csv().encode('utf-8')
84
+ st.download_button(
85
+ label="下載表格",
86
+ data=csv,
87
+ file_name='jap_forms.csv',
88
+ )
89
+
90
+ def filter_tokens(doc):
91
+ clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
92
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
93
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
94
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
95
+ clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
96
+ clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
97
+ return clean_tokens
98
+
99
+ def create_kw_section(doc):
100
+ st.markdown("## 關鍵詞分析")
101
+ kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
102
+ kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
103
+ kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
104
+ count = 1
105
+ for keyword, score in kws2scores:
106
+ rounded_score = round(score, 3)
107
+ st.write(f"{count} >>> {keyword} ({rounded_score})")
108
+ count += 1
109
+
110
+ # Page setting
111
+ st.set_page_config(
112
+ page_icon="🤠",
113
+ layout="wide",
114
+ initial_sidebar_state="auto",
115
+ )
116
+ st.markdown(f"# {DESCRIPTION}")
117
+
118
+ # Load the model
119
+ nlp = spacy.load(MODEL_NAME)
120
+
121
+ # Add pipelines to spaCy
122
+ nlp.add_pipe("yake") # keyword extraction
123
+ # nlp.add_pipe("merge_entities") # Merge entity spans to tokens
124
+
125
+ # Page starts from here
126
+ st.markdown("## 待分析文本")
127
+ st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
128
+ text = st.text_area("", DEFAULT_TEXT, height=200)
129
+ doc = nlp(text)
130
+ st.markdown("---")
131
+
132
+ st.info("請勾選以下至少一項功能")
133
+ keywords_extraction = st.checkbox("關鍵詞分析", False)
134
+ analyzed_text = st.checkbox("增強文本", True)
135
+ defs_examples = st.checkbox("單詞解析", True)
136
+ morphology = st.checkbox("詞形變化", False)
137
+ ner_viz = st.checkbox("命名實體", True)
138
+ tok_table = st.checkbox("斷詞特徵", False)
139
+
140
+ if keywords_extraction:
141
+ create_kw_section(doc)
142
+
143
+ if analyzed_text:
144
+ st.markdown("## 分析後文本")
145
+ for idx, sent in enumerate(doc.sents):
146
+ clean_tokens = [tok for tok in sent if tok.pos_ not in ["PUNCT", "SYM"]]
147
+ tokens_text = [tok.text for tok in clean_tokens]
148
+ readings = ["/".join(tok.morph.get("Reading")) for tok in clean_tokens]
149
+ display = [f"{text} [{reading}]" for text, reading in zip(tokens_text, readings)]
150
+ if display:
151
+ display_text = TOK_SEP.join(display)
152
+ st.write(f"{idx+1} >>> {display_text}")
153
+ else:
154
+ st.write(f"{idx+1} >>> EMPTY LINE")
155
+
156
+ if defs_examples:
157
+ st.markdown("## 單詞解釋與例句")
158
+ clean_tokens = filter_tokens(doc)
159
+ alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
160
+ clean_lemmas = [tok.lemma_ for tok in clean_tokens if not alphanum_pattern.search(tok.lemma_)]
161
+ vocab = list(set(clean_lemmas))
162
+ if vocab:
163
+ selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
164
+ for w in selected_words:
165
+ st.write(f"### {w}")
166
+ with st.expander("點擊 + 檢視結果"):
167
+ parse_jisho_senses(w)
168
+ parse_jisho_sentences(w)
169
+
170
+ if morphology:
171
+ st.markdown("## 詞形變化")
172
+ # Collect inflected forms
173
+ inflected_forms = [tok for tok in doc if tok.tag_.startswith("動詞") or tok.tag_.startswith("形")]
174
+ if inflected_forms:
175
+ create_jap_df(inflected_forms)
176
+
177
+ if ner_viz:
178
+ ner_labels = nlp.get_pipe("ner").labels
179
+ visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
180
+
181
+ if tok_table:
182
+ visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")
183
+
pages/03_🍔English.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import requests
4
+ import spacy
5
+ from spacy_streamlit import visualize_ner, visualize_tokens
6
+ #from spacy.language import Language
7
+ from spacy.tokens import Doc
8
+ import spacy_ke
9
+ import streamlit as st
10
+
11
+ # Global variables
12
+ DEFAULT_TEXT = """So I lived my life alone, without anyone that I could really talk to, until I had an accident with my plane in the Desert of Sahara, six years ago. Something was broken in my engine. And as I had with me neither a mechanic nor any passengers, I set myself to attempt the difficult repairs all alone. It was a question of life or death for me: I had scarcely enough drinking water to last a week. The first night, then, I went to sleep on the sand, a thousand miles from any human habitation. I was more isolated than a shipwrecked sailor on a raft in the middle of the ocean. Thus you can imagine my amazement, at sunrise, when I was awakened by an odd little voice. It said:
13
+
14
+ "If you please−− draw me a sheep!"
15
+
16
+ "What!"
17
+
18
+ "Draw me a sheep!"
19
+
20
+ The Little Prince
21
+ """
22
+ DESCRIPTION = "AI模型輔助語言學習:英語"
23
+ TOK_SEP = " | "
24
+ MODEL_NAME = "en_core_web_sm"
25
+ API_LOOKUP = {}
26
+ MAX_SYM_NUM = 5
27
+
28
+ # External API caller
29
+ def free_dict_caller(word):
30
+ req = requests.get(f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}")
31
+ try:
32
+ result = req.json()[0]
33
+ if word not in API_LOOKUP:
34
+ API_LOOKUP[word] = result
35
+ except:
36
+ pass
37
+
38
+ def show_definitions_and_examples(word, pos):
39
+ if word not in API_LOOKUP:
40
+ free_dict_caller(word)
41
+
42
+ result = API_LOOKUP.get(word)
43
+ if result:
44
+ meanings = result.get('meanings')
45
+ if meanings:
46
+ definitions = []
47
+ for meaning in meanings:
48
+ if meaning['partOfSpeech'] == pos.lower():
49
+ definitions = meaning.get('definitions')
50
+
51
+ if len(definitions) > 3:
52
+ definitions = definitions[:3]
53
+
54
+ for definition in definitions:
55
+ df = definition.get("definition")
56
+ ex = definition.get("example")
57
+ st.markdown(f" - {df}")
58
+ st.markdown(f" Example: *{ex}*")
59
+ st.markdown("---")
60
+
61
+ else:
62
+ st.info("Found no matching result on Free Dictionary!")
63
+
64
+ def get_synonyms(word, pos):
65
+ if word not in API_LOOKUP:
66
+ free_dict_caller(word)
67
+
68
+ result = API_LOOKUP.get(word)
69
+ if result:
70
+ meanings = result.get('meanings')
71
+ if meanings:
72
+ synonyms = []
73
+ for meaning in meanings:
74
+ if meaning['partOfSpeech'] == pos.lower():
75
+ synonyms = meaning.get('synonyms')
76
+ return synonyms
77
+
78
+ # Utility functions
79
+ def create_eng_df(tokens):
80
+ seen_texts = []
81
+ filtered_tokens = []
82
+ for tok in tokens:
83
+ if tok.lemma_ not in seen_texts:
84
+ filtered_tokens.append(tok)
85
+
86
+ df = pd.DataFrame(
87
+ {
88
+ "單詞": [tok.text.lower() for tok in filtered_tokens],
89
+ "詞類": [tok.pos_ for tok in filtered_tokens],
90
+ "原形": [tok.lemma_ for tok in filtered_tokens],
91
+ }
92
+ )
93
+ st.dataframe(df)
94
+ csv = df.to_csv().encode('utf-8')
95
+ st.download_button(
96
+ label="下載表格",
97
+ data=csv,
98
+ file_name='eng_forms.csv',
99
+ )
100
+
101
+ def filter_tokens(doc):
102
+ clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
103
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
104
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
105
+ clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
106
+ clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
107
+ clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
108
+ return clean_tokens
109
+
110
+ def create_kw_section(doc):
111
+ st.markdown("## 關鍵詞分析")
112
+ kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
113
+ kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
114
+ kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
115
+ count = 1
116
+ for keyword, score in kws2scores:
117
+ rounded_score = round(score, 3)
118
+ st.write(f"{count} >>> {keyword} ({rounded_score})")
119
+ count += 1
120
+
121
+ # Page setting
122
+ st.set_page_config(
123
+ page_icon="🤠",
124
+ layout="wide",
125
+ initial_sidebar_state="auto",
126
+ )
127
+ st.markdown(f"# {DESCRIPTION}")
128
+
129
+ # Load the language model
130
+ nlp = spacy.load(MODEL_NAME)
131
+
132
+ # Add pipelines to spaCy
133
+ nlp.add_pipe("yake") # keyword extraction
134
+ # nlp.add_pipe("merge_entities") # Merge entity spans to tokens
135
+
136
+ # Page starts from here
137
+ st.markdown("## 待分析文本")
138
+ st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
139
+ text = st.text_area("", DEFAULT_TEXT, height=200)
140
+ doc = nlp(text)
141
+ st.markdown("---")
142
+
143
+ st.info("請勾選以下至少一項功能")
144
+ keywords_extraction = st.checkbox("關鍵詞分析", False)
145
+ analyzed_text = st.checkbox("增強文本", True)
146
+ defs_examples = st.checkbox("單詞解析", True)
147
+ morphology = st.checkbox("詞形變化", False)
148
+ ner_viz = st.checkbox("命名實體", True)
149
+ tok_table = st.checkbox("斷詞特徵", False)
150
+
151
+ if keywords_extraction:
152
+ create_kw_section(doc)
153
+
154
+ if analyzed_text:
155
+ st.markdown("## 分析後文本")
156
+ for idx, sent in enumerate(doc.sents):
157
+ enriched_sentence = []
158
+ for tok in sent:
159
+ if tok.pos_ != "VERB":
160
+ enriched_sentence.append(tok.text)
161
+ else:
162
+ synonyms = get_synonyms(tok.text, tok.pos_)
163
+ if synonyms:
164
+ if len(synonyms) > MAX_SYM_NUM:
165
+ synonyms = synonyms[:MAX_SYM_NUM]
166
+ added_verbs = " | ".join(synonyms)
167
+ enriched_tok = f"{tok.text} (cf. {added_verbs})"
168
+ enriched_sentence.append(enriched_tok)
169
+ else:
170
+ enriched_sentence.append(tok.text)
171
+
172
+ display_text = " ".join(enriched_sentence)
173
+ st.write(f"{idx+1} >>> {display_text}")
174
+
175
+ if defs_examples:
176
+ st.markdown("## 單詞解釋與例句")
177
+ clean_tokens = filter_tokens(doc)
178
+ num_pattern = re.compile(r"[0-9]")
179
+ clean_tokens = [tok for tok in clean_tokens if not num_pattern.search(tok.lemma_)]
180
+ selected_pos = ["VERB", "NOUN", "ADJ", "ADV"]
181
+ clean_tokens = [tok for tok in clean_tokens if tok.pos_ in selected_pos]
182
+ tokens_lemma_pos = [tok.lemma_ + " | " + tok.pos_ for tok in clean_tokens]
183
+ vocab = list(set(tokens_lemma_pos))
184
+ if vocab:
185
+ selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
186
+ for w in selected_words:
187
+ word_pos = w.split("|")
188
+ word = word_pos[0].strip()
189
+ pos = word_pos[1].strip()
190
+ st.write(f"### {w}")
191
+ with st.expander("點擊 + 檢視結果"):
192
+ show_definitions_and_examples(word, pos)
193
+
194
+ if morphology:
195
+ st.markdown("## 詞形變化")
196
+ # Collect inflected forms
197
+ inflected_forms = [tok for tok in doc if tok.text.lower() != tok.lemma_.lower()]
198
+ if inflected_forms:
199
+ create_eng_df(inflected_forms)
200
+
201
+ if ner_viz:
202
+ ner_labels = nlp.get_pipe("ner").labels
203
+ visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
204
+
205
+ if tok_table:
206
+ visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ja_ginza is a Japanese model with a lemmatizer and a morphologizer more fine-grained than the default one in spaCy
2
+ ginza
3
+ ja_ginza
4
+
5
+ # ja_ginza depends on spacy>=3.2.0,<3.3.0
6
+ spacy>=3.2.0,<3.3.0
7
+ spacy-streamlit>=1.0.0rc1,<1.1.0
8
+ spacy-wordnet
9
+
10
+ # spacy-wordnet depends on nltk
11
+ nltk
12
+
13
+ # sapCy models
14
+ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm
15
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
16
+
17
+ # conversion between hanzi and transcriptions
18
+ dragonmapper
19
+
20
+ # Jisho online Japanese dictionary
21
+ jisho_api
22
+
23
+ # YAKE keyword extraction
24
+ spacy-ke
25
+
26
+ # interactive plotting
27
+ plotly
tocfl_wordlist.csv ADDED
The diff for this file is too large to render. See raw diff
 
update_data.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A little helper scripts to generate the requirements.txt and models.json with
2
+ the latest supported model versions based on the compatibility.json."""
3
+ from spacy.about import __compatibility__ as COMPAT_URL
4
+ from spacy.util import get_lang_class, is_compatible_version
5
+ from pathlib import Path
6
+ import requests
7
+ import typer
8
+ import srsly
9
+
10
+
11
+ URL_TEMPLATE = "https://github.com/explosion/spacy-models/releases/download/{name}-{version}/{name}-{version}.tar.gz#egg={name}=={version}"
12
+
13
+
14
+ def main(
15
+ # fmt: off
16
+ spacy_version: str = typer.Argument(">=3.0.0,<3.1.0", help="The spaCy version range"),
17
+ spacy_streamlit_version: str = typer.Argument(">=1.0.0rc1,<1.1.0", help="The version range of spacy-streamlit"),
18
+ req_path: Path = typer.Option(Path(__file__).parent / "requirements.txt", "--requirements-path", "-rp", help="Path to requirements.txt"),
19
+ desc_path: Path = typer.Option(Path(__file__).parent / "models.json", "--models-json-path", "-mp", help="Path to models.json with model details for dropdown"),
20
+ package: str = typer.Option("spacy", "--package", "-p", help="The parent package (spacy, spacy-nightly, etc.)"),
21
+ exclude: str = typer.Option("en_vectors_web_lg", "--exclude", "-e", help="Comma-separated model names to exclude"),
22
+ # fmt: on
23
+ ):
24
+ exclude = [name.strip() for name in exclude.split(",")]
25
+ r = requests.get(COMPAT_URL)
26
+ r.raise_for_status()
27
+ compat = r.json()["spacy"]
28
+ data = None
29
+ for version_option in compat:
30
+ if is_compatible_version(version_option, spacy_version):
31
+ data = compat[version_option]
32
+ break
33
+ if data is None:
34
+ raise ValueError(f"No compatible models found for {spacy_version}")
35
+ reqs = [
36
+ f"# Auto-generated by {Path(__file__).name}",
37
+ f"{package}{spacy_version}",
38
+ f"spacy-streamlit{spacy_streamlit_version}",
39
+ ]
40
+ models = {}
41
+ for model_name, model_versions in data.items():
42
+ if model_name not in exclude and model_versions:
43
+ url = URL_TEMPLATE.format(name=model_name, version=model_versions[0])
44
+ # We do a quick check if the URL exists
45
+ r = requests.get(url, headers={"Range": "bytes=0"})
46
+ if r.status_code == 404:
47
+ print(f"Invalid package URL (skipping): {url}")
48
+ continue
49
+ reqs.append(url)
50
+ lang = model_name.split("_", 1)[0]
51
+ lang_name = get_lang_class(lang).__name__
52
+ models[model_name] = f"{lang_name} ({model_name})"
53
+ # Sort by human-readable language name, then by model size
54
+ sort_key = lambda x: f"{x[1].split(' ')[0]}_{['sm', 'md', 'lg', 'trf'].index(x[0].split('_')[-1])}"
55
+ models = {name: desc for name, desc in sorted(models.items(), key=sort_key)}
56
+ with Path(req_path).open("w", encoding="utf8") as f:
57
+ f.write("\n".join(reqs))
58
+ srsly.write_json(desc_path, models)
59
+ print(f"Generated requirements.txt and models.json for {len(reqs) - 1} models")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ typer.run(main)