Upload 183 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- nltk_data/corpora/cmudict/README +76 -0
- nltk_data/corpora/cmudict/cmudict +0 -0
- nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/README +98 -0
- nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
- nltk_data/tokenizers/punkt/README +98 -0
- nltk_data/tokenizers/punkt/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/english.pickle +3 -0
- nltk_data/tokenizers/punkt/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/french.pickle +3 -0
- nltk_data/tokenizers/punkt/german.pickle +3 -0
- nltk_data/tokenizers/punkt/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/turkish.pickle +3 -0
- test/models/test_vicuna_chain_agent.py +95 -0
- test/textsplitter/test_zh_title_enhance.py +21 -0
- textsplitter/__init__.py +3 -0
- textsplitter/__pycache__/__init__.cpython-310.pyc +0 -0
- textsplitter/__pycache__/ali_text_splitter.cpython-310.pyc +0 -0
- textsplitter/__pycache__/chinese_text_splitter.cpython-310.pyc +0 -0
- textsplitter/__pycache__/zh_title_enhance.cpython-310.pyc +0 -0
nltk_data/corpora/cmudict/README
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
|
2 |
+
|
3 |
+
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
4 |
+
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
|
5 |
+
|
6 |
+
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
|
7 |
+
|
8 |
+
File Format: Each line consists of an uppercased word,
|
9 |
+
a counter (for alternative pronunciations), and a transcription.
|
10 |
+
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
|
11 |
+
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
|
12 |
+
|
13 |
+
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
14 |
+
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
15 |
+
three or more pronunciations. Many of these are fast-speech variants.
|
16 |
+
|
17 |
+
Phonemes: There are 39 phonemes, as shown below:
|
18 |
+
|
19 |
+
Phoneme Example Translation Phoneme Example Translation
|
20 |
+
------- ------- ----------- ------- ------- -----------
|
21 |
+
AA odd AA D AE at AE T
|
22 |
+
AH hut HH AH T AO ought AO T
|
23 |
+
AW cow K AW AY hide HH AY D
|
24 |
+
B be B IY CH cheese CH IY Z
|
25 |
+
D dee D IY DH thee DH IY
|
26 |
+
EH Ed EH D ER hurt HH ER T
|
27 |
+
EY ate EY T F fee F IY
|
28 |
+
G green G R IY N HH he HH IY
|
29 |
+
IH it IH T IY eat IY T
|
30 |
+
JH gee JH IY K key K IY
|
31 |
+
L lee L IY M me M IY
|
32 |
+
N knee N IY NG ping P IH NG
|
33 |
+
OW oat OW T OY toy T OY
|
34 |
+
P pee P IY R read R IY D
|
35 |
+
S sea S IY SH she SH IY
|
36 |
+
T tea T IY TH theta TH EY T AH
|
37 |
+
UH hood HH UH D UW two T UW
|
38 |
+
V vee V IY W we W IY
|
39 |
+
Y yield Y IY L D Z zee Z IY
|
40 |
+
ZH seizure S IY ZH ER
|
41 |
+
|
42 |
+
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
|
43 |
+
are contiguous, and not separated by FIRE'S 1.)
|
44 |
+
|
45 |
+
Redistribution and use in source and binary forms, with or without
|
46 |
+
modification, are permitted provided that the following conditions
|
47 |
+
are met:
|
48 |
+
|
49 |
+
1. Redistributions of source code must retain the above copyright
|
50 |
+
notice, this list of conditions and the following disclaimer.
|
51 |
+
The contents of this file are deemed to be source code.
|
52 |
+
|
53 |
+
2. Redistributions in binary form must reproduce the above copyright
|
54 |
+
notice, this list of conditions and the following disclaimer in
|
55 |
+
the documentation and/or other materials provided with the
|
56 |
+
distribution.
|
57 |
+
|
58 |
+
This work was supported in part by funding from the Defense Advanced
|
59 |
+
Research Projects Agency, the Office of Naval Research and the National
|
60 |
+
Science Foundation of the United States of America, and by member
|
61 |
+
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
62 |
+
the contributions of many volunteers to the expansion and improvement of
|
63 |
+
this dictionary.
|
64 |
+
|
65 |
+
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
66 |
+
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
67 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
68 |
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
69 |
+
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
70 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
71 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
72 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
73 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
74 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
75 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
76 |
+
|
nltk_data/corpora/cmudict/cmudict
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
|
3 |
+
size 6138625
|
nltk_data/tokenizers/punkt/PY3/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/PY3/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
|
3 |
+
size 1119050
|
nltk_data/tokenizers/punkt/PY3/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
|
3 |
+
size 1191710
|
nltk_data/tokenizers/punkt/PY3/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
|
3 |
+
size 693759
|
nltk_data/tokenizers/punkt/PY3/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
|
3 |
+
size 406697
|
nltk_data/tokenizers/punkt/PY3/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
|
3 |
+
size 1499502
|
nltk_data/tokenizers/punkt/PY3/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
|
3 |
+
size 1852226
|
nltk_data/tokenizers/punkt/PY3/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
|
3 |
+
size 553575
|
nltk_data/tokenizers/punkt/PY3/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
|
3 |
+
size 1463575
|
nltk_data/tokenizers/punkt/PY3/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
|
3 |
+
size 876006
|
nltk_data/tokenizers/punkt/PY3/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
|
3 |
+
size 615089
|
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
|
3 |
+
size 1181271
|
nltk_data/tokenizers/punkt/PY3/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
|
3 |
+
size 1738386
|
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
|
3 |
+
size 611919
|
nltk_data/tokenizers/punkt/PY3/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
|
3 |
+
size 33020
|
nltk_data/tokenizers/punkt/PY3/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
|
3 |
+
size 734444
|
nltk_data/tokenizers/punkt/PY3/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
|
3 |
+
size 562337
|
nltk_data/tokenizers/punkt/PY3/swedish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
|
3 |
+
size 979681
|
nltk_data/tokenizers/punkt/PY3/turkish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
|
3 |
+
size 1017038
|
nltk_data/tokenizers/punkt/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c085f6283bed0f1390d36a55d126ccc29c9b4dfcd2705e862b1711b7c6bb5ab
|
3 |
+
size 1424691
|
nltk_data/tokenizers/punkt/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df8366ad67db22b1f838cd63fcc589a6006faf66d7a46be5312d9c487ce2c811
|
3 |
+
size 1427491
|
nltk_data/tokenizers/punkt/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12f46024d3c840529b56ac2a3118b80b8dc77705734bcdd71ff7c46f5808395e
|
3 |
+
size 839761
|
nltk_data/tokenizers/punkt/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e2d25d5adc3ee51ac192ce611bdc5378acae7136af5d3c52c2903c669f9aff0
|
3 |
+
size 495006
|
nltk_data/tokenizers/punkt/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9083ef6ef3d5b9992a8a4ea09e889a87be75e2122ad25648307178960634cd8d
|
3 |
+
size 1803082
|
nltk_data/tokenizers/punkt/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce1b4dbe72e400e902220061457f9bd5f491ec37f7af468bc4694980c9623817
|
3 |
+
size 2192034
|
nltk_data/tokenizers/punkt/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e0be48e38a54232ea88c817cf34c1f1f8f44954e21f118c65af9f2d6a43cdbd
|
3 |
+
size 664010
|
nltk_data/tokenizers/punkt/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:251c2f4bde61ab3fc1cabc2158c62e6ab285fddd16267d2d3885f71e3ed61c7f
|
3 |
+
size 1708012
|
nltk_data/tokenizers/punkt/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b3a6da093ed2df084ded6dc49c88f101d47a0c69398f19ae50af6785d93b1c5
|
3 |
+
size 2042362
|
nltk_data/tokenizers/punkt/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41e6aaf554e696703b3d41890973368b9b2f17c342745c07369742928d363731
|
3 |
+
size 748532
|
nltk_data/tokenizers/punkt/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45828b0d57da9a66f107ea277752f6c1cbde51b9f9feba173b2c6e2edb28af21
|
3 |
+
size 1422756
|
nltk_data/tokenizers/punkt/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79d09a9406f90dbf20f8cbb0a04a7aa0bdb4b71604eda31e97c3df2de5cd2837
|
3 |
+
size 2287622
|
nltk_data/tokenizers/punkt/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c09561e770b6f17e3d85112f83007ff1397dec66c23acb15b9fe046eaefd2e86
|
3 |
+
size 739845
|
nltk_data/tokenizers/punkt/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
|
3 |
+
size 33027
|
nltk_data/tokenizers/punkt/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dc83b900e347c16ed0123868369107cd19d1a6125d099e26889580c4dbba277
|
3 |
+
size 939791
|
nltk_data/tokenizers/punkt/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61afae663cb2968148e0e27d5a3fcd4a5f19648688800caf8e7f998eaa75f4a7
|
3 |
+
size 680466
|
nltk_data/tokenizers/punkt/swedish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5794208b223b2a54bd4ed565045172f9c6ef80b5bead94f71a5499455cda955
|
3 |
+
size 1168214
|
nltk_data/tokenizers/punkt/turkish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2abb5d7ec4e80aeeb994407254a2e1a0928520727cc25f7bd3fc9ce0b5a78c1
|
3 |
+
size 1363199
|
test/models/test_vicuna_chain_agent.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
|
4 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../')
|
5 |
+
import asyncio
|
6 |
+
from argparse import Namespace
|
7 |
+
from models.loader.args import parser
|
8 |
+
from models.loader import LoaderCheckPoint
|
9 |
+
|
10 |
+
|
11 |
+
import models.shared as shared
|
12 |
+
|
13 |
+
from langchain.chains import LLMChain
|
14 |
+
from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
|
15 |
+
from langchain.prompts import PromptTemplate
|
16 |
+
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
|
17 |
+
from typing import List, Set
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
class CustomLLMSingleActionAgent(ZeroShotAgent):
|
22 |
+
allowed_tools: List[str]
|
23 |
+
|
24 |
+
def __init__(self, *args, **kwargs):
|
25 |
+
super(CustomLLMSingleActionAgent, self).__init__(*args, **kwargs)
|
26 |
+
self.allowed_tools = kwargs['allowed_tools']
|
27 |
+
|
28 |
+
def get_allowed_tools(self) -> Set[str]:
|
29 |
+
return set(self.allowed_tools)
|
30 |
+
|
31 |
+
|
32 |
+
async def dispatch(args: Namespace):
|
33 |
+
args_dict = vars(args)
|
34 |
+
|
35 |
+
shared.loaderCheckPoint = LoaderCheckPoint(args_dict)
|
36 |
+
llm_model_ins = shared.loaderLLM()
|
37 |
+
|
38 |
+
template = """This is a conversation between a human and a bot:
|
39 |
+
|
40 |
+
{chat_history}
|
41 |
+
|
42 |
+
Write a summary of the conversation for {input}:
|
43 |
+
"""
|
44 |
+
|
45 |
+
prompt = PromptTemplate(
|
46 |
+
input_variables=["input", "chat_history"],
|
47 |
+
template=template
|
48 |
+
)
|
49 |
+
memory = ConversationBufferMemory(memory_key="chat_history")
|
50 |
+
readonlymemory = ReadOnlySharedMemory(memory=memory)
|
51 |
+
summry_chain = LLMChain(
|
52 |
+
llm=llm_model_ins,
|
53 |
+
prompt=prompt,
|
54 |
+
verbose=True,
|
55 |
+
memory=readonlymemory, # use the read-only memory to prevent the tool from modifying the memory
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
tools = [
|
60 |
+
Tool(
|
61 |
+
name="Summary",
|
62 |
+
func=summry_chain.run,
|
63 |
+
description="useful for when you summarize a conversation. The input to this tool should be a string, representing who will read this summary."
|
64 |
+
)
|
65 |
+
]
|
66 |
+
|
67 |
+
prefix = """Have a conversation with a human, answering the following questions as best you can. You have access to the following tools:"""
|
68 |
+
suffix = """Begin!
|
69 |
+
|
70 |
+
Question: {input}
|
71 |
+
{agent_scratchpad}"""
|
72 |
+
|
73 |
+
|
74 |
+
prompt = CustomLLMSingleActionAgent.create_prompt(
|
75 |
+
tools,
|
76 |
+
prefix=prefix,
|
77 |
+
suffix=suffix,
|
78 |
+
input_variables=["input", "agent_scratchpad"]
|
79 |
+
)
|
80 |
+
tool_names = [tool.name for tool in tools]
|
81 |
+
llm_chain = LLMChain(llm=llm_model_ins, prompt=prompt)
|
82 |
+
agent = CustomLLMSingleActionAgent(llm_chain=llm_chain, tools=tools, allowed_tools=tool_names)
|
83 |
+
agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools)
|
84 |
+
|
85 |
+
agent_chain.run(input="你好")
|
86 |
+
agent_chain.run(input="你是谁?")
|
87 |
+
agent_chain.run(input="我们之前聊了什么?")
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
args = None
|
91 |
+
args = parser.parse_args(args=['--model-dir', '/media/checkpoint/', '--model', 'vicuna-13b-hf', '--no-remote-model', '--load-in-8bit'])
|
92 |
+
|
93 |
+
loop = asyncio.new_event_loop()
|
94 |
+
asyncio.set_event_loop(loop)
|
95 |
+
loop.run_until_complete(dispatch(args))
|
test/textsplitter/test_zh_title_enhance.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from configs.model_config import *
|
2 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
3 |
+
import nltk
|
4 |
+
from vectorstores import MyFAISS
|
5 |
+
from chains.local_doc_qa import load_file
|
6 |
+
|
7 |
+
|
8 |
+
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
12 |
+
"knowledge_base", "samples", "content", "test.txt")
|
13 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
|
14 |
+
model_kwargs={'device': EMBEDDING_DEVICE})
|
15 |
+
|
16 |
+
docs = load_file(filepath, using_zh_title_enhance=True)
|
17 |
+
vector_store = MyFAISS.from_documents(docs, embeddings)
|
18 |
+
query = "指令提示技术有什么示例"
|
19 |
+
search_result = vector_store.similarity_search(query)
|
20 |
+
print(search_result)
|
21 |
+
pass
|
textsplitter/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .chinese_text_splitter import ChineseTextSplitter
|
2 |
+
from .ali_text_splitter import AliTextSplitter
|
3 |
+
from .zh_title_enhance import zh_title_enhance
|
textsplitter/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (319 Bytes). View file
|
|
textsplitter/__pycache__/ali_text_splitter.cpython-310.pyc
ADDED
Binary file (1.39 kB). View file
|
|
textsplitter/__pycache__/chinese_text_splitter.cpython-310.pyc
ADDED
Binary file (2.82 kB). View file
|
|
textsplitter/__pycache__/zh_title_enhance.cpython-310.pyc
ADDED
Binary file (2.86 kB). View file
|
|