Spaces:

XuBailing
/

CongMa2

Runtime error

App Files Files Community

XuBailing commited on Jul 29, 2023

Commit

aeee1e2

•

1 Parent(s): 5ffec70

Upload 183 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

nltk_data/corpora/cmudict/README +76 -0
nltk_data/corpora/cmudict/cmudict +0 -0
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/README +98 -0
nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
nltk_data/tokenizers/punkt/README +98 -0
nltk_data/tokenizers/punkt/czech.pickle +3 -0
nltk_data/tokenizers/punkt/danish.pickle +3 -0
nltk_data/tokenizers/punkt/dutch.pickle +3 -0
nltk_data/tokenizers/punkt/english.pickle +3 -0
nltk_data/tokenizers/punkt/estonian.pickle +3 -0
nltk_data/tokenizers/punkt/finnish.pickle +3 -0
nltk_data/tokenizers/punkt/french.pickle +3 -0
nltk_data/tokenizers/punkt/german.pickle +3 -0
nltk_data/tokenizers/punkt/greek.pickle +3 -0
nltk_data/tokenizers/punkt/italian.pickle +3 -0
nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
nltk_data/tokenizers/punkt/polish.pickle +3 -0
nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
nltk_data/tokenizers/punkt/russian.pickle +3 -0
nltk_data/tokenizers/punkt/slovene.pickle +3 -0
nltk_data/tokenizers/punkt/spanish.pickle +3 -0
nltk_data/tokenizers/punkt/swedish.pickle +3 -0
nltk_data/tokenizers/punkt/turkish.pickle +3 -0
test/models/test_vicuna_chain_agent.py +95 -0
test/textsplitter/test_zh_title_enhance.py +21 -0
textsplitter/__init__.py +3 -0
textsplitter/__pycache__/__init__.cpython-310.pyc +0 -0
textsplitter/__pycache__/ali_text_splitter.cpython-310.pyc +0 -0
textsplitter/__pycache__/chinese_text_splitter.cpython-310.pyc +0 -0
textsplitter/__pycache__/zh_title_enhance.cpython-310.pyc +0 -0

nltk_data/corpora/cmudict/README ADDED Viewed

	@@ -0,0 +1,76 @@

+The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
+ftp://ftp.cs.cmu.edu/project/speech/dict/
+https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
+Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
+File Format: Each line consists of an uppercased word,
+a counter (for alternative pronunciations), and a transcription.
+Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
+E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
+The dictionary contains 127069 entries.  Of these, 119400 words are assigned
+a unique pronunciation, 6830 words have two pronunciations, and 839 words have
+three or more pronunciations.  Many of these are fast-speech variants.
+Phonemes: There are 39 phonemes, as shown below:
+    Phoneme Example Translation    Phoneme Example Translation
+    ------- ------- -----------    ------- ------- -----------
+    AA      odd     AA D           AE      at      AE T
+    AH      hut     HH AH T        AO      ought   AO T
+    AW      cow     K AW           AY      hide    HH AY D
+    B       be      B IY           CH      cheese  CH IY Z
+    D       dee     D IY           DH      thee    DH IY
+    EH      Ed      EH D           ER      hurt    HH ER T
+    EY      ate     EY T           F       fee     F IY
+    G       green   G R IY N       HH      he      HH IY
+    IH      it      IH T           IY      eat     IY T
+    JH      gee     JH IY          K       key     K IY
+    L       lee     L IY           M       me      M IY
+    N       knee    N IY           NG      ping    P IH NG
+    OW      oat     OW T           OY      toy     T OY
+    P       pee     P IY           R       read    R IY D
+    S       sea     S IY           SH      she     SH IY
+    T       tea     T IY           TH      theta   TH EY T AH
+    UH      hood    HH UH D        UW      two     T UW
+    V       vee     V IY           W       we      W IY
+    Y       yield   Y IY L D       Z       zee     Z IY
+    ZH      seizure S IY ZH ER
+(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
+are contiguous, and not separated by FIRE'S 1.)
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   The contents of this file are deemed to be source code.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+This work was supported in part by funding from the Defense Advanced
+Research Projects Agency, the Office of Naval Research and the National
+Science Foundation of the United States of America, and by member
+companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
+the contributions of many volunteers to the expansion and improvement of
+this dictionary.
+THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

nltk_data/corpora/cmudict/cmudict ADDED Viewed

The diff for this file is too large to render. See raw diff

nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
+size 6138625

nltk_data/tokenizers/punkt/PY3/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
+size 1119050

nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
+size 1191710

nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
+size 693759

nltk_data/tokenizers/punkt/PY3/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
+size 406697

nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
+size 1499502

nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
+size 1852226

nltk_data/tokenizers/punkt/PY3/french.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
+size 553575

nltk_data/tokenizers/punkt/PY3/german.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
+size 1463575

nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
+size 876006

nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
+size 615089

nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
+size 221207

nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
+size 1181271

nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
+size 1738386

nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
+size 611919

nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
+size 33020

nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
+size 734444

nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
+size 562337

nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
+size 979681

nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
+size 1017038

nltk_data/tokenizers/punkt/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/czech.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c085f6283bed0f1390d36a55d126ccc29c9b4dfcd2705e862b1711b7c6bb5ab
+size 1424691

nltk_data/tokenizers/punkt/danish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df8366ad67db22b1f838cd63fcc589a6006faf66d7a46be5312d9c487ce2c811
+size 1427491

nltk_data/tokenizers/punkt/dutch.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12f46024d3c840529b56ac2a3118b80b8dc77705734bcdd71ff7c46f5808395e
+size 839761

nltk_data/tokenizers/punkt/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e2d25d5adc3ee51ac192ce611bdc5378acae7136af5d3c52c2903c669f9aff0
+size 495006

nltk_data/tokenizers/punkt/estonian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9083ef6ef3d5b9992a8a4ea09e889a87be75e2122ad25648307178960634cd8d
+size 1803082

nltk_data/tokenizers/punkt/finnish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce1b4dbe72e400e902220061457f9bd5f491ec37f7af468bc4694980c9623817
+size 2192034

nltk_data/tokenizers/punkt/french.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e0be48e38a54232ea88c817cf34c1f1f8f44954e21f118c65af9f2d6a43cdbd
+size 664010

nltk_data/tokenizers/punkt/german.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:251c2f4bde61ab3fc1cabc2158c62e6ab285fddd16267d2d3885f71e3ed61c7f
+size 1708012

nltk_data/tokenizers/punkt/greek.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b3a6da093ed2df084ded6dc49c88f101d47a0c69398f19ae50af6785d93b1c5
+size 2042362

nltk_data/tokenizers/punkt/italian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41e6aaf554e696703b3d41890973368b9b2f17c342745c07369742928d363731
+size 748532

nltk_data/tokenizers/punkt/malayalam.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
+size 221207

nltk_data/tokenizers/punkt/norwegian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45828b0d57da9a66f107ea277752f6c1cbde51b9f9feba173b2c6e2edb28af21
+size 1422756

nltk_data/tokenizers/punkt/polish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d09a9406f90dbf20f8cbb0a04a7aa0bdb4b71604eda31e97c3df2de5cd2837
+size 2287622

nltk_data/tokenizers/punkt/portuguese.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c09561e770b6f17e3d85112f83007ff1397dec66c23acb15b9fe046eaefd2e86
+size 739845

nltk_data/tokenizers/punkt/russian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
+size 33027

nltk_data/tokenizers/punkt/slovene.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc83b900e347c16ed0123868369107cd19d1a6125d099e26889580c4dbba277
+size 939791

nltk_data/tokenizers/punkt/spanish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61afae663cb2968148e0e27d5a3fcd4a5f19648688800caf8e7f998eaa75f4a7
+size 680466

nltk_data/tokenizers/punkt/swedish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5794208b223b2a54bd4ed565045172f9c6ef80b5bead94f71a5499455cda955
+size 1168214

nltk_data/tokenizers/punkt/turkish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2abb5d7ec4e80aeeb994407254a2e1a0928520727cc25f7bd3fc9ce0b5a78c1
+size 1363199

test/models/test_vicuna_chain_agent.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../')
+import asyncio
+from argparse import Namespace
+from models.loader.args import parser
+from models.loader import LoaderCheckPoint
+import models.shared as shared
+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
+from langchain.prompts import PromptTemplate
+from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
+from typing import List, Set
+class CustomLLMSingleActionAgent(ZeroShotAgent):
+    allowed_tools: List[str]
+    def __init__(self, *args, **kwargs):
+        super(CustomLLMSingleActionAgent, self).__init__(*args, **kwargs)
+        self.allowed_tools = kwargs['allowed_tools']
+    def get_allowed_tools(self) -> Set[str]:
+        return set(self.allowed_tools)
+async def dispatch(args: Namespace):
+    args_dict = vars(args)
+    shared.loaderCheckPoint = LoaderCheckPoint(args_dict)
+    llm_model_ins = shared.loaderLLM()
+    template = """This is a conversation between a human and a bot:
+{chat_history}
+Write a summary of the conversation for {input}:
+"""
+    prompt = PromptTemplate(
+        input_variables=["input", "chat_history"],
+        template=template
+    )
+    memory = ConversationBufferMemory(memory_key="chat_history")
+    readonlymemory = ReadOnlySharedMemory(memory=memory)
+    summry_chain = LLMChain(
+        llm=llm_model_ins,
+        prompt=prompt,
+        verbose=True,
+        memory=readonlymemory,  # use the read-only memory to prevent the tool from modifying the memory
+    )
+    tools = [
+        Tool(
+            name="Summary",
+            func=summry_chain.run,
+            description="useful for when you summarize a conversation. The input to this tool should be a string, representing who will read this summary."
+        )
+    ]
+    prefix = """Have a conversation with a human, answering the following questions as best you can. You have access to the following tools:"""
+    suffix = """Begin!
+Question: {input}
+{agent_scratchpad}"""
+    prompt = CustomLLMSingleActionAgent.create_prompt(
+        tools,
+        prefix=prefix,
+        suffix=suffix,
+        input_variables=["input",   "agent_scratchpad"]
+    )
+    tool_names = [tool.name for tool in tools]
+    llm_chain = LLMChain(llm=llm_model_ins, prompt=prompt)
+    agent = CustomLLMSingleActionAgent(llm_chain=llm_chain, tools=tools, allowed_tools=tool_names)
+    agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools)
+    agent_chain.run(input="你好")
+    agent_chain.run(input="你是谁?")
+    agent_chain.run(input="我们之前聊了什么?")
+if __name__ == '__main__':
+    args = None
+    args = parser.parse_args(args=['--model-dir', '/media/checkpoint/',  '--model', 'vicuna-13b-hf', '--no-remote-model', '--load-in-8bit'])
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.run_until_complete(dispatch(args))

test/textsplitter/test_zh_title_enhance.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from configs.model_config import *
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+import nltk
+from vectorstores import MyFAISS
+from chains.local_doc_qa import load_file
+nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+if __name__ == "__main__":
+    filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+                            "knowledge_base", "samples", "content", "test.txt")
+    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
+                                       model_kwargs={'device': EMBEDDING_DEVICE})
+    docs = load_file(filepath, using_zh_title_enhance=True)
+    vector_store = MyFAISS.from_documents(docs, embeddings)
+    query = "指令提示技术有什么示例"
+    search_result = vector_store.similarity_search(query)
+    print(search_result)
+    pass

textsplitter/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .chinese_text_splitter import ChineseTextSplitter
+from .ali_text_splitter import AliTextSplitter
+from .zh_title_enhance import zh_title_enhance

textsplitter/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (319 Bytes). View file

textsplitter/__pycache__/ali_text_splitter.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

textsplitter/__pycache__/chinese_text_splitter.cpython-310.pyc ADDED Viewed

Binary file (2.82 kB). View file

textsplitter/__pycache__/zh_title_enhance.cpython-310.pyc ADDED Viewed

Binary file (2.86 kB). View file