wannaphong commited on
Commit
0903ed1
β€’
1 Parent(s): 0f5b3a5
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.markdown("""
3
+ # PyThaiNLP Demo 🎈
4
+
5
+ Welcome to PyThaiNLP Demo. This website will give you a example NLP function from PyThaiNLP.
6
+
7
+ You can see PyThaiNLP at [PyThaiNLP.github.io](https://pythainlp.github.io)
8
+
9
+
10
+ **About PyThaiNLP**
11
+
12
+ The PyThaiNLP Project is a Thai Natural Language Processing project. We build softwares and datasets for Thai language. Our Main Project is PyThaiNLP.
13
+
14
+ PyThaiNLP is a Python package for text processing and linguistic analysis, similar to nltk, with focus on Thai language.
15
+
16
+
17
+ We build Thai NLP.
18
+
19
+ PyThaiNLP
20
+ """
21
+ )
22
+ #st.sidebar.markdown("# Homepage 🎈")
pages/Name Entity Recognition.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.tag import NER
4
+
5
+ st.markdown("""
6
+ # Name Entity Recognition
7
+
8
+ PyThaiNLP support Name Entity Recognition. We have
9
+ - thainer - Thai NER engine
10
+ - wangchanberta* - wangchanberta model
11
+ - tltk - wrapper for TLTK
12
+
13
+ and trained with corpus
14
+
15
+ - thainer - Thai NER corpus
16
+ - lst20 - lst20 corpus (wangchanberta only)
17
+
18
+ **Note**: for tltk engine, It's support ner model from tltk only.
19
+ """)
20
+
21
+ _engine =None
22
+ with st.form("my_form"):
23
+ st.write("Inside the form")
24
+ text = st.text_area("text")
25
+ engine=st.selectbox('Select engine', ['thainer', 'wangchanberta', 'tltk'], key=1,index=0)
26
+ corpus=st.selectbox('Select corpus', ['thainer', 'lst20'], key=1,index=0)
27
+
28
+ # Every form must have a submit button.
29
+ submitted = st.form_submit_button("Submit")
30
+ if submitted:
31
+ st.subheader("Tags: ")
32
+ _engine = NER(engine=str(engine), corpus=str(corpus))
33
+ st.write(_engine.tag(text,tag=True))
34
+
35
+ st.write("See the documentation at [NER | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.NER).")
pages/pos_tag.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.tag import pos_tag
4
+ from pythainlp.tokenize import word_tokenize
5
+ st.markdown("""
6
+ # Part of speech tagging πŸŽ‰
7
+
8
+ PyThaiNLP support part-of-speech tagging for analysis text. We have
9
+ - perceptron - perceptron tagger (default)
10
+ - unigram - unigram tagger
11
+ - tltk - TLTK: Thai Language Toolkit (support TNC corpus only. if you choose other corpus, It’s change to TNC corpus.)
12
+ and trained with corpus:
13
+ - lst20 - LST20 corpus by National Electronics and Computer Technology Center, Thailand
14
+ - orchid - ORCHID corpus, text from Thai academic articles
15
+ - pud - Parallel Universal Dependencies (PUD) treebanks, natively use Universal POS tags
16
+ - lst20_ud - LST20 text, with tags mapped to Universal POS tag from Universal Dependencies
17
+ - orchid_ud - ORCHID text, with tags mapped to Universal POS tags
18
+
19
+ for this demo page.
20
+ """)
21
+
22
+ with st.form("my_form"):
23
+ st.write("Input text")
24
+ text = st.text_area("text")
25
+ word_engine=st.selectbox('Select word tokenize', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
26
+ pos_corpus = st.selectbox('Select POS corpus', ['lst20', 'orchid', 'pud', 'lst20_ud', 'orchid_ud'], key=1,index=0)
27
+ pos_engine = st.selectbox('Select Postag engine', ['perceptron', 'unigram', 'tltk'], key=1,index=0)
28
+
29
+ # Every form must have a submit button.
30
+ submitted = st.form_submit_button("Submit")
31
+ if submitted:
32
+ st.subheader("Pos: ")
33
+ start = time.time()
34
+ _list_words = word_tokenize(str(text), engine=str(word_engine))
35
+ _pos = pos_tag(_list_words, corpus=str(pos_corpus), engine=str(pos_engine))
36
+ _text = ""
37
+ for i,j in _pos:
38
+ _text += str(i)+"|"+str(j)+" "
39
+ end = time.time()
40
+ st.write(_text)
41
+ st.write()
42
+ st.write("Running times: "+str(end - start))
43
+
44
+ st.write("See the documentation at [pos_tag | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.pos_tag).")
pages/sent_tokenize.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.tokenize import sent_tokenize
4
+ st.markdown("""
5
+ # Sentence tokenization πŸŽ‰
6
+
7
+ PyThaiNLP support Sentence tokenization for NLP piplines. We have
8
+
9
+ - crfcut - (default) split by CRF trained on TED dataset.
10
+ - whitespace+newline - split by whitespaces and newline.
11
+ - whitespace - split by whitespaces. Specifiaclly, with regex pattern r" +"
12
+ - tltk - split by TLTK.
13
+
14
+ for this demo page.
15
+ """)
16
+
17
+ with st.form("my_form"):
18
+ st.write("Input text")
19
+ text = st.text_area("text")
20
+ engine=st.selectbox('Select sentence tokenizer', ['crfcut', 'whitespace+newline', 'whitespace', 'tltk'], key=1,index=0)
21
+
22
+ # Every form must have a submit button.
23
+ submitted = st.form_submit_button("Submit")
24
+ if submitted:
25
+ st.subheader("Sentences: ")
26
+ start = time.time()
27
+ _temp = sent_tokenize(str(text), engine=str(engine))
28
+ for i in _temp:
29
+ st.write(i)
30
+ end = time.time()
31
+ st.write()
32
+ st.write("Running times: "+str(end - start))
33
+
34
+ st.write("See the documentation at [sent_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.sent_tokenize).")
35
+ #st.sidebar.markdown("# Word tokenize πŸŽ‰")
pages/soundex.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.soundex import soundex
4
+ st.markdown("""
5
+ # Soundex πŸŽ‰
6
+
7
+ PyThaiNLP support Soundex for searching or indexing. We have
8
+
9
+ - udom83 (default) - Thai soundex algorithm proposed by Vichit Lorchirachoonkul.
10
+ - lk82 - Thai soundex algorithm proposed by Wannee Udompanich.
11
+ - metasound - Thai soundex algorithm based on a combination of Metaphone and Soundex proposed by Snae & BrΓΌckner.
12
+
13
+ for this demo page.
14
+ """)
15
+
16
+ with st.form("my_form"):
17
+ st.write("Inside the form")
18
+ text = st.text_area("text")
19
+ engine=st.selectbox('Select soundex', ['udom83', 'lk82', 'metasound'], key=1,index=0)
20
+
21
+ # Every form must have a submit button.
22
+ submitted = st.form_submit_button("Submit")
23
+ if submitted:
24
+ st.subheader("Soundex: ")
25
+ start = time.time()
26
+ for i in str(text).splitlines():
27
+ _temp = soundex(str(i), engine=str(engine))
28
+ st.write(_temp)
29
+ end = time.time()
30
+ st.write()
31
+ st.write("Running times: "+str(end - start))
32
+
33
+ st.markdown("See the documentation at [https://pythainlp.github.io/docs/3.0/api/soundex.html](https://pythainlp.github.io/docs/3.0/api/soundex.html).")
34
+ # st.sidebar.markdown("# Soundex πŸŽ‰")
pages/subword_tokenize.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.tokenize import subword_tokenize
4
+ st.markdown("""
5
+ # Subword tokenization πŸŽ‰
6
+
7
+ PyThaiNLP support Subword tokenization for NLP piplines. We have
8
+
9
+ - tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)
10
+ - etcc - Enhanced Thai Character Cluster (Inrut et al. 2001)
11
+ - dict - newmm word tokenizer with a syllable dictionary
12
+ - ssg - CRF syllable segmenter for Thai
13
+ - tltk - syllable tokenizer from tltk
14
+
15
+ for this demo page.
16
+ """)
17
+ with st.form("my_form"):
18
+ st.write("Input text")
19
+ _text = st.text_area("text")
20
+ engine=st.selectbox('Select word tokenizition', ['tcc', 'etcc', 'dict', 'ssg', 'tltk'], key=1,index=0)
21
+
22
+ # Every form must have a submit button.
23
+ submitted = st.form_submit_button("Submit")
24
+ if submitted:
25
+ st.subheader("Subwords: ")
26
+ start = time.time()
27
+ st.write(' '.join(subword_tokenize(str(_text), engine=str(engine))))
28
+ end = time.time()
29
+ st.write()
30
+ st.write("Running times: "+str(end - start))
31
+
32
+ st.write("See the documentation at [subword_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.subword_tokenize).")
pages/translation.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.translate import Translate
4
+
5
+ st.markdown("""
6
+ # Translation πŸŽ‰
7
+
8
+ PyThaiNLP support machine translation for translate text. We have
9
+
10
+ - th2en - (default) Thai to English translation
11
+ - en2th - English to Thai translation
12
+ - th2zh - Thai to Chinese translation
13
+ - zh2th - Chinese to Thai translation
14
+ - th2fr - Thai to French translation
15
+
16
+ for this demo page. It will use many times for running model.
17
+ """)
18
+ _engine =None
19
+ with st.form("my_form"):
20
+ st.write("Inside the form")
21
+ text = st.text_area("text")
22
+ engine=st.selectbox('Select', ['th2en', 'en2th', 'zh2th', 'th2zh', 'th2fr'], key=1,index=0)
23
+
24
+ # Every form must have a submit button.
25
+ submitted = st.form_submit_button("Submit")
26
+ if submitted:
27
+ st.subheader("Text: ")
28
+ start = time.time()
29
+ if engine=="th2en":
30
+ from pythainlp.translate.en_th import download_model_all
31
+ download_model_all()
32
+ _engine = Translate('th', 'en')
33
+ elif engine=="en2th":
34
+ from pythainlp.translate.en_th import download_model_all
35
+ download_model_all()
36
+ _engine = Translate('en','th')
37
+ elif engine == "zh2th":
38
+ _engine = Translate("zh","th")
39
+ elif engine == "th2zh":
40
+ _engine = Translate("th","zh")
41
+ elif engine == "th2fr":
42
+ _engine = Translate("th", "fr")
43
+ st.write(_engine.translate(str(text)))
44
+ end = time.time()
45
+ st.write()
46
+ st.write("Running times: "+str(end - start))
47
+
48
+ st.write("See the documentation at [translate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/translate.html).")
pages/transliteration.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.transliterate import transliterate
4
+
5
+ st.markdown("""
6
+ # Translation πŸŽ‰
7
+
8
+ PyThaiNLP support transliterate text for NLP piplines. We have
9
+
10
+ - thaig2p - (default) Thai Grapheme-to-Phoneme, output is IPA (require PyTorch)
11
+ - tltk_g2p - Thai Grapheme-to-Phoneme from TLTK <https://pypi.org/project/tltk/>_.,
12
+ - tltk_ipa - tltk, output is International Phonetic Alphabet (IPA)
13
+
14
+ for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
15
+ """)
16
+
17
+ with st.form("my_form"):
18
+ st.write("Inside the form")
19
+ text = st.text_input("text")
20
+ engine=st.selectbox('Select transliterate', ['thaig2p', 'tltk_g2p', 'tltk_ipa'], key=1,index=0)
21
+
22
+ # Every form must have a submit button.
23
+ submitted = st.form_submit_button("Submit")
24
+ if submitted:
25
+ st.subheader("Words: ")
26
+ start = time.time()
27
+ st.write(transliterate(str(text), engine=str(engine)))
28
+ end = time.time()
29
+ st.write()
30
+ st.write("Running times: "+str(end - start))
31
+
32
+ st.write("See the documentation at [transliterate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/transliterate.html).")
pages/word_tokenize.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from pythainlp.tokenize import word_tokenize
4
+ st.markdown("""
5
+ # Word tokenization πŸŽ‰
6
+
7
+ PyThaiNLP support Word tokenization for NLP piplines. We have
8
+
9
+ - newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
10
+ - mm - dictionary-based, Maximum Matching
11
+ - longest - dictionary-based, Longest Matching
12
+ - tltk - wrapper for TLTK.
13
+
14
+ for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
15
+ """)
16
+ with st.form("my_form"):
17
+ st.write("Input text")
18
+ text = st.text_area("text")
19
+ engine=st.selectbox('Select word tokenizition', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
20
+
21
+ # Every form must have a submit button.
22
+ submitted = st.form_submit_button("Submit")
23
+ if submitted:
24
+ st.subheader("Words: ")
25
+ start = time.time()
26
+ st.write(' '.join(word_tokenize(str(text), engine=str(engine))))
27
+ end = time.time()
28
+ st.write()
29
+ st.write("Running times: "+str(end - start))
30
+
31
+ st.write("See the documentation at [word_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.word_tokenize).")
32
+ #st.sidebar.markdown("# Word tokenize πŸŽ‰")
33
+
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyYAML
2
+ numpy
3
+ python-crfsuite
4
+ requests
5
+ tinydb
6
+ nltk
7
+ emoji
8
+ sacremoses
9
+ sentencepiece
10
+ ssg
11
+ torch
12
+ transformers
13
+ fairseq==0.10.2
14
+ tensorflow
15
+ pandas
16
+ tltk
17
+ streamlit
18
+ pythainlp==3.0.8