Spaces:
Build error
Build error
wannaphong
commited on
Commit
β’
0903ed1
1
Parent(s):
0f5b3a5
Add file
Browse files- .DS_Store +0 -0
- app.py +22 -0
- pages/Name Entity Recognition.py +35 -0
- pages/pos_tag.py +44 -0
- pages/sent_tokenize.py +35 -0
- pages/soundex.py +34 -0
- pages/subword_tokenize.py +32 -0
- pages/translation.py +48 -0
- pages/transliteration.py +32 -0
- pages/word_tokenize.py +33 -0
- requirements.txt +18 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
st.markdown("""
|
3 |
+
# PyThaiNLP Demo π
|
4 |
+
|
5 |
+
Welcome to PyThaiNLP Demo. This website will give you a example NLP function from PyThaiNLP.
|
6 |
+
|
7 |
+
You can see PyThaiNLP at [PyThaiNLP.github.io](https://pythainlp.github.io)
|
8 |
+
|
9 |
+
|
10 |
+
**About PyThaiNLP**
|
11 |
+
|
12 |
+
The PyThaiNLP Project is a Thai Natural Language Processing project. We build softwares and datasets for Thai language. Our Main Project is PyThaiNLP.
|
13 |
+
|
14 |
+
PyThaiNLP is a Python package for text processing and linguistic analysis, similar to nltk, with focus on Thai language.
|
15 |
+
|
16 |
+
|
17 |
+
We build Thai NLP.
|
18 |
+
|
19 |
+
PyThaiNLP
|
20 |
+
"""
|
21 |
+
)
|
22 |
+
#st.sidebar.markdown("# Homepage π")
|
pages/Name Entity Recognition.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.tag import NER
|
4 |
+
|
5 |
+
st.markdown("""
|
6 |
+
# Name Entity Recognition
|
7 |
+
|
8 |
+
PyThaiNLP support Name Entity Recognition. We have
|
9 |
+
- thainer - Thai NER engine
|
10 |
+
- wangchanberta* - wangchanberta model
|
11 |
+
- tltk - wrapper for TLTK
|
12 |
+
|
13 |
+
and trained with corpus
|
14 |
+
|
15 |
+
- thainer - Thai NER corpus
|
16 |
+
- lst20 - lst20 corpus (wangchanberta only)
|
17 |
+
|
18 |
+
**Note**: for tltk engine, It's support ner model from tltk only.
|
19 |
+
""")
|
20 |
+
|
21 |
+
_engine =None
|
22 |
+
with st.form("my_form"):
|
23 |
+
st.write("Inside the form")
|
24 |
+
text = st.text_area("text")
|
25 |
+
engine=st.selectbox('Select engine', ['thainer', 'wangchanberta', 'tltk'], key=1,index=0)
|
26 |
+
corpus=st.selectbox('Select corpus', ['thainer', 'lst20'], key=1,index=0)
|
27 |
+
|
28 |
+
# Every form must have a submit button.
|
29 |
+
submitted = st.form_submit_button("Submit")
|
30 |
+
if submitted:
|
31 |
+
st.subheader("Tags: ")
|
32 |
+
_engine = NER(engine=str(engine), corpus=str(corpus))
|
33 |
+
st.write(_engine.tag(text,tag=True))
|
34 |
+
|
35 |
+
st.write("See the documentation at [NER | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.NER).")
|
pages/pos_tag.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.tag import pos_tag
|
4 |
+
from pythainlp.tokenize import word_tokenize
|
5 |
+
st.markdown("""
|
6 |
+
# Part of speech tagging π
|
7 |
+
|
8 |
+
PyThaiNLP support part-of-speech tagging for analysis text. We have
|
9 |
+
- perceptron - perceptron tagger (default)
|
10 |
+
- unigram - unigram tagger
|
11 |
+
- tltk - TLTK: Thai Language Toolkit (support TNC corpus only. if you choose other corpus, Itβs change to TNC corpus.)
|
12 |
+
and trained with corpus:
|
13 |
+
- lst20 - LST20 corpus by National Electronics and Computer Technology Center, Thailand
|
14 |
+
- orchid - ORCHID corpus, text from Thai academic articles
|
15 |
+
- pud - Parallel Universal Dependencies (PUD) treebanks, natively use Universal POS tags
|
16 |
+
- lst20_ud - LST20 text, with tags mapped to Universal POS tag from Universal Dependencies
|
17 |
+
- orchid_ud - ORCHID text, with tags mapped to Universal POS tags
|
18 |
+
|
19 |
+
for this demo page.
|
20 |
+
""")
|
21 |
+
|
22 |
+
with st.form("my_form"):
|
23 |
+
st.write("Input text")
|
24 |
+
text = st.text_area("text")
|
25 |
+
word_engine=st.selectbox('Select word tokenize', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
|
26 |
+
pos_corpus = st.selectbox('Select POS corpus', ['lst20', 'orchid', 'pud', 'lst20_ud', 'orchid_ud'], key=1,index=0)
|
27 |
+
pos_engine = st.selectbox('Select Postag engine', ['perceptron', 'unigram', 'tltk'], key=1,index=0)
|
28 |
+
|
29 |
+
# Every form must have a submit button.
|
30 |
+
submitted = st.form_submit_button("Submit")
|
31 |
+
if submitted:
|
32 |
+
st.subheader("Pos: ")
|
33 |
+
start = time.time()
|
34 |
+
_list_words = word_tokenize(str(text), engine=str(word_engine))
|
35 |
+
_pos = pos_tag(_list_words, corpus=str(pos_corpus), engine=str(pos_engine))
|
36 |
+
_text = ""
|
37 |
+
for i,j in _pos:
|
38 |
+
_text += str(i)+"|"+str(j)+" "
|
39 |
+
end = time.time()
|
40 |
+
st.write(_text)
|
41 |
+
st.write()
|
42 |
+
st.write("Running times: "+str(end - start))
|
43 |
+
|
44 |
+
st.write("See the documentation at [pos_tag | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.pos_tag).")
|
pages/sent_tokenize.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.tokenize import sent_tokenize
|
4 |
+
st.markdown("""
|
5 |
+
# Sentence tokenization π
|
6 |
+
|
7 |
+
PyThaiNLP support Sentence tokenization for NLP piplines. We have
|
8 |
+
|
9 |
+
- crfcut - (default) split by CRF trained on TED dataset.
|
10 |
+
- whitespace+newline - split by whitespaces and newline.
|
11 |
+
- whitespace - split by whitespaces. Specifiaclly, with regex pattern r" +"
|
12 |
+
- tltk - split by TLTK.
|
13 |
+
|
14 |
+
for this demo page.
|
15 |
+
""")
|
16 |
+
|
17 |
+
with st.form("my_form"):
|
18 |
+
st.write("Input text")
|
19 |
+
text = st.text_area("text")
|
20 |
+
engine=st.selectbox('Select sentence tokenizer', ['crfcut', 'whitespace+newline', 'whitespace', 'tltk'], key=1,index=0)
|
21 |
+
|
22 |
+
# Every form must have a submit button.
|
23 |
+
submitted = st.form_submit_button("Submit")
|
24 |
+
if submitted:
|
25 |
+
st.subheader("Sentences: ")
|
26 |
+
start = time.time()
|
27 |
+
_temp = sent_tokenize(str(text), engine=str(engine))
|
28 |
+
for i in _temp:
|
29 |
+
st.write(i)
|
30 |
+
end = time.time()
|
31 |
+
st.write()
|
32 |
+
st.write("Running times: "+str(end - start))
|
33 |
+
|
34 |
+
st.write("See the documentation at [sent_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.sent_tokenize).")
|
35 |
+
#st.sidebar.markdown("# Word tokenize π")
|
pages/soundex.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.soundex import soundex
|
4 |
+
st.markdown("""
|
5 |
+
# Soundex π
|
6 |
+
|
7 |
+
PyThaiNLP support Soundex for searching or indexing. We have
|
8 |
+
|
9 |
+
- udom83 (default) - Thai soundex algorithm proposed by Vichit Lorchirachoonkul.
|
10 |
+
- lk82 - Thai soundex algorithm proposed by Wannee Udompanich.
|
11 |
+
- metasound - Thai soundex algorithm based on a combination of Metaphone and Soundex proposed by Snae & BrΓΌckner.
|
12 |
+
|
13 |
+
for this demo page.
|
14 |
+
""")
|
15 |
+
|
16 |
+
with st.form("my_form"):
|
17 |
+
st.write("Inside the form")
|
18 |
+
text = st.text_area("text")
|
19 |
+
engine=st.selectbox('Select soundex', ['udom83', 'lk82', 'metasound'], key=1,index=0)
|
20 |
+
|
21 |
+
# Every form must have a submit button.
|
22 |
+
submitted = st.form_submit_button("Submit")
|
23 |
+
if submitted:
|
24 |
+
st.subheader("Soundex: ")
|
25 |
+
start = time.time()
|
26 |
+
for i in str(text).splitlines():
|
27 |
+
_temp = soundex(str(i), engine=str(engine))
|
28 |
+
st.write(_temp)
|
29 |
+
end = time.time()
|
30 |
+
st.write()
|
31 |
+
st.write("Running times: "+str(end - start))
|
32 |
+
|
33 |
+
st.markdown("See the documentation at [https://pythainlp.github.io/docs/3.0/api/soundex.html](https://pythainlp.github.io/docs/3.0/api/soundex.html).")
|
34 |
+
# st.sidebar.markdown("# Soundex π")
|
pages/subword_tokenize.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.tokenize import subword_tokenize
|
4 |
+
st.markdown("""
|
5 |
+
# Subword tokenization π
|
6 |
+
|
7 |
+
PyThaiNLP support Subword tokenization for NLP piplines. We have
|
8 |
+
|
9 |
+
- tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)
|
10 |
+
- etcc - Enhanced Thai Character Cluster (Inrut et al. 2001)
|
11 |
+
- dict - newmm word tokenizer with a syllable dictionary
|
12 |
+
- ssg - CRF syllable segmenter for Thai
|
13 |
+
- tltk - syllable tokenizer from tltk
|
14 |
+
|
15 |
+
for this demo page.
|
16 |
+
""")
|
17 |
+
with st.form("my_form"):
|
18 |
+
st.write("Input text")
|
19 |
+
_text = st.text_area("text")
|
20 |
+
engine=st.selectbox('Select word tokenizition', ['tcc', 'etcc', 'dict', 'ssg', 'tltk'], key=1,index=0)
|
21 |
+
|
22 |
+
# Every form must have a submit button.
|
23 |
+
submitted = st.form_submit_button("Submit")
|
24 |
+
if submitted:
|
25 |
+
st.subheader("Subwords: ")
|
26 |
+
start = time.time()
|
27 |
+
st.write(' '.join(subword_tokenize(str(_text), engine=str(engine))))
|
28 |
+
end = time.time()
|
29 |
+
st.write()
|
30 |
+
st.write("Running times: "+str(end - start))
|
31 |
+
|
32 |
+
st.write("See the documentation at [subword_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.subword_tokenize).")
|
pages/translation.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.translate import Translate
|
4 |
+
|
5 |
+
st.markdown("""
|
6 |
+
# Translation π
|
7 |
+
|
8 |
+
PyThaiNLP support machine translation for translate text. We have
|
9 |
+
|
10 |
+
- th2en - (default) Thai to English translation
|
11 |
+
- en2th - English to Thai translation
|
12 |
+
- th2zh - Thai to Chinese translation
|
13 |
+
- zh2th - Chinese to Thai translation
|
14 |
+
- th2fr - Thai to French translation
|
15 |
+
|
16 |
+
for this demo page. It will use many times for running model.
|
17 |
+
""")
|
18 |
+
_engine =None
|
19 |
+
with st.form("my_form"):
|
20 |
+
st.write("Inside the form")
|
21 |
+
text = st.text_area("text")
|
22 |
+
engine=st.selectbox('Select', ['th2en', 'en2th', 'zh2th', 'th2zh', 'th2fr'], key=1,index=0)
|
23 |
+
|
24 |
+
# Every form must have a submit button.
|
25 |
+
submitted = st.form_submit_button("Submit")
|
26 |
+
if submitted:
|
27 |
+
st.subheader("Text: ")
|
28 |
+
start = time.time()
|
29 |
+
if engine=="th2en":
|
30 |
+
from pythainlp.translate.en_th import download_model_all
|
31 |
+
download_model_all()
|
32 |
+
_engine = Translate('th', 'en')
|
33 |
+
elif engine=="en2th":
|
34 |
+
from pythainlp.translate.en_th import download_model_all
|
35 |
+
download_model_all()
|
36 |
+
_engine = Translate('en','th')
|
37 |
+
elif engine == "zh2th":
|
38 |
+
_engine = Translate("zh","th")
|
39 |
+
elif engine == "th2zh":
|
40 |
+
_engine = Translate("th","zh")
|
41 |
+
elif engine == "th2fr":
|
42 |
+
_engine = Translate("th", "fr")
|
43 |
+
st.write(_engine.translate(str(text)))
|
44 |
+
end = time.time()
|
45 |
+
st.write()
|
46 |
+
st.write("Running times: "+str(end - start))
|
47 |
+
|
48 |
+
st.write("See the documentation at [translate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/translate.html).")
|
pages/transliteration.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.transliterate import transliterate
|
4 |
+
|
5 |
+
st.markdown("""
|
6 |
+
# Translation π
|
7 |
+
|
8 |
+
PyThaiNLP support transliterate text for NLP piplines. We have
|
9 |
+
|
10 |
+
- thaig2p - (default) Thai Grapheme-to-Phoneme, output is IPA (require PyTorch)
|
11 |
+
- tltk_g2p - Thai Grapheme-to-Phoneme from TLTK <https://pypi.org/project/tltk/>_.,
|
12 |
+
- tltk_ipa - tltk, output is International Phonetic Alphabet (IPA)
|
13 |
+
|
14 |
+
for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
|
15 |
+
""")
|
16 |
+
|
17 |
+
with st.form("my_form"):
|
18 |
+
st.write("Inside the form")
|
19 |
+
text = st.text_input("text")
|
20 |
+
engine=st.selectbox('Select transliterate', ['thaig2p', 'tltk_g2p', 'tltk_ipa'], key=1,index=0)
|
21 |
+
|
22 |
+
# Every form must have a submit button.
|
23 |
+
submitted = st.form_submit_button("Submit")
|
24 |
+
if submitted:
|
25 |
+
st.subheader("Words: ")
|
26 |
+
start = time.time()
|
27 |
+
st.write(transliterate(str(text), engine=str(engine)))
|
28 |
+
end = time.time()
|
29 |
+
st.write()
|
30 |
+
st.write("Running times: "+str(end - start))
|
31 |
+
|
32 |
+
st.write("See the documentation at [transliterate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/transliterate.html).")
|
pages/word_tokenize.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from pythainlp.tokenize import word_tokenize
|
4 |
+
st.markdown("""
|
5 |
+
# Word tokenization π
|
6 |
+
|
7 |
+
PyThaiNLP support Word tokenization for NLP piplines. We have
|
8 |
+
|
9 |
+
- newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
|
10 |
+
- mm - dictionary-based, Maximum Matching
|
11 |
+
- longest - dictionary-based, Longest Matching
|
12 |
+
- tltk - wrapper for TLTK.
|
13 |
+
|
14 |
+
for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
|
15 |
+
""")
|
16 |
+
with st.form("my_form"):
|
17 |
+
st.write("Input text")
|
18 |
+
text = st.text_area("text")
|
19 |
+
engine=st.selectbox('Select word tokenizition', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
|
20 |
+
|
21 |
+
# Every form must have a submit button.
|
22 |
+
submitted = st.form_submit_button("Submit")
|
23 |
+
if submitted:
|
24 |
+
st.subheader("Words: ")
|
25 |
+
start = time.time()
|
26 |
+
st.write(' '.join(word_tokenize(str(text), engine=str(engine))))
|
27 |
+
end = time.time()
|
28 |
+
st.write()
|
29 |
+
st.write("Running times: "+str(end - start))
|
30 |
+
|
31 |
+
st.write("See the documentation at [word_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.word_tokenize).")
|
32 |
+
#st.sidebar.markdown("# Word tokenize π")
|
33 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyYAML
|
2 |
+
numpy
|
3 |
+
python-crfsuite
|
4 |
+
requests
|
5 |
+
tinydb
|
6 |
+
nltk
|
7 |
+
emoji
|
8 |
+
sacremoses
|
9 |
+
sentencepiece
|
10 |
+
ssg
|
11 |
+
torch
|
12 |
+
transformers
|
13 |
+
fairseq==0.10.2
|
14 |
+
tensorflow
|
15 |
+
pandas
|
16 |
+
tltk
|
17 |
+
streamlit
|
18 |
+
pythainlp==3.0.8
|