Spaces:

pythainlp
/

pythainlp

Build error

App Files Files

wannaphong commited on Jul 21, 2022

Commit

0903ed1

•

1 Parent(s): 0f5b3a5

Add file

Browse files

Files changed (11) hide show

.DS_Store +0 -0
app.py +22 -0
pages/Name Entity Recognition.py +35 -0
pages/pos_tag.py +44 -0
pages/sent_tokenize.py +35 -0
pages/soundex.py +34 -0
pages/subword_tokenize.py +32 -0
pages/translation.py +48 -0
pages/transliteration.py +32 -0
pages/word_tokenize.py +33 -0
requirements.txt +18 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+st.markdown("""
+# PyThaiNLP Demo 🎈
+Welcome to PyThaiNLP Demo. This website will give you a example NLP function from PyThaiNLP.
+You can see PyThaiNLP at [PyThaiNLP.github.io](https://pythainlp.github.io)
+**About PyThaiNLP**
+The PyThaiNLP Project is a Thai Natural Language Processing project. We build softwares and datasets for Thai language. Our Main Project is PyThaiNLP.
+PyThaiNLP is a Python package for text processing and linguistic analysis, similar to nltk, with focus on Thai language.
+We build Thai NLP.
+PyThaiNLP
+"""
+)
+#st.sidebar.markdown("# Homepage 🎈")

pages/Name Entity Recognition.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import streamlit as st
+import time
+from pythainlp.tag import NER
+st.markdown("""
+# Name Entity Recognition
+PyThaiNLP support Name Entity Recognition. We have
+- thainer - Thai NER engine
+- wangchanberta* - wangchanberta model
+- tltk - wrapper for TLTK
+and trained with corpus
+- thainer - Thai NER corpus
+- lst20 - lst20 corpus (wangchanberta only)
+**Note**: for tltk engine, It's support ner model from tltk only.
+""")
+_engine =None
+with st.form("my_form"):
+    st.write("Inside the form")
+    text = st.text_area("text")
+    engine=st.selectbox('Select engine', ['thainer', 'wangchanberta', 'tltk'], key=1,index=0)
+    corpus=st.selectbox('Select corpus', ['thainer', 'lst20'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Tags: ")
+        _engine = NER(engine=str(engine), corpus=str(corpus))
+        st.write(_engine.tag(text,tag=True))
+st.write("See the documentation at [NER | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.NER).")

pages/pos_tag.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+import time
+from pythainlp.tag import pos_tag
+from pythainlp.tokenize import word_tokenize
+st.markdown("""
+# Part of speech tagging 🎉
+PyThaiNLP support part-of-speech tagging for analysis text. We have
+- perceptron - perceptron tagger (default)
+- unigram - unigram tagger
+- tltk - TLTK: Thai Language Toolkit (support TNC corpus only. if you choose other corpus, It’s change to TNC corpus.)
+and trained with corpus:
+- lst20 - LST20 corpus by National Electronics and Computer Technology Center, Thailand
+- orchid - ORCHID corpus, text from Thai academic articles
+- pud - Parallel Universal Dependencies (PUD) treebanks, natively use Universal POS tags
+- lst20_ud - LST20 text, with tags mapped to Universal POS tag from Universal Dependencies
+- orchid_ud - ORCHID text, with tags mapped to Universal POS tags
+for this demo page.
+""")
+with st.form("my_form"):
+    st.write("Input text")
+    text = st.text_area("text")
+    word_engine=st.selectbox('Select word tokenize', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
+    pos_corpus = st.selectbox('Select POS corpus', ['lst20', 'orchid', 'pud', 'lst20_ud', 'orchid_ud'], key=1,index=0)
+    pos_engine = st.selectbox('Select Postag engine', ['perceptron', 'unigram', 'tltk'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Pos: ")
+        start = time.time()
+        _list_words = word_tokenize(str(text), engine=str(word_engine))
+        _pos = pos_tag(_list_words, corpus=str(pos_corpus), engine=str(pos_engine))
+        _text = ""
+        for i,j in _pos:
+            _text += str(i)+"|"+str(j)+" "
+        end = time.time()
+        st.write(_text)
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [pos_tag | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tag.html#pythainlp.tag.pos_tag).")

pages/sent_tokenize.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import streamlit as st
+import time
+from pythainlp.tokenize import sent_tokenize
+st.markdown("""
+# Sentence tokenization 🎉
+PyThaiNLP support Sentence tokenization for NLP piplines. We have
+- crfcut - (default) split by CRF trained on TED dataset.
+- whitespace+newline - split by whitespaces and newline.
+- whitespace - split by whitespaces. Specifiaclly, with regex pattern r" +"
+- tltk - split by TLTK.
+for this demo page.
+""")
+with st.form("my_form"):
+    st.write("Input text")
+    text = st.text_area("text")
+    engine=st.selectbox('Select sentence tokenizer', ['crfcut', 'whitespace+newline', 'whitespace', 'tltk'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Sentences: ")
+        start = time.time()
+        _temp = sent_tokenize(str(text), engine=str(engine))
+        for i in _temp:
+            st.write(i)
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [sent_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.sent_tokenize).")
+#st.sidebar.markdown("# Word tokenize 🎉")

pages/soundex.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+import time
+from pythainlp.soundex import soundex
+st.markdown("""
+# Soundex 🎉
+PyThaiNLP support Soundex for searching or indexing. We have
+- udom83 (default) - Thai soundex algorithm proposed by Vichit Lorchirachoonkul.
+- lk82 - Thai soundex algorithm proposed by Wannee Udompanich.
+- metasound - Thai soundex algorithm based on a combination of Metaphone and Soundex proposed by Snae & Brückner.
+for this demo page.
+""")
+with st.form("my_form"):
+    st.write("Inside the form")
+    text = st.text_area("text")
+    engine=st.selectbox('Select soundex', ['udom83', 'lk82', 'metasound'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Soundex: ")
+        start = time.time()
+        for i in str(text).splitlines():
+            _temp = soundex(str(i), engine=str(engine))
+            st.write(_temp)
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.markdown("See the documentation at [https://pythainlp.github.io/docs/3.0/api/soundex.html](https://pythainlp.github.io/docs/3.0/api/soundex.html).")
+# st.sidebar.markdown("# Soundex 🎉")

pages/subword_tokenize.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import streamlit as st
+import time
+from pythainlp.tokenize import subword_tokenize
+st.markdown("""
+# Subword tokenization 🎉
+PyThaiNLP support Subword tokenization for NLP piplines. We have
+- tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)
+- etcc - Enhanced Thai Character Cluster (Inrut et al. 2001)
+- dict - newmm word tokenizer with a syllable dictionary
+- ssg - CRF syllable segmenter for Thai
+- tltk - syllable tokenizer from tltk
+for this demo page.
+""")
+with st.form("my_form"):
+    st.write("Input text")
+    _text = st.text_area("text")
+    engine=st.selectbox('Select word tokenizition', ['tcc', 'etcc', 'dict', 'ssg', 'tltk'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Subwords: ")
+        start = time.time()
+        st.write(' '.join(subword_tokenize(str(_text), engine=str(engine))))
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [subword_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.subword_tokenize).")

pages/translation.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+import time
+from pythainlp.translate import Translate
+st.markdown("""
+# Translation 🎉
+PyThaiNLP support machine translation for translate text. We have
+- th2en - (default) Thai to English translation
+- en2th - English to Thai translation
+- th2zh - Thai to Chinese translation
+- zh2th - Chinese to Thai translation
+- th2fr - Thai to French translation
+for this demo page. It will use many times for running model.
+""")
+_engine =None
+with st.form("my_form"):
+    st.write("Inside the form")
+    text = st.text_area("text")
+    engine=st.selectbox('Select', ['th2en', 'en2th', 'zh2th', 'th2zh', 'th2fr'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Text: ")
+        start = time.time()
+        if engine=="th2en":
+            from pythainlp.translate.en_th import download_model_all
+            download_model_all()
+            _engine = Translate('th', 'en')
+        elif engine=="en2th":
+            from pythainlp.translate.en_th import download_model_all
+            download_model_all()
+            _engine = Translate('en','th')
+        elif engine == "zh2th":
+            _engine = Translate("zh","th")
+        elif engine == "th2zh":
+            _engine = Translate("th","zh")
+        elif engine == "th2fr":
+            _engine = Translate("th", "fr")
+        st.write(_engine.translate(str(text)))
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [translate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/translate.html).")

pages/transliteration.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import streamlit as st
+import time
+from pythainlp.transliterate import transliterate
+st.markdown("""
+# Translation 🎉
+PyThaiNLP support transliterate text for NLP piplines. We have
+- thaig2p - (default) Thai Grapheme-to-Phoneme, output is IPA (require PyTorch)
+- tltk_g2p - Thai Grapheme-to-Phoneme from TLTK <https://pypi.org/project/tltk/>_.,
+- tltk_ipa - tltk, output is International Phonetic Alphabet (IPA)
+for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
+""")
+with st.form("my_form"):
+    st.write("Inside the form")
+    text = st.text_input("text")
+    engine=st.selectbox('Select transliterate', ['thaig2p', 'tltk_g2p', 'tltk_ipa'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Words: ")
+        start = time.time()
+        st.write(transliterate(str(text), engine=str(engine)))
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [transliterate | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/transliterate.html).")

pages/word_tokenize.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+import time
+from pythainlp.tokenize import word_tokenize
+st.markdown("""
+# Word tokenization 🎉
+PyThaiNLP support Word tokenization for NLP piplines. We have
+- newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
+- mm - dictionary-based, Maximum Matching
+- longest - dictionary-based, Longest Matching
+- tltk - wrapper for TLTK.
+for this demo page. You can custom dictionary for some word tokenizer engine. (Python only)
+""")
+with st.form("my_form"):
+    st.write("Input text")
+    text = st.text_area("text")
+    engine=st.selectbox('Select word tokenizition', ['newmm', 'mm', 'longest', 'tltk'], key=1,index=0)
+    # Every form must have a submit button.
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        st.subheader("Words: ")
+        start = time.time()
+        st.write(' '.join(word_tokenize(str(text), engine=str(engine))))
+        end = time.time()
+        st.write()
+        st.write("Running times: "+str(end - start))
+st.write("See the documentation at [word_tokenize | PyThaiNLP](https://pythainlp.github.io/docs/3.0/api/tokenize.html#pythainlp.tokenize.word_tokenize).")
+#st.sidebar.markdown("# Word tokenize 🎉")

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+PyYAML
+numpy
+python-crfsuite
+requests
+tinydb
+nltk
+emoji
+sacremoses
+sentencepiece
+ssg
+torch
+transformers
+fairseq==0.10.2
+tensorflow
+pandas
+tltk
+streamlit
+pythainlp==3.0.8