akuysal's picture
Update app.py
07dd827
raw
history blame
1.64 kB
from sklearn.feature_extraction.text import TfidfVectorizer
from TurkishStemmer import TurkishStemmer
import nltk
import string
# import for loading python objects (scikit-learn models)
import pickle
import streamlit as st
import sklearn
def custom_tokenizer_with_Turkish_stemmer(text):
# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
stems = [stemmerTR.stem(item.lower()) for item in tokens]
return stems
def predictSMSdata(test_text):
categories = ["legitimate", "spam"]
categories.sort()
# load model
filename1 = "LinearSVC_SMS_spam_TR.pickle"
file_handle1 = open(filename1, "rb")
classifier = pickle.load(file_handle1)
file_handle1.close()
# load tfidf_vectorizer for transforming test text data
filename2 = "tfidf_vectorizer_TR.pickle"
file_handle2 = open(filename2, "rb")
tfidf_vectorizer = pickle.load(file_handle2)
file_handle2.close()
test_list=[test_text]
tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
predicted = classifier.predict(tfidf_vectorizer_vectors_test)
print(categories[predicted[0]])
trans_table = {ord(c): None for c in string.punctuation + string.digits}
stemmerTR = TurkishStemmer()
text = st.text_area("enter some text!")
if text:
out = predictSMSdata(text)
st.json(out)