akuysal commited on
Commit
a8dbb61
0 Parent(s):

Duplicate from akuysal/SMS-spam-Turkish-sklearn

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LinearSVC_SMS_spam_TR.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1b32d1f4716a7c48facea2b8630b897be52618b461cbb2bb4f20f34b9df52f
3
+ size 23303
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SMS Spam Turkish Scikit-Learn
3
+ emoji: 🌖
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: akuysal/SMS-spam-Turkish-sklearn
12
+ ---
13
+
14
+ ENGLISH
15
+ The dataset used in the study "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." is employed for training. The success ratio for Linear SVM Classifier is 0.9880 in terms of Macro-F1 when 10% of the dataset was used for testing.
16
+ The dataset is composed of SPAM and LEGITIMATE sms data.
17
+
18
+ TÜRKÇE
19
+ Bu çalışmada "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." başlıklı çalışmadaki veri seti kullanılmıştır. Linear SVM sınıflandırıcı için başarı oranı, veri setinin %10'u test için kullanıldığında Makro-F1 açısından 0,9880'dir.
20
+ Veri seti, SPAM ve LEGITIMATE kısa mesaj verilerinden oluşmaktadır.
21
+
22
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from TurkishStemmer import TurkishStemmer
3
+ import string
4
+ # import for loading python objects (scikit-learn models)
5
+ import pickle
6
+ import nltk
7
+ from nltk.data import load
8
+ import streamlit as st
9
+ import sklearn
10
+
11
+ nltk.download('punkt')
12
+ trans_table = {ord(c): None for c in string.punctuation + string.digits}
13
+
14
+ def custom_tokenizer_with_Turkish_stemmer(text):
15
+ # tokenize text
16
+ # tokens = text.split(" ")
17
+ tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
18
+ print(tokens)
19
+ stems = [stemmerTR.stem(item.lower()) for item in tokens]
20
+ return stems
21
+
22
+ def predictSMSdata(test_text):
23
+ categories = ["legitimate", "spam"]
24
+ categories.sort()
25
+
26
+ # load model
27
+ filename1 = "LinearSVC_SMS_spam_TR.pickle"
28
+ file_handle1 = open(filename1, "rb")
29
+ classifier = pickle.load(file_handle1)
30
+ file_handle1.close()
31
+
32
+ # load tfidf_vectorizer for transforming test text data
33
+ filename2 = "tfidf_vectorizer_TR.pickle"
34
+ file_handle2 = open(filename2, "rb")
35
+ tfidf_vectorizer = pickle.load(file_handle2)
36
+ file_handle2.close()
37
+
38
+ test_list=[test_text]
39
+ tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
40
+ predicted = classifier.predict(tfidf_vectorizer_vectors_test)
41
+ print(categories[predicted[0]])
42
+ return categories[predicted[0]]
43
+
44
+ stemmerTR = TurkishStemmer()
45
+
46
+ # adding the text that will show in the text box
47
+ default_value = "Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder"
48
+ text = st.text_area("enter some text!", default_value)
49
+ if text:
50
+ out = predictSMSdata(text)
51
+ st.write("The category of SMS = " + out.upper())
52
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ TurkishStemmer==1.3
2
+ scikit-learn>=1.1
3
+ nltk
tfidf_vectorizer_TR.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7ad6fcbd377d3025072502492b36208d32dba87ba4d73bd86171c48b74ba33
3
+ size 82481