preprocessssssss
Browse files- .gitignore +2 -1
- main.py +3 -3
- new_kamusalay.csv +0 -0
- preprocess/preprocess.py +51 -0
- requirements.txt +2 -0
.gitignore
CHANGED
@@ -2,4 +2,5 @@ venv
|
|
2 |
__pycache__
|
3 |
*.csv
|
4 |
models/
|
5 |
-
.cache/
|
|
|
|
2 |
__pycache__
|
3 |
*.csv
|
4 |
models/
|
5 |
+
.cache/
|
6 |
+
!new_kamusalay.csv
|
main.py
CHANGED
@@ -3,8 +3,7 @@ from type.request.predict import PredictRequest
|
|
3 |
from type.response.predict import PredictResponse
|
4 |
from hugging_face import model, dataset
|
5 |
from transformer import transformer
|
6 |
-
|
7 |
-
import sys
|
8 |
|
9 |
hate_speech_model = model.load_hugging_face_model('model_rf.pkl')
|
10 |
hate_speech_dataset = dataset.load_dataset('data_clean.csv')
|
@@ -22,7 +21,8 @@ def healthz():
|
|
22 |
|
23 |
@app.post("/predict")
|
24 |
def predict(request: PredictRequest):
|
25 |
-
|
|
|
26 |
|
27 |
predict_text = tfidf.transform(predict_text)
|
28 |
prediction = hate_speech_model.predict(predict_text)
|
|
|
3 |
from type.response.predict import PredictResponse
|
4 |
from hugging_face import model, dataset
|
5 |
from transformer import transformer
|
6 |
+
from preprocess import preprocess
|
|
|
7 |
|
8 |
hate_speech_model = model.load_hugging_face_model('model_rf.pkl')
|
9 |
hate_speech_dataset = dataset.load_dataset('data_clean.csv')
|
|
|
21 |
|
22 |
@app.post("/predict")
|
23 |
def predict(request: PredictRequest):
|
24 |
+
preprocessed_text = preprocess.preprocess(request.predict_text)
|
25 |
+
predict_text = [preprocessed_text]
|
26 |
|
27 |
predict_text = tfidf.transform(predict_text)
|
28 |
prediction = hate_speech_model.predict(predict_text)
|
new_kamusalay.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocess/preprocess.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
4 |
+
import nltk
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
|
7 |
+
nltk.download('stopwords')
|
8 |
+
alay_dict = pd.read_csv('./new_kamusalay.csv', encoding='latin-1', header=None)
|
9 |
+
alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})
|
10 |
+
|
11 |
+
factory = StemmerFactory()
|
12 |
+
stemmer = factory.create_stemmer()
|
13 |
+
|
14 |
+
def lowercase(text):
|
15 |
+
return text.lower()
|
16 |
+
|
17 |
+
def remove_unnecessary_char(text):
|
18 |
+
text = re.sub('\n',' ',text) # Remove every '\n'
|
19 |
+
text = re.sub('rt',' ',text) # Remove every retweet symbol
|
20 |
+
text = re.sub('user',' ',text) # Remove every username
|
21 |
+
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
|
22 |
+
text = re.sub(' +', ' ', text) # Remove extra spaces
|
23 |
+
return text
|
24 |
+
|
25 |
+
def remove_nonaplhanumeric(text):
|
26 |
+
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
|
27 |
+
return text
|
28 |
+
|
29 |
+
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
|
30 |
+
def normalize_alay(text):
|
31 |
+
return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])
|
32 |
+
|
33 |
+
def remove_stopword(text):
|
34 |
+
list_stopwords = stopwords.words('indonesian')
|
35 |
+
text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')])
|
36 |
+
|
37 |
+
text = re.sub(' +', ' ', text) # Remove extra spaces
|
38 |
+
text = text.strip()
|
39 |
+
return text
|
40 |
+
|
41 |
+
def stemming(text):
|
42 |
+
return stemmer.stem(text)
|
43 |
+
|
44 |
+
def preprocess(text):
|
45 |
+
text = lowercase(text) # 1
|
46 |
+
text = remove_nonaplhanumeric(text) # 2
|
47 |
+
text = remove_unnecessary_char(text) # 2
|
48 |
+
text = normalize_alay(text) # 3
|
49 |
+
text = stemming(text) # 4
|
50 |
+
text = remove_stopword(text) # 5
|
51 |
+
return text
|
requirements.txt
CHANGED
@@ -49,3 +49,5 @@ watchfiles==0.21.0
|
|
49 |
websockets==12.0
|
50 |
xxhash==3.4.1
|
51 |
yarl==1.9.4
|
|
|
|
|
|
49 |
websockets==12.0
|
50 |
xxhash==3.4.1
|
51 |
yarl==1.9.4
|
52 |
+
Sastrawi==1.0.1
|
53 |
+
nltk==3.8.1
|