kompiangg commited on
Commit
ae8ae26
1 Parent(s): 61dee9d

preprocessssssss

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. main.py +3 -3
  3. new_kamusalay.csv +0 -0
  4. preprocess/preprocess.py +51 -0
  5. requirements.txt +2 -0
.gitignore CHANGED
@@ -2,4 +2,5 @@ venv
2
  __pycache__
3
  *.csv
4
  models/
5
- .cache/
 
 
2
  __pycache__
3
  *.csv
4
  models/
5
+ .cache/
6
+ !new_kamusalay.csv
main.py CHANGED
@@ -3,8 +3,7 @@ from type.request.predict import PredictRequest
3
  from type.response.predict import PredictResponse
4
  from hugging_face import model, dataset
5
  from transformer import transformer
6
-
7
- import sys
8
 
9
  hate_speech_model = model.load_hugging_face_model('model_rf.pkl')
10
  hate_speech_dataset = dataset.load_dataset('data_clean.csv')
@@ -22,7 +21,8 @@ def healthz():
22
 
23
  @app.post("/predict")
24
  def predict(request: PredictRequest):
25
- predict_text = [request.predict_text]
 
26
 
27
  predict_text = tfidf.transform(predict_text)
28
  prediction = hate_speech_model.predict(predict_text)
 
3
  from type.response.predict import PredictResponse
4
  from hugging_face import model, dataset
5
  from transformer import transformer
6
+ from preprocess import preprocess
 
7
 
8
  hate_speech_model = model.load_hugging_face_model('model_rf.pkl')
9
  hate_speech_dataset = dataset.load_dataset('data_clean.csv')
 
21
 
22
  @app.post("/predict")
23
  def predict(request: PredictRequest):
24
+ preprocessed_text = preprocess.preprocess(request.predict_text)
25
+ predict_text = [preprocessed_text]
26
 
27
  predict_text = tfidf.transform(predict_text)
28
  prediction = hate_speech_model.predict(predict_text)
new_kamusalay.csv ADDED
The diff for this file is too large to render. See raw diff
 
preprocess/preprocess.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+
7
+ nltk.download('stopwords')
8
+ alay_dict = pd.read_csv('./new_kamusalay.csv', encoding='latin-1', header=None)
9
+ alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})
10
+
11
+ factory = StemmerFactory()
12
+ stemmer = factory.create_stemmer()
13
+
14
+ def lowercase(text):
15
+ return text.lower()
16
+
17
+ def remove_unnecessary_char(text):
18
+ text = re.sub('\n',' ',text) # Remove every '\n'
19
+ text = re.sub('rt',' ',text) # Remove every retweet symbol
20
+ text = re.sub('user',' ',text) # Remove every username
21
+ text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
22
+ text = re.sub(' +', ' ', text) # Remove extra spaces
23
+ return text
24
+
25
+ def remove_nonaplhanumeric(text):
26
+ text = re.sub('[^0-9a-zA-Z]+', ' ', text)
27
+ return text
28
+
29
+ alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
30
+ def normalize_alay(text):
31
+ return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])
32
+
33
+ def remove_stopword(text):
34
+ list_stopwords = stopwords.words('indonesian')
35
+ text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')])
36
+
37
+ text = re.sub(' +', ' ', text) # Remove extra spaces
38
+ text = text.strip()
39
+ return text
40
+
41
+ def stemming(text):
42
+ return stemmer.stem(text)
43
+
44
+ def preprocess(text):
45
+ text = lowercase(text) # 1
46
+ text = remove_nonaplhanumeric(text) # 2
47
+ text = remove_unnecessary_char(text) # 2
48
+ text = normalize_alay(text) # 3
49
+ text = stemming(text) # 4
50
+ text = remove_stopword(text) # 5
51
+ return text
requirements.txt CHANGED
@@ -49,3 +49,5 @@ watchfiles==0.21.0
49
  websockets==12.0
50
  xxhash==3.4.1
51
  yarl==1.9.4
 
 
 
49
  websockets==12.0
50
  xxhash==3.4.1
51
  yarl==1.9.4
52
+ Sastrawi==1.0.1
53
+ nltk==3.8.1