ashishabraham22 commited on
Commit
e82eef0
1 Parent(s): a5a950d

Upload prepro.py

Browse files
Files changed (1) hide show
  1. prepro.py +29 -0
prepro.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #preprocessing
2
+ from sklearn.preprocessing import OrdinalEncoder
3
+ from nltk.corpus import stopwords
4
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
5
+ from tensorflow.keras.preprocessing.text import one_hot
6
+ from nltk.stem.porter import PorterStemmer
7
+ import re
8
+
9
+ #stem words
10
+ def stemm(data):
11
+ ps=PorterStemmer()
12
+ corpus=[]
13
+ review=re.sub('[^a-zA-Z]',' ',data)
14
+ review=review.lower()
15
+ review=review.split()
16
+ #remove html tag by removing <br> also
17
+ review=[ps.stem(word) for word in review if not word in stopwords.words('english') and not word in ['br']]
18
+ review=' '.join(review)
19
+ corpus.append(review)
20
+ return corpus
21
+
22
+ #one hot encoding and padding
23
+ def preprocess(data):
24
+ corpus=stemm(data)
25
+ onehot_corpus=[one_hot(words,10000) for words in corpus]
26
+ sent_length = 2470
27
+ padded_corpus=pad_sequences(onehot_corpus,padding='pre',maxlen=sent_length)
28
+ return padded_corpus
29
+