henry2024 commited on
Commit
3ca727e
1 Parent(s): f0c31b4

Upload nltk_utils.py

Browse files
Files changed (1) hide show
  1. nltk_utils.py +24 -0
nltk_utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Libraries
2
+ import nltk
3
+ from nltk.tokenize import word_tokenize
4
+ nltk.download('punkt')
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from nltk.stem import SnowballStemmer
7
+ stemmer= SnowballStemmer(language= 'english')
8
+ from nltk.corpus import stopwords
9
+ nltk.download('stopwords')
10
+
11
+ # Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick']
12
+ def tokenize(text):
13
+ return [stemmer.stem(token) for token in word_tokenize(text)]
14
+
15
+ # Create stopwords to reduce noise in data
16
+ english_stopwords= stopwords.words('english')
17
+
18
+ # Create a vectosizer to learn all words in order to convert them into numbers
19
+ def vectorizer():
20
+ vectorizer= TfidfVectorizer(tokenizer=tokenize,
21
+ stop_words=english_stopwords,
22
+ )
23
+ return vectorizer
24
+