OnabajoMonsurat commited on
Commit
ca49d23
1 Parent(s): 047b9ea

Upload nltk_utils.py

Browse files
Files changed (1) hide show
  1. nltk_utils.py +6 -0
nltk_utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import nltk
2
  from nltk.tokenize import word_tokenize
3
  nltk.download('punkt')
@@ -7,12 +8,17 @@ stemmer= SnowballStemmer(language= 'english')
7
  from nltk.corpus import stopwords
8
  nltk.download('stopwords')
9
 
 
10
  def tokenize(text):
11
  return [stemmer.stem(token) for token in word_tokenize(text)]
12
 
 
13
  english_stopwords= stopwords.words('english')
 
 
14
  def vectorizer():
15
  vectorizer= TfidfVectorizer(tokenizer=tokenize,
16
  stop_words=english_stopwords,
17
  )
18
  return vectorizer
 
 
1
+ # Import Libraries
2
  import nltk
3
  from nltk.tokenize import word_tokenize
4
  nltk.download('punkt')
 
8
  from nltk.corpus import stopwords
9
  nltk.download('stopwords')
10
 
11
+ # Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick']
12
  def tokenize(text):
13
  return [stemmer.stem(token) for token in word_tokenize(text)]
14
 
15
+ # Create stopwords to reduce noise in data
16
  english_stopwords= stopwords.words('english')
17
+
18
+ # Create a vectosizer to learn all words in order to convert them into numbers
19
  def vectorizer():
20
  vectorizer= TfidfVectorizer(tokenizer=tokenize,
21
  stop_words=english_stopwords,
22
  )
23
  return vectorizer
24
+