ShAnSantosh commited on
Commit
b289e03
1 Parent(s): 099c56e

Upload nltk_utils.py

Browse files
Files changed (1) hide show
  1. nltk_utils.py +43 -0
nltk_utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import nltk
3
+ nltk.download('punkt')
4
+ from nltk.stem.porter import PorterStemmer
5
+ stemmer = PorterStemmer()
6
+
7
+ def tokenize(sentence):
8
+ """
9
+ split sentence into array of words/tokens
10
+ a token can be a word or punctuation character, or number
11
+ """
12
+ return nltk.word_tokenize(sentence)
13
+
14
+
15
+ def stem(word):
16
+ """
17
+ stemming = find the root form of the word
18
+ examples:
19
+ words = ["organize", "organizes", "organizing"]
20
+ words = [stem(w) for w in words]
21
+ -> ["organ", "organ", "organ"]
22
+ """
23
+ return stemmer.stem(word.lower())
24
+
25
+
26
+ def bag_of_words(tokenized_sentence, words):
27
+ """
28
+ return bag of words array:
29
+ 1 for each known word that exists in the sentence, 0 otherwise
30
+ example:
31
+ sentence = ["hello", "how", "are", "you"]
32
+ words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
33
+ bog = [ 0 , 1 , 0 , 1 , 0 , 0 , 0]
34
+ """
35
+ # stem each word
36
+ sentence_words = [stem(word) for word in tokenized_sentence]
37
+ # initialize bag with 0 for each word
38
+ bag = np.zeros(len(words), dtype=np.float32)
39
+ for idx, w in enumerate(words):
40
+ if w in sentence_words:
41
+ bag[idx] = 1
42
+
43
+ return bag