File size: 835 Bytes
40ee52c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Import Libraries
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
stemmer= SnowballStemmer(language= 'english')
from nltk.corpus import stopwords
nltk.download('stopwords')

# Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick']
def tokenize(text):
  return [stemmer.stem(token) for token in word_tokenize(text)]

# Create stopwords to reduce noise in data
english_stopwords= stopwords.words('english')

# Create a vectosizer to learn all words in order to convert them into numbers
def vectorizer():
    vectorizer= TfidfVectorizer(tokenizer=tokenize,
                                stop_words=english_stopwords,
                                )
    return vectorizer