Spaces:
Running
Running
# Import Libraries | |
import nltk | |
from nltk.tokenize import word_tokenize | |
nltk.download('punkt') | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from nltk.stem import SnowballStemmer | |
stemmer= SnowballStemmer(language= 'english') | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
# Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick'] | |
def tokenize(text): | |
return [stemmer.stem(token) for token in word_tokenize(text)] | |
# Create stopwords to reduce noise in data | |
english_stopwords= stopwords.words('english') | |
# Create a vectosizer to learn all words in order to convert them into numbers | |
def vectorizer(): | |
vectorizer= TfidfVectorizer(tokenizer=tokenize, | |
stop_words=english_stopwords, | |
) | |
return vectorizer | |