import numpy as np import pandas as pd from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer import re from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB import pickle # import dataset 'full_post' that has been lemmatized url = 'https://huggingface.co/spaces/yxmauw/subreddit-clf-app/raw/main/tts.csv' df = pd.read_csv(url, header=0) # train-test-split X = df['full_post'] # pd.series because dataframe format not friendly for word vectorization y = df['subreddit'] # make sure target variable has equal representation on both train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=42) # lemmatizing def lemmatize_join(text): tokenizer = RegexpTokenizer('[a-z]+', gaps=False) # instantiate tokenizer lemmer = WordNetLemmatizer() # instantiate lemmatizer return ' '.join([lemmer.lemmatize(w) for w in tokenizer.tokenize(text.lower())]) # lowercase, join back together with spaces so that word vectorizers can still operate # on cell contents as strings Z_train = X_train.apply(lemmatize_join) # model instantiation pipe_cvec_nb = Pipeline([ ('cvec', CountVectorizer()), ('nb', MultinomialNB()) ]) # word vectorizor parameters features = [1000] min_df = [3] max_df = [.6] ngrams = [(1,2)] stop_words = ['english'] accent = ['unicode'] # naive bayes classifier parameters alphas = [.5] cvec_nb_params = [{'cvec__max_features': features, 'cvec__min_df': min_df, 'cvec__max_df': max_df, 'cvec__ngram_range': ngrams, 'cvec__lowercase': [False], 'cvec__stop_words': stop_words, 'cvec__strip_accents': accent, 'nb__alpha': alphas }] cvec_nb_gs = GridSearchCV(pipe_cvec_nb, cvec_nb_params, scoring='accuracy', cv=5, verbose=1, n_jobs=-2) cvec_nb_gs.fit(Z_train, y_train) pickle.dump(cvec_nb_gs, open('final_model.sav', 'wb'))