### # - Author: Jaelin Lee # - Date: Mar 23, 2024 # - Description: XGBoost mental health classfier [depression, adhd, anxiety, social_isolation, cyberbullying, social_media_addiction]. Incorporated the updated code from Aleksandra Ĺšledziewska that fixed token size issue. The model is now loaded from pickle if the model is already saved to pickle. This saves time for each prediction without having to retrain the model. ### import os.path import pickle import pandas as pd from transformers import AutoTokenizer from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from xgboost import XGBClassifier from sklearn.preprocessing import LabelEncoder from clean_text_model import TextCleaner class MentalHealthClassifier: def __init__(self, data_path, model_path): self.data = pd.read_csv(data_path, skip_blank_lines=True) self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']] # self.data.dropna(subset=['text'], inplace=True) self.data.dropna(subset=['clean_text'], inplace=True) self.data_selected = self.data[['clean_text', 'category']] self.df = pd.DataFrame(self.data_selected) self.label_encoder = LabelEncoder() self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category']) self.tokenizer = None self.vectorizer = CountVectorizer() self.text_cleaner = TextCleaner() self.model_path = model_path self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier() def preprocess_data(self): tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']] X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray() return X, self.df['category_encoded'] def train_model(self, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) self.model.fit(X_train, y_train) y_pred = self.model.predict(X_test) return y_test, y_pred def predict_category(self, raw_input_text): if self.tokenizer is None: raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.") input_text = self.text_cleaner.cleaning_text(raw_input_text) tokenized_input = self.tokenizer.tokenize(raw_input_text, padding=True, truncation=True) input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray() predicted_category_encoded = self.model.predict(input_feature_vector) predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded) return predicted_category[0] def initialize_tokenizer(self, model_name): self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) def save_model(self): print("saving model...to pickle...") with open(self.model_path, 'wb') as f: pickle.dump(self.model, f) def load_model(self): print("loading model...from pickle...") with open(self.model_path, 'rb') as f: return pickle.load(f) if __name__ == "__main__": tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment" data_path = 'data/data.csv' model_path = 'mental_health_model.pkl' mental_classifier = MentalHealthClassifier(data_path, model_path) if not os.path.exists(model_path): mental_classifier.initialize_tokenizer(tokenizer_model_name) X, y = mental_classifier.preprocess_data() y_test, y_pred = mental_classifier.train_model(X, y) mental_classifier.save_model() else: mental_classifier.load_model() mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle mental_classifier.preprocess_data() input_text = "I feel bullied online." predicted_category = mental_classifier.predict_category(input_text) print("Predicted mental health condition:", predicted_category)