MindfulMedia_Mentor

Sleeping

App Files Files Community

MindfulMedia_Mentor / xgb_mental_health.py

jaelin215

updated path

2c17cd5 verified 8 months ago

raw

history blame

4.21 kB

	###
	# - Author: Jaelin Lee
	# - Date: Mar 23, 2024
	# - Description: XGBoost mental health classfier [depression, adhd, anxiety, social_isolation, cyberbullying, social_media_addiction]. Incorporated the updated code from Aleksandra Śledziewska that fixed token size issue. The model is now loaded from pickle if the model is already saved to pickle. This saves time for each prediction without having to retrain the model.
	###

	import os.path
	import pickle
	import pandas as pd
	from transformers import AutoTokenizer
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import CountVectorizer
	from xgboost import XGBClassifier
	from sklearn.preprocessing import LabelEncoder
	from clean_text_model import TextCleaner

	class MentalHealthClassifier:
	def __init__(self, data_path, model_path):
	self.data = pd.read_csv(data_path, skip_blank_lines=True)
	self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']]
	# self.data.dropna(subset=['text'], inplace=True)
	self.data.dropna(subset=['clean_text'], inplace=True)
	self.data_selected = self.data[['clean_text', 'category']]
	self.df = pd.DataFrame(self.data_selected)
	self.label_encoder = LabelEncoder()
	self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category'])
	self.tokenizer = None
	self.vectorizer = CountVectorizer()
	self.text_cleaner = TextCleaner()
	self.model_path = model_path
	self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier()

	def preprocess_data(self):
	tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']]
	X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray()
	return X, self.df['category_encoded']

	def train_model(self, X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	self.model.fit(X_train, y_train)
	y_pred = self.model.predict(X_test)
	return y_test, y_pred

	def predict_category(self, raw_input_text):
	if self.tokenizer is None:
	raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.")
	input_text = self.text_cleaner.cleaning_text(raw_input_text)
	tokenized_input = self.tokenizer.tokenize(raw_input_text, padding=True, truncation=True)
	input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray()
	predicted_category_encoded = self.model.predict(input_feature_vector)
	predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded)
	return predicted_category[0]

	def initialize_tokenizer(self, model_name):
	self.model_name = model_name
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

	def save_model(self):
	print("saving model...to pickle...")
	with open(self.model_path, 'wb') as f:
	pickle.dump(self.model, f)

	def load_model(self):
	print("loading model...from pickle...")
	with open(self.model_path, 'rb') as f:
	return pickle.load(f)

	if __name__ == "__main__":
	tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
	data_path = 'data/data.csv'
	model_path = 'mental_health_model.pkl'
	mental_classifier = MentalHealthClassifier(data_path, model_path)

	if not os.path.exists(model_path):
	mental_classifier.initialize_tokenizer(tokenizer_model_name)
	X, y = mental_classifier.preprocess_data()
	y_test, y_pred = mental_classifier.train_model(X, y)
	mental_classifier.save_model()
	else:
	mental_classifier.load_model()
	mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle
	mental_classifier.preprocess_data()

	input_text = "I feel bullied online."
	predicted_category = mental_classifier.predict_category(input_text)
	print("Predicted mental health condition:", predicted_category)