Spaces:
Sleeping
Sleeping
### | |
# - Author: Jaelin Lee | |
# - Date: Mar 23, 2024 | |
# - Description: XGBoost mental health classfier [depression, adhd, anxiety, social_isolation, cyberbullying, social_media_addiction]. Incorporated the updated code from Aleksandra Śledziewska that fixed token size issue. The model is now loaded from pickle if the model is already saved to pickle. This saves time for each prediction without having to retrain the model. | |
### | |
import os.path | |
import pickle | |
import pandas as pd | |
from transformers import AutoTokenizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import CountVectorizer | |
from xgboost import XGBClassifier | |
from sklearn.preprocessing import LabelEncoder | |
from clean_text_model import TextCleaner | |
class MentalHealthClassifier: | |
def __init__(self, data_path, model_path): | |
self.data = pd.read_csv(data_path, skip_blank_lines=True) | |
self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']] | |
# self.data.dropna(subset=['text'], inplace=True) | |
self.data.dropna(subset=['clean_text'], inplace=True) | |
self.data_selected = self.data[['clean_text', 'category']] | |
self.df = pd.DataFrame(self.data_selected) | |
self.label_encoder = LabelEncoder() | |
self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category']) | |
self.tokenizer = None | |
self.vectorizer = CountVectorizer() | |
self.text_cleaner = TextCleaner() | |
self.model_path = model_path | |
self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier() | |
def preprocess_data(self): | |
tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']] | |
X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray() | |
return X, self.df['category_encoded'] | |
def train_model(self, X, y): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
self.model.fit(X_train, y_train) | |
y_pred = self.model.predict(X_test) | |
return y_test, y_pred | |
def predict_category(self, raw_input_text): | |
if self.tokenizer is None: | |
raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.") | |
input_text = self.text_cleaner.cleaning_text(raw_input_text) | |
tokenized_input = self.tokenizer.tokenize(raw_input_text, padding=True, truncation=True) | |
input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray() | |
predicted_category_encoded = self.model.predict(input_feature_vector) | |
predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded) | |
return predicted_category[0] | |
def initialize_tokenizer(self, model_name): | |
self.model_name = model_name | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
def save_model(self): | |
print("saving model...to pickle...") | |
with open(self.model_path, 'wb') as f: | |
pickle.dump(self.model, f) | |
def load_model(self): | |
print("loading model...from pickle...") | |
with open(self.model_path, 'rb') as f: | |
return pickle.load(f) | |
if __name__ == "__main__": | |
tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment" | |
data_path = 'data/data.csv' | |
model_path = 'mental_health_model.pkl' | |
mental_classifier = MentalHealthClassifier(data_path, model_path) | |
if not os.path.exists(model_path): | |
mental_classifier.initialize_tokenizer(tokenizer_model_name) | |
X, y = mental_classifier.preprocess_data() | |
y_test, y_pred = mental_classifier.train_model(X, y) | |
mental_classifier.save_model() | |
else: | |
mental_classifier.load_model() | |
mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle | |
mental_classifier.preprocess_data() | |
input_text = "I feel bullied online." | |
predicted_category = mental_classifier.predict_category(input_text) | |
print("Predicted mental health condition:", predicted_category) | |