import os.path import pickle import pandas as pd from transformers import AutoTokenizer from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from xgboost import XGBClassifier from sklearn.preprocessing import LabelEncoder class MentalHealthClassifier: def __init__(self, data_path, model_path): self.data = pd.read_csv(data_path, skip_blank_lines=True) self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']] self.data.dropna(subset=['text'], inplace=True) self.data.dropna(subset=['clean_text'], inplace=True) self.data_selected = self.data[['clean_text', 'category']] self.df = pd.DataFrame(self.data_selected) self.label_encoder = LabelEncoder() self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category']) self.tokenizer = None self.vectorizer = CountVectorizer() self.model_path = model_path self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier() def preprocess_data(self): tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']] X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray() return X, self.df['category_encoded'] def train_model(self, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) self.model.fit(X_train, y_train) y_pred = self.model.predict(X_test) return y_test, y_pred def predict_category(self, input_text): if self.tokenizer is None: raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.") tokenized_input = self.tokenizer.tokenize(input_text, padding=True, truncation=True) input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray() predicted_category_encoded = self.model.predict(input_feature_vector) predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded) return predicted_category[0] def initialize_tokenizer(self, model_name): self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) def save_model(self): print("saving model...to pickle...") with open(self.model_path, 'wb') as f: pickle.dump(self.model, f) def load_model(self): print("loading model...from pickle...") with open(self.model_path, 'rb') as f: return pickle.load(f) if __name__ == "__main__": tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment" data_path = 'data/data.csv' model_path = 'mental_health_model.pkl' mental_classifier = MentalHealthClassifier(data_path, model_path) if not os.path.exists(model_path): mental_classifier.initialize_tokenizer(tokenizer_model_name) X, y = mental_classifier.preprocess_data() y_test, y_pred = mental_classifier.train_model(X, y) mental_classifier.save_model() else: mental_classifier.load_model() mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle mental_classifier.preprocess_data() # input_text = "I feel anxiety whenever i am doing nothing." # predicted_category = mental_classifier.predict_category(input_text) # print("Predicted mental health condition:", predicted_category)