File size: 3,640 Bytes
a4c8f8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1830956
a4c8f8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os.path
import pickle
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

class MentalHealthClassifier:
    def __init__(self, data_path, model_path):
        self.data = pd.read_csv(data_path, skip_blank_lines=True)
        self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']]
        self.data.dropna(subset=['text'], inplace=True)
        self.data.dropna(subset=['clean_text'], inplace=True)
        self.data_selected = self.data[['clean_text', 'category']]
        self.df = pd.DataFrame(self.data_selected)
        self.label_encoder = LabelEncoder()
        self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category'])
        self.tokenizer = None
        self.vectorizer = CountVectorizer()
        self.model_path = model_path
        self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier()

    def preprocess_data(self):
        tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']]
        X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray()
        return X, self.df['category_encoded']

    def train_model(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        return y_test, y_pred

    def predict_category(self, input_text):
        if self.tokenizer is None:
            raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.")
        tokenized_input = self.tokenizer.tokenize(input_text, padding=True, truncation=True)
        input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray()
        predicted_category_encoded = self.model.predict(input_feature_vector)
        predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded)
        return predicted_category[0]

    def initialize_tokenizer(self, model_name):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

    def save_model(self):
        print("saving model...to pickle...")
        with open(self.model_path, 'wb') as f:
            pickle.dump(self.model, f)

    def load_model(self):
        print("loading model...from pickle...")
        with open(self.model_path, 'rb') as f:
            return pickle.load(f)

if __name__ == "__main__":
    tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    data_path = 'data/data.csv'
    model_path = 'mental_health_model.pkl'
    mental_classifier = MentalHealthClassifier(data_path, model_path)

    if not os.path.exists(model_path):
        mental_classifier.initialize_tokenizer(tokenizer_model_name)
        X, y = mental_classifier.preprocess_data()
        y_test, y_pred = mental_classifier.train_model(X, y)
        mental_classifier.save_model()
    else:
        mental_classifier.load_model()
        mental_classifier.initialize_tokenizer(tokenizer_model_name)  # Ensure tokenizer is initialized if loading model from pickle
        mental_classifier.preprocess_data()

    # input_text = "I feel anxiety whenever i am doing nothing."
    # predicted_category = mental_classifier.predict_category(input_text)
    # print("Predicted mental health condition:", predicted_category)