The-Adnan-Syed commited on
Commit
c5f4573
1 Parent(s): d26ed64

Upload stress_categorization_using_bert_transformer.py

Browse files
stress_categorization_using_bert_transformer.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Stress Categorization Using BERT Transformer.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1JZTLCAUBN6XkcQpAWukUsx7dJ5VyC_KR
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from tensorflow.keras.preprocessing.text import Tokenizer
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+ from tensorflow.keras.models import Sequential
15
+ from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout
16
+ from tensorflow.keras.utils import to_categorical
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.preprocessing import LabelEncoder
19
+
20
+ !pip install transformers
21
+
22
+ import pandas as pd
23
+ import re
24
+ from sklearn.model_selection import train_test_split
25
+ from transformers import BertTokenizer, TFBertForSequenceClassification
26
+ from transformers import InputExample, InputFeatures
27
+ import tensorflow as tf
28
+
29
+ # 1. Load and inspect the data
30
+ data = pd.read_excel('stress_data.xlsx')
31
+
32
+ # 2. Clean and preprocess the data
33
+ def clean_text(text):
34
+ text = text.lower()
35
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
36
+ text = re.sub(r'\d+|\W+', ' ', text)
37
+ return text
38
+
39
+ data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
40
+
41
+ # Convert string labels to integer indices
42
+ label_encoder = LabelEncoder()
43
+
44
+ data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
45
+
46
+ # 3. Tokenize data using BERT's tokenizer
47
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
48
+
49
+ # Split the data into train and test
50
+ train, test = train_test_split(data, test_size=0.2, random_state=42)
51
+
52
+ # Convert data to InputExample format
53
+ def convert_data_to_input_example(data):
54
+ return data.apply(lambda x: InputExample(guid=None, text_a=x['Cleaned_Posts'], text_b=None, label=x['LabelIndices']), axis=1)
55
+
56
+ train_InputExamples = convert_data_to_input_example(train)
57
+ test_InputExamples = convert_data_to_input_example(test)
58
+
59
+ # Convert to features for BERT input
60
+ def convert_input_example_to_feature(example):
61
+ return tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False)
62
+
63
+ train_features = train_InputExamples.apply(convert_input_example_to_feature)
64
+ test_features = test_InputExamples.apply(convert_input_example_to_feature)
65
+
66
+ # Convert features to tensorflow dataset
67
+ def convert_features_to_tf_dataset(features, labels):
68
+ def gen():
69
+ for f, l in zip(features, labels):
70
+ yield ({'input_ids': f['input_ids'], 'attention_mask': f['attention_mask']}, l)
71
+ return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))
72
+
73
+ def decode_predictions(predictions):
74
+ # Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
75
+ predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
76
+ # Decode the indices to original labels
77
+ decoded_labels = label_encoder.inverse_transform(predicted_indices)
78
+ return decoded_labels
79
+
80
+
81
+ train_dataset = convert_features_to_tf_dataset(train_features, train['LabelIndices']).shuffle(100).batch(32).repeat(2)
82
+ test_dataset = convert_features_to_tf_dataset(test_features, test['LabelIndices']).batch(32)
83
+
84
+ # 4. Fine-tune BERT on the dataset
85
+ model_new = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data['Labels'].unique()))
86
+ model_new.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
87
+ model_new.fit(train_dataset, epochs=1, validation_data=test_dataset)
88
+ decode_predictions(model_new.predict("I am financially broken"))
89
+
90
+ # 5. Evaluate the model
91
+ loss, accuracy = model_new.evaluate(test_dataset)
92
+ print(f"Test accuracy: {accuracy}")
93
+
94
+
95
+
96
+
97
+
98
+ """# New Section"""
99
+
100
+
101
+
102
+ pip install transformers
103
+
104
+ import pandas as pd
105
+ import re
106
+ from sklearn.model_selection import train_test_split
107
+ from transformers import BertTokenizer, TFBertForSequenceClassification
108
+ from transformers import InputExample, InputFeatures
109
+ import tensorflow as tf
110
+
111
+ # 1. Load and inspect the data
112
+ data = pd.read_excel('stress_data.xlsx')
113
+
114
+ # 2. Clean and preprocess the data
115
+ def clean_text(text):
116
+ text = text.lower()
117
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
118
+ text = re.sub(r'\d+|\W+', ' ', text)
119
+ return text
120
+
121
+ data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
122
+
123
+ # Convert string labels to integer indices
124
+ label_encoder = LabelEncoder()
125
+
126
+ data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
127
+
128
+ # 3. Tokenize data using BERT's tokenizer
129
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
130
+
131
+ tokenizer
132
+
133
+ data
134
+
135
+ import pandas as pd
136
+ import re
137
+ from sklearn.model_selection import train_test_split
138
+ from sklearn.preprocessing import LabelEncoder
139
+ from transformers import BertTokenizer, TFBertForSequenceClassification
140
+ from transformers import InputExample, InputFeatures
141
+ import tensorflow as tf
142
+
143
+ # 1. Load and inspect the data
144
+ data = pd.read_excel('stress_data.xlsx')
145
+
146
+ # 2. Clean and preprocess the data
147
+ def clean_text(text):
148
+ text = text.lower()
149
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
150
+ text = re.sub(r'\d+|\W+', ' ', text)
151
+ return text
152
+
153
+ data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
154
+
155
+ # Convert string labels to integer indices
156
+ label_encoder = LabelEncoder()
157
+
158
+ data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
159
+
160
+ data["Labels"]
161
+
162
+ le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
163
+ print(le_name_mapping)
164
+
165
+ import joblib
166
+
167
+ # Assuming 'label_encoder' is your LabelEncoder instance
168
+ joblib.dump(label_encoder, 'label_encoder.joblib')
169
+
170
+ label_encoder = joblib.load("/content/label_encoder.joblib")
171
+
172
+ def decode_predictions(predictions):
173
+ # Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
174
+ predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
175
+ # Decode the indices to original labels
176
+ decoded_labels = label_encoder.inverse_transform(predicted_indices)
177
+ return decoded_labels
178
+
179
+ # Use a pipeline as a high-level helper
180
+ from transformers import pipeline
181
+
182
+ pipe = pipeline("text-classification", model="NeuEraAI/Stress_Classifier_BERT")
183
+
184
+ decode_predictions(pipe.predict("I am in huge debts. I have taken huge loans and I can't repay."))
185
+