|
|
|
"""Stress Categorization Using BERT Transformer.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1JZTLCAUBN6XkcQpAWukUsx7dJ5VyC_KR |
|
""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.models import Sequential |
|
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout |
|
from tensorflow.keras.utils import to_categorical |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
!pip install transformers |
|
|
|
import pandas as pd |
|
import re |
|
from sklearn.model_selection import train_test_split |
|
from transformers import BertTokenizer, TFBertForSequenceClassification |
|
from transformers import InputExample, InputFeatures |
|
import tensorflow as tf |
|
|
|
|
|
data = pd.read_excel('stress_data.xlsx') |
|
|
|
|
|
def clean_text(text): |
|
text = text.lower() |
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'\d+|\W+', ' ', text) |
|
return text |
|
|
|
data['Cleaned_Posts'] = data['Posts'].apply(clean_text) |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
|
|
data['LabelIndices'] = label_encoder.fit_transform(data['Labels']) |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) |
|
|
|
|
|
train, test = train_test_split(data, test_size=0.2, random_state=42) |
|
|
|
|
|
def convert_data_to_input_example(data): |
|
return data.apply(lambda x: InputExample(guid=None, text_a=x['Cleaned_Posts'], text_b=None, label=x['LabelIndices']), axis=1) |
|
|
|
train_InputExamples = convert_data_to_input_example(train) |
|
test_InputExamples = convert_data_to_input_example(test) |
|
|
|
|
|
def convert_input_example_to_feature(example): |
|
return tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False) |
|
|
|
train_features = train_InputExamples.apply(convert_input_example_to_feature) |
|
test_features = test_InputExamples.apply(convert_input_example_to_feature) |
|
|
|
|
|
def convert_features_to_tf_dataset(features, labels): |
|
def gen(): |
|
for f, l in zip(features, labels): |
|
yield ({'input_ids': f['input_ids'], 'attention_mask': f['attention_mask']}, l) |
|
return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([]))) |
|
|
|
def decode_predictions(predictions): |
|
|
|
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions] |
|
|
|
decoded_labels = label_encoder.inverse_transform(predicted_indices) |
|
return decoded_labels |
|
|
|
|
|
train_dataset = convert_features_to_tf_dataset(train_features, train['LabelIndices']).shuffle(100).batch(32).repeat(2) |
|
test_dataset = convert_features_to_tf_dataset(test_features, test['LabelIndices']).batch(32) |
|
|
|
|
|
model_new = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data['Labels'].unique())) |
|
model_new.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]) |
|
model_new.fit(train_dataset, epochs=1, validation_data=test_dataset) |
|
decode_predictions(model_new.predict("I am financially broken")) |
|
|
|
|
|
loss, accuracy = model_new.evaluate(test_dataset) |
|
print(f"Test accuracy: {accuracy}") |
|
|
|
|
|
|
|
|
|
|
|
"""# New Section""" |
|
|
|
|
|
|
|
pip install transformers |
|
|
|
import pandas as pd |
|
import re |
|
from sklearn.model_selection import train_test_split |
|
from transformers import BertTokenizer, TFBertForSequenceClassification |
|
from transformers import InputExample, InputFeatures |
|
import tensorflow as tf |
|
|
|
|
|
data = pd.read_excel('stress_data.xlsx') |
|
|
|
|
|
def clean_text(text): |
|
text = text.lower() |
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'\d+|\W+', ' ', text) |
|
return text |
|
|
|
data['Cleaned_Posts'] = data['Posts'].apply(clean_text) |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
|
|
data['LabelIndices'] = label_encoder.fit_transform(data['Labels']) |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) |
|
|
|
tokenizer |
|
|
|
data |
|
|
|
import pandas as pd |
|
import re |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import LabelEncoder |
|
from transformers import BertTokenizer, TFBertForSequenceClassification |
|
from transformers import InputExample, InputFeatures |
|
import tensorflow as tf |
|
|
|
|
|
data = pd.read_excel('stress_data.xlsx') |
|
|
|
|
|
def clean_text(text): |
|
text = text.lower() |
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'\d+|\W+', ' ', text) |
|
return text |
|
|
|
data['Cleaned_Posts'] = data['Posts'].apply(clean_text) |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
|
|
data['LabelIndices'] = label_encoder.fit_transform(data['Labels']) |
|
|
|
data["Labels"] |
|
|
|
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))) |
|
print(le_name_mapping) |
|
|
|
import joblib |
|
|
|
|
|
joblib.dump(label_encoder, 'label_encoder.joblib') |
|
|
|
label_encoder = joblib.load("/content/label_encoder.joblib") |
|
|
|
def decode_predictions(predictions): |
|
|
|
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions] |
|
|
|
decoded_labels = label_encoder.inverse_transform(predicted_indices) |
|
return decoded_labels |
|
|
|
|
|
from transformers import pipeline |
|
|
|
pipe = pipeline("text-classification", model="NeuEraAI/Stress_Classifier_BERT") |
|
|
|
decode_predictions(pipe.predict("I am in huge debts. I have taken huge loans and I can't repay.")) |
|
|
|
|