File size: 6,684 Bytes
c5f4573 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# -*- coding: utf-8 -*-
"""Stress Categorization Using BERT Transformer.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1JZTLCAUBN6XkcQpAWukUsx7dJ5VyC_KR
"""
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
!pip install transformers
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
# 1. Load and inspect the data
data = pd.read_excel('stress_data.xlsx')
# 2. Clean and preprocess the data
def clean_text(text):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\d+|\W+', ' ', text)
return text
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
# Convert string labels to integer indices
label_encoder = LabelEncoder()
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
# 3. Tokenize data using BERT's tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
# Split the data into train and test
train, test = train_test_split(data, test_size=0.2, random_state=42)
# Convert data to InputExample format
def convert_data_to_input_example(data):
return data.apply(lambda x: InputExample(guid=None, text_a=x['Cleaned_Posts'], text_b=None, label=x['LabelIndices']), axis=1)
train_InputExamples = convert_data_to_input_example(train)
test_InputExamples = convert_data_to_input_example(test)
# Convert to features for BERT input
def convert_input_example_to_feature(example):
return tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False)
train_features = train_InputExamples.apply(convert_input_example_to_feature)
test_features = test_InputExamples.apply(convert_input_example_to_feature)
# Convert features to tensorflow dataset
def convert_features_to_tf_dataset(features, labels):
def gen():
for f, l in zip(features, labels):
yield ({'input_ids': f['input_ids'], 'attention_mask': f['attention_mask']}, l)
return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))
def decode_predictions(predictions):
# Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
# Decode the indices to original labels
decoded_labels = label_encoder.inverse_transform(predicted_indices)
return decoded_labels
train_dataset = convert_features_to_tf_dataset(train_features, train['LabelIndices']).shuffle(100).batch(32).repeat(2)
test_dataset = convert_features_to_tf_dataset(test_features, test['LabelIndices']).batch(32)
# 4. Fine-tune BERT on the dataset
model_new = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data['Labels'].unique()))
model_new.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
model_new.fit(train_dataset, epochs=1, validation_data=test_dataset)
decode_predictions(model_new.predict("I am financially broken"))
# 5. Evaluate the model
loss, accuracy = model_new.evaluate(test_dataset)
print(f"Test accuracy: {accuracy}")
"""# New Section"""
pip install transformers
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
# 1. Load and inspect the data
data = pd.read_excel('stress_data.xlsx')
# 2. Clean and preprocess the data
def clean_text(text):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\d+|\W+', ' ', text)
return text
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
# Convert string labels to integer indices
label_encoder = LabelEncoder()
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
# 3. Tokenize data using BERT's tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
tokenizer
data
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
# 1. Load and inspect the data
data = pd.read_excel('stress_data.xlsx')
# 2. Clean and preprocess the data
def clean_text(text):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\d+|\W+', ' ', text)
return text
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
# Convert string labels to integer indices
label_encoder = LabelEncoder()
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
data["Labels"]
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(le_name_mapping)
import joblib
# Assuming 'label_encoder' is your LabelEncoder instance
joblib.dump(label_encoder, 'label_encoder.joblib')
label_encoder = joblib.load("/content/label_encoder.joblib")
def decode_predictions(predictions):
# Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
# Decode the indices to original labels
decoded_labels = label_encoder.inverse_transform(predicted_indices)
return decoded_labels
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-classification", model="NeuEraAI/Stress_Classifier_BERT")
decode_predictions(pipe.predict("I am in huge debts. I have taken huge loans and I can't repay."))
|