LSTM1 / train_model.py
d-e-e-k-11's picture
Upload folder using huggingface_hub
bde793d verified
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout, Layer, Concatenate
import tensorflow.keras.backend as K
import pickle
import os
# Custom Attention Layer
class Attention(Layer):
def __init__(self, **kwargs):
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(name='attention_weight',
shape=(input_shape[-1], 1),
initializer='random_normal',
trainable=True)
self.b = self.add_weight(name='attention_bias',
shape=(input_shape[1], 1),
initializer='zeros',
trainable=True)
super(Attention, self).build(input_shape)
def call(self, x):
e = K.tanh(K.dot(x, self.W) + self.b)
a = K.softmax(e, axis=1)
output = x * a
return K.sum(output, axis=1)
def train_advanced_model(file_path):
print("Loading data for advanced model...")
df = pd.read_csv(file_path)
# Fill missing facts
df['related_facts'] = df['related_facts'].fillna("No context provided.")
# Advanced Preprocessing: Combine facts, question, and response
# Structure: [FACTS] facts [SEP] [QUERY] question [SEP] [RES] response
df['text'] = "[FACTS] " + df['related_facts'].astype(str) + \
" [QUERY] " + df['question'].astype(str) + \
" [RES] " + df['engine_response'].astype(str)
y = df['best'].astype(int).values
X_text = df['text'].astype(str).str.lower().values
max_words = 15000
max_len = 300
tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(X_text)
X_seq = tokenizer.texts_to_sequences(X_text)
X_pad = pad_sequences(X_seq, maxlen=max_len)
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.15, random_state=42, stratify=y)
# Save tokenizer immediately so it's available as soon as model starts saving checkpoints
with open('tokenizer_advanced.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved.")
# Advanced Arch: Bi-LSTM + Attention
inputs = Input(shape=(max_len,))
embed = Embedding(max_words, 128)(inputs)
drop1 = SpatialDropout1D(0.3)(embed)
lstm = Bidirectional(LSTM(64, return_sequences=True))(drop1)
attn = Attention()(lstm)
dense1 = Dense(64, activation='relu')(attn)
drop2 = Dropout(0.4)(dense1)
outputs = Dense(1, activation='sigmoid')(drop2)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# Training with Checkpointing
batch_size = 128
epochs = 2
class_weight = {0: 1.0, 1: len(y[y==0]) / len(y[y==1])}
checkpoint = tf.keras.callbacks.ModelCheckpoint(
'chatbot_performance_advanced.h5',
monitor='val_accuracy',
save_best_only=True,
mode='max',
verbose=1
)
print("Training advanced model with Attention...")
model.fit(
X_train, y_train,
epochs=epochs,
batch_size=batch_size,
validation_split=0.1,
class_weight=class_weight,
callbacks=[checkpoint],
verbose=1
)
print("Training complete.")
if __name__ == "__main__":
train_advanced_model('BP_MHS_V1.csv')