ExampleHostedChatBot / ChatAssessmentAnalysis.py
AjithKSenthil's picture
Upload 5 files
d2e169c verified
# ChatAssessmentAnalysis.py
# Purpose: Script for analyzing chat data using machine learning models, including training, validation, and testing.
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Read your data file
datafile_path = "data/chat_transcripts_with_features.csv" # Update this path as necessary
df = pd.read_csv(datafile_path)
# Convert embeddings to numpy arrays
df['embedding'] = df['embedding'].apply(lambda x: np.array([float(num) for num in x.strip('[]').split(',')]))
# Define features (X) and labels (y) - Adjust column names as per your dataset
X = np.array(df['embedding'].tolist())
y = df[['score1', 'score2', 'score3']].values # Replace with your actual score columns
# Split data into training, validation, and testing sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
# Train the regression model
# Note: You can replace XGBRegressor with any other regression model as per your requirement.
# For instance, you might use RandomForestRegressor or a neural network model from Keras.
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.05, max_depth=4, alpha=0, lam=0.5, n_estimators=200)
multioutput_reg = MultiOutputRegressor(xg_reg)
multioutput_reg.fit(X_train, y_train)
# Save the trained model
model_filename = 'trained_model.pkl'
with open(model_filename, 'wb') as file:
pickle.dump(multioutput_reg, file)
print(f"Model trained and saved as {model_filename}")
# Validate the model
# Note: You can use other metrics for validation based on your specific needs.
# For instance, you might consider using precision, recall, F1-score, or ROC-AUC for classification tasks.
val_preds = multioutput_reg.predict(X_val)
val_mse = mean_squared_error(y_val, val_preds)
val_mae = mean_absolute_error(y_val, val_preds)
print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
# Test the model
test_preds = multioutput_reg.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds)
test_mae = mean_absolute_error(y_test, test_preds)
print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")
# Note to Users:
# - Make sure to adjust the data paths and column names to match your dataset.
# - Feel free to experiment with different machine learning models and parameters to find the best fit for your data.
# - The trained model can be used to make predictions on new chat transcript data.
# - Consider re-training the model periodically with new data to keep it updated and improve its accuracy.