AjithKSenthil commited on
Commit
76aaad4
1 Parent(s): 16e3b84

Upload 2 files

Browse files
ChatAttachmentAnalysisWithValidation.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import RandomForestRegressor
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
6
+
7
+ # Read your data file
8
+ datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
9
+
10
+ df = pd.read_csv(datafile_path)
11
+
12
+ # Convert embeddings to numpy arrays
13
+ df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
14
+
15
+ # Split the data into features (X) and labels (y)
16
+ X = list(df.embedding.values)
17
+ y = df[['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']].values
18
+
19
+ # Split data into training, validation, and testing sets
20
+ X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
21
+ X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
22
+
23
+ # Train your regression model
24
+ rfr = RandomForestRegressor(n_estimators=100)
25
+ rfr.fit(X_train, y_train)
26
+
27
+ # Make predictions on the validation data and adjust your model parameters accordingly
28
+ val_preds = rfr.predict(X_val)
29
+ val_mse = mean_squared_error(y_val, val_preds)
30
+ val_mae = mean_absolute_error(y_val, val_preds)
31
+ print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
32
+
33
+ # After adjusting your model parameters, make predictions on the test data
34
+ test_preds = rfr.predict(X_test)
35
+
36
+ # Evaluate your model
37
+ test_mse = mean_squared_error(y_test, test_preds)
38
+ test_mae = mean_absolute_error(y_test, test_preds)
39
+ print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")
ChatAttachmentAnalysisXGWithValidation.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import xgboost as xgb
4
+ from sklearn.multioutput import MultiOutputRegressor
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
7
+
8
+ # Read your data file
9
+ datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
10
+
11
+ df = pd.read_csv(datafile_path)
12
+
13
+ # Convert embeddings to numpy arrays
14
+ df['embedding'] = df['embedding'].apply(lambda x: np.array(eval(x)))
15
+
16
+ # Split the data into features (X) and labels (y)
17
+ X = list(df.embedding.values)
18
+ y = df[['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']].values
19
+
20
+ # Split data into training, validation, and testing sets
21
+ X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
22
+ X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
23
+
24
+ # Train your regression model
25
+ xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
26
+ multioutput_reg = MultiOutputRegressor(xg_reg)
27
+ multioutput_reg.fit(np.array(X_train).tolist(), y_train)
28
+
29
+ # Make predictions on the validation data and tune your model parameters accordingly
30
+ val_preds = multioutput_reg.predict(np.array(X_val).tolist())
31
+ val_mse = mean_squared_error(y_val, val_preds)
32
+ val_mae = mean_absolute_error(y_val, val_preds)
33
+ print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
34
+
35
+ # After tuning your model, make predictions on the test data
36
+ test_preds = multioutput_reg.predict(np.array(X_test).tolist())
37
+
38
+ # Evaluate your model
39
+ test_mse = mean_squared_error(y_test, test_preds)
40
+ test_mae = mean_absolute_error(y_test, test_preds)
41
+ print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")