AjithKSenthil commited on
Commit
16e3b84
1 Parent(s): 5aab893

Added Data visualizations, don't forget to add a data folder

Browse files
ChatAttachmentAnalysis.py CHANGED
@@ -1,6 +1,11 @@
1
  import pandas as pd
2
  import numpy as np
3
 
 
 
 
 
 
4
  from sklearn.ensemble import RandomForestRegressor
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, mean_absolute_error
@@ -73,3 +78,32 @@ print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
73
  # X = df[['embedding', 'text_length', 'sentiment']].values
74
 
75
  # Always be sure to check your data after adding new features to make sure everything looks correct.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
 
4
+ # for data visualization:
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ # for regression:
9
  from sklearn.ensemble import RandomForestRegressor
10
  from sklearn.model_selection import train_test_split
11
  from sklearn.metrics import mean_squared_error, mean_absolute_error
 
78
  # X = df[['embedding', 'text_length', 'sentiment']].values
79
 
80
  # Always be sure to check your data after adding new features to make sure everything looks correct.
81
+
82
+
83
+ column_names = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
84
+
85
+ # Create a DataFrame for the predictions
86
+ preds_df = pd.DataFrame(preds, columns=column_names)
87
+
88
+ # Create a DataFrame for the actual values
89
+ y_test_df = pd.DataFrame(y_test, columns=column_names)
90
+
91
+ # Create a 2x5 subplot grid
92
+ fig, axes = plt.subplots(2, 5, figsize=(20, 10))
93
+
94
+ # Loop over each column
95
+ for idx, col in enumerate(column_names):
96
+ # Plot the actual values on the left column
97
+ sns.histplot(y_test_df[col], bins=10, ax=axes[idx//5, idx%5], color='blue', kde=True)
98
+
99
+ # Plot the predicted values on the right column
100
+ sns.histplot(preds_df[col], bins=10, ax=axes[idx//5, idx%5], color='red', kde=True)
101
+
102
+ # Set the title of the subplot
103
+ axes[idx//5, idx%5].set_title(f"{col} - actual vs predicted")
104
+
105
+ # Add a legend
106
+ plt.legend(labels=['actual', 'predicted'])
107
+
108
+ # Show the plot
109
+ plt.show()
ChatAttachmentAnalysisWithXG.py CHANGED
@@ -2,6 +2,12 @@ import pandas as pd
2
  import numpy as np
3
  import xgboost as xgb
4
 
 
 
 
 
 
 
5
  from sklearn.multioutput import MultiOutputRegressor
6
  from sklearn.model_selection import train_test_split
7
  from sklearn.metrics import mean_squared_error, mean_absolute_error
@@ -33,4 +39,33 @@ print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={m
33
 
34
  # MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
35
 
36
- # MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import xgboost as xgb
4
 
5
+ # for data visualization:
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+
9
+
10
+ # for regression:
11
  from sklearn.multioutput import MultiOutputRegressor
12
  from sklearn.model_selection import train_test_split
13
  from sklearn.metrics import mean_squared_error, mean_absolute_error
 
39
 
40
  # MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
41
 
42
+ # MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.
43
+
44
+
45
+ column_names = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
46
+
47
+ # Create a DataFrame for the predictions
48
+ preds_df = pd.DataFrame(preds, columns=column_names)
49
+
50
+ # Create a DataFrame for the actual values
51
+ y_test_df = pd.DataFrame(y_test, columns=column_names)
52
+
53
+ # Create a 2x5 subplot grid
54
+ fig, axes = plt.subplots(2, 5, figsize=(20, 10))
55
+
56
+ # Loop over each column
57
+ for idx, col in enumerate(column_names):
58
+ # Plot the actual values on the left column
59
+ sns.histplot(y_test_df[col], bins=10, ax=axes[idx//5, idx%5], color='blue', kde=True)
60
+
61
+ # Plot the predicted values on the right column
62
+ sns.histplot(preds_df[col], bins=10, ax=axes[idx//5, idx%5], color='red', kde=True)
63
+
64
+ # Set the title of the subplot
65
+ axes[idx//5, idx%5].set_title(f"{col} - actual vs predicted")
66
+
67
+ # Add a legend
68
+ plt.legend(labels=['actual', 'predicted'])
69
+
70
+ # Show the plot
71
+ plt.show()