Spaces:

ACRLab
/

FraleyLabAttachmentBot

Sleeping

App Files Files Community

AjithKSenthil commited on May 16, 2023

Commit

16e3b84

1 Parent(s): 5aab893

Added Data visualizations, don't forget to add a data folder

Browse files

Files changed (2) hide show

ChatAttachmentAnalysis.py +34 -0
ChatAttachmentAnalysisWithXG.py +36 -1

ChatAttachmentAnalysis.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
@@ -73,3 +78,32 @@ print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
 # X = df[['embedding', 'text_length', 'sentiment']].values
 # Always be sure to check your data after adding new features to make sure everything looks correct.

 import pandas as pd
 import numpy as np
+# for data visualization:
+import matplotlib.pyplot as plt
+import seaborn as sns
+# for regression:
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 # X = df[['embedding', 'text_length', 'sentiment']].values
 # Always be sure to check your data after adding new features to make sure everything looks correct.
+column_names = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
+# Create a DataFrame for the predictions
+preds_df = pd.DataFrame(preds, columns=column_names)
+# Create a DataFrame for the actual values
+y_test_df = pd.DataFrame(y_test, columns=column_names)
+# Create a 2x5 subplot grid
+fig, axes = plt.subplots(2, 5, figsize=(20, 10))
+# Loop over each column
+for idx, col in enumerate(column_names):
+    # Plot the actual values on the left column
+    sns.histplot(y_test_df[col], bins=10, ax=axes[idx//5, idx%5], color='blue', kde=True)
+    # Plot the predicted values on the right column
+    sns.histplot(preds_df[col], bins=10, ax=axes[idx//5, idx%5], color='red', kde=True)
+    # Set the title of the subplot
+    axes[idx//5, idx%5].set_title(f"{col} - actual vs predicted")
+# Add a legend
+plt.legend(labels=['actual', 'predicted'])
+# Show the plot
+plt.show()

ChatAttachmentAnalysisWithXG.py CHANGED Viewed

@@ -2,6 +2,12 @@ import pandas as pd
 import numpy as np
 import xgboost as xgb
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
@@ -33,4 +39,33 @@ print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={m
 # MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
-# MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.

 import numpy as np
 import xgboost as xgb
+# for data visualization:
+import matplotlib.pyplot as plt
+import seaborn as sns
+# for regression:
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 # MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
+# MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.
+column_names = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
+# Create a DataFrame for the predictions
+preds_df = pd.DataFrame(preds, columns=column_names)
+# Create a DataFrame for the actual values
+y_test_df = pd.DataFrame(y_test, columns=column_names)
+# Create a 2x5 subplot grid
+fig, axes = plt.subplots(2, 5, figsize=(20, 10))
+# Loop over each column
+for idx, col in enumerate(column_names):
+    # Plot the actual values on the left column
+    sns.histplot(y_test_df[col], bins=10, ax=axes[idx//5, idx%5], color='blue', kde=True)
+    # Plot the predicted values on the right column
+    sns.histplot(preds_df[col], bins=10, ax=axes[idx//5, idx%5], color='red', kde=True)
+    # Set the title of the subplot
+    axes[idx//5, idx%5].set_title(f"{col} - actual vs predicted")
+# Add a legend
+plt.legend(labels=['actual', 'predicted'])
+# Show the plot
+plt.show()