AjithKSenthil commited on
Commit
5aab893
1 Parent(s): 840f6e0

added guide in comments for adding additional features

Browse files
Files changed (1) hide show
  1. ChatAttachmentAnalysis.py +24 -0
ChatAttachmentAnalysis.py CHANGED
@@ -13,6 +13,7 @@ df = pd.read_csv(datafile_path)
13
  # Convert embeddings to numpy arrays
14
  df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
15
 
 
16
  # Split the data into features (X) and labels (y)
17
  X = list(df.embedding.values)
18
  y = df[['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']].values
@@ -49,3 +50,26 @@ print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
49
 
50
  # Both MSE and MAE are loss functions that we want to minimize. Lower values for both indicate better model performance.
51
  # In general, the lower these values, the better the model's predictions are.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Convert embeddings to numpy arrays
14
  df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
15
 
16
+
17
  # Split the data into features (X) and labels (y)
18
  X = list(df.embedding.values)
19
  y = df[['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']].values
 
50
 
51
  # Both MSE and MAE are loss functions that we want to minimize. Lower values for both indicate better model performance.
52
  # In general, the lower these values, the better the model's predictions are.
53
+
54
+
55
+ # Guide for adding additional features to improve performance:
56
+ # Additional Features Extraction
57
+ # To add new features to the data, you will need to create new columns in the DataFrame
58
+ # Each new feature will be a new column, which can be created by applying a function to the text data
59
+
60
+ # For example, if you were adding a feature for the length of the chat, you would do something like this:
61
+ # df['text_length'] = df['ChatTranscript'].apply(len)
62
+
63
+ # If you are using an external library to compute a feature (like NLTK for tokenization or sentiment analysis), you would need to import that library and use its functions.
64
+ # For example, to compute a sentiment score with TextBlob, you might do something like this:
65
+ # from textblob import TextBlob
66
+ # df['sentiment'] = df['ChatTranscript'].apply(lambda text: TextBlob(text).sentiment.polarity)
67
+
68
+ # Make sure to handle any potential errors or exceptions in your function.
69
+ # For example, if a chat is empty, trying to compute its length or sentiment might cause an error.
70
+
71
+ # After you've added your new features, you can include them in your model by adding them to your features array when you split the data into training and testing sets.
72
+ # For example, if 'text_length' and 'sentiment' are new features, you might do this:
73
+ # X = df[['embedding', 'text_length', 'sentiment']].values
74
+
75
+ # Always be sure to check your data after adding new features to make sure everything looks correct.