AjithKSenthil commited on
Commit
8b16ee5
1 Parent(s): c77f143

modified it to use our data now

Browse files
ChatAttachmentAnalysis.py CHANGED
@@ -6,16 +6,16 @@ from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, mean_absolute_error
7
 
8
  # Read your data file
9
- datafile_path = "data/chat_transcripts_with_embeddings.csv"
10
 
11
  df = pd.read_csv(datafile_path)
12
 
13
  # Convert embeddings to numpy arrays
14
- df["embedding"] = df.embedding.apply(eval).apply(np.array)
15
 
16
  # Split the data into features (X) and labels (y)
17
  X = list(df.embedding.values)
18
- y = df[['attachment', 'avoidance']] # Assuming your attachment scores are in these two columns
19
 
20
  # Split data into training and testing sets
21
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
@@ -32,3 +32,5 @@ mse = mean_squared_error(y_test, preds)
32
  mae = mean_absolute_error(y_test, preds)
33
 
34
  print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
 
 
 
6
  from sklearn.metrics import mean_squared_error, mean_absolute_error
7
 
8
  # Read your data file
9
+ datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
10
 
11
  df = pd.read_csv(datafile_path)
12
 
13
  # Convert embeddings to numpy arrays
14
+ df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
15
 
16
  # Split the data into features (X) and labels (y)
17
  X = list(df.embedding.values)
18
+ y = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
19
 
20
  # Split data into training and testing sets
21
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
32
  mae = mean_absolute_error(y_test, preds)
33
 
34
  print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
35
+
36
+
ChatAttachmentAnalysisWithXG.py CHANGED
@@ -1,27 +1,36 @@
1
  import pandas as pd
2
  import numpy as np
3
- from sklearn.multioutput import MultiOutputRegressor
4
  import xgboost as xgb
 
 
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, mean_absolute_error
7
 
8
- datafile_path = "data/chat_transcripts_with_embeddings.csv"
9
 
10
  df = pd.read_csv(datafile_path)
11
- df["embedding"] = df.embedding.apply(eval).apply(np.array)
12
 
13
- X = np.array(df.embedding.tolist())
14
- y = df[["Attachment", "Avoidance"]]
15
 
16
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
17
 
18
  xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
19
 
20
- multioutputregressor = MultiOutputRegressor(xg_reg).fit(X_train, y_train)
 
 
21
 
22
- preds = multioutputregressor.predict(X_test)
23
 
24
  mse = mean_squared_error(y_test, preds)
25
  mae = mean_absolute_error(y_test, preds)
26
 
27
  print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
 
3
  import xgboost as xgb
4
+
5
+ from sklearn.multioutput import MultiOutputRegressor
6
  from sklearn.model_selection import train_test_split
7
  from sklearn.metrics import mean_squared_error, mean_absolute_error
8
 
9
+ datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
10
 
11
  df = pd.read_csv(datafile_path)
12
+ df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
13
 
14
+ y_columns = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
15
+ y = df[y_columns].values
16
 
17
+ X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), y, test_size=0.2, random_state=42)
18
 
19
  xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
20
 
21
+ multioutput_reg = MultiOutputRegressor(xg_reg)
22
+
23
+ multioutput_reg.fit(np.array(X_train).tolist(), y_train)
24
 
25
+ preds = multioutput_reg.predict(np.array(X_test).tolist())
26
 
27
  mse = mean_squared_error(y_test, preds)
28
  mae = mean_absolute_error(y_test, preds)
29
 
30
  print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")
31
+
32
+ # The mean squared error (MSE) and mean absolute error (MAE) are both metrics for assessing the performance of our regression model.
33
+
34
+ # MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
35
+
36
+ # MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.
ObtainDataEmbedding.py CHANGED
@@ -1,7 +1,13 @@
 
1
  # imports
 
2
  import pandas as pd
 
3
  import tiktoken
4
  from openai.embeddings_utils import get_embedding
 
 
 
5
 
6
  # embedding model parameters
7
  embedding_model = "text-embedding-ada-002"
@@ -9,24 +15,24 @@ embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-0
9
  max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
10
 
11
  # load & inspect dataset
12
- input_datapath = "data/chat_transcripts.csv"
13
  df = pd.read_csv(input_datapath, index_col=0)
14
- df = df[["ChatTranscript", "Attachment", "Avoidance"]]
15
  df = df.dropna()
16
  df.head(2)
17
 
18
  # Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
19
  encoding = tiktoken.get_encoding(embedding_encoding)
20
 
21
- df["n_tokens"] = df.ChatTranscript.apply(lambda x: len(encoding.encode(x)))
22
  df = df[df.n_tokens <= max_tokens]
23
  len(df)
24
 
25
  # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
26
 
27
  # This may take a few minutes
28
- df["embedding"] = df.ChatTranscript.apply(lambda x: get_embedding(x, engine=embedding_model))
29
- df.to_csv("data/chat_transcripts_with_embeddings.csv")
30
 
31
 
32
  # Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
 
1
+ # ObtainDataEmbedding.py
2
  # imports
3
+ import openai
4
  import pandas as pd
5
+
6
  import tiktoken
7
  from openai.embeddings_utils import get_embedding
8
+ import config
9
+ # set your API key
10
+ openai.api_key = "your openai api key"
11
 
12
  # embedding model parameters
13
  embedding_model = "text-embedding-ada-002"
 
15
  max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
16
 
17
  # load & inspect dataset
18
+ input_datapath = "data/attachmentchatdata-formated.csv"
19
  df = pd.read_csv(input_datapath, index_col=0)
20
+ df = df[["userid", "chathistory", "avoide", "avoida", "avoidb", "avoidc", "avoidd", "anxietye", "anxietya", "anxietyb", "anxietyc", "anxietyd"]]
21
  df = df.dropna()
22
  df.head(2)
23
 
24
  # Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
25
  encoding = tiktoken.get_encoding(embedding_encoding)
26
 
27
+ df["n_tokens"] = df.chathistory.apply(lambda x: len(encoding.encode(x)))
28
  df = df[df.n_tokens <= max_tokens]
29
  len(df)
30
 
31
  # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
32
 
33
  # This may take a few minutes
34
+ df["embedding"] = df.chathistory.apply(lambda x: get_embedding(x, engine=embedding_model))
35
+ df.to_csv("data/chat_transcripts_with_embeddings_and_scores.csv")
36
 
37
 
38
  # Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.