AjithKSenthil commited on
Commit
4247f5a
1 Parent(s): 379ec43

This will probably be the way we do it

Browse files
Files changed (2) hide show
  1. ChatAttachmentAnalysis.py +34 -0
  2. ObtainDataEmbedding.py +34 -0
ChatAttachmentAnalysis.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from sklearn.ensemble import RandomForestRegressor
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
7
+
8
+ # Read your data file
9
+ datafile_path = "data/chat_transcripts_with_embeddings.csv"
10
+
11
+ df = pd.read_csv(datafile_path)
12
+
13
+ # Convert embeddings to numpy arrays
14
+ df["embedding"] = df.embedding.apply(eval).apply(np.array)
15
+
16
+ # Split the data into features (X) and labels (y)
17
+ X = list(df.embedding.values)
18
+ y = df[['attachment', 'avoidance']] # Assuming your attachment scores are in these two columns
19
+
20
+ # Split data into training and testing sets
21
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
22
+
23
+ # Train your regression model
24
+ rfr = RandomForestRegressor(n_estimators=100)
25
+ rfr.fit(X_train, y_train)
26
+
27
+ # Make predictions on the test data
28
+ preds = rfr.predict(X_test)
29
+
30
+ # Evaluate your model
31
+ mse = mean_squared_error(y_test, preds)
32
+ mae = mean_absolute_error(y_test, preds)
33
+
34
+ print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
ObtainDataEmbedding.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # imports
2
+ import pandas as pd
3
+ import tiktoken
4
+ from openai.embeddings_utils import get_embedding
5
+
6
+ # embedding model parameters
7
+ embedding_model = "text-embedding-ada-002"
8
+ embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
9
+ max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
10
+
11
+ # load & inspect dataset
12
+ input_datapath = "data/chat_transcripts.csv"
13
+ df = pd.read_csv(input_datapath, index_col=0)
14
+ df = df[["ChatTranscript", "Attachment", "Avoidance"]]
15
+ df = df.dropna()
16
+ df.head(2)
17
+
18
+ # Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
19
+ encoding = tiktoken.get_encoding(embedding_encoding)
20
+
21
+ df["n_tokens"] = df.ChatTranscript.apply(lambda x: len(encoding.encode(x)))
22
+ df = df[df.n_tokens <= max_tokens]
23
+ len(df)
24
+
25
+ # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
26
+
27
+ # This may take a few minutes
28
+ df["embedding"] = df.ChatTranscript.apply(lambda x: get_embedding(x, engine=embedding_model))
29
+ df.to_csv("data/chat_transcripts_with_embeddings.csv")
30
+
31
+
32
+ # Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
33
+
34
+ # Also, remember to set the API key for OpenAI in your environment before running the get_embedding function.