Spaces:

ACRLab
/

FraleyLabAttachmentBot

Sleeping

App Files Files Community

AjithKSenthil commited on May 10, 2023

Commit

4247f5a

•

1 Parent(s): 379ec43

This will probably be the way we do it

Browse files

Files changed (2) hide show

ChatAttachmentAnalysis.py +34 -0
ObtainDataEmbedding.py +34 -0

ChatAttachmentAnalysis.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+# Read your data file
+datafile_path = "data/chat_transcripts_with_embeddings.csv"
+df = pd.read_csv(datafile_path)
+# Convert embeddings to numpy arrays
+df["embedding"] = df.embedding.apply(eval).apply(np.array)
+# Split the data into features (X) and labels (y)
+X = list(df.embedding.values)
+y = df[['attachment', 'avoidance']] # Assuming your attachment scores are in these two columns
+# Split data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Train your regression model
+rfr = RandomForestRegressor(n_estimators=100)
+rfr.fit(X_train, y_train)
+# Make predictions on the test data
+preds = rfr.predict(X_test)
+# Evaluate your model
+mse = mean_squared_error(y_test, preds)
+mae = mean_absolute_error(y_test, preds)
+print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")

ObtainDataEmbedding.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# imports
+import pandas as pd
+import tiktoken
+from openai.embeddings_utils import get_embedding
+# embedding model parameters
+embedding_model = "text-embedding-ada-002"
+embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
+max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
+# load & inspect dataset
+input_datapath = "data/chat_transcripts.csv"
+df = pd.read_csv(input_datapath, index_col=0)
+df = df[["ChatTranscript", "Attachment", "Avoidance"]]
+df = df.dropna()
+df.head(2)
+# Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
+encoding = tiktoken.get_encoding(embedding_encoding)
+df["n_tokens"] = df.ChatTranscript.apply(lambda x: len(encoding.encode(x)))
+df = df[df.n_tokens <= max_tokens]
+len(df)
+# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
+# This may take a few minutes
+df["embedding"] = df.ChatTranscript.apply(lambda x: get_embedding(x, engine=embedding_model))
+df.to_csv("data/chat_transcripts_with_embeddings.csv")
+# Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
+# Also, remember to set the API key for OpenAI in your environment before running the get_embedding function.