Spaces:
Sleeping
Sleeping
Commit
•
4247f5a
1
Parent(s):
379ec43
This will probably be the way we do it
Browse files- ChatAttachmentAnalysis.py +34 -0
- ObtainDataEmbedding.py +34 -0
ChatAttachmentAnalysis.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from sklearn.ensemble import RandomForestRegressor
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
7 |
+
|
8 |
+
# Read your data file
|
9 |
+
datafile_path = "data/chat_transcripts_with_embeddings.csv"
|
10 |
+
|
11 |
+
df = pd.read_csv(datafile_path)
|
12 |
+
|
13 |
+
# Convert embeddings to numpy arrays
|
14 |
+
df["embedding"] = df.embedding.apply(eval).apply(np.array)
|
15 |
+
|
16 |
+
# Split the data into features (X) and labels (y)
|
17 |
+
X = list(df.embedding.values)
|
18 |
+
y = df[['attachment', 'avoidance']] # Assuming your attachment scores are in these two columns
|
19 |
+
|
20 |
+
# Split data into training and testing sets
|
21 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
22 |
+
|
23 |
+
# Train your regression model
|
24 |
+
rfr = RandomForestRegressor(n_estimators=100)
|
25 |
+
rfr.fit(X_train, y_train)
|
26 |
+
|
27 |
+
# Make predictions on the test data
|
28 |
+
preds = rfr.predict(X_test)
|
29 |
+
|
30 |
+
# Evaluate your model
|
31 |
+
mse = mean_squared_error(y_test, preds)
|
32 |
+
mae = mean_absolute_error(y_test, preds)
|
33 |
+
|
34 |
+
print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
|
ObtainDataEmbedding.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# imports
|
2 |
+
import pandas as pd
|
3 |
+
import tiktoken
|
4 |
+
from openai.embeddings_utils import get_embedding
|
5 |
+
|
6 |
+
# embedding model parameters
|
7 |
+
embedding_model = "text-embedding-ada-002"
|
8 |
+
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
|
9 |
+
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
10 |
+
|
11 |
+
# load & inspect dataset
|
12 |
+
input_datapath = "data/chat_transcripts.csv"
|
13 |
+
df = pd.read_csv(input_datapath, index_col=0)
|
14 |
+
df = df[["ChatTranscript", "Attachment", "Avoidance"]]
|
15 |
+
df = df.dropna()
|
16 |
+
df.head(2)
|
17 |
+
|
18 |
+
# Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
|
19 |
+
encoding = tiktoken.get_encoding(embedding_encoding)
|
20 |
+
|
21 |
+
df["n_tokens"] = df.ChatTranscript.apply(lambda x: len(encoding.encode(x)))
|
22 |
+
df = df[df.n_tokens <= max_tokens]
|
23 |
+
len(df)
|
24 |
+
|
25 |
+
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
|
26 |
+
|
27 |
+
# This may take a few minutes
|
28 |
+
df["embedding"] = df.ChatTranscript.apply(lambda x: get_embedding(x, engine=embedding_model))
|
29 |
+
df.to_csv("data/chat_transcripts_with_embeddings.csv")
|
30 |
+
|
31 |
+
|
32 |
+
# Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
|
33 |
+
|
34 |
+
# Also, remember to set the API key for OpenAI in your environment before running the get_embedding function.
|