import pandas as pd import openai import tiktoken import os import config from openai import OpenAI from dotenv import load_dotenv load_dotenv(override=True) client = OpenAI( api_key=os.getenv("OPENAI_API_KEY") ) # Set your OpenAI API key # Embedding model parameters embedding_model = "text-embedding-ada-002" embedding_encoding = "cl100k_base" max_tokens = 8000 # Function to get embeddings def get_embedding(text, model="text-embedding-3-small"): text = text.replace("\n", " ") return client.embeddings.create(input = [text], model=model).data[0].embedding # Load preprocessed chat transcript data input_datapath = "../data/processed_chat_data.csv" output_datapath = "../data/chat_transcripts_with_embeddings.csv" df = pd.read_csv(input_datapath) # Ensure your chat transcripts are within the token limit for embedding encoding = tiktoken.get_encoding(embedding_encoding) df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x))) df = df[df["n_tokens"] <= max_tokens] # Extract embeddings for each chat transcript print("Extracting embeddings...") df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model)) # Save the data with embeddings df.to_csv(output_datapath, index=False) print(f"Data with embeddings saved to {output_datapath}")