ExampleHostedChatBot / EmbeddingExtraction.py
AjithKSenthil's picture
Upload 5 files
d2e169c verified
import pandas as pd
import openai
import tiktoken
import os
import config
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv(override=True)
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)
# Set your OpenAI API key
# Embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
# Function to get embeddings
def get_embedding(text, model="text-embedding-3-small"):
text = text.replace("\n", " ")
return client.embeddings.create(input = [text], model=model).data[0].embedding
# Load preprocessed chat transcript data
input_datapath = "../data/processed_chat_data.csv"
output_datapath = "../data/chat_transcripts_with_embeddings.csv"
df = pd.read_csv(input_datapath)
# Ensure your chat transcripts are within the token limit for embedding
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x)))
df = df[df["n_tokens"] <= max_tokens]
# Extract embeddings for each chat transcript
print("Extracting embeddings...")
df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model))
# Save the data with embeddings
df.to_csv(output_datapath, index=False)
print(f"Data with embeddings saved to {output_datapath}")