import openai import pandas as pd from deeplake.core.vectorstore import VectorStore from utils import zip_contents def embedding_function(texts, model="text-embedding-ada-002"): if isinstance(texts, str): texts = [texts] texts = [t.replace("\n", " ") for t in texts] return [ data["embedding"] for data in openai.Embedding.create(input=texts, model=model)["data"] ] def extract_metadata(df: pd.DataFrame) -> dict: """extract the metadata from the dataframe in deeplake dict format""" metadata = df.apply( lambda x: { "url": x.url, "source": x.source, "title": x.title, }, axis=1, ).to_list() return metadata if __name__ == "__main__": vector_store_path = "deeplake_store" chunk_file = "data/chunks_preprocessed.csv" overwrite = True df = pd.read_csv(chunk_file) for col in ["url", "source", "title", "content"]: assert col in df.columns # extract the text + metadata metadata = extract_metadata(df) chunked_text = df.content.to_list() # init the vector store vector_store = VectorStore( path=vector_store_path, overwrite=True, ) # add the embeddings vector_store.add( text=chunked_text, embedding_function=embedding_function, embedding_data=chunked_text, metadata=metadata, ) # save the deeplake folder to a zip file zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".") print(f"Contents zipped to: {zipped_file_path}")