import pandas as pd from tqdm.auto import tqdm from uuid import uuid4 from langchain.text_splitter import RecursiveCharacterTextSplitter def inject(index, embedder, data_file): data = pd.read_csv(data_file) print(data.head()) BATCH_LIMIT = 100 text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, ### YOUR CODE HERE, # the character length of the chunk chunk_overlap = 100, ### YOUR CODE HERE, # the character length of the overlap between chunks length_function = len ### YOUR CODE HERE, # the length function - in this case, character length (aka the python len() fn.) ) texts = [] metadatas = [] for i in tqdm(range(len(data))): record = data.iloc[i] metadata = { 'review-url': str(record["Review_Url"]), 'review-date' : str(record["Review_Date"]), 'author' : str(record["Author"]), 'rating' : str(record["Rating"]), 'review-title' : str(record["Review_Title"]), } record_texts = text_splitter.split_text(record["Review"]) record_metadatas = [{ "chunk": j, "text": text, **metadata } for j, text in enumerate(record_texts)] texts.extend(record_texts) metadatas.extend(record_metadatas) if len(texts) >= BATCH_LIMIT: ids = [str(uuid4()) for _ in range(len(texts))] embeds = embedder.embed_documents(texts) index.upsert(vectors=zip(ids, embeds, metadatas)) texts = [] metadatas = [] if len(texts) > 0: ids = [str(uuid4()) for _ in range(len(texts))] embeds = embedder.embed_documents(texts) index.upsert(vectors=zip(ids, embeds, metadatas))