Spaces:
Sleeping
Sleeping
import pandas as pd | |
from tqdm.auto import tqdm | |
from uuid import uuid4 | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
def inject(index, embedder, data_file): | |
data = pd.read_csv(data_file) | |
print(data.head()) | |
BATCH_LIMIT = 100 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 1000, ### YOUR CODE HERE, # the character length of the chunk | |
chunk_overlap = 100, ### YOUR CODE HERE, # the character length of the overlap between chunks | |
length_function = len ### YOUR CODE HERE, # the length function - in this case, character length (aka the python len() fn.) | |
) | |
texts = [] | |
metadatas = [] | |
for i in tqdm(range(len(data))): | |
record = data.iloc[i] | |
metadata = { | |
'review-url': str(record["Review_Url"]), | |
'review-date' : str(record["Review_Date"]), | |
'author' : str(record["Author"]), | |
'rating' : str(record["Rating"]), | |
'review-title' : str(record["Review_Title"]), | |
} | |
record_texts = text_splitter.split_text(record["Review"]) | |
record_metadatas = [{ | |
"chunk": j, "text": text, **metadata | |
} for j, text in enumerate(record_texts)] | |
texts.extend(record_texts) | |
metadatas.extend(record_metadatas) | |
if len(texts) >= BATCH_LIMIT: | |
ids = [str(uuid4()) for _ in range(len(texts))] | |
embeds = embedder.embed_documents(texts) | |
index.upsert(vectors=zip(ids, embeds, metadatas)) | |
texts = [] | |
metadatas = [] | |
if len(texts) > 0: | |
ids = [str(uuid4()) for _ in range(len(texts))] | |
embeds = embedder.embed_documents(texts) | |
index.upsert(vectors=zip(ids, embeds, metadatas)) | |