File size: 1,754 Bytes
72a5f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.text_splitter import RecursiveCharacterTextSplitter

def inject(index, embedder, data_file):

    data = pd.read_csv(data_file)
    print(data.head())
    BATCH_LIMIT = 100
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000, ### YOUR CODE HERE, # the character length of the chunk
        chunk_overlap = 100, ### YOUR CODE HERE, # the character length of the overlap between chunks
        length_function = len ### YOUR CODE HERE, # the length function - in this case, character length (aka the python len() fn.)
    )

    texts = []
    metadatas = []

    for i in tqdm(range(len(data))):

        record = data.iloc[i]

        metadata = {
            'review-url': str(record["Review_Url"]),
            'review-date' : str(record["Review_Date"]),
            'author' : str(record["Author"]),
            'rating' : str(record["Rating"]),
            'review-title' : str(record["Review_Title"]),
        }

        record_texts = text_splitter.split_text(record["Review"])

        record_metadatas = [{
            "chunk": j, "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)
        if len(texts) >= BATCH_LIMIT:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embedder.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embedder.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))