File size: 2,005 Bytes
e7ebc48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


import pinecone
index_name = "abstractive-question-answering"

# check if the abstractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric="cosine"
    )

# connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["passage_text"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

# from transformers import BartTokenizer, BartForConditionalGeneration

# # load bart tokenizer and model from huggingface
# tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
# generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')

# def query_pinecone(query, top_k):
#     # generate embeddings for the query
#     xq = retriever.encode([query]).tolist()
#     # search pinecone index for context passage with the answer
#     xc = index.query(xq, top_k=top_k, include_metadata=True)
#     return xc

# def format_query(query, context):
#     # extract passage_text from Pinecone search result and add the  tag
#     context = [f" {m['metadata']['passage_text']}" for m in context]
#     # concatinate all context passages
#     context = " ".join(context)
#     # contcatinate the query and context passages
#     query = f"question: {query} context: {context}"
#     return query