import pinecone index_name = "abstractive-question-answering" # check if the abstractive-question-answering index exists if index_name not in pinecone.list_indexes(): # create the index if it does not exist pinecone.create_index( index_name, dimension=768, metric="cosine" ) # connect to abstractive-question-answering index we created index = pinecone.Index(index_name) # we will use batches of 64 batch_size = 64 for i in tqdm(range(0, len(df), batch_size)): # find end of batch i_end = min(i+batch_size, len(df)) # extract batch batch = df.iloc[i:i_end] # generate embeddings for batch emb = retriever.encode(batch["passage_text"].tolist()).tolist() # get metadata meta = batch.to_dict(orient="records") # create unique IDs ids = [f"{idx}" for idx in range(i, i_end)] # add all to upsert list to_upsert = list(zip(ids, emb, meta)) # upsert/insert these records to pinecone _ = index.upsert(vectors=to_upsert) # check that we have all vectors in index index.describe_index_stats() # from transformers import BartTokenizer, BartForConditionalGeneration # # load bart tokenizer and model from huggingface # tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa') # generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa') # def query_pinecone(query, top_k): # # generate embeddings for the query # xq = retriever.encode([query]).tolist() # # search pinecone index for context passage with the answer # xc = index.query(xq, top_k=top_k, include_metadata=True) # return xc # def format_query(query, context): # # extract passage_text from Pinecone search result and add the tag # context = [f" {m['metadata']['passage_text']}" for m in context] # # concatinate all context passages # context = " ".join(context) # # contcatinate the query and context passages # query = f"question: {query} context: {context}" # return query