abhi001vj commited on
Commit
2c560b7
1 Parent(s): c98aa7a

added packages for linux

Browse files
Files changed (2) hide show
  1. packages.txt +2 -0
  2. search.py +60 -0
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ xpdf
search.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import pinecone
4
+ index_name = "abstractive-question-answering"
5
+
6
+ # check if the abstractive-question-answering index exists
7
+ if index_name not in pinecone.list_indexes():
8
+ # create the index if it does not exist
9
+ pinecone.create_index(
10
+ index_name,
11
+ dimension=768,
12
+ metric="cosine"
13
+ )
14
+
15
+ # connect to abstractive-question-answering index we created
16
+ index = pinecone.Index(index_name)
17
+
18
+ # we will use batches of 64
19
+ batch_size = 64
20
+
21
+ for i in tqdm(range(0, len(df), batch_size)):
22
+ # find end of batch
23
+ i_end = min(i+batch_size, len(df))
24
+ # extract batch
25
+ batch = df.iloc[i:i_end]
26
+ # generate embeddings for batch
27
+ emb = retriever.encode(batch["passage_text"].tolist()).tolist()
28
+ # get metadata
29
+ meta = batch.to_dict(orient="records")
30
+ # create unique IDs
31
+ ids = [f"{idx}" for idx in range(i, i_end)]
32
+ # add all to upsert list
33
+ to_upsert = list(zip(ids, emb, meta))
34
+ # upsert/insert these records to pinecone
35
+ _ = index.upsert(vectors=to_upsert)
36
+
37
+ # check that we have all vectors in index
38
+ index.describe_index_stats()
39
+
40
+ # from transformers import BartTokenizer, BartForConditionalGeneration
41
+
42
+ # # load bart tokenizer and model from huggingface
43
+ # tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
44
+ # generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')
45
+
46
+ # def query_pinecone(query, top_k):
47
+ # # generate embeddings for the query
48
+ # xq = retriever.encode([query]).tolist()
49
+ # # search pinecone index for context passage with the answer
50
+ # xc = index.query(xq, top_k=top_k, include_metadata=True)
51
+ # return xc
52
+
53
+ # def format_query(query, context):
54
+ # # extract passage_text from Pinecone search result and add the tag
55
+ # context = [f" {m['metadata']['passage_text']}" for m in context]
56
+ # # concatinate all context passages
57
+ # context = " ".join(context)
58
+ # # contcatinate the query and context passages
59
+ # query = f"question: {query} context: {context}"
60
+ # return query