xpsychted commited on
Commit
5758a81
1 Parent(s): b1c3977

Update pinecone_integration.py

Browse files
Files changed (1) hide show
  1. pinecone_integration.py +115 -0
pinecone_integration.py CHANGED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from tqdm.auto import tqdm
4
+ import pinecone
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+
8
+ class PineconeIndex:
9
+
10
+ def __init__(self):
11
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+
13
+ self.sm = SentenceTransformer('all-MiniLM-L6-v2', device=device)
14
+ self.index_name = 'semantic-search-fast-med'
15
+ self.index = None
16
+
17
+ def init_pinecone(self):
18
+
19
+ index_name = self.index_name
20
+ sentence_model = self.sm
21
+
22
+ # get api key from app.pinecone.io
23
+ PINECONE_API_KEY = "b97d5759-dd39-428b-a1fd-ed30f3ba74ee" # os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
24
+ # find your environment next to the api key in pinecone console
25
+ PINECONE_ENV = "us-west4-gcp" # os.environ.get('PINECONE_ENV') or 'PINECONE_ENV'
26
+
27
+ pinecone.init(
28
+ api_key=PINECONE_API_KEY,
29
+ environment=PINECONE_ENV
30
+ )
31
+
32
+ # pinecone.delete_index(index_name)
33
+
34
+ # only create index if it doesn't exist
35
+ if index_name not in pinecone.list_indexes():
36
+ pinecone.create_index(
37
+ name=index_name,
38
+ dimension=sentence_model.get_sentence_embedding_dimension(),
39
+ metric='cosine'
40
+ )
41
+
42
+ # now connect to the index
43
+ self.index = pinecone.GRPCIndex(index_name)
44
+ return self.index
45
+
46
+ def build_index(self):
47
+
48
+ if self.index is None:
49
+ index = self.init_pinecone()
50
+ else:
51
+ index = self.index
52
+
53
+ if index.describe_index_stats()['total_vector_count']:
54
+ "Index already built"
55
+ return
56
+
57
+ sentence_model = self.sm
58
+
59
+ x = pd.read_excel('/kaggle/input/drug-p/Diseases_data_W.xlsx')
60
+
61
+ question_dict = {'About': 'What is {}?', 'Symptoms': 'What are symptoms of {}?',
62
+ 'Causes': 'What are causes of {}?',
63
+ 'Diagnosis': 'What are diagnosis for {}?', 'Risk Factors': 'What are the risk factors for {}?',
64
+ 'Treatment Options': 'What are the treatment options for {}?',
65
+ 'Prognosis and Complications': 'What are the prognosis and complications?'}
66
+ context = []
67
+ disease_list = []
68
+
69
+ for i in range(len(x)):
70
+ disease = x.iloc[i, 0]
71
+ if disease.strip().lower() in disease_list:
72
+ continue
73
+
74
+ disease_list.append(disease.strip().lower())
75
+
76
+ conditions = x.iloc[i, 1:].dropna().index
77
+ answers = x.iloc[i, 1:].dropna()
78
+
79
+ for cond in conditions:
80
+ context.append(f"{question_dict[cond].format(disease)}\n\n{answers[cond]}")
81
+
82
+ batch_size = 128
83
+ for i in tqdm(range(0, len(context), batch_size)):
84
+ # find end of batch
85
+ i_end = min(i + batch_size, len(context))
86
+ # create IDs batch
87
+ ids = [str(x) for x in range(i, i_end)]
88
+ # create metadata batch
89
+ metadatas = [{'text': text} for text in context[i:i_end]]
90
+ # create embeddings
91
+ xc = sentence_model.encode(context[i:i_end])
92
+ # create records list for upsert
93
+ records = zip(ids, xc, metadatas)
94
+ # upsert to Pinecone
95
+ index.upsert(vectors=records)
96
+
97
+ # check number of records in the index
98
+ index.describe_index_stats()
99
+
100
+ def search(self, query: str = "medicines for fever"):
101
+
102
+ sentence_model = self.sm
103
+
104
+ if self.index is None:
105
+ self.build_index()
106
+
107
+ index = self.index
108
+
109
+ # create the query vector
110
+ xq = sentence_model.encode(query).tolist()
111
+
112
+ # now query
113
+ xc = index.query(xq, top_k = 3, include_metadata = True)
114
+
115
+ return xc