Spaces:

xpsychted
/

MedQA-Assistant

Runtime error

App Files Files Community

xpsychted commited on Oct 6, 2023

Commit

5758a81

•

1 Parent(s): b1c3977

Update pinecone_integration.py

Browse files

Files changed (1) hide show

pinecone_integration.py +115 -0

pinecone_integration.py CHANGED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+import os
+from tqdm.auto import tqdm
+import pinecone
+from sentence_transformers import SentenceTransformer
+import torch
+class PineconeIndex:
+    def __init__(self):
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.sm = SentenceTransformer('all-MiniLM-L6-v2', device=device)
+        self.index_name = 'semantic-search-fast-med'
+        self.index = None
+    def init_pinecone(self):
+        index_name = self.index_name
+        sentence_model = self.sm
+        # get api key from app.pinecone.io
+        PINECONE_API_KEY = "b97d5759-dd39-428b-a1fd-ed30f3ba74ee"  # os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
+        # find your environment next to the api key in pinecone console
+        PINECONE_ENV = "us-west4-gcp"  # os.environ.get('PINECONE_ENV') or 'PINECONE_ENV'
+        pinecone.init(
+            api_key=PINECONE_API_KEY,
+            environment=PINECONE_ENV
+        )
+#         pinecone.delete_index(index_name)
+        # only create index if it doesn't exist
+        if index_name not in pinecone.list_indexes():
+            pinecone.create_index(
+                name=index_name,
+                dimension=sentence_model.get_sentence_embedding_dimension(),
+                metric='cosine'
+            )
+        # now connect to the index
+        self.index = pinecone.GRPCIndex(index_name)
+        return self.index
+    def build_index(self):
+        if self.index is None:
+            index = self.init_pinecone()
+        else:
+            index = self.index
+        if index.describe_index_stats()['total_vector_count']:
+            "Index already built"
+            return
+        sentence_model = self.sm
+        x = pd.read_excel('/kaggle/input/drug-p/Diseases_data_W.xlsx')
+        question_dict = {'About': 'What is {}?', 'Symptoms': 'What are symptoms of {}?',
+                         'Causes': 'What are causes of {}?',
+                         'Diagnosis': 'What are diagnosis for {}?', 'Risk Factors': 'What are the risk factors for {}?',
+                         'Treatment Options': 'What are the treatment options for {}?',
+                         'Prognosis and Complications': 'What are the prognosis and complications?'}
+        context = []
+        disease_list = []
+        for i in range(len(x)):
+            disease = x.iloc[i, 0]
+            if disease.strip().lower() in disease_list:
+                continue
+            disease_list.append(disease.strip().lower())
+            conditions = x.iloc[i, 1:].dropna().index
+            answers = x.iloc[i, 1:].dropna()
+            for cond in conditions:
+                context.append(f"{question_dict[cond].format(disease)}\n\n{answers[cond]}")
+        batch_size = 128
+        for i in tqdm(range(0, len(context), batch_size)):
+            # find end of batch
+            i_end = min(i + batch_size, len(context))
+            # create IDs batch
+            ids = [str(x) for x in range(i, i_end)]
+            # create metadata batch
+            metadatas = [{'text': text} for text in context[i:i_end]]
+            # create embeddings
+            xc = sentence_model.encode(context[i:i_end])
+            # create records list for upsert
+            records = zip(ids, xc, metadatas)
+            # upsert to Pinecone
+            index.upsert(vectors=records)
+        # check number of records in the index
+        index.describe_index_stats()
+    def search(self, query: str = "medicines for fever"):
+        sentence_model = self.sm
+        if self.index is None:
+            self.build_index()
+        index = self.index
+        # create the query vector
+        xq = sentence_model.encode(query).tolist()
+        # now query
+        xc = index.query(xq, top_k = 3, include_metadata = True)
+        return xc