Spaces:

bioscan-ml
/

browser-backend

Sleeping

App Files Files Community

atwang commited on Dec 4, 2024

Commit

1f788b3

1 Parent(s): 51e3825

update app to use indexes directly to get embeddings

Browse files

Files changed (3) hide show

app.py +32 -31
big_id_to_image_emb_dict.pickle +0 -3
big_indx_to_id_dict.pickle +2 -2

app.py CHANGED Viewed

@@ -11,11 +11,11 @@ import click
 def getRandID():
-    indx = random.randrange(0, 396503)
-    return indx_to_id_dict[indx], indx
-def chooseImageIndex(indexType):
     if indexType == "FlatIP(default)":
         return image_index_IP
     elif indexType == "FlatL2":
@@ -32,7 +32,7 @@ def chooseImageIndex(indexType):
         return image_index_LSH
-def chooseDNAIndex(indexType):
     if indexType == "FlatIP(default)":
         return dna_index_IP
     elif indexType == "FlatL2":
@@ -49,35 +49,36 @@ def chooseDNAIndex(indexType):
         return dna_index_LSH
-def searchEmbeddings(id, key_type, query_type, index_type):
-    # variable and index initialization
-    dim = 768
-    count = 0
-    num_neighbors = 10
-    index = faiss.IndexFlatIP(dim)
     # get index
     if query_type == "Image":
-        index = chooseImageIndex(index_type)
     elif query_type == "DNA":
-        index = chooseDNAIndex(index_type)
     # search for query
     if key_type == "Image":
-        query = id_to_image_emb_dict[id]
     elif key_type == "DNA":
-        query = id_to_dna_emb_dict[id]
-    query = query.astype(np.float32)
-    D, I = index.search(query, num_neighbors)
-    id_list = []
-    i = 1
     for indx in I[0]:
-        id = indx_to_id_dict[indx]
-        id_list.append(id)
-    return id_list
 with gr.Blocks() as demo:
@@ -102,14 +103,8 @@ with gr.Blocks() as demo:
     # with open("processid_to_index.pickle", "rb") as f:
     #     processid_to_index = pickle.load(f)
     with open("big_indx_to_id_dict.pickle", "rb") as f:
-        indx_to_id_dict = pickle.load(f)
-    # initialize both possible dicts
-    with open("big_id_to_image_emb_dict.pickle", "rb") as f:
-        id_to_image_emb_dict = pickle.load(f)
-    # with open("big_id_to_dna_emb_dict.pickle", "rb") as f:
-    #     id_to_dna_emb_dict = pickle.load(f)
-    id_to_dna_emb_dict = None
     with gr.Column():
         with gr.Row():
@@ -124,12 +119,18 @@ with gr.Blocks() as demo:
         index_type = gr.Radio(
             choices=["FlatIP(default)", "FlatL2", "HNSWFlat", "IVFFlat", "LSH"], label="Index:", value="FlatIP(default)"
         )
         process_id = gr.Textbox(label="ID:", info="Enter a sample ID to search for")
         process_id_list = gr.Textbox(label="Closest 10 matches:")
         search_btn = gr.Button("Search")
         id_btn.click(fn=getRandID, inputs=[], outputs=[rand_id, rand_id_indx])
-    search_btn.click(fn=searchEmbeddings, inputs=[process_id, key_type, query_type, index_type], outputs=[process_id_list])
 demo.launch()

 def getRandID():
+    indx = random.randrange(0, len(index_to_id_dict))
+    return index_to_id_dict[indx], indx
+def get_image_index(indexType):
     if indexType == "FlatIP(default)":
         return image_index_IP
     elif indexType == "FlatL2":
         return image_index_LSH
+def get_dna_index(indexType):
     if indexType == "FlatIP(default)":
         return dna_index_IP
     elif indexType == "FlatL2":
         return dna_index_LSH
+def searchEmbeddings(id, key_type, query_type, index_type, num_results: int = 10):
+    image_index = get_image_index(index_type)
+    dna_index = get_dna_index(index_type)
     # get index
     if query_type == "Image":
+        query = image_index.reconstruct(id_to_index_dict[id])
     elif query_type == "DNA":
+        query = dna_index.reconstruct(id_to_index_dict[id])
+    else:
+        raise ValueError(f"Invalid query type: {query_type}")
+    query = query.astype(np.float32)
+    query = np.expand_dims(query, axis=0)
     # search for query
     if key_type == "Image":
+        index = image_index
     elif key_type == "DNA":
+        index = dna_index
+    else:
+        raise ValueError(f"Invalid key type: {key_type}")
+    _, I = index.search(query, num_results)
+    closest_ids = []
     for indx in I[0]:
+        id = index_to_id_dict[indx]
+        closest_ids.append(id)
+    return closest_ids
 with gr.Blocks() as demo:
     # with open("processid_to_index.pickle", "rb") as f:
     #     processid_to_index = pickle.load(f)
     with open("big_indx_to_id_dict.pickle", "rb") as f:
+        index_to_id_dict = pickle.load(f)
+    id_to_index_dict = {v: k for k, v in index_to_id_dict.items()}
     with gr.Column():
         with gr.Row():
         index_type = gr.Radio(
             choices=["FlatIP(default)", "FlatL2", "HNSWFlat", "IVFFlat", "LSH"], label="Index:", value="FlatIP(default)"
         )
+        num_results = gr.Number(label="Number of Results:", value=10, precision=0)
         process_id = gr.Textbox(label="ID:", info="Enter a sample ID to search for")
         process_id_list = gr.Textbox(label="Closest 10 matches:")
         search_btn = gr.Button("Search")
         id_btn.click(fn=getRandID, inputs=[], outputs=[rand_id, rand_id_indx])
+    search_btn.click(
+        fn=searchEmbeddings,
+        inputs=[process_id, key_type, query_type, index_type, num_results],
+        outputs=[process_id_list],
+    )
 demo.launch()

big_id_to_image_emb_dict.pickle DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4fb3f21f2d38a91cb2cad8f40449f31c12d481944d93e9c61def2d3e8e6b78eb
-size 274402415

big_indx_to_id_dict.pickle CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a192bfb968d669f59ad0c5438751f1585094a67d2130b80c56db9731d4406e10
-size 7861755

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee0a9044e054f640b704247a2fa2e74219180b78ded6ba07f551bfc222657fc5
+size 885457