Spaces:

thankrandomness
/

mimic-iii-retrieval

Paused

App Files Files Community

thankrandomness commited on Jun 2, 2024

Commit

b8e33aa

1 Parent(s): 84cf34d

change data ingestion logic

Browse files

Files changed (1) hide show

app.py +36 -38

app.py CHANGED Viewed

@@ -11,14 +11,7 @@ import gradio as gr
 # Load the private dataset using the token
 #dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
-dataset = load_dataset("thankrandomness/mimic-iii")
-# Debug: Print the keys of the first row to understand the structure
-#print(dataset['train'][0].keys())
-# Load model and tokenizer
-#tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
-#model = AutoModel.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
 # Load PubMedBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
@@ -41,45 +34,50 @@ def embed_text(text, max_length=512):
 client = chromadb.Client()
 collection = client.create_collection(name="pubmedbert_embeddings")
-# Prepare lists for ids, embeddings, and metadatas
-ids = []
-embeddings = []
-metadatas = []
-# Process the dataset and load into ChromaDB
 for i, row in enumerate(dataset['train']):
-    # Iterate over the notes array
-    for note_index, note in enumerate(row['notes']):
-        text = note.get('text')
-        annotations = note.get('annotations')
-        # Compute the embedding if text and annotations are present
-        if text and annotations:
-            embedding = embed_text(text)
-            # Append to lists
-            ids.append(f"doc_{i}_{note_index}")
-            embeddings.append(embedding)
-            metadatas.append({"text": text, "annotations": annotations})
-        else:
-            print(f"Skipping note in row {i} due to missing 'text' or 'annotations'")
-# Upsert all documents
-collection.upsert(ids=ids, embeddings=embeddings, metadatas=metadatas)
 # Define retrieval function
 def retrieve_relevant_text(input_text):
-    input_embedding = embed_text(input_text)
-    results = collection.query(query_embedding=input_embedding, top_k=5)
-    # Extract annotations and similarity scores
     output = []
-    for result in results:
         for annotation in result["metadata"]["annotations"]:
             output.append({
-                "similarity_score": result["similarity"],
-                "code": annotation["code"],
-                "description": annotation["description"]
             })
     return output

 # Load the private dataset using the token
 #dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
+dataset = load_dataset("thankrandomness/mimic-iii-sample")
 # Load PubMedBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
 client = chromadb.Client()
 collection = client.create_collection(name="pubmedbert_embeddings")
+# Process the dataset and upsert into ChromaDB
 for i, row in enumerate(dataset['train']):
+    for note in row['notes']:
+        text = note.get('text', '')
+        annotations_list = []
+        for annotation in note.get('annotations', []):
+            try:
+                code = annotation['code']
+                code_system = annotation['code_system']
+                description = annotation['description']
+                #annotations_list.append(f"{code}: {code_system}: {description}")
+                annotations_list.append({"code": code, "code_system": code_system, "description": description})
+            except KeyError as e:
+                print(f"Skipping annotation due to missing key: {e}")
+        print(f"Processed annotations for note {note['note_id']}: {annotations_list}")
+        if text and annotations_list:
+            embeddings = embed_text([text])[0]
+            # Upsert data, embeddings, and annotations into ChromaDB
+            for j, annotation in enumerate(annotations_list):
+                collection.upsert(
+                    ids=[f"note_{note['note_id']}_{j}"],
+                    embeddings=[embeddings],
+                    metadatas=[annotation]
+                )
+        else:
+            print(f"Skipping note {note['note_id']} due to missing 'text' or 'annotations'")
 # Define retrieval function
 def retrieve_relevant_text(input_text):
+    input_embedding = embed_text([input_text])[0]  # Get the embedding for the single input text
+    results = collection.query(query_embeddings=[input_embedding], n_results=5)
+    print(results)
+    # Extract code and similarity scores
     output = []
+    for result in results['results']:
+        print(result)
         for annotation in result["metadata"]["annotations"]:
             output.append({
+                "similarity_score": result["distances"],
+                "annotation": annotation
             })
     return output