thankrandomness
commited on
Commit
·
b8e33aa
1
Parent(s):
84cf34d
change data ingestion logic
Browse files
app.py
CHANGED
@@ -11,14 +11,7 @@ import gradio as gr
|
|
11 |
|
12 |
# Load the private dataset using the token
|
13 |
#dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
|
14 |
-
dataset = load_dataset("thankrandomness/mimic-iii")
|
15 |
-
|
16 |
-
# Debug: Print the keys of the first row to understand the structure
|
17 |
-
#print(dataset['train'][0].keys())
|
18 |
-
|
19 |
-
# Load model and tokenizer
|
20 |
-
#tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
|
21 |
-
#model = AutoModel.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
|
22 |
|
23 |
# Load PubMedBERT model and tokenizer
|
24 |
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
|
@@ -41,45 +34,50 @@ def embed_text(text, max_length=512):
|
|
41 |
client = chromadb.Client()
|
42 |
collection = client.create_collection(name="pubmedbert_embeddings")
|
43 |
|
44 |
-
#
|
45 |
-
ids = []
|
46 |
-
embeddings = []
|
47 |
-
metadatas = []
|
48 |
-
|
49 |
-
# Process the dataset and load into ChromaDB
|
50 |
for i, row in enumerate(dataset['train']):
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# Append to lists
|
61 |
-
ids.append(f"doc_{i}_{note_index}")
|
62 |
-
embeddings.append(embedding)
|
63 |
-
metadatas.append({"text": text, "annotations": annotations})
|
64 |
-
else:
|
65 |
-
print(f"Skipping note in row {i} due to missing 'text' or 'annotations'")
|
66 |
|
67 |
-
# Upsert
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
# Define retrieval function
|
71 |
def retrieve_relevant_text(input_text):
|
72 |
-
input_embedding = embed_text(input_text)
|
73 |
-
results = collection.query(
|
74 |
-
|
75 |
-
# Extract
|
76 |
output = []
|
77 |
-
for result in results:
|
|
|
78 |
for annotation in result["metadata"]["annotations"]:
|
79 |
output.append({
|
80 |
-
"similarity_score": result["
|
81 |
-
"
|
82 |
-
"description": annotation["description"]
|
83 |
})
|
84 |
return output
|
85 |
|
|
|
11 |
|
12 |
# Load the private dataset using the token
|
13 |
#dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
|
14 |
+
dataset = load_dataset("thankrandomness/mimic-iii-sample")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Load PubMedBERT model and tokenizer
|
17 |
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
|
|
|
34 |
client = chromadb.Client()
|
35 |
collection = client.create_collection(name="pubmedbert_embeddings")
|
36 |
|
37 |
+
# Process the dataset and upsert into ChromaDB
|
|
|
|
|
|
|
|
|
|
|
38 |
for i, row in enumerate(dataset['train']):
|
39 |
+
for note in row['notes']:
|
40 |
+
text = note.get('text', '')
|
41 |
+
annotations_list = []
|
42 |
+
|
43 |
+
for annotation in note.get('annotations', []):
|
44 |
+
try:
|
45 |
+
code = annotation['code']
|
46 |
+
code_system = annotation['code_system']
|
47 |
+
description = annotation['description']
|
48 |
+
#annotations_list.append(f"{code}: {code_system}: {description}")
|
49 |
+
annotations_list.append({"code": code, "code_system": code_system, "description": description})
|
50 |
+
except KeyError as e:
|
51 |
+
print(f"Skipping annotation due to missing key: {e}")
|
52 |
|
53 |
+
print(f"Processed annotations for note {note['note_id']}: {annotations_list}")
|
54 |
+
|
55 |
+
if text and annotations_list:
|
56 |
+
embeddings = embed_text([text])[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
# Upsert data, embeddings, and annotations into ChromaDB
|
59 |
+
for j, annotation in enumerate(annotations_list):
|
60 |
+
collection.upsert(
|
61 |
+
ids=[f"note_{note['note_id']}_{j}"],
|
62 |
+
embeddings=[embeddings],
|
63 |
+
metadatas=[annotation]
|
64 |
+
)
|
65 |
+
else:
|
66 |
+
print(f"Skipping note {note['note_id']} due to missing 'text' or 'annotations'")
|
67 |
|
68 |
# Define retrieval function
|
69 |
def retrieve_relevant_text(input_text):
|
70 |
+
input_embedding = embed_text([input_text])[0] # Get the embedding for the single input text
|
71 |
+
results = collection.query(query_embeddings=[input_embedding], n_results=5)
|
72 |
+
print(results)
|
73 |
+
# Extract code and similarity scores
|
74 |
output = []
|
75 |
+
for result in results['results']:
|
76 |
+
print(result)
|
77 |
for annotation in result["metadata"]["annotations"]:
|
78 |
output.append({
|
79 |
+
"similarity_score": result["distances"],
|
80 |
+
"annotation": annotation
|
|
|
81 |
})
|
82 |
return output
|
83 |
|