thankrandomness commited on
Commit
b8e33aa
·
1 Parent(s): 84cf34d

change data ingestion logic

Browse files
Files changed (1) hide show
  1. app.py +36 -38
app.py CHANGED
@@ -11,14 +11,7 @@ import gradio as gr
11
 
12
  # Load the private dataset using the token
13
  #dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
14
- dataset = load_dataset("thankrandomness/mimic-iii")
15
-
16
- # Debug: Print the keys of the first row to understand the structure
17
- #print(dataset['train'][0].keys())
18
-
19
- # Load model and tokenizer
20
- #tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
21
- #model = AutoModel.from_pretrained("NeuML/pubmedbert-base-embeddings-matryoshka")
22
 
23
  # Load PubMedBERT model and tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
@@ -41,45 +34,50 @@ def embed_text(text, max_length=512):
41
  client = chromadb.Client()
42
  collection = client.create_collection(name="pubmedbert_embeddings")
43
 
44
- # Prepare lists for ids, embeddings, and metadatas
45
- ids = []
46
- embeddings = []
47
- metadatas = []
48
-
49
- # Process the dataset and load into ChromaDB
50
  for i, row in enumerate(dataset['train']):
51
- # Iterate over the notes array
52
- for note_index, note in enumerate(row['notes']):
53
- text = note.get('text')
54
- annotations = note.get('annotations')
 
 
 
 
 
 
 
 
 
55
 
56
- # Compute the embedding if text and annotations are present
57
- if text and annotations:
58
- embedding = embed_text(text)
59
-
60
- # Append to lists
61
- ids.append(f"doc_{i}_{note_index}")
62
- embeddings.append(embedding)
63
- metadatas.append({"text": text, "annotations": annotations})
64
- else:
65
- print(f"Skipping note in row {i} due to missing 'text' or 'annotations'")
66
 
67
- # Upsert all documents
68
- collection.upsert(ids=ids, embeddings=embeddings, metadatas=metadatas)
 
 
 
 
 
 
 
69
 
70
  # Define retrieval function
71
  def retrieve_relevant_text(input_text):
72
- input_embedding = embed_text(input_text)
73
- results = collection.query(query_embedding=input_embedding, top_k=5)
74
-
75
- # Extract annotations and similarity scores
76
  output = []
77
- for result in results:
 
78
  for annotation in result["metadata"]["annotations"]:
79
  output.append({
80
- "similarity_score": result["similarity"],
81
- "code": annotation["code"],
82
- "description": annotation["description"]
83
  })
84
  return output
85
 
 
11
 
12
  # Load the private dataset using the token
13
  #dataset = load_dataset("thankrandomness/mimic-iii", token=hf_token)
14
+ dataset = load_dataset("thankrandomness/mimic-iii-sample")
 
 
 
 
 
 
 
15
 
16
  # Load PubMedBERT model and tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
 
34
  client = chromadb.Client()
35
  collection = client.create_collection(name="pubmedbert_embeddings")
36
 
37
+ # Process the dataset and upsert into ChromaDB
 
 
 
 
 
38
  for i, row in enumerate(dataset['train']):
39
+ for note in row['notes']:
40
+ text = note.get('text', '')
41
+ annotations_list = []
42
+
43
+ for annotation in note.get('annotations', []):
44
+ try:
45
+ code = annotation['code']
46
+ code_system = annotation['code_system']
47
+ description = annotation['description']
48
+ #annotations_list.append(f"{code}: {code_system}: {description}")
49
+ annotations_list.append({"code": code, "code_system": code_system, "description": description})
50
+ except KeyError as e:
51
+ print(f"Skipping annotation due to missing key: {e}")
52
 
53
+ print(f"Processed annotations for note {note['note_id']}: {annotations_list}")
54
+
55
+ if text and annotations_list:
56
+ embeddings = embed_text([text])[0]
 
 
 
 
 
 
57
 
58
+ # Upsert data, embeddings, and annotations into ChromaDB
59
+ for j, annotation in enumerate(annotations_list):
60
+ collection.upsert(
61
+ ids=[f"note_{note['note_id']}_{j}"],
62
+ embeddings=[embeddings],
63
+ metadatas=[annotation]
64
+ )
65
+ else:
66
+ print(f"Skipping note {note['note_id']} due to missing 'text' or 'annotations'")
67
 
68
  # Define retrieval function
69
  def retrieve_relevant_text(input_text):
70
+ input_embedding = embed_text([input_text])[0] # Get the embedding for the single input text
71
+ results = collection.query(query_embeddings=[input_embedding], n_results=5)
72
+ print(results)
73
+ # Extract code and similarity scores
74
  output = []
75
+ for result in results['results']:
76
+ print(result)
77
  for annotation in result["metadata"]["annotations"]:
78
  output.append({
79
+ "similarity_score": result["distances"],
80
+ "annotation": annotation
 
81
  })
82
  return output
83