awacke1 commited on
Commit
d58378b
1 Parent(s): 26e5ae2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -2
app.py CHANGED
@@ -66,6 +66,30 @@ def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, d
66
  #JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
67
  #JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
70
  video, audio1, audio2, file, df1, df2,):
71
  #def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
@@ -73,7 +97,10 @@ def fn( text1, text2, num, slider1, slider2, single_checkbox,
73
  searchTerm = text1
74
  searchTermSentence = text2
75
 
76
- start_with_searchTermLOINC = datasetLOINC.filter(lambda example: example["Description"].startswith('Allergy')) #Allergy
 
 
 
77
  columns = start_with_searchTermLOINC.column_names
78
  columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
79
  columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
@@ -81,11 +108,68 @@ def fn( text1, text2, num, slider1, slider2, single_checkbox,
81
  start_with_searchTermLOINC
82
  start_with_searchTermLOINC.set_format("pandas")
83
  df = start_with_searchTermLOINC[:]
84
- #df["Purpose: Clinical Focus"][0].tolist()
85
  df["Purpose: Clinical Focus"][0]
 
86
  df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
87
  df4.head(4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
90
  start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
91
 
 
66
  #JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
67
  #JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
68
 
69
+
70
+
71
+
72
+ def concatenate_text(examples):
73
+ return {
74
+ "text": examples["Code"]
75
+ + " \n "
76
+ + examples["Description"]
77
+ + " \n "
78
+ + examples["Purpose: Clinical Focus"]
79
+ }
80
+
81
+ def cls_pooling(model_output):
82
+ return model_output.last_hidden_state[:, 0]
83
+
84
+ def get_embeddings(text_list):
85
+ encoded_input = tokenizer(
86
+ text_list, padding=True, truncation=True, return_tensors="tf"
87
+ )
88
+ encoded_input = {k: v for k, v in encoded_input.items()}
89
+ model_output = model(**encoded_input)
90
+ return cls_pooling(model_output)
91
+
92
+
93
  def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
94
  video, audio1, audio2, file, df1, df2,):
95
  #def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
 
97
  searchTerm = text1
98
  searchTermSentence = text2
99
 
100
+ start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy
101
+
102
+
103
+ # FAISS
104
  columns = start_with_searchTermLOINC.column_names
105
  columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
106
  columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
 
108
  start_with_searchTermLOINC
109
  start_with_searchTermLOINC.set_format("pandas")
110
  df = start_with_searchTermLOINC[:]
111
+
112
  df["Purpose: Clinical Focus"][0]
113
+
114
  df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
115
  df4.head(4)
116
+
117
+ from datasets import Dataset
118
+ clinical_dataset = Dataset.from_pandas(df4)
119
+ clinical_dataset
120
+
121
+ clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
122
+
123
+ clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
124
+ clinical_dataset
125
+
126
+
127
+ clinical_dataset = clinical_dataset.map(concatenate_text)
128
+ embedding = get_embeddings(comments_dataset["text"][0])
129
+ embedding.shape
130
+
131
+ from transformers import AutoTokenizer, TFAutoModel
132
+
133
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
134
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
135
+ model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
136
+
137
+ TensorShape([1, 768])
138
+
139
+ embeddings_dataset = comments_dataset.map(
140
+ lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
141
+
142
+ embeddings_dataset.add_faiss_index(column="embeddings")
143
 
144
+ question = "How can I load a dataset offline?"
145
+ question_embedding = get_embeddings([question]).numpy()
146
+ question_embedding.shape
147
+
148
+ scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
149
+
150
+ import pandas as pd
151
+
152
+ samples_df = pd.DataFrame.from_dict(samples)
153
+ samples_df["scores"] = scores
154
+ samples_df.sort_values("scores", ascending=False, inplace=True)
155
+
156
+
157
+ # "text": examples["Code"]
158
+ # + " \n "
159
+ # + examples["Description"]
160
+ # + " \n "
161
+ # + examples["Purpose: Clinical Focus"]
162
+
163
+
164
+ for _, row in samples_df.iterrows():
165
+ print(f"Code: {row.Code}")
166
+ print(f"Description: {row.Description}")
167
+ #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
168
+ #print(f"URL: {row.html_url}")
169
+ print("=" * 50)
170
+ print()
171
+
172
+ # SNOMED and CQM ---------------
173
  start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
174
  start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
175