Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -66,6 +66,30 @@ def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, d
|
|
66 |
#JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
|
67 |
#JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
|
70 |
video, audio1, audio2, file, df1, df2,):
|
71 |
#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
|
@@ -73,7 +97,10 @@ def fn( text1, text2, num, slider1, slider2, single_checkbox,
|
|
73 |
searchTerm = text1
|
74 |
searchTermSentence = text2
|
75 |
|
76 |
-
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:
|
|
|
|
|
|
|
77 |
columns = start_with_searchTermLOINC.column_names
|
78 |
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
|
79 |
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
@@ -81,11 +108,68 @@ def fn( text1, text2, num, slider1, slider2, single_checkbox,
|
|
81 |
start_with_searchTermLOINC
|
82 |
start_with_searchTermLOINC.set_format("pandas")
|
83 |
df = start_with_searchTermLOINC[:]
|
84 |
-
|
85 |
df["Purpose: Clinical Focus"][0]
|
|
|
86 |
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
|
87 |
df4.head(4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
|
90 |
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
|
91 |
|
|
|
66 |
#JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
|
67 |
#JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
|
68 |
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
def concatenate_text(examples):
|
73 |
+
return {
|
74 |
+
"text": examples["Code"]
|
75 |
+
+ " \n "
|
76 |
+
+ examples["Description"]
|
77 |
+
+ " \n "
|
78 |
+
+ examples["Purpose: Clinical Focus"]
|
79 |
+
}
|
80 |
+
|
81 |
+
def cls_pooling(model_output):
|
82 |
+
return model_output.last_hidden_state[:, 0]
|
83 |
+
|
84 |
+
def get_embeddings(text_list):
|
85 |
+
encoded_input = tokenizer(
|
86 |
+
text_list, padding=True, truncation=True, return_tensors="tf"
|
87 |
+
)
|
88 |
+
encoded_input = {k: v for k, v in encoded_input.items()}
|
89 |
+
model_output = model(**encoded_input)
|
90 |
+
return cls_pooling(model_output)
|
91 |
+
|
92 |
+
|
93 |
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
|
94 |
video, audio1, audio2, file, df1, df2,):
|
95 |
#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
|
|
|
97 |
searchTerm = text1
|
98 |
searchTermSentence = text2
|
99 |
|
100 |
+
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy
|
101 |
+
|
102 |
+
|
103 |
+
# FAISS
|
104 |
columns = start_with_searchTermLOINC.column_names
|
105 |
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
|
106 |
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
|
|
108 |
start_with_searchTermLOINC
|
109 |
start_with_searchTermLOINC.set_format("pandas")
|
110 |
df = start_with_searchTermLOINC[:]
|
111 |
+
|
112 |
df["Purpose: Clinical Focus"][0]
|
113 |
+
|
114 |
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
|
115 |
df4.head(4)
|
116 |
+
|
117 |
+
from datasets import Dataset
|
118 |
+
clinical_dataset = Dataset.from_pandas(df4)
|
119 |
+
clinical_dataset
|
120 |
+
|
121 |
+
clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
|
122 |
+
|
123 |
+
clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
|
124 |
+
clinical_dataset
|
125 |
+
|
126 |
+
|
127 |
+
clinical_dataset = clinical_dataset.map(concatenate_text)
|
128 |
+
embedding = get_embeddings(comments_dataset["text"][0])
|
129 |
+
embedding.shape
|
130 |
+
|
131 |
+
from transformers import AutoTokenizer, TFAutoModel
|
132 |
+
|
133 |
+
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
134 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
135 |
+
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
|
136 |
+
|
137 |
+
TensorShape([1, 768])
|
138 |
+
|
139 |
+
embeddings_dataset = comments_dataset.map(
|
140 |
+
lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
|
141 |
+
|
142 |
+
embeddings_dataset.add_faiss_index(column="embeddings")
|
143 |
|
144 |
+
question = "How can I load a dataset offline?"
|
145 |
+
question_embedding = get_embeddings([question]).numpy()
|
146 |
+
question_embedding.shape
|
147 |
+
|
148 |
+
scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
|
149 |
+
|
150 |
+
import pandas as pd
|
151 |
+
|
152 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
153 |
+
samples_df["scores"] = scores
|
154 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
155 |
+
|
156 |
+
|
157 |
+
# "text": examples["Code"]
|
158 |
+
# + " \n "
|
159 |
+
# + examples["Description"]
|
160 |
+
# + " \n "
|
161 |
+
# + examples["Purpose: Clinical Focus"]
|
162 |
+
|
163 |
+
|
164 |
+
for _, row in samples_df.iterrows():
|
165 |
+
print(f"Code: {row.Code}")
|
166 |
+
print(f"Description: {row.Description}")
|
167 |
+
#print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
|
168 |
+
#print(f"URL: {row.html_url}")
|
169 |
+
print("=" * 50)
|
170 |
+
print()
|
171 |
+
|
172 |
+
# SNOMED and CQM ---------------
|
173 |
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
|
174 |
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
|
175 |
|