Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- README.txt +13 -0
- app.py +327 -0
- requirements.txt +7 -0
README.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: 🧬CTMap - Clinical Terminology AutoMap AI
|
3 |
+
emoji: ⚗️🧠🔬🧬
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.5
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas_profiling as pp
|
2 |
+
import pandas as pd
|
3 |
+
import tensorflow as tf
|
4 |
+
|
5 |
+
from datasets import load_dataset
|
6 |
+
from tensorflow.python.framework import tensor_shape
|
7 |
+
|
8 |
+
#LOINC
|
9 |
+
datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train")
|
10 |
+
#SNOMED:
|
11 |
+
datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train")
|
12 |
+
#eCQM:
|
13 |
+
dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train")
|
14 |
+
|
15 |
+
# map using autotokenizer
|
16 |
+
from transformers import AutoTokenizer
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
18 |
+
dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True)
|
19 |
+
JSONOBJ2=dataset[0]
|
20 |
+
print(JSONOBJ2)
|
21 |
+
|
22 |
+
sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy"))
|
23 |
+
len(sw)
|
24 |
+
print(sw)
|
25 |
+
print(datasetLOINC)
|
26 |
+
print(datasetSNOMED)
|
27 |
+
print(dataseteCQM)
|
28 |
+
|
29 |
+
# play with some dataset tools before the show:
|
30 |
+
|
31 |
+
#print(start_with_ar["Description"])
|
32 |
+
|
33 |
+
#---
|
34 |
+
#Main Stage - Begin!
|
35 |
+
#---
|
36 |
+
|
37 |
+
import os
|
38 |
+
import json
|
39 |
+
import numpy as np
|
40 |
+
import gradio as gr
|
41 |
+
|
42 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
43 |
+
CHOICES = ["SNOMED", "LOINC", "CQM"]
|
44 |
+
JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}"""
|
45 |
+
|
46 |
+
|
47 |
+
def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"):
|
48 |
+
df = pd.read_csv(dataset.Description)
|
49 |
+
if len(df.columns) <= 15:
|
50 |
+
profile = pp.ProfileReport(df, title=f"{dataset_name} Report")
|
51 |
+
else:
|
52 |
+
profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True)
|
53 |
+
|
54 |
+
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
|
55 |
+
|
56 |
+
profile.to_file("./index.html")
|
57 |
+
|
58 |
+
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
|
59 |
+
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
|
60 |
+
with open("README.md", "w+") as f:
|
61 |
+
f.write(readme)
|
62 |
+
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
|
63 |
+
return f"Your dataset report will be ready at {repo_url}"
|
64 |
+
|
65 |
+
#def lowercase_title(example):
|
66 |
+
# return {"Description": example[title].lower()}
|
67 |
+
|
68 |
+
# demonstrate map function of dataset
|
69 |
+
#JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
|
70 |
+
#JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
def concatenate_text(examples):
|
76 |
+
return {
|
77 |
+
"text": examples["Code"]
|
78 |
+
+ " \n "
|
79 |
+
+ examples["Description"]
|
80 |
+
+ " \n "
|
81 |
+
+ examples["Purpose: Clinical Focus"]
|
82 |
+
}
|
83 |
+
|
84 |
+
def cls_pooling(model_output):
|
85 |
+
return model_output.last_hidden_state[:, 0]
|
86 |
+
|
87 |
+
def get_embeddings(text_list):
|
88 |
+
encoded_input = tokenizer(
|
89 |
+
text_list, padding=True, truncation=True, return_tensors="tf"
|
90 |
+
)
|
91 |
+
encoded_input = {k: v for k, v in encoded_input.items()}
|
92 |
+
model_output = model(**encoded_input)
|
93 |
+
return cls_pooling(model_output)
|
94 |
+
|
95 |
+
|
96 |
+
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
|
97 |
+
video, audio1, audio2, file, df1, df2,):
|
98 |
+
#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
|
99 |
+
|
100 |
+
searchTerm = text1
|
101 |
+
searchTermSentence = text2
|
102 |
+
|
103 |
+
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy
|
104 |
+
|
105 |
+
|
106 |
+
# FAISS
|
107 |
+
columns = start_with_searchTermLOINC.column_names
|
108 |
+
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
|
109 |
+
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
110 |
+
start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove)
|
111 |
+
start_with_searchTermLOINC
|
112 |
+
start_with_searchTermLOINC.set_format("pandas")
|
113 |
+
df = start_with_searchTermLOINC[:]
|
114 |
+
|
115 |
+
df["Purpose: Clinical Focus"][0]
|
116 |
+
|
117 |
+
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
|
118 |
+
df4.head(4)
|
119 |
+
|
120 |
+
from datasets import Dataset
|
121 |
+
clinical_dataset = Dataset.from_pandas(df4)
|
122 |
+
clinical_dataset
|
123 |
+
|
124 |
+
clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
|
125 |
+
|
126 |
+
clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
|
127 |
+
clinical_dataset
|
128 |
+
|
129 |
+
|
130 |
+
clinical_dataset = clinical_dataset.map(concatenate_text)
|
131 |
+
#embedding = get_embeddings(clinical_dataset["text"][0])
|
132 |
+
#embedding.shape
|
133 |
+
|
134 |
+
from transformers import AutoTokenizer, TFAutoModel
|
135 |
+
|
136 |
+
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
137 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
138 |
+
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
|
139 |
+
|
140 |
+
# TensorShape([1, 768])
|
141 |
+
tf.shape([1, 768])
|
142 |
+
|
143 |
+
embeddings_dataset = clinical_dataset.map(
|
144 |
+
lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
|
145 |
+
|
146 |
+
# embeddings_dataset.add_faiss_index(column="embeddings")
|
147 |
+
|
148 |
+
# question = "How can I load a dataset offline?"
|
149 |
+
# question_embedding = get_embeddings([question]).numpy()
|
150 |
+
# question_embedding.shape
|
151 |
+
|
152 |
+
# scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
|
153 |
+
|
154 |
+
# import pandas as pd
|
155 |
+
|
156 |
+
# samples_df = pd.DataFrame.from_dict(samples)
|
157 |
+
# samples_df["scores"] = scores
|
158 |
+
# samples_df.sort_values("scores", ascending=False, inplace=True)
|
159 |
+
|
160 |
+
|
161 |
+
# "text": examples["Code"]
|
162 |
+
# + " \n "
|
163 |
+
# + examples["Description"]
|
164 |
+
# + " \n "
|
165 |
+
# + examples["Purpose: Clinical Focus"]
|
166 |
+
|
167 |
+
|
168 |
+
# for _, row in samples_df.iterrows():
|
169 |
+
# print(f"Code: {row.Code}")
|
170 |
+
# print(f"Description: {row.Description}")
|
171 |
+
# #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
|
172 |
+
# #print(f"URL: {row.html_url}")
|
173 |
+
# print("=" * 50)
|
174 |
+
# print()
|
175 |
+
|
176 |
+
# SNOMED and CQM ---------------
|
177 |
+
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
|
178 |
+
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
|
179 |
+
|
180 |
+
print(start_with_searchTermLOINC )
|
181 |
+
print(start_with_searchTermSNOMED )
|
182 |
+
print(start_with_searchTermCQM)
|
183 |
+
|
184 |
+
#print(start_with_searchTermLOINC["train"][0] )
|
185 |
+
#print(start_with_searchTermSNOMED["train"][0] )
|
186 |
+
#print(start_with_searchTermCQM["train"][0] )
|
187 |
+
|
188 |
+
#returnMsg=profile_dataset()
|
189 |
+
#print(returnMsg)
|
190 |
+
|
191 |
+
# try:
|
192 |
+
#top1matchLOINC = json.loads(start_with_searchTermLOINC['train'])
|
193 |
+
#top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train'])
|
194 |
+
#top1matchCQM = json.loads(start_with_searchTermCQM['train'])
|
195 |
+
# top1matchLOINC = json.loads(start_with_searchTermLOINC)
|
196 |
+
# top1matchSNOMED = json.loads(start_with_searchTermSNOMED)
|
197 |
+
# top1matchCQM = json.loads(start_with_searchTermCQM)
|
198 |
+
# except:
|
199 |
+
# print('Hello')
|
200 |
+
#print(start_with_searchTermLOINC[0])
|
201 |
+
#print(start_with_searchTermSNOMED[0] )
|
202 |
+
#print(start_with_searchTermCQM[0] )
|
203 |
+
|
204 |
+
#print(returnMsg)
|
205 |
+
# print("Datasets Processed")
|
206 |
+
|
207 |
+
return (
|
208 |
+
(text1 if single_checkbox else text2)
|
209 |
+
+ ", selected:"
|
210 |
+
+ ", ".join(checkboxes), # Text
|
211 |
+
{
|
212 |
+
"positive": num / (num + slider1 + slider2),
|
213 |
+
"negative": slider1 / (num + slider1 + slider2),
|
214 |
+
"neutral": slider2 / (num + slider1 + slider2),
|
215 |
+
}, # Label
|
216 |
+
(audio1[0], np.flipud(audio1[1]))
|
217 |
+
if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio
|
218 |
+
np.flipud(im1)
|
219 |
+
if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image
|
220 |
+
video
|
221 |
+
if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video
|
222 |
+
[
|
223 |
+
("The", "art"),
|
224 |
+
("quick brown", "adj"),
|
225 |
+
("fox", "nn"),
|
226 |
+
("jumped", "vrb"),
|
227 |
+
("testing testing testing", None),
|
228 |
+
("over", "prp"),
|
229 |
+
("the", "art"),
|
230 |
+
("testing", None),
|
231 |
+
("lazy", "adj"),
|
232 |
+
("dogs", "nn"),
|
233 |
+
(".", "punc"),
|
234 |
+
] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText
|
235 |
+
[
|
236 |
+
("The testing testing testing", None),
|
237 |
+
("over", 0.6),
|
238 |
+
("the", 0.2),
|
239 |
+
("testing", None),
|
240 |
+
("lazy", -0.1),
|
241 |
+
("dogs", 0.4),
|
242 |
+
(".", 0),
|
243 |
+
] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText
|
244 |
+
#json.loads(JSONOBJ), # JSON
|
245 |
+
start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"),
|
246 |
+
#json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))),
|
247 |
+
"<button style='background-color: red'>Click Me: " + radio + "</button>", # HTML
|
248 |
+
os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
|
249 |
+
df1, # Dataframe
|
250 |
+
np.random.randint(0, 10, (4, 4)), # Dataframe
|
251 |
+
df2, # Timeseries
|
252 |
+
)
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
demo = gr.Interface(
|
257 |
+
fn,
|
258 |
+
inputs=[
|
259 |
+
gr.Textbox(value="Allergy", label="Textbox"),
|
260 |
+
gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"),
|
261 |
+
gr.Number(label="Number", value=42),
|
262 |
+
gr.Slider(10, 20, value=15, label="Slider: 10 - 20"),
|
263 |
+
gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"),
|
264 |
+
gr.Checkbox(label="Check for NER Match on Submit"),
|
265 |
+
gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]),
|
266 |
+
gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]),
|
267 |
+
gr.Dropdown(label="Dropdown", choices=CHOICES),
|
268 |
+
gr.Image(label="Image"),
|
269 |
+
gr.Image(label="Image w/ Cropper", tool="select"),
|
270 |
+
gr.Image(label="Sketchpad", source="canvas"),
|
271 |
+
gr.Image(label="Webcam", source="webcam"),
|
272 |
+
gr.Video(label="Video"),
|
273 |
+
gr.Audio(label="Audio"),
|
274 |
+
gr.Audio(label="Microphone", source="microphone"),
|
275 |
+
gr.File(label="File"),
|
276 |
+
gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]),
|
277 |
+
gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]),
|
278 |
+
],
|
279 |
+
outputs=[
|
280 |
+
gr.Textbox(label="Textbox"),
|
281 |
+
gr.Label(label="Label"),
|
282 |
+
gr.Audio(label="Audio"),
|
283 |
+
gr.Image(label="Image"),
|
284 |
+
gr.Video(label="Video"),
|
285 |
+
gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}),
|
286 |
+
gr.HighlightedText(label="HighlightedText", show_legend=True),
|
287 |
+
gr.JSON(label="JSON"),
|
288 |
+
gr.HTML(label="HTML"),
|
289 |
+
gr.File(label="File"),
|
290 |
+
gr.Dataframe(label="Dataframe"),
|
291 |
+
gr.Dataframe(label="Numpy"),
|
292 |
+
gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"),
|
293 |
+
],
|
294 |
+
examples=[
|
295 |
+
[
|
296 |
+
"Allergy",
|
297 |
+
"Admission",
|
298 |
+
10,
|
299 |
+
12,
|
300 |
+
4,
|
301 |
+
True,
|
302 |
+
["SNOMED", "LOINC", "CQM"],
|
303 |
+
"SNOMED",
|
304 |
+
"bar",
|
305 |
+
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
|
306 |
+
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
|
307 |
+
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
|
308 |
+
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
|
309 |
+
os.path.join(os.path.dirname(__file__), "files/world.mp4"),
|
310 |
+
os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
|
311 |
+
os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
|
312 |
+
os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
|
313 |
+
[[1, 2, 3], [3, 4, 5]],
|
314 |
+
os.path.join(os.path.dirname(__file__), "files/time.csv"),
|
315 |
+
]
|
316 |
+
]
|
317 |
+
* 3,
|
318 |
+
theme="default",
|
319 |
+
title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩⚕️🩺⚕️🙋",
|
320 |
+
cache_examples=False,
|
321 |
+
description="Clinical Terminology Auto Mapper AI",
|
322 |
+
article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)",
|
323 |
+
# live=True,
|
324 |
+
)
|
325 |
+
|
326 |
+
if __name__ == "__main__":
|
327 |
+
demo.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
transformers
|
3 |
+
pandas-profiling
|
4 |
+
huggingface-hub
|
5 |
+
gradio
|
6 |
+
Tensorflow
|
7 |
+
torch
|