import pandas_profiling as pp import pandas as pd import tensorflow as tf from datasets import load_dataset from tensorflow.python.framework import tensor_shape #LOINC datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train") #SNOMED: datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train") #eCQM: dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train") # map using autotokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True) JSONOBJ2=dataset[0] print(JSONOBJ2) sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy")) len(sw) print(sw) print(datasetLOINC) print(datasetSNOMED) print(dataseteCQM) # play with some dataset tools before the show: #print(start_with_ar["Description"]) #--- #Main Stage - Begin! #--- import os import json import numpy as np import gradio as gr HF_TOKEN = os.environ.get("HF_TOKEN") CHOICES = ["SNOMED", "LOINC", "CQM"] JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}""" def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"): df = pd.read_csv(dataset.Description) if len(df.columns) <= 15: profile = pp.ProfileReport(df, title=f"{dataset_name} Report") else: profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True) repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) profile.to_file("./index.html") upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" with open("README.md", "w+") as f: f.write(readme) upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) return f"Your dataset report will be ready at {repo_url}" #def lowercase_title(example): # return {"Description": example[title].lower()} # demonstrate map function of dataset #JSONOBJ_MAP=datasetLOINC.map(lowercase_title) #JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health")) def concatenate_text(examples): return { "text": examples["Code"] + " \n " + examples["Description"] + " \n " + examples["Purpose: Clinical Focus"] } def cls_pooling(model_output): return model_output.last_hidden_state[:, 0] def get_embeddings(text_list): encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="tf" ) encoded_input = {k: v for k, v in encoded_input.items()} model_output = model(**encoded_input) return cls_pooling(model_output) def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4, video, audio1, audio2, file, df1, df2,): #def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,): searchTerm = text1 searchTermSentence = text2 start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy # FAISS columns = start_with_searchTermLOINC.column_names columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"] columns_to_remove = set(columns_to_keep).symmetric_difference(columns) start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove) start_with_searchTermLOINC start_with_searchTermLOINC.set_format("pandas") df = start_with_searchTermLOINC[:] df["Purpose: Clinical Focus"][0] df4 = df.explode("Purpose: Clinical Focus", ignore_index=True) df4.head(4) from datasets import Dataset clinical_dataset = Dataset.from_pandas(df4) clinical_dataset clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())}) clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15) clinical_dataset clinical_dataset = clinical_dataset.map(concatenate_text) #embedding = get_embeddings(clinical_dataset["text"][0]) #embedding.shape from transformers import AutoTokenizer, TFAutoModel model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True) # TensorShape([1, 768]) tf.shape([1, 768]) embeddings_dataset = clinical_dataset.map( lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]}) # embeddings_dataset.add_faiss_index(column="embeddings") # question = "How can I load a dataset offline?" # question_embedding = get_embeddings([question]).numpy() # question_embedding.shape # scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5) # import pandas as pd # samples_df = pd.DataFrame.from_dict(samples) # samples_df["scores"] = scores # samples_df.sort_values("scores", ascending=False, inplace=True) # "text": examples["Code"] # + " \n " # + examples["Description"] # + " \n " # + examples["Purpose: Clinical Focus"] # for _, row in samples_df.iterrows(): # print(f"Code: {row.Code}") # print(f"Description: {row.Description}") # #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}") # #print(f"URL: {row.html_url}") # print("=" * 50) # print() # SNOMED and CQM --------------- start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone print(start_with_searchTermLOINC ) print(start_with_searchTermSNOMED ) print(start_with_searchTermCQM) #print(start_with_searchTermLOINC["train"][0] ) #print(start_with_searchTermSNOMED["train"][0] ) #print(start_with_searchTermCQM["train"][0] ) #returnMsg=profile_dataset() #print(returnMsg) # try: #top1matchLOINC = json.loads(start_with_searchTermLOINC['train']) #top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train']) #top1matchCQM = json.loads(start_with_searchTermCQM['train']) # top1matchLOINC = json.loads(start_with_searchTermLOINC) # top1matchSNOMED = json.loads(start_with_searchTermSNOMED) # top1matchCQM = json.loads(start_with_searchTermCQM) # except: # print('Hello') #print(start_with_searchTermLOINC[0]) #print(start_with_searchTermSNOMED[0] ) #print(start_with_searchTermCQM[0] ) #print(returnMsg) # print("Datasets Processed") return ( (text1 if single_checkbox else text2) + ", selected:" + ", ".join(checkboxes), # Text { "positive": num / (num + slider1 + slider2), "negative": slider1 / (num + slider1 + slider2), "neutral": slider2 / (num + slider1 + slider2), }, # Label (audio1[0], np.flipud(audio1[1])) if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio np.flipud(im1) if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image video if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video [ ("The", "art"), ("quick brown", "adj"), ("fox", "nn"), ("jumped", "vrb"), ("testing testing testing", None), ("over", "prp"), ("the", "art"), ("testing", None), ("lazy", "adj"), ("dogs", "nn"), (".", "punc"), ] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText [ ("The testing testing testing", None), ("over", 0.6), ("the", 0.2), ("testing", None), ("lazy", -0.1), ("dogs", 0.4), (".", 0), ] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText #json.loads(JSONOBJ), # JSON start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"), #json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))), "", # HTML os.path.join(os.path.dirname(__file__), "files/titanic.csv"), df1, # Dataframe np.random.randint(0, 10, (4, 4)), # Dataframe df2, # Timeseries ) demo = gr.Interface( fn, inputs=[ gr.Textbox(value="Allergy", label="Textbox"), gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"), gr.Number(label="Number", value=42), gr.Slider(10, 20, value=15, label="Slider: 10 - 20"), gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"), gr.Checkbox(label="Check for NER Match on Submit"), gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]), gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]), gr.Dropdown(label="Dropdown", choices=CHOICES), gr.Image(label="Image"), gr.Image(label="Image w/ Cropper", tool="select"), gr.Image(label="Sketchpad", source="canvas"), gr.Image(label="Webcam", source="webcam"), gr.Video(label="Video"), gr.Audio(label="Audio"), gr.Audio(label="Microphone", source="microphone"), gr.File(label="File"), gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]), gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]), ], outputs=[ gr.Textbox(label="Textbox"), gr.Label(label="Label"), gr.Audio(label="Audio"), gr.Image(label="Image"), gr.Video(label="Video"), gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}), gr.HighlightedText(label="HighlightedText", show_legend=True), gr.JSON(label="JSON"), gr.HTML(label="HTML"), gr.File(label="File"), gr.Dataframe(label="Dataframe"), gr.Dataframe(label="Numpy"), gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"), ], examples=[ [ "Allergy", "Admission", 10, 12, 4, True, ["SNOMED", "LOINC", "CQM"], "SNOMED", "bar", os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), os.path.join(os.path.dirname(__file__), "files/world.mp4"), os.path.join(os.path.dirname(__file__), "files/cantina.wav"), os.path.join(os.path.dirname(__file__), "files/cantina.wav"), os.path.join(os.path.dirname(__file__), "files/titanic.csv"), [[1, 2, 3], [3, 4, 5]], os.path.join(os.path.dirname(__file__), "files/time.csv"), ] ] * 3, theme="default", title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩‍⚕️🩺⚕️🙋", cache_examples=False, description="Clinical Terminology Auto Mapper AI", article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)", # live=True, ) if __name__ == "__main__": demo.launch(debug=True)