Spaces:

HengJay
/

snomed-ct-assistant

Sleeping

App Files Files Community

HengJay commited on May 22, 2024

Commit

45a019e

1 Parent(s): 693a89d

first commit to HF spaces.

Browse files

Files changed (13) hide show

.gitattributes +1 -0
.gitignore +140 -0
README.md +4 -4
SNOMED-CT_Assistant.py +150 -0
pages/Vector DB of SNOMED-CT.py +59 -0
requirements.txt +8 -0
snomed-entity-challenge.csv +0 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/data_level0.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/header.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/index_metadata.pickle +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/length.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/link_lists.bin +3 -0
snomed_ct_id_term_1410k/chroma.sqlite3 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,140 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# End of https://mrkandreev.name/snippets/gitignore-generator/#Python

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 title: Snomed Ct Assistant
-emoji: 👀
-colorFrom: gray
-colorTo: indigo
 sdk: streamlit
 sdk_version: 1.34.0
-app_file: app.py
 pinned: false
 ---

 ---
 title: Snomed Ct Assistant
+emoji: 🏥
+colorFrom: yellow
+colorTo: pink
 sdk: streamlit
 sdk_version: 1.34.0
+app_file: SNOMED-CT_Assistant.py
 pinned: false
 ---

SNOMED-CT_Assistant.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import random
+import streamlit as st
+from openai import OpenAI
+from dotenv import load_dotenv
+import pandas as pd
+# configure sqlite3
+# __import__('pysqlite3')
+# import sys
+# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+st.set_page_config(layout="wide")
+remote = True
+if remote:
+    with st.sidebar:
+        if 'OPENAI_API_TOKEN' in st.secrets:
+            st.success('API key already provided!', icon='✅')
+            openai_api_key = st.secrets['OPENAI_API_TOKEN']
+else:
+    load_dotenv()
+    openai_api_key = os.environ.get("OpenAI_API_KEY")
+st.title("🏥 SNOMED-CT Assistant")
+st.caption("👩‍⚕️ A smart medical assistant with SNOMED-CT knowledge.")
+# System prompt
+system_prompt = """You are a medical expert with rich experience in SNOMED-CT professional knowledge.
+You are skilled at assisting medical professionals and answering questions in the medical field. You are patient, helpful and professional.
+Please refuse to answer inquiries and requests unrelated to the medical field, in order to maintain professionalism in medicine.
+As an experienced professional, you possess deep expertise in the field of SNOMED CT Entity Linking.
+You have a thorough understanding of the relevant workflows and critical aspects involved, encompassing:
+- Processing electronic medical records (EHRs), Adept handling of electronic medical record (EMR) data processing
+- Entity Identification, Proficient entity recognition capabilities, identifying and extracting relevant medical concepts from unstructured text
+- Skilled Entity Mapping, accurately linking identified entities to their corresponding SNOMED CT concepts
+- Seamless integration and output of clinical terminology, ensuring the accurate representation and utilization of standardized medical language
+- Patiently and professionally respond to all SNOMED CT related inquiries, even if the user repeats questions.
+- Demonstrate deep expertise in the standard SNOMED CT Entity Linking workflow, which involves:
+  1. Performing Entity Identification to extract relevant medical terminology from the input.
+  2. Conducting Entity Mapping to link the identified entities to their corresponding SNOMED CT concepts.
+- Present the results in a tabular format only with the following 3 columns: "Identified Entity", "SNOMED CT Concept IDs", "SNOMED CT Descriptions".
+Here is the practical entity linking process example:
+- the input text in EHRs: "Patient referred for a biopsy to investigate potential swelling in upper larynx."
+- the identified entity: "biopsy", "larynx"
+- the mapped SNOMED CT concepts id & descriptions: "274317003 | Laryngoscopic biopsy larynx (procedure)", "4596009 | Laryngeal structure (body structure)"
+List out as many potential SNOMED entities as possible from the original medical text description,
+including Diseases, Diagnoses, Clinical Findings (like Signs and Symptoms),
+Procedures (Surgical, Therapeutic, Diagnostic, Nursing), Specimen Types, Living Organisms,
+Observables (for example heart rate), Physical Objects and Forces,
+Chemicals (including the chemicals used in drug preparations), Drugs (pharmaceutical products),
+Human Anatomy (body structures, organisms), Physiological Processes and Functions,
+Patients' Occupations, Patients' Social Contexts (e.g., religion and ethnicity), and various other types from the SNOMED CT standard.
+Numbers or units related symbols are not included in this range and can be ignored.
+Output Format Requirements (Must follow):
+- Present the results in a tabular format with the following 3 columns only: "Identified Entity", "SNOMED CT Concept IDs", and "SNOMED CT Descriptions". Do not arbitrarily replace the column names, as that would lead to unclear output.
+- The table should be easy to read and understand, with each row displaying the identified medical entity, its corresponding SNOMED CT concept ID, and the full SNOMED CT description.
+- Ensure the formatting and organization of the table is clean and professional, optimized for the user's ease of reference.
+Your comprehensive knowledge and mastery of these key components make you an invaluable asset in the realm of biomedical natural language processing and knowledge extraction.
+With your specialized expertise, you are able to navigate the complexities of SNOMED CT Entity Linking with ease, delivering accurate and reliable results that support various healthcare and research applications.
+When answering questions, except for the use of English for medical-related terminology,  always respond in Traditional Chinese (zh-TW).
+If there are any SNOMED-CT related medical professional terms, please provide the original text in parentheses afterwards."""
+# Func: generate random med text
+raw_text_df = pd.read_csv('snomed-entity-challenge.csv')
+def random_med_text(text_df):
+    rows = len(text_df['text'])
+    index = random.randint(0, rows)
+    raw_text = text_df["text"][index]
+    raw_text_spilt = raw_text.split('###TEXT:')
+    raw_text_spilt_2 = raw_text_spilt[1].split('###RESPONSE:')
+    human = raw_text_spilt[0]
+    med_text = raw_text_spilt_2[0]
+    response = raw_text_spilt_2[1]
+    return index, human, med_text, response
+# Func: Gen Medical Prompt Example
+def generate_med_prompt(medical_text):
+    return f"""請協助我做電子病歷 (Electronic Health Record, EHR) 的 SNOMED-CT Entity Linking 的處理， 這是原本的病歷文字:  \n {medical_text} \n """
+# test_prompt = """請協助我做 EHR 的 SNOMED CT Entity Linking 的處理， 這是原本的病歷文字:
+# "Patient referred for a biopsy to investigate potential swelling in upper larynx."
+# ，首先做 Entity Identification，列出醫學相關術語片段，接著做 Entity Mapping，將對應的 SNOMED CT 術語列出。
+# 輸出格式用表格，欄位是 "identified entity", "SNOMED CT concept ids", "SNOMED CT descriptions"。"""
+client = OpenAI(api_key=openai_api_key)
+model_tag = "gpt-3.5-turbo"
+def chat_input(prompt):
+    # with st.sidebar:
+    # st.write("You are talking with: ", model_tag)
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.chat_message("user").write(prompt)
+    with st.spinner("Thinking..."):
+        response = client.chat.completions.create(
+            model=model_tag, messages=st.session_state.messages, temperature=0.5)
+        msg = response.choices[0].message.content
+        st.session_state.messages.append({"role": "assistant", "content": msg})
+        st.chat_message("assistant").write(msg)
+if "messages" not in st.session_state:
+    st.session_state["messages"] = [{"role": "system", "content": system_prompt},
+                                    {"role": "assistant", "content": "👩‍⚕️ 您好，我是您的專業醫學助理。請問有任何我可以協助你的地方嗎?"}]
+for msg in st.session_state.messages:
+    if msg["role"] == "system":
+        continue
+    st.chat_message(msg["role"]).write(msg["content"])
+if prompt := st.chat_input():
+    if not openai_api_key:
+        st.info("Please add your OpenAI API key to continue.")
+        st.stop()
+    chat_input(prompt)
+    # st.session_state.messages.append({"role": "user", "content": prompt})
+    # st.chat_message("user").write(prompt)
+    # with st.spinner("Thinking..."):
+    #     response = client.chat.completions.create(model="gpt-3.5-turbo", messages=st.session_state.messages)
+    #     msg = response.choices[0].message.content
+    #     st.session_state.messages.append({"role": "assistant", "content": msg})
+    #     st.chat_message("assistant").write(msg)
+if st.sidebar.button("Example Input",type="primary"):
+    med_prompt = generate_med_prompt("Patient referred for a biopsy to investigate potential swelling in upper larynx.")
+    chat_input(med_prompt)
+if st.sidebar.button("Random Input",type="primary"):
+    index, human, med_text, response = random_med_text(raw_text_df)
+    response = response.replace(",","  \n")
+    med_prompt = generate_med_prompt(med_text)
+    chat_input(med_prompt)
+    st.sidebar.write(f"[Random Text](https://huggingface.co/datasets/JaimeML/snomed-entity-challenge) Index: {index}")
+    st.sidebar.markdown(f"Ref Entity:  \n  {response}")
+# model_tag = st.sidebar.selectbox(
+#     "Which model do you want to chat with?",
+#     ("gpt-4o", "gpt-3.5-turbo")
+# )

pages/Vector DB of SNOMED-CT.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from timeit import default_timer as timer
+import streamlit as st
+import chromadb
+import pandas as pd
+import numpy as np
+# configure sqlite3
+# __import__('pysqlite3')
+# import sys
+# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+st.set_page_config(layout="wide")
+# App Title
+st.title("📚 Semantic Search with Vector Database of SNOMED-CT 💡")
+st.caption("🔍 Search any SNOMED-CT relate decription & concept with natural language.")
+st.sidebar.title("🔍 Search Setting")
+query_number = st.sidebar.slider("Query Numbers", 5, 30, 10)
+st.markdown("##### ➡️⌨️ Please input some medical description here, e.g. \"insomnia two nights a week.\", \"COPD\", \"Degenerative Joint Disease\"")
+query_text = st.text_input("Input: any medical description snippet","Type-2 Diabetes")
+# Chroma DB Client
+chroma_client = chromadb.PersistentClient(path="snomed_ct_id_term_1410k")
+collection = chroma_client.get_or_create_collection(name="snomed_ct_id_term")
+start = 1.0
+end = 1.1
+st.markdown("##### ➡️Chroma DB will return " + str(query_number)
+            + " related instances from " + str(collection.count()) + " collections.")
+# st.warning("Due to the SQLite [file size limit on GitHub](https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-git-large-file-storage), this testing only query from 500k SNOMED-CT instances.", icon="🚨")
+# Func: query chrome_db
+def query_chroma_db(query_text, query_number):
+    results = collection.query(
+        query_texts=[query_text],
+        n_results=query_number,
+        include=["distances", "metadatas", "documents"]
+    )
+    return results
+# Func: chrome_db_result to df
+def get_df_from_chroma_results(results):
+    result_dict = {'ids': results['ids'][0], 'concept_ids': [ str(sub['concept_id']) for sub in results['metadatas'][0] ], 'distances': results['distances'][0], 'documents': results['documents'][0]}
+    df = pd.DataFrame(result_dict)
+    return df
+start = timer()
+results = query_chroma_db(query_text, query_number)
+end = timer()
+st.markdown("###### ➡️ Query Time : {: .6f} seconds.".format(end - start))
+st.divider()
+results_df = get_df_from_chroma_results(results)
+#displaying the dataframe as an interactive object
+st.markdown("### 📊 Similar Search Results from Chroma Vector DB")
+st.dataframe(results_df, 1000, 500)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+pandas
+openai
+numpy
+chromadb
+python-dotenv
+pysqlite3-binary

snomed-entity-challenge.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6052af3bc565baf830088dd4c367f3e260ddbb2cf7dfac904fb483aa64f6b31
+size 2363160000

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc4275c3ac7eb47b6540b51430e9f85f50a3ebda23a824a9afa7906a02946db
+size 100

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25b66beb13495b59f604b58531f4b2ca7a4407ee9555c6d33a8faf2913dc420b
+size 52473273

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2582aa1cc6e61c9b0b3da6575206c81c03377e13cf96fa0eb7ca509bbd1f2692
+size 5640000

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91a660d0f12b9111f4217c2024c4b75f810fbf4c6beae03cd9576891096b06a4
+size 12018944

snomed_ct_id_term_1410k/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dbcfc18f1d97ee8184c664105863bc8be1d8b6c376aca94dea6cdb5e9b81bf1
+size 3590983680