HengJay commited on
Commit
45a019e
1 Parent(s): 693a89d

first commit to HF spaces.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # End of https://mrkandreev.name/snippets/gitignore-generator/#Python
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: Snomed Ct Assistant
3
- emoji: 👀
4
- colorFrom: gray
5
- colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.34.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
1
  ---
2
  title: Snomed Ct Assistant
3
+ emoji: 🏥
4
+ colorFrom: yellow
5
+ colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.34.0
8
+ app_file: SNOMED-CT_Assistant.py
9
  pinned: false
10
  ---
11
 
SNOMED-CT_Assistant.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import streamlit as st
4
+ from openai import OpenAI
5
+ from dotenv import load_dotenv
6
+ import pandas as pd
7
+
8
+
9
+ # configure sqlite3
10
+ # __import__('pysqlite3')
11
+ # import sys
12
+ # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
13
+
14
+ st.set_page_config(layout="wide")
15
+
16
+ remote = True
17
+
18
+ if remote:
19
+ with st.sidebar:
20
+ if 'OPENAI_API_TOKEN' in st.secrets:
21
+ st.success('API key already provided!', icon='✅')
22
+ openai_api_key = st.secrets['OPENAI_API_TOKEN']
23
+ else:
24
+ load_dotenv()
25
+ openai_api_key = os.environ.get("OpenAI_API_KEY")
26
+
27
+ st.title("🏥 SNOMED-CT Assistant")
28
+ st.caption("👩‍⚕️ A smart medical assistant with SNOMED-CT knowledge.")
29
+
30
+ # System prompt
31
+ system_prompt = """You are a medical expert with rich experience in SNOMED-CT professional knowledge.
32
+ You are skilled at assisting medical professionals and answering questions in the medical field. You are patient, helpful and professional.
33
+ Please refuse to answer inquiries and requests unrelated to the medical field, in order to maintain professionalism in medicine.
34
+ As an experienced professional, you possess deep expertise in the field of SNOMED CT Entity Linking.
35
+ You have a thorough understanding of the relevant workflows and critical aspects involved, encompassing:
36
+ - Processing electronic medical records (EHRs), Adept handling of electronic medical record (EMR) data processing
37
+ - Entity Identification, Proficient entity recognition capabilities, identifying and extracting relevant medical concepts from unstructured text
38
+ - Skilled Entity Mapping, accurately linking identified entities to their corresponding SNOMED CT concepts
39
+ - Seamless integration and output of clinical terminology, ensuring the accurate representation and utilization of standardized medical language
40
+ - Patiently and professionally respond to all SNOMED CT related inquiries, even if the user repeats questions.
41
+ - Demonstrate deep expertise in the standard SNOMED CT Entity Linking workflow, which involves:
42
+ 1. Performing Entity Identification to extract relevant medical terminology from the input.
43
+ 2. Conducting Entity Mapping to link the identified entities to their corresponding SNOMED CT concepts.
44
+ - Present the results in a tabular format only with the following 3 columns: "Identified Entity", "SNOMED CT Concept IDs", "SNOMED CT Descriptions".
45
+
46
+ Here is the practical entity linking process example:
47
+ - the input text in EHRs: "Patient referred for a biopsy to investigate potential swelling in upper larynx."
48
+ - the identified entity: "biopsy", "larynx"
49
+ - the mapped SNOMED CT concepts id & descriptions: "274317003 | Laryngoscopic biopsy larynx (procedure)", "4596009 | Laryngeal structure (body structure)"
50
+
51
+ List out as many potential SNOMED entities as possible from the original medical text description,
52
+ including Diseases, Diagnoses, Clinical Findings (like Signs and Symptoms),
53
+ Procedures (Surgical, Therapeutic, Diagnostic, Nursing), Specimen Types, Living Organisms,
54
+ Observables (for example heart rate), Physical Objects and Forces,
55
+ Chemicals (including the chemicals used in drug preparations), Drugs (pharmaceutical products),
56
+ Human Anatomy (body structures, organisms), Physiological Processes and Functions,
57
+ Patients' Occupations, Patients' Social Contexts (e.g., religion and ethnicity), and various other types from the SNOMED CT standard.
58
+ Numbers or units related symbols are not included in this range and can be ignored.
59
+
60
+ Output Format Requirements (Must follow):
61
+ - Present the results in a tabular format with the following 3 columns only: "Identified Entity", "SNOMED CT Concept IDs", and "SNOMED CT Descriptions". Do not arbitrarily replace the column names, as that would lead to unclear output.
62
+ - The table should be easy to read and understand, with each row displaying the identified medical entity, its corresponding SNOMED CT concept ID, and the full SNOMED CT description.
63
+ - Ensure the formatting and organization of the table is clean and professional, optimized for the user's ease of reference.
64
+
65
+ Your comprehensive knowledge and mastery of these key components make you an invaluable asset in the realm of biomedical natural language processing and knowledge extraction.
66
+ With your specialized expertise, you are able to navigate the complexities of SNOMED CT Entity Linking with ease, delivering accurate and reliable results that support various healthcare and research applications.
67
+ When answering questions, except for the use of English for medical-related terminology, always respond in Traditional Chinese (zh-TW).
68
+ If there are any SNOMED-CT related medical professional terms, please provide the original text in parentheses afterwards."""
69
+
70
+
71
+ # Func: generate random med text
72
+ raw_text_df = pd.read_csv('snomed-entity-challenge.csv')
73
+
74
+ def random_med_text(text_df):
75
+ rows = len(text_df['text'])
76
+ index = random.randint(0, rows)
77
+ raw_text = text_df["text"][index]
78
+ raw_text_spilt = raw_text.split('###TEXT:')
79
+ raw_text_spilt_2 = raw_text_spilt[1].split('###RESPONSE:')
80
+ human = raw_text_spilt[0]
81
+ med_text = raw_text_spilt_2[0]
82
+ response = raw_text_spilt_2[1]
83
+ return index, human, med_text, response
84
+
85
+
86
+ # Func: Gen Medical Prompt Example
87
+ def generate_med_prompt(medical_text):
88
+ return f"""請協助我做電子病歷 (Electronic Health Record, EHR) 的 SNOMED-CT Entity Linking 的處理, 這是原本的病歷文字: \n {medical_text} \n """
89
+
90
+ # test_prompt = """請協助我做 EHR 的 SNOMED CT Entity Linking 的處理, 這是原本的病歷文字:
91
+ # "Patient referred for a biopsy to investigate potential swelling in upper larynx."
92
+ # ,首先做 Entity Identification,列出醫學相關術語片段,接著做 Entity Mapping,將對應的 SNOMED CT 術語列出。
93
+ # 輸出格式用表格,欄位是 "identified entity", "SNOMED CT concept ids", "SNOMED CT descriptions"。"""
94
+
95
+ client = OpenAI(api_key=openai_api_key)
96
+ model_tag = "gpt-3.5-turbo"
97
+
98
+ def chat_input(prompt):
99
+ # with st.sidebar:
100
+ # st.write("You are talking with: ", model_tag)
101
+ st.session_state.messages.append({"role": "user", "content": prompt})
102
+ st.chat_message("user").write(prompt)
103
+ with st.spinner("Thinking..."):
104
+ response = client.chat.completions.create(
105
+ model=model_tag, messages=st.session_state.messages, temperature=0.5)
106
+ msg = response.choices[0].message.content
107
+ st.session_state.messages.append({"role": "assistant", "content": msg})
108
+ st.chat_message("assistant").write(msg)
109
+
110
+ if "messages" not in st.session_state:
111
+ st.session_state["messages"] = [{"role": "system", "content": system_prompt},
112
+ {"role": "assistant", "content": "👩‍⚕️ 您好,我是您的專業醫學助理。請問有任何我可以協助你的地方嗎?"}]
113
+
114
+ for msg in st.session_state.messages:
115
+ if msg["role"] == "system":
116
+ continue
117
+ st.chat_message(msg["role"]).write(msg["content"])
118
+
119
+ if prompt := st.chat_input():
120
+ if not openai_api_key:
121
+ st.info("Please add your OpenAI API key to continue.")
122
+ st.stop()
123
+
124
+ chat_input(prompt)
125
+ # st.session_state.messages.append({"role": "user", "content": prompt})
126
+ # st.chat_message("user").write(prompt)
127
+ # with st.spinner("Thinking..."):
128
+ # response = client.chat.completions.create(model="gpt-3.5-turbo", messages=st.session_state.messages)
129
+ # msg = response.choices[0].message.content
130
+ # st.session_state.messages.append({"role": "assistant", "content": msg})
131
+ # st.chat_message("assistant").write(msg)
132
+
133
+ if st.sidebar.button("Example Input",type="primary"):
134
+ med_prompt = generate_med_prompt("Patient referred for a biopsy to investigate potential swelling in upper larynx.")
135
+ chat_input(med_prompt)
136
+
137
+
138
+ if st.sidebar.button("Random Input",type="primary"):
139
+ index, human, med_text, response = random_med_text(raw_text_df)
140
+ response = response.replace(","," \n")
141
+ med_prompt = generate_med_prompt(med_text)
142
+ chat_input(med_prompt)
143
+ st.sidebar.write(f"[Random Text](https://huggingface.co/datasets/JaimeML/snomed-entity-challenge) Index: {index}")
144
+ st.sidebar.markdown(f"Ref Entity: \n {response}")
145
+
146
+
147
+ # model_tag = st.sidebar.selectbox(
148
+ # "Which model do you want to chat with?",
149
+ # ("gpt-4o", "gpt-3.5-turbo")
150
+ # )
pages/Vector DB of SNOMED-CT.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from timeit import default_timer as timer
2
+
3
+ import streamlit as st
4
+ import chromadb
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ # configure sqlite3
9
+ # __import__('pysqlite3')
10
+ # import sys
11
+ # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
12
+
13
+ st.set_page_config(layout="wide")
14
+
15
+ # App Title
16
+ st.title("📚 Semantic Search with Vector Database of SNOMED-CT 💡")
17
+ st.caption("🔍 Search any SNOMED-CT relate decription & concept with natural language.")
18
+ st.sidebar.title("🔍 Search Setting")
19
+ query_number = st.sidebar.slider("Query Numbers", 5, 30, 10)
20
+ st.markdown("##### ➡️⌨️ Please input some medical description here, e.g. \"insomnia two nights a week.\", \"COPD\", \"Degenerative Joint Disease\"")
21
+ query_text = st.text_input("Input: any medical description snippet","Type-2 Diabetes")
22
+
23
+ # Chroma DB Client
24
+ chroma_client = chromadb.PersistentClient(path="snomed_ct_id_term_1410k")
25
+ collection = chroma_client.get_or_create_collection(name="snomed_ct_id_term")
26
+ start = 1.0
27
+ end = 1.1
28
+ st.markdown("##### ➡️Chroma DB will return " + str(query_number)
29
+ + " related instances from " + str(collection.count()) + " collections.")
30
+ # st.warning("Due to the SQLite [file size limit on GitHub](https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-git-large-file-storage), this testing only query from 500k SNOMED-CT instances.", icon="🚨")
31
+
32
+
33
+ # Func: query chrome_db
34
+ def query_chroma_db(query_text, query_number):
35
+ results = collection.query(
36
+ query_texts=[query_text],
37
+ n_results=query_number,
38
+ include=["distances", "metadatas", "documents"]
39
+ )
40
+ return results
41
+
42
+ # Func: chrome_db_result to df
43
+ def get_df_from_chroma_results(results):
44
+ result_dict = {'ids': results['ids'][0], 'concept_ids': [ str(sub['concept_id']) for sub in results['metadatas'][0] ], 'distances': results['distances'][0], 'documents': results['documents'][0]}
45
+ df = pd.DataFrame(result_dict)
46
+ return df
47
+
48
+ start = timer()
49
+ results = query_chroma_db(query_text, query_number)
50
+ end = timer()
51
+ st.markdown("###### ➡️ Query Time : {: .6f} seconds.".format(end - start))
52
+ st.divider()
53
+
54
+ results_df = get_df_from_chroma_results(results)
55
+
56
+ #displaying the dataframe as an interactive object
57
+ st.markdown("### 📊 Similar Search Results from Chroma Vector DB")
58
+ st.dataframe(results_df, 1000, 500)
59
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ openai
4
+ numpy
5
+ chromadb
6
+ python-dotenv
7
+ pysqlite3-binary
8
+
snomed-entity-challenge.csv ADDED
The diff for this file is too large to render. See raw diff
 
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6052af3bc565baf830088dd4c367f3e260ddbb2cf7dfac904fb483aa64f6b31
3
+ size 2363160000
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dc4275c3ac7eb47b6540b51430e9f85f50a3ebda23a824a9afa7906a02946db
3
+ size 100
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25b66beb13495b59f604b58531f4b2ca7a4407ee9555c6d33a8faf2913dc420b
3
+ size 52473273
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2582aa1cc6e61c9b0b3da6575206c81c03377e13cf96fa0eb7ca509bbd1f2692
3
+ size 5640000
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91a660d0f12b9111f4217c2024c4b75f810fbf4c6beae03cd9576891096b06a4
3
+ size 12018944
snomed_ct_id_term_1410k/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dbcfc18f1d97ee8184c664105863bc8be1d8b6c376aca94dea6cdb5e9b81bf1
3
+ size 3590983680