import streamlit as st
import sparknlp
import os
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from annotated_text import annotated_text
# Page configuration
st.set_page_config(layout="wide", initial_sidebar_state="auto")
# CSS for styling
st.markdown("""
""", unsafe_allow_html=True)
# Initialize Spark NLP
@st.cache_resource
def init_spark():
return sparknlp.start()
# Create the NER pipeline
@st.cache_resource
def create_pipeline(model, context_dict):
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
sentenceDetector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
zero_shot_ner = ZeroShotNerModel.pretrained(model, "en")\
.setInputCols(["sentence", "token"])\
.setOutputCol("zero_shot_ner")\
.setEntityDefinitions(context_dict)
ner_converter = NerConverter()\
.setInputCols(["sentence", "token", "zero_shot_ner"])\
.setOutputCol("ner_chunk")\
pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, zero_shot_ner, ner_converter])
return pipeline
# Fit data using the pipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
result = model.fullAnnotate(data)
return result
# Annotate the text with NER results
def annotate(data):
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
annotated_words = []
for chunk, label in zip(chunks, labels):
parts = document.split(chunk, 1)
if parts[0]:
annotated_words.append(parts[0])
annotated_words.append((chunk, label))
document = parts[1]
if document:
annotated_words.append(document)
annotated_text(*annotated_words)
def df_to_dict(df):
context_dict = {}
for col in df.columns:
values = df[col].dropna().tolist()
if values:
context_dict[col] = values
return context_dict
# Sidebar content
model = st.sidebar.selectbox(
"Choose the pretrained model",
["zero_shot_ner_roberta"],
help="For more info about the models visit: https://sparknlp.org/models"
)
# Set up the page layout
st.markdown('
Zero-Shot Named Entity Recognition (NER)
', unsafe_allow_html=True)
st.markdown('Explore Zero-Shot Named Entity Recognition (NER)—a state-of-the-art technique that detects and classifies named entities in text without needing specific training on annotated datasets. With our interactive interface, you can modify the context by editing the DataFrame to define custom entity types and examples. Then, input your own text or select from predefined examples to see how the model identifies and categorizes entities in real time.
', unsafe_allow_html=True)
# Reference notebook link in sidebar
link = """
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Set examples and create DataFrame
data = {
"PROBLEM": [
"What is the disease?",
"What are the symptoms of the condition?",
"What is the patient's diagnosis?",
"What kind of disease is he suffering from?",
"What specific medical issue does she have?",
"What is the main problem the patient is facing?",
"What were the reasons for the patient's hospitalization?"
],
"DRUG": [
"Which medication was prescribed?",
"What is the name of the drug used for treatment?",
"Which drug is administered for this condition?",
"What medication does he take daily?",
"What drugs are used to manage his symptoms?",
"Which medicine is recommended for this illness?",
"What is the prescription for this medical condition?"
],
"SYMPTOM": [
"What symptoms is the patient experiencing?",
"What are the signs of the disease?",
"Which symptoms did the patient report?",
"What were the initial symptoms observed?",
"What specific symptoms are present?"
]
}
# Pad shorter lists with empty strings
max_length = max(len(v) for v in data.values())
for key in data.keys():
while len(data[key]) < max_length:
data[key].append(None)
# Create DataFrame
df = pd.DataFrame(data)
df.index += 1
# Add new key with all values as None
new_key = st.text_input("Add Lable:")
if new_key:
data = {new_key.upper(): [None] * max_length, **data}
df = pd.DataFrame(data)
df.index += 1
st.success(f"Added '{new_key}' to Context DataFrame")
st.write("Context DataFrame (Click To Edit)")
edited_df = st.data_editor(df)
# Example sentences
examples = [
"The patient is experiencing severe headache and nausea. The doctor has prescribed Ibuprofen to alleviate the symptoms. The patient has been complaining about these symptoms for the last three days.",
"The patient's main problem is chronic back pain. This issue has been affecting their daily activities significantly. The doctor recommended a series of physical therapy sessions to address the problem.",
"After the diagnosis of diabetes, the patient was given Metformin as a part of their treatment plan. The medication is intended to help manage blood sugar levels effectively.",
"The symptoms reported by the patient include persistent cough and shortness of breath. The doctor has advised some tests to identify the underlying cause of these symptoms.",
"The patient has been prescribed Prednisone to manage their severe inflammation. This medication is part of the treatment plan for their chronic condition."
]
selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own Sentence!")
text_to_analyze = custom_input if custom_input else selected_text
context_dict = df_to_dict(edited_df)
# Display example text
st.subheader('Full Example Text')
HTML_WRAPPER = """{}
"""
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model, context_dict)
output = fit_data(pipeline, text_to_analyze)
# Display processed output
st.subheader("Processed Output:")
results = {
'Document': output[0]['document'][0].result,
'NER Chunk': [n.result for n in output[0]['ner_chunk']],
"NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
}
annotate(results)
with st.expander("View DataFrame"):
df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
df.index += 1
st.dataframe(df)