import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config(layout="wide", initial_sidebar_state="auto") # CSS for styling st.markdown(""" """, unsafe_allow_html=True) # Initialize Spark NLP @st.cache_resource def init_spark(): return sparknlp.start() # Create the NER pipeline @st.cache_resource def create_pipeline(model, context_dict): documentAssembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") sentenceDetector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") zero_shot_ner = ZeroShotNerModel.pretrained(model, "en")\ .setInputCols(["sentence", "token"])\ .setOutputCol("zero_shot_ner")\ .setEntityDefinitions(context_dict) ner_converter = NerConverter()\ .setInputCols(["sentence", "token", "zero_shot_ner"])\ .setOutputCol("ner_chunk")\ pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, zero_shot_ner, ner_converter]) return pipeline # Fit data using the pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) result = model.fullAnnotate(data) return result # Annotate the text with NER results def annotate(data): document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] annotated_words = [] for chunk, label in zip(chunks, labels): parts = document.split(chunk, 1) if parts[0]: annotated_words.append(parts[0]) annotated_words.append((chunk, label)) document = parts[1] if document: annotated_words.append(document) annotated_text(*annotated_words) def df_to_dict(df): context_dict = {} for col in df.columns: values = df[col].dropna().tolist() if values: context_dict[col] = values return context_dict # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["zero_shot_ner_roberta"], help="For more info about the models visit: https://sparknlp.org/models" ) # Set up the page layout st.markdown('

Zero-Shot Named Entity Recognition (NER)

', unsafe_allow_html=True) st.markdown('

Explore Zero-Shot Named Entity Recognition (NER)—a state-of-the-art technique that detects and classifies named entities in text without needing specific training on annotated datasets. With our interactive interface, you can modify the context by editing the DataFrame to define custom entity types and examples. Then, input your own text or select from predefined examples to see how the model identifies and categorizes entities in real time.

', unsafe_allow_html=True) # Reference notebook link in sidebar link = """

""" st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Set examples and create DataFrame data = { "PROBLEM": [ "What is the disease?", "What are the symptoms of the condition?", "What is the patient's diagnosis?", "What kind of disease is he suffering from?", "What specific medical issue does she have?", "What is the main problem the patient is facing?", "What were the reasons for the patient's hospitalization?" ], "DRUG": [ "Which medication was prescribed?", "What is the name of the drug used for treatment?", "Which drug is administered for this condition?", "What medication does he take daily?", "What drugs are used to manage his symptoms?", "Which medicine is recommended for this illness?", "What is the prescription for this medical condition?" ], "SYMPTOM": [ "What symptoms is the patient experiencing?", "What are the signs of the disease?", "Which symptoms did the patient report?", "What were the initial symptoms observed?", "What specific symptoms are present?" ] } # Pad shorter lists with empty strings max_length = max(len(v) for v in data.values()) for key in data.keys(): while len(data[key]) < max_length: data[key].append(None) # Create DataFrame df = pd.DataFrame(data) df.index += 1 # Add new key with all values as None new_key = st.text_input("Add Lable:") if new_key: data = {new_key.upper(): [None] * max_length, **data} df = pd.DataFrame(data) df.index += 1 st.success(f"Added '{new_key}' to Context DataFrame") st.write("Context DataFrame (Click To Edit)") edited_df = st.data_editor(df) # Example sentences examples = [ "The patient is experiencing severe headache and nausea. The doctor has prescribed Ibuprofen to alleviate the symptoms. The patient has been complaining about these symptoms for the last three days.", "The patient's main problem is chronic back pain. This issue has been affecting their daily activities significantly. The doctor recommended a series of physical therapy sessions to address the problem.", "After the diagnosis of diabetes, the patient was given Metformin as a part of their treatment plan. The medication is intended to help manage blood sugar levels effectively.", "The symptoms reported by the patient include persistent cough and shortness of breath. The doctor has advised some tests to identify the underlying cause of these symptoms.", "The patient has been prescribed Prednisone to manage their severe inflammation. This medication is part of the treatment plan for their chronic condition." ] selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own Sentence!") text_to_analyze = custom_input if custom_input else selected_text context_dict = df_to_dict(edited_df) # Display example text st.subheader('Full Example Text') HTML_WRAPPER = """

{}

""" st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model, context_dict) output = fit_data(pipeline, text_to_analyze) # Display processed output st.subheader("Processed Output:") results = { 'Document': output[0]['document'][0].result, 'NER Chunk': [n.result for n in output[0]['ner_chunk']], "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']] } annotate(results) with st.expander("View DataFrame"): df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']}) df.index += 1 st.dataframe(df)