abdullahmubeen10
commited on
Commit
โข
38becfe
1
Parent(s):
c7bee95
Upload 10 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +228 -0
- Dockerfile +72 -0
- inputs/bert_token_classifier_hi_en_ner/Example1.txt +2 -0
- inputs/bert_token_classifier_hi_en_ner/Example2.txt +2 -0
- inputs/bert_token_classifier_hi_en_ner/Example3.txt +2 -0
- inputs/bert_token_classifier_hi_en_ner/Example4.txt +2 -0
- inputs/bert_token_classifier_hi_en_ner/Example5.txt +2 -0
- pages/Workflow & Model Overview.py +255 -0
- requirements.txt +6 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
from sparknlp.base import *
|
6 |
+
from sparknlp.annotator import *
|
7 |
+
from pyspark.ml import Pipeline
|
8 |
+
from sparknlp.pretrained import PretrainedPipeline
|
9 |
+
from annotated_text import annotated_text
|
10 |
+
|
11 |
+
# Page configuration
|
12 |
+
st.set_page_config(layout="wide", initial_sidebar_state="auto")
|
13 |
+
|
14 |
+
# CSS for styling
|
15 |
+
st.markdown("""
|
16 |
+
<style>
|
17 |
+
.main-title {
|
18 |
+
font-size: 36px;
|
19 |
+
color: #4A90E2;
|
20 |
+
font-weight: bold;
|
21 |
+
text-align: center;
|
22 |
+
}
|
23 |
+
.section {
|
24 |
+
background-color: #f9f9f9;
|
25 |
+
padding: 10px;
|
26 |
+
border-radius: 10px;
|
27 |
+
margin-top: 10px;
|
28 |
+
}
|
29 |
+
.section p, .section ul {
|
30 |
+
color: #666666;
|
31 |
+
}
|
32 |
+
</style>
|
33 |
+
""", unsafe_allow_html=True)
|
34 |
+
|
35 |
+
# Initialize Spark NLP
|
36 |
+
@st.cache_resource
|
37 |
+
def init_spark():
|
38 |
+
return sparknlp.start()
|
39 |
+
|
40 |
+
# Create the NER pipeline
|
41 |
+
@st.cache_resource
|
42 |
+
def create_pipeline(model, context_dict):
|
43 |
+
documentAssembler = DocumentAssembler() \
|
44 |
+
.setInputCol("text") \
|
45 |
+
.setOutputCol("document")
|
46 |
+
|
47 |
+
sentenceDetector = SentenceDetector() \
|
48 |
+
.setInputCols(["document"]) \
|
49 |
+
.setOutputCol("sentence")
|
50 |
+
|
51 |
+
tokenizer = Tokenizer() \
|
52 |
+
.setInputCols(["sentence"]) \
|
53 |
+
.setOutputCol("token")
|
54 |
+
|
55 |
+
zero_shot_ner = ZeroShotNerModel.pretrained(model, "en")\
|
56 |
+
.setInputCols(["sentence", "token"])\
|
57 |
+
.setOutputCol("zero_shot_ner")\
|
58 |
+
.setEntityDefinitions(context_dict)
|
59 |
+
|
60 |
+
ner_converter = NerConverter()\
|
61 |
+
.setInputCols(["sentence", "token", "zero_shot_ner"])\
|
62 |
+
.setOutputCol("ner_chunk")\
|
63 |
+
|
64 |
+
pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, zero_shot_ner, ner_converter])
|
65 |
+
return pipeline
|
66 |
+
|
67 |
+
# Fit data using the pipeline
|
68 |
+
def fit_data(pipeline, data):
|
69 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
70 |
+
pipeline_model = pipeline.fit(empty_df)
|
71 |
+
model = LightPipeline(pipeline_model)
|
72 |
+
result = model.fullAnnotate(data)
|
73 |
+
return result
|
74 |
+
|
75 |
+
# Annotate the text with NER results
|
76 |
+
def annotate(data):
|
77 |
+
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
|
78 |
+
annotated_words = []
|
79 |
+
for chunk, label in zip(chunks, labels):
|
80 |
+
parts = document.split(chunk, 1)
|
81 |
+
if parts[0]:
|
82 |
+
annotated_words.append(parts[0])
|
83 |
+
annotated_words.append((chunk, label))
|
84 |
+
document = parts[1]
|
85 |
+
if document:
|
86 |
+
annotated_words.append(document)
|
87 |
+
annotated_text(*annotated_words)
|
88 |
+
|
89 |
+
def df_to_dict(df):
|
90 |
+
context_dict = {}
|
91 |
+
for col in df.columns:
|
92 |
+
values = df[col].dropna().tolist()
|
93 |
+
if values:
|
94 |
+
context_dict[col] = values
|
95 |
+
return context_dict
|
96 |
+
|
97 |
+
# Sidebar content
|
98 |
+
model = st.sidebar.selectbox(
|
99 |
+
"Choose the pretrained model",
|
100 |
+
["zero_shot_ner_roberta"],
|
101 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
102 |
+
)
|
103 |
+
|
104 |
+
# Set up the page layout
|
105 |
+
st.markdown('<div class="main-title">Zero-Shot Named Entity Recognition (NER)</div>', unsafe_allow_html=True)
|
106 |
+
st.markdown('<div class="section"><p>Explore Zero-Shot Named Entity Recognition (NER)โa state-of-the-art technique that detects and classifies named entities in text without needing specific training on annotated datasets. With our interactive interface, you can modify the context by editing the DataFrame to define custom entity types and examples. Then, input your own text or select from predefined examples to see how the model identifies and categorizes entities in real time.</p></div>', unsafe_allow_html=True)
|
107 |
+
|
108 |
+
# Reference notebook link in sidebar
|
109 |
+
link = """
|
110 |
+
<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.6.ZeroShot_Clinical_NER.ipynb">
|
111 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
112 |
+
</a>
|
113 |
+
"""
|
114 |
+
st.sidebar.markdown('Reference notebook:')
|
115 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
116 |
+
|
117 |
+
# Set examples and create DataFrame
|
118 |
+
data = {
|
119 |
+
"PROBLEM": [
|
120 |
+
"What is the disease?",
|
121 |
+
"What are the symptoms of the condition?",
|
122 |
+
"What is the patient's diagnosis?",
|
123 |
+
"What kind of disease is he suffering from?",
|
124 |
+
"What specific medical issue does she have?",
|
125 |
+
"What is the main problem the patient is facing?",
|
126 |
+
"What were the reasons for the patient's hospitalization?"
|
127 |
+
],
|
128 |
+
"DRUG": [
|
129 |
+
"Which medication was prescribed?",
|
130 |
+
"What is the name of the drug used for treatment?",
|
131 |
+
"Which drug is administered for this condition?",
|
132 |
+
"What medication does he take daily?",
|
133 |
+
"What drugs are used to manage his symptoms?",
|
134 |
+
"Which medicine is recommended for this illness?",
|
135 |
+
"What is the prescription for this medical condition?"
|
136 |
+
],
|
137 |
+
"ADMISSION_DATE": [
|
138 |
+
"When was the patient admitted to the hospital?",
|
139 |
+
"What is the date of the patient's admission?",
|
140 |
+
"On which date did the patient enter the clinic?",
|
141 |
+
"When did the patient check into the hospital?",
|
142 |
+
"What is the admission date for the patient?"
|
143 |
+
],
|
144 |
+
"PATIENT_AGE": [
|
145 |
+
"How old is the patient?",
|
146 |
+
"What is the patient's age?",
|
147 |
+
"At what age was the patient diagnosed?",
|
148 |
+
"Can you tell me the age of the patient?",
|
149 |
+
"What is the age of the person receiving treatment?"
|
150 |
+
],
|
151 |
+
"SYMPTOM": [
|
152 |
+
"What symptoms is the patient experiencing?",
|
153 |
+
"What are the signs of the disease?",
|
154 |
+
"Which symptoms did the patient report?",
|
155 |
+
"What were the initial symptoms observed?",
|
156 |
+
"What specific symptoms are present?"
|
157 |
+
],
|
158 |
+
"TREATMENT": [
|
159 |
+
"What treatment plan was recommended?",
|
160 |
+
"Which therapies are being used?",
|
161 |
+
"What is the current treatment protocol?",
|
162 |
+
"What type of treatment is the patient undergoing?",
|
163 |
+
"What are the options for treating this condition?"
|
164 |
+
],
|
165 |
+
"DOCTOR": [
|
166 |
+
"Who is the treating physician?",
|
167 |
+
"Which doctor is handling the case?",
|
168 |
+
"What is the name of the attending doctor?",
|
169 |
+
"Who is the specialist for this illness?",
|
170 |
+
"Can you provide the name of the doctor overseeing the treatment?"
|
171 |
+
]
|
172 |
+
}
|
173 |
+
|
174 |
+
# Pad shorter lists with empty strings
|
175 |
+
max_length = max(len(v) for v in data.values())
|
176 |
+
for key in data.keys():
|
177 |
+
while len(data[key]) < max_length:
|
178 |
+
data[key].append(None)
|
179 |
+
|
180 |
+
# Create DataFrame and display
|
181 |
+
df = pd.DataFrame(data)
|
182 |
+
df.index += 1
|
183 |
+
st.write("Context DataFrame (Click To Edit)")
|
184 |
+
edited_df = st.data_editor(df)
|
185 |
+
|
186 |
+
# Example sentences
|
187 |
+
examples = [
|
188 |
+
"Dr. Taylor prescribed Lisinopril to a 68-year-old patient with high blood pressure. The patient was admitted to the hospital on April 15, 2024, after experiencing severe hypertension symptoms.",
|
189 |
+
"The 50-year-old male patient reported persistent back pain. The treatment plan includes physical therapy and a medication called Flexeril to alleviate the discomfort.",
|
190 |
+
"The patient was admitted on June 12, 2024, with symptoms of severe abdominal pain. Dr. Kim diagnosed acute appendicitis and recommended immediate surgical intervention.",
|
191 |
+
"A 25-year-old female patient with a history of asthma was treated with Albuterol. Dr. Patel noted that the patient's symptoms of shortness of breath improved significantly after starting the medication.",
|
192 |
+
"The 72-year-old patient underwent surgery for cataracts on May 30, 2024. Dr. Martinez prescribed eye drops and scheduled a follow-up appointment for post-operative care.",
|
193 |
+
"The patient, aged 40, presented with symptoms of chronic fatigue and joint pain. Dr. Nguyen recommended a combination of lifestyle changes and a new drug called Imunorix.",
|
194 |
+
"Dr. Wilson observed that the 34-year-old patientโs condition, diagnosed as lupus, was managed with hydroxychloroquine and a tailored treatment regimen.",
|
195 |
+
"The 56-year-old patient experienced symptoms of severe dehydration due to gastroenteritis. Dr. Rogers provided intravenous fluids and anti-nausea medication as treatment.",
|
196 |
+
"On July 8, 2024, the patient was admitted with acute respiratory distress. Dr. Green prescribed a regimen including corticosteroids and bronchodilators to manage the symptoms.",
|
197 |
+
"The 29-year-old patient, diagnosed with chronic migraines, was treated with a new medication called MigraRelief. The attending physician, Dr. Lewis, also recommended cognitive behavioral therapy."
|
198 |
+
]
|
199 |
+
|
200 |
+
selected_text = st.selectbox("Select an example", examples)
|
201 |
+
custom_input = st.text_input("Try it with your own Sentence!")
|
202 |
+
|
203 |
+
text_to_analyze = custom_input if custom_input else selected_text
|
204 |
+
context_dict = df_to_dict(edited_df)
|
205 |
+
|
206 |
+
# Display example text
|
207 |
+
st.subheader('Full Example Text')
|
208 |
+
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
|
209 |
+
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
|
210 |
+
|
211 |
+
# Initialize Spark and create pipeline
|
212 |
+
spark = init_spark()
|
213 |
+
pipeline = create_pipeline(model, context_dict)
|
214 |
+
output = fit_data(pipeline, text_to_analyze)
|
215 |
+
|
216 |
+
# Display processed output
|
217 |
+
st.subheader("Processed Output:")
|
218 |
+
results = {
|
219 |
+
'Document': output[0]['document'][0].result,
|
220 |
+
'NER Chunk': [n.result for n in output[0]['ner_chunk']],
|
221 |
+
"NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
|
222 |
+
}
|
223 |
+
annotate(results)
|
224 |
+
|
225 |
+
with st.expander("View DataFrame"):
|
226 |
+
df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
|
227 |
+
df.index += 1
|
228 |
+
st.dataframe(df)
|
Dockerfile
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV NB_USER jovyan
|
6 |
+
ENV NB_UID 1000
|
7 |
+
ENV HOME /home/${NB_USER}
|
8 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
9 |
+
|
10 |
+
# Install required packages
|
11 |
+
RUN apt-get update && apt-get install -y \
|
12 |
+
tar \
|
13 |
+
wget \
|
14 |
+
bash \
|
15 |
+
rsync \
|
16 |
+
gcc \
|
17 |
+
libfreetype6-dev \
|
18 |
+
libhdf5-serial-dev \
|
19 |
+
libpng-dev \
|
20 |
+
libzmq3-dev \
|
21 |
+
python3 \
|
22 |
+
python3-dev \
|
23 |
+
python3-pip \
|
24 |
+
unzip \
|
25 |
+
pkg-config \
|
26 |
+
software-properties-common \
|
27 |
+
graphviz \
|
28 |
+
openjdk-8-jdk \
|
29 |
+
ant \
|
30 |
+
ca-certificates-java \
|
31 |
+
&& apt-get clean \
|
32 |
+
&& update-ca-certificates -f
|
33 |
+
|
34 |
+
# Install Python 3.8 and pip
|
35 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
36 |
+
&& apt-get update \
|
37 |
+
&& apt-get install -y python3.8 python3-pip \
|
38 |
+
&& apt-get clean
|
39 |
+
|
40 |
+
# Set up JAVA_HOME
|
41 |
+
RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
|
42 |
+
&& echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
|
43 |
+
# Create a new user named "jovyan" with user ID 1000
|
44 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
45 |
+
|
46 |
+
# Switch to the "jovyan" user
|
47 |
+
USER ${NB_USER}
|
48 |
+
|
49 |
+
# Set home and path variables for the user
|
50 |
+
ENV HOME=/home/${NB_USER} \
|
51 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
52 |
+
|
53 |
+
# Set up PySpark to use Python 3.8 for both driver and workers
|
54 |
+
ENV PYSPARK_PYTHON=/usr/bin/python3.8
|
55 |
+
ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
|
56 |
+
|
57 |
+
# Set the working directory to the user's home directory
|
58 |
+
WORKDIR ${HOME}
|
59 |
+
|
60 |
+
# Upgrade pip and install Python dependencies
|
61 |
+
RUN python3.8 -m pip install --upgrade pip
|
62 |
+
COPY requirements.txt /tmp/requirements.txt
|
63 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
64 |
+
|
65 |
+
# Copy the application code into the container at /home/jovyan
|
66 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
67 |
+
|
68 |
+
# Expose port for Streamlit
|
69 |
+
EXPOSE 7860
|
70 |
+
|
71 |
+
# Define the entry point for the container
|
72 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
inputs/bert_token_classifier_hi_en_ner/Example1.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
เคเคถเคฟเคฏเคจ-เคชเฅเคเคเฅเคธ เคฒเคฟเคฎเคฟเคเฅเคก (Asian Paints Limited) เคเค เคญเคพเคฐเคคเฅเคฏ เคฌเคนเฅเคฐเคพเคทเฅเคเฅเคฐเฅเคฏ เคตเฅเคฏเคพเคชเคพเคฐ เคนเฅ เคเคฟเคธเคเคพ เคฎเฅเคเฅเคฏเคพเคฒเคฏ เคฎเฅเคเคฌเค (...
|
2 |
+
เคเคถเคฟเคฏเคจ-เคชเฅเคเคเฅเคธ เคฒเคฟเคฎเคฟเคเฅเคก (Asian Paints Limited) เคเค เคญเคพเคฐเคคเฅเคฏ เคฌเคนเฅเคฐเคพเคทเฅเคเฅเคฐเฅเคฏ เคตเฅเคฏเคพเคชเคพเคฐ เคนเฅ เคเคฟเคธเคเคพ เคฎเฅเคเฅเคฏเคพเคฒเคฏ เคฎเฅเคเคฌเค (Mumbai), เคฎเคนเคพเคฐเคพเคทเฅเคเฅเคฐ (Maharashtra) เคฎเฅเค เคนเฅเฅค เคฏเฅ เคตเฅเคฏเคพเคชเคพเคฐ, เคฐเคเค, เคเคฐ เคเฅ เคธเคเคพเคตเค, เคซเคฟเคเคฟเคเค เคธเฅ เคธเคเคฌเคเคงเคฟเคค เคเคคเฅเคชเคพเคฆเฅเค เคเคฐ เคธเคเคฌเคเคงเคฟเคค เคธเฅเคตเคพเคเค เคชเฅเคฐเคฆเคพเคจ เคเคฐเคจเฅ, เคจเคฟเคฐเฅเคฎเคพเคฃ, เคฌเคฟเคเฅเคฐเฅ เคเคฐ เคตเคฟเคคเคฐเคฃ เคเฅ เคตเฅเคฏเคตเคธเคพเคฏ เคฎเฅเค เคฒเคเฅ เคนเฅเค เคนเฅเฅค
|
inputs/bert_token_classifier_hi_en_ner/Example2.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
เคฐเคฟเคฒเคพเคฏเคเคธ เคเคเคกเคธเฅเคเฅเคฐเฅเคเคผ เคฒเคฟเคฎเคฟเคเฅเคก (Reliance Industries Limited) เคเค เคญเคพเคฐเคคเฅเคฏ เคธเคเคเฅเคเคฟเคเคพ เคจเคฟเคฏเคเคคเฅเคฐเค เคเคเคชเคจเฅ เคนเฅ, เคเคฟเคธเค...
|
2 |
+
เคฐเคฟเคฒเคพเคฏเคเคธ เคเคเคกเคธเฅเคเฅเคฐเฅเคเคผ เคฒเคฟเคฎเคฟเคเฅเคก (Reliance Industries Limited) เคเค เคญเคพเคฐเคคเฅเคฏ เคธเคเคเฅเคเคฟเคเคพ เคจเคฟเคฏเคเคคเฅเคฐเค เคเคเคชเคจเฅ เคนเฅ, เคเคฟเคธเคเคพ เคฎเฅเคเฅเคฏเคพเคฒเคฏ เคฎเฅเคเคฌเค, เคฎเคนเคพเคฐเคพเคทเฅเคเฅเคฐ (Maharashtra) เคฎเฅเค เคธเฅเคฅเคฟเคค เคนเฅเฅคเคฐเคคเคจ เคจเคตเคฒ เคเคพเคเคพ (28 เคฆเคฟเคธเคเคฌเคฐ 1937, เคเฅ เคฎเฅเคฎเฅเคฌเค , เคฎเฅเค เคเคจเฅเคฎเฅ) เคเคพเคเคพ เคธเคฎเฅเคน เคเฅ เคตเคฐเฅเคคเคฎเคพเคจ เค
เคงเฅเคฏเคเฅเคท, เคเฅ เคญเคพเคฐเคค เคเฅ เคธเคฌเคธเฅ เคฌเคกเคผเฅ เคตเฅเคฏเคพเคชเคพเคฐเคฟเค เคธเคฎเฅเคน เคนเฅ, เคเคฟเคธเคเฅ เคธเฅเคฅเคพเคชเคจเคพ เคเคฎเคถเฅเคฆเคเฅ เคเคพเคเคพ เคจเฅ เคเฅ เคเคฐ เคเคจเคเฅ เคชเคฐเคฟเคตเคพเคฐ เคเฅ เคชเฅเคขเคฟเคฏเฅเค เคจเฅ เคเคธเคเคพ เคตเคฟเคธเฅเคคเคพเคฐ เคเคฟเคฏเคพ เคเคฐ เคเคธเฅ เคฆเฅเคขเคผ เคฌเคจเคพเคฏเคพเฅค
|
inputs/bert_token_classifier_hi_en_ner/Example3.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
เคฒเคฟเคฏเฅเคจเฅเคฒ เคฎเฅเคธเฅเคธเฅ (Lionel Messi); (เคเคจเฅเคฎ 24 เคเฅเคจ 1987) เค
เคฐเฅเคเฅเคเคเฅเคจเคพ (Argentina) เคเฅ เคซเคผเฅเคเคฌเฅเคฒ เคเคฟเคฒเคพเคกเคผเฅ เคนเฅเค, เคเฅ ...
|
2 |
+
เคฒเคฟเคฏเฅเคจเฅเคฒ เคฎเฅเคธเฅเคธเฅ (Lionel Messi); (เคเคจเฅเคฎ 24 เคเฅเคจ 1987) เค
เคฐเฅเคเฅเคเคเฅเคจเคพ (Argentina) เคเฅ เคซเคผเฅเคเคฌเฅเคฒ เคเคฟเคฒเคพเคกเคผเฅ เคนเฅเค, เคเฅ เคเคธ เคธเคฎเคฏ เคชเฅ.เคเคธ.เคเฅ เคเฅเคฎ เคชเฅเคฐเคฟเคธ เคธเฅเคเค-เคเคฐเฅเคฎเฅเคจ (Paris Saint-Germain Football Club) เคเคฐ เค
เคฐเฅเคเฅเคเคเฅเคจเคพ (Argentina) เคเฅ เคฐเคพเคทเฅเคเฅเคฐเฅเคฏ เคเฅเคฎ เคเฅ เคฒเคฟเค เคเฅเคฒเคคเฅ เคนเฅเคเฅค'
|
inputs/bert_token_classifier_hi_en_ner/Example4.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
เคเฅเคฐเคฟเคธเฅเคเคฟเคฏเคพเคจเฅ เคฐเฅเคจเคพเคฒเฅเคกเฅ (Cristiano Ronaldo), (เคเคจเฅเคฎ: 5 เคซเคฐเคตเคฐเฅ 1985), เคเค เคชเฅเคฐเฅเคคเคเคพเคฒเฅ (Portugal) เคชเฅเคถเฅเคตเคฐ เคซเฅเคเคฌ...
|
2 |
+
เคเฅเคฐเคฟเคธเฅเคเคฟเคฏเคพเคจเฅ เคฐเฅเคจเคพเคฒเฅเคกเฅ (Cristiano Ronaldo), (เคเคจเฅเคฎ: 5 เคซเคฐเคตเคฐเฅ 1985), เคเค เคชเฅเคฐเฅเคคเคเคพเคฒเฅ (Portugal) เคชเฅเคถเฅเคตเคฐ เคซเฅเคเคฌเฅเคฒ เคเคฟเคฒเคพเคกเคผเฅ เคนเฅ, เคเฅ เคฎเฅเคจเคเฅเคธเฅเคเคฐ เคฏเฅเคจเคพเคเคเฅเคก (Manchester United) เคเฅ เคฒเคฟเค เคเคฐ เคชเฅเคฐเฅเคคเคเคพเคฒ (Portugal) เคฐเคพเคทเฅเคเฅเคฐเฅเคฏ เคซเฅเคเคฌเฅเคฒ เคเฅ เคเคชเฅเคคเคพเคจ เคนเฅเคเฅค
|
inputs/bert_token_classifier_hi_en_ner/Example5.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
เคตเฅเคฐเฅเคจ เคเคกเคตเคฐเฅเคก เคฌเคซเฅเค (Warren Buffet) (เค
เคเคธเฅเคค 30 (August 30), 1930 เคเฅ เคเคฎเคพเคนเคพ (Omaha), เคจเฅเคฌเฅเคฐเคพเคธเฅเคเคพ (Nebraska...
|
2 |
+
เคตเฅเคฐเฅเคจ เคเคกเคตเคฐเฅเคก เคฌเคซเฅเค (Warren Buffet) (เค
เคเคธเฅเคค 30 (August 30), 1930 เคเฅ เคเคฎเคพเคนเคพ (Omaha), เคจเฅเคฌเฅเคฐเคพเคธเฅเคเคพ (Nebraska) เคฎเฅเค เคชเฅเคฆเคพ เคนเฅเค) เคเค เค
เคฎเฅเคฐเคฟเคเฅ เคจเคฟเคตเฅเคถเค (investor), เคตเฅเคฏเคตเคธเคพเคฏเฅ เคเคฐ เคชเคฐเฅเคชเคเคพเคฐเฅ (philanthropist) เคตเฅเคฏเคเฅเคคเคฟเคคเฅเคต เคนเฅเคเฅค
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Custom CSS for better styling
|
5 |
+
st.markdown("""
|
6 |
+
<style>
|
7 |
+
.main-title {
|
8 |
+
font-size: 36px;
|
9 |
+
color: #4A90E2;
|
10 |
+
font-weight: bold;
|
11 |
+
text-align: center;
|
12 |
+
}
|
13 |
+
.sub-title {
|
14 |
+
font-size: 24px;
|
15 |
+
color: #4A90E2;
|
16 |
+
margin-top: 20px;
|
17 |
+
}
|
18 |
+
.section {
|
19 |
+
background-color: #f9f9f9;
|
20 |
+
padding: 15px;
|
21 |
+
border-radius: 10px;
|
22 |
+
margin-top: 20px;
|
23 |
+
}
|
24 |
+
.section p, .section ul {
|
25 |
+
color: #666666;
|
26 |
+
}
|
27 |
+
.link {
|
28 |
+
color: #4A90E2;
|
29 |
+
text-decoration: none;
|
30 |
+
}
|
31 |
+
h2 {
|
32 |
+
color: #4A90E2;
|
33 |
+
font-size: 28px;
|
34 |
+
font-weight: bold;
|
35 |
+
margin-top: 30px;
|
36 |
+
}
|
37 |
+
h3 {
|
38 |
+
color: #4A90E2;
|
39 |
+
font-size: 22px;
|
40 |
+
font-weight: bold;
|
41 |
+
margin-top: 20px;
|
42 |
+
}
|
43 |
+
h4 {
|
44 |
+
color: #4A90E2;
|
45 |
+
font-size: 18px;
|
46 |
+
font-weight: bold;
|
47 |
+
margin-top: 15px;
|
48 |
+
}
|
49 |
+
</style>
|
50 |
+
""", unsafe_allow_html=True)
|
51 |
+
|
52 |
+
# Main Title
|
53 |
+
st.markdown('<div class="main-title">Zero-Shot Named Entity Recognition (NER) with Spark NLP</div>', unsafe_allow_html=True)
|
54 |
+
|
55 |
+
# Overview Section
|
56 |
+
st.markdown("""
|
57 |
+
<div class="section">
|
58 |
+
<p>Named Entity Recognition (NER) is a crucial task in Natural Language Processing (NLP) that involves identifying entities such as names, places, dates, and other types of information within text. Traditional NER models require extensive labeled data to train on specific entity types, which can be time-consuming and expensive to acquire.</p>
|
59 |
+
<p>Zero-Shot NER, however, is a game-changing approach that enables you to recognize new entities without the need for labeled training data.</p>
|
60 |
+
</div>
|
61 |
+
""", unsafe_allow_html=True)
|
62 |
+
|
63 |
+
# What is Zero-Shot NER?
|
64 |
+
st.markdown("""
|
65 |
+
<div class="section">
|
66 |
+
<h3>What is Zero-Shot NER?</h3>
|
67 |
+
<p>Zero-Shot Named Entity Recognition is a technique that allows models to identify and classify entities in text without having been explicitly trained on those specific categories. Instead of relying on labeled datasets, Zero-Shot NER models use pretrained language models, such as RoBERTa, and a set of entity definitions provided at inference time.</p>
|
68 |
+
<p>This means you can define new entity types on the fly, making the model highly adaptable to different domains and tasks.</p>
|
69 |
+
</div>
|
70 |
+
""", unsafe_allow_html=True)
|
71 |
+
|
72 |
+
# Why Use Zero-Shot NER?
|
73 |
+
st.markdown("""
|
74 |
+
<div class="section">
|
75 |
+
<h3>Why Use Zero-Shot NER?</h3>
|
76 |
+
<p>Zero-Shot NER is particularly useful when:</p>
|
77 |
+
<ul>
|
78 |
+
<li>You need to recognize entities that were not included in the training data.</li>
|
79 |
+
<li>You are working with specialized or domain-specific texts where predefined NER models may not perform well.</li>
|
80 |
+
<li>Speed is critical, and you want to avoid the time-consuming process of labeling and training a new model.</li>
|
81 |
+
</ul>
|
82 |
+
<p>By using Zero-Shot NER, you can easily extend the model to recognize new entities by simply providing relevant prompts or questions.</p>
|
83 |
+
</div>
|
84 |
+
""", unsafe_allow_html=True)
|
85 |
+
|
86 |
+
# Use Cases
|
87 |
+
st.markdown("""
|
88 |
+
<div class="section">
|
89 |
+
<h3>Where to Use Zero-Shot NER?</h3>
|
90 |
+
<p>Zero-Shot NER can be applied in various scenarios, including:</p>
|
91 |
+
<ul>
|
92 |
+
<li><b>Custom Entity Recognition</b>: Quickly adapt to new types of entities as they emerge in your data.</li>
|
93 |
+
<li><b>Legal and Medical Texts</b>: Recognize domain-specific entities without needing a domain-specific training set.</li>
|
94 |
+
<li><b>Multilingual Applications</b>: Use Zero-Shot NER for languages with limited annotated data.</li>
|
95 |
+
<li><b>Rapid Prototyping</b>: Experiment with different entity types without waiting for data annotation and model training.</li>
|
96 |
+
</ul>
|
97 |
+
</div>
|
98 |
+
""", unsafe_allow_html=True)
|
99 |
+
|
100 |
+
# Pipeline and Results
|
101 |
+
st.markdown('<div class="sub-title">Pipeline and Results</div>', unsafe_allow_html=True)
|
102 |
+
|
103 |
+
st.markdown("""
|
104 |
+
<div class="section">
|
105 |
+
<p>In this section, weโll build a Spark NLP pipeline to perform Zero-Shot NER using a pretrained RoBERTa model. We'll define two entity types, "NAME" and "CITY," and demonstrate how the model identifies these entities in text.</p>
|
106 |
+
</div>
|
107 |
+
""", unsafe_allow_html=True)
|
108 |
+
|
109 |
+
# Step 1: Creating the Data
|
110 |
+
st.markdown("""
|
111 |
+
<div class="section">
|
112 |
+
<h4>Step 1: Creating the Data</h4>
|
113 |
+
<p>We'll start by creating a Spark DataFrame that includes a few sample sentences for testing Zero-Shot NER.</p>
|
114 |
+
""", unsafe_allow_html=True)
|
115 |
+
|
116 |
+
st.code("""
|
117 |
+
data = spark.createDataFrame([
|
118 |
+
"Hellen works in London, Paris and Berlin. My name is Clara, I live in New York and Hellen lives in Paris.",
|
119 |
+
"John is a man who works in London, London and London."
|
120 |
+
], StringType()).toDF("text")
|
121 |
+
""", language="python")
|
122 |
+
|
123 |
+
# Step 2: Assembling the Pipeline
|
124 |
+
st.markdown("""
|
125 |
+
<div class="section">
|
126 |
+
<h4>Step 2: Assembling the Pipeline</h4>
|
127 |
+
<p>Next, we'll set up the pipeline with the necessary annotators, including the ZeroShotNerModel.</p>
|
128 |
+
""", unsafe_allow_html=True)
|
129 |
+
|
130 |
+
st.code("""
|
131 |
+
from sparknlp.annotator import ZeroShotNerModel, SentenceDetector, Tokenizer
|
132 |
+
from sparknlp.base import DocumentAssembler, NerConverter
|
133 |
+
from pyspark.ml import Pipeline
|
134 |
+
|
135 |
+
# Step 1: Document Assembler
|
136 |
+
documentAssembler = DocumentAssembler() \\
|
137 |
+
.setInputCol("text") \\
|
138 |
+
.setOutputCol("document")
|
139 |
+
|
140 |
+
# Step 2: Sentence Detection
|
141 |
+
sentenceDetector = SentenceDetector() \\
|
142 |
+
.setInputCols(["document"]) \\
|
143 |
+
.setOutputCol("sentence")
|
144 |
+
|
145 |
+
# Step 3: Tokenization
|
146 |
+
tokenizer = Tokenizer() \\
|
147 |
+
.setInputCols(["sentence"]) \\
|
148 |
+
.setOutputCol("token")
|
149 |
+
|
150 |
+
# Step 4: Zero-Shot NER Model
|
151 |
+
zero_shot_ner = ZeroShotNerModel.pretrained("zero_shot_ner_roberta", "en")\\
|
152 |
+
.setInputCols(["sentence", "token"])\\
|
153 |
+
.setOutputCol("zero_shot_ner")\\
|
154 |
+
.setEntityDefinitions(
|
155 |
+
{
|
156 |
+
"NAME": ["What is his name?", "What is my name?", "What is her name?"],
|
157 |
+
"CITY": ["Which city?", "Which is the city?"]
|
158 |
+
})
|
159 |
+
|
160 |
+
# Step 5: NER Converter
|
161 |
+
ner_converter = NerConverter()\\
|
162 |
+
.setInputCols(["sentence", "token", "zero_shot_ner"])\\
|
163 |
+
.setOutputCol("ner_chunk")
|
164 |
+
|
165 |
+
# Define the pipeline
|
166 |
+
pipeline = Pipeline(stages=[
|
167 |
+
documentAssembler,
|
168 |
+
sentenceDetector,
|
169 |
+
tokenizer,
|
170 |
+
zero_shot_ner,
|
171 |
+
ner_converter
|
172 |
+
])
|
173 |
+
|
174 |
+
# Fit and transform data
|
175 |
+
result = pipeline.fit(data).transform(data)
|
176 |
+
""", language="python")
|
177 |
+
|
178 |
+
# Step 3: Viewing the Results
|
179 |
+
st.markdown("""
|
180 |
+
<div class="section">
|
181 |
+
<h4>Step 3: Viewing the Results</h4>
|
182 |
+
<p>After processing the data through the pipeline, you can inspect the recognized entities:</p>
|
183 |
+
""", unsafe_allow_html=True)
|
184 |
+
|
185 |
+
st.code("""
|
186 |
+
# View NER Results:
|
187 |
+
|
188 |
+
from pyspark.sql.functions import explode, expr, col
|
189 |
+
|
190 |
+
result.select(
|
191 |
+
expr("explode(ner_chunk) as ner_chunk")
|
192 |
+
).select(
|
193 |
+
col("ner_chunk.result").alias("chunk"),
|
194 |
+
col("ner_chunk.metadata").getItem("entity").alias("ner_label")
|
195 |
+
).show(truncate=False)
|
196 |
+
""", language="python")
|
197 |
+
|
198 |
+
st.text("""
|
199 |
+
+--------+---------+
|
200 |
+
|chunk |ner_label|
|
201 |
+
+--------+---------+
|
202 |
+
|Hellen |NAME |
|
203 |
+
|London |CITY |
|
204 |
+
|Paris |CITY |
|
205 |
+
|Berlin |CITY |
|
206 |
+
|Clara |NAME |
|
207 |
+
|New York|CITY |
|
208 |
+
|Hellen |NAME |
|
209 |
+
|Paris |CITY |
|
210 |
+
|John |NAME |
|
211 |
+
|London |CITY |
|
212 |
+
|London |CITY |
|
213 |
+
|London |CITY |
|
214 |
+
+--------+---------+
|
215 |
+
""")
|
216 |
+
|
217 |
+
# Model Information and Use Cases
|
218 |
+
st.markdown("""
|
219 |
+
<div class="section">
|
220 |
+
<h4>Model Information and Use Cases</h4>
|
221 |
+
<p>The model used in this example is <code>zero_shot_ner_roberta</code>, which is compatible with Spark NLP 4.3.0+ and is trained with RoBERTa embeddings. This model is capable of recognizing any entity defined at runtime, making it versatile for various applications.</p>
|
222 |
+
<ul>
|
223 |
+
<li><b>Model Name:</b> zero_shot_ner_roberta</li>
|
224 |
+
<li><b>Language:</b> English</li>
|
225 |
+
<li><b>Size:</b> 463.8 MB</li>
|
226 |
+
<li><b>Case Sensitive:</b> true</li>
|
227 |
+
<li><b>Supported Entities:</b> Defined at runtime</li>
|
228 |
+
<li><b>Task:</b> NER</li>
|
229 |
+
</ul>
|
230 |
+
<p>More models and embeddings can be found at <a href="https://nlp.johnsnowlabs.com/models" class="link">John Snow Labs' Model Hub</a>.</p>
|
231 |
+
</div>
|
232 |
+
""", unsafe_allow_html=True)
|
233 |
+
|
234 |
+
# Conclusion
|
235 |
+
st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
|
236 |
+
|
237 |
+
st.markdown("""
|
238 |
+
<div class="section">
|
239 |
+
<p>Zero-Shot NER is a powerful tool that allows you to recognize new entities without the need for labeled training data. By leveraging pretrained models and defining entity types at runtime, you can quickly adapt to new domains and tasks.</p>
|
240 |
+
<p>Try implementing Zero-Shot NER in your own projects to see how it can simplify your workflow and extend the capabilities of your NLP models.</p>
|
241 |
+
</div>
|
242 |
+
""", unsafe_allow_html=True)
|
243 |
+
|
244 |
+
# Additional Resources
|
245 |
+
st.markdown("""
|
246 |
+
<div class="section">
|
247 |
+
<h4>Additional Resources</h4>
|
248 |
+
<ul>
|
249 |
+
<li><a href="https://nlp.johnsnowlabs.com/" class="link">Spark NLP Official Website</a></li>
|
250 |
+
<li><a href="https://nlp.johnsnowlabs.com/docs/en/licensed/models" class="link">Spark NLP Models</a></li>
|
251 |
+
<li><a href="https://github.com/JohnSnowLabs/spark-nlp" class="link">GitHub Repository</a></li>
|
252 |
+
<li><a href="https://colab.research.google.com/drive/1-9x0hSoVhHb3Fq8UqMeZrbHMYaK0YHj7" class="link">Zero-Shot NER with Spark NLP - Google Colab Notebook</a></li>
|
253 |
+
</ul>
|
254 |
+
</div>
|
255 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
st-annotated-text
|
3 |
+
pandas
|
4 |
+
numpy
|
5 |
+
spark-nlp
|
6 |
+
pyspark
|