abdullahmubeen10 commited on
Commit
38becfe
โ€ข
1 Parent(s): c7bee95

Upload 10 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+ from sparknlp.base import *
6
+ from sparknlp.annotator import *
7
+ from pyspark.ml import Pipeline
8
+ from sparknlp.pretrained import PretrainedPipeline
9
+ from annotated_text import annotated_text
10
+
11
+ # Page configuration
12
+ st.set_page_config(layout="wide", initial_sidebar_state="auto")
13
+
14
+ # CSS for styling
15
+ st.markdown("""
16
+ <style>
17
+ .main-title {
18
+ font-size: 36px;
19
+ color: #4A90E2;
20
+ font-weight: bold;
21
+ text-align: center;
22
+ }
23
+ .section {
24
+ background-color: #f9f9f9;
25
+ padding: 10px;
26
+ border-radius: 10px;
27
+ margin-top: 10px;
28
+ }
29
+ .section p, .section ul {
30
+ color: #666666;
31
+ }
32
+ </style>
33
+ """, unsafe_allow_html=True)
34
+
35
+ # Initialize Spark NLP
36
+ @st.cache_resource
37
+ def init_spark():
38
+ return sparknlp.start()
39
+
40
+ # Create the NER pipeline
41
+ @st.cache_resource
42
+ def create_pipeline(model, context_dict):
43
+ documentAssembler = DocumentAssembler() \
44
+ .setInputCol("text") \
45
+ .setOutputCol("document")
46
+
47
+ sentenceDetector = SentenceDetector() \
48
+ .setInputCols(["document"]) \
49
+ .setOutputCol("sentence")
50
+
51
+ tokenizer = Tokenizer() \
52
+ .setInputCols(["sentence"]) \
53
+ .setOutputCol("token")
54
+
55
+ zero_shot_ner = ZeroShotNerModel.pretrained(model, "en")\
56
+ .setInputCols(["sentence", "token"])\
57
+ .setOutputCol("zero_shot_ner")\
58
+ .setEntityDefinitions(context_dict)
59
+
60
+ ner_converter = NerConverter()\
61
+ .setInputCols(["sentence", "token", "zero_shot_ner"])\
62
+ .setOutputCol("ner_chunk")\
63
+
64
+ pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, zero_shot_ner, ner_converter])
65
+ return pipeline
66
+
67
+ # Fit data using the pipeline
68
+ def fit_data(pipeline, data):
69
+ empty_df = spark.createDataFrame([['']]).toDF('text')
70
+ pipeline_model = pipeline.fit(empty_df)
71
+ model = LightPipeline(pipeline_model)
72
+ result = model.fullAnnotate(data)
73
+ return result
74
+
75
+ # Annotate the text with NER results
76
+ def annotate(data):
77
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
78
+ annotated_words = []
79
+ for chunk, label in zip(chunks, labels):
80
+ parts = document.split(chunk, 1)
81
+ if parts[0]:
82
+ annotated_words.append(parts[0])
83
+ annotated_words.append((chunk, label))
84
+ document = parts[1]
85
+ if document:
86
+ annotated_words.append(document)
87
+ annotated_text(*annotated_words)
88
+
89
+ def df_to_dict(df):
90
+ context_dict = {}
91
+ for col in df.columns:
92
+ values = df[col].dropna().tolist()
93
+ if values:
94
+ context_dict[col] = values
95
+ return context_dict
96
+
97
+ # Sidebar content
98
+ model = st.sidebar.selectbox(
99
+ "Choose the pretrained model",
100
+ ["zero_shot_ner_roberta"],
101
+ help="For more info about the models visit: https://sparknlp.org/models"
102
+ )
103
+
104
+ # Set up the page layout
105
+ st.markdown('<div class="main-title">Zero-Shot Named Entity Recognition (NER)</div>', unsafe_allow_html=True)
106
+ st.markdown('<div class="section"><p>Explore Zero-Shot Named Entity Recognition (NER)โ€”a state-of-the-art technique that detects and classifies named entities in text without needing specific training on annotated datasets. With our interactive interface, you can modify the context by editing the DataFrame to define custom entity types and examples. Then, input your own text or select from predefined examples to see how the model identifies and categorizes entities in real time.</p></div>', unsafe_allow_html=True)
107
+
108
+ # Reference notebook link in sidebar
109
+ link = """
110
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.6.ZeroShot_Clinical_NER.ipynb">
111
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
112
+ </a>
113
+ """
114
+ st.sidebar.markdown('Reference notebook:')
115
+ st.sidebar.markdown(link, unsafe_allow_html=True)
116
+
117
+ # Set examples and create DataFrame
118
+ data = {
119
+ "PROBLEM": [
120
+ "What is the disease?",
121
+ "What are the symptoms of the condition?",
122
+ "What is the patient's diagnosis?",
123
+ "What kind of disease is he suffering from?",
124
+ "What specific medical issue does she have?",
125
+ "What is the main problem the patient is facing?",
126
+ "What were the reasons for the patient's hospitalization?"
127
+ ],
128
+ "DRUG": [
129
+ "Which medication was prescribed?",
130
+ "What is the name of the drug used for treatment?",
131
+ "Which drug is administered for this condition?",
132
+ "What medication does he take daily?",
133
+ "What drugs are used to manage his symptoms?",
134
+ "Which medicine is recommended for this illness?",
135
+ "What is the prescription for this medical condition?"
136
+ ],
137
+ "ADMISSION_DATE": [
138
+ "When was the patient admitted to the hospital?",
139
+ "What is the date of the patient's admission?",
140
+ "On which date did the patient enter the clinic?",
141
+ "When did the patient check into the hospital?",
142
+ "What is the admission date for the patient?"
143
+ ],
144
+ "PATIENT_AGE": [
145
+ "How old is the patient?",
146
+ "What is the patient's age?",
147
+ "At what age was the patient diagnosed?",
148
+ "Can you tell me the age of the patient?",
149
+ "What is the age of the person receiving treatment?"
150
+ ],
151
+ "SYMPTOM": [
152
+ "What symptoms is the patient experiencing?",
153
+ "What are the signs of the disease?",
154
+ "Which symptoms did the patient report?",
155
+ "What were the initial symptoms observed?",
156
+ "What specific symptoms are present?"
157
+ ],
158
+ "TREATMENT": [
159
+ "What treatment plan was recommended?",
160
+ "Which therapies are being used?",
161
+ "What is the current treatment protocol?",
162
+ "What type of treatment is the patient undergoing?",
163
+ "What are the options for treating this condition?"
164
+ ],
165
+ "DOCTOR": [
166
+ "Who is the treating physician?",
167
+ "Which doctor is handling the case?",
168
+ "What is the name of the attending doctor?",
169
+ "Who is the specialist for this illness?",
170
+ "Can you provide the name of the doctor overseeing the treatment?"
171
+ ]
172
+ }
173
+
174
+ # Pad shorter lists with empty strings
175
+ max_length = max(len(v) for v in data.values())
176
+ for key in data.keys():
177
+ while len(data[key]) < max_length:
178
+ data[key].append(None)
179
+
180
+ # Create DataFrame and display
181
+ df = pd.DataFrame(data)
182
+ df.index += 1
183
+ st.write("Context DataFrame (Click To Edit)")
184
+ edited_df = st.data_editor(df)
185
+
186
+ # Example sentences
187
+ examples = [
188
+ "Dr. Taylor prescribed Lisinopril to a 68-year-old patient with high blood pressure. The patient was admitted to the hospital on April 15, 2024, after experiencing severe hypertension symptoms.",
189
+ "The 50-year-old male patient reported persistent back pain. The treatment plan includes physical therapy and a medication called Flexeril to alleviate the discomfort.",
190
+ "The patient was admitted on June 12, 2024, with symptoms of severe abdominal pain. Dr. Kim diagnosed acute appendicitis and recommended immediate surgical intervention.",
191
+ "A 25-year-old female patient with a history of asthma was treated with Albuterol. Dr. Patel noted that the patient's symptoms of shortness of breath improved significantly after starting the medication.",
192
+ "The 72-year-old patient underwent surgery for cataracts on May 30, 2024. Dr. Martinez prescribed eye drops and scheduled a follow-up appointment for post-operative care.",
193
+ "The patient, aged 40, presented with symptoms of chronic fatigue and joint pain. Dr. Nguyen recommended a combination of lifestyle changes and a new drug called Imunorix.",
194
+ "Dr. Wilson observed that the 34-year-old patientโ€™s condition, diagnosed as lupus, was managed with hydroxychloroquine and a tailored treatment regimen.",
195
+ "The 56-year-old patient experienced symptoms of severe dehydration due to gastroenteritis. Dr. Rogers provided intravenous fluids and anti-nausea medication as treatment.",
196
+ "On July 8, 2024, the patient was admitted with acute respiratory distress. Dr. Green prescribed a regimen including corticosteroids and bronchodilators to manage the symptoms.",
197
+ "The 29-year-old patient, diagnosed with chronic migraines, was treated with a new medication called MigraRelief. The attending physician, Dr. Lewis, also recommended cognitive behavioral therapy."
198
+ ]
199
+
200
+ selected_text = st.selectbox("Select an example", examples)
201
+ custom_input = st.text_input("Try it with your own Sentence!")
202
+
203
+ text_to_analyze = custom_input if custom_input else selected_text
204
+ context_dict = df_to_dict(edited_df)
205
+
206
+ # Display example text
207
+ st.subheader('Full Example Text')
208
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
209
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
210
+
211
+ # Initialize Spark and create pipeline
212
+ spark = init_spark()
213
+ pipeline = create_pipeline(model, context_dict)
214
+ output = fit_data(pipeline, text_to_analyze)
215
+
216
+ # Display processed output
217
+ st.subheader("Processed Output:")
218
+ results = {
219
+ 'Document': output[0]['document'][0].result,
220
+ 'NER Chunk': [n.result for n in output[0]['ner_chunk']],
221
+ "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
222
+ }
223
+ annotate(results)
224
+
225
+ with st.expander("View DataFrame"):
226
+ df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
227
+ df.index += 1
228
+ st.dataframe(df)
Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
9
+
10
+ # Install required packages
11
+ RUN apt-get update && apt-get install -y \
12
+ tar \
13
+ wget \
14
+ bash \
15
+ rsync \
16
+ gcc \
17
+ libfreetype6-dev \
18
+ libhdf5-serial-dev \
19
+ libpng-dev \
20
+ libzmq3-dev \
21
+ python3 \
22
+ python3-dev \
23
+ python3-pip \
24
+ unzip \
25
+ pkg-config \
26
+ software-properties-common \
27
+ graphviz \
28
+ openjdk-8-jdk \
29
+ ant \
30
+ ca-certificates-java \
31
+ && apt-get clean \
32
+ && update-ca-certificates -f
33
+
34
+ # Install Python 3.8 and pip
35
+ RUN add-apt-repository ppa:deadsnakes/ppa \
36
+ && apt-get update \
37
+ && apt-get install -y python3.8 python3-pip \
38
+ && apt-get clean
39
+
40
+ # Set up JAVA_HOME
41
+ RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
42
+ && echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
43
+ # Create a new user named "jovyan" with user ID 1000
44
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
45
+
46
+ # Switch to the "jovyan" user
47
+ USER ${NB_USER}
48
+
49
+ # Set home and path variables for the user
50
+ ENV HOME=/home/${NB_USER} \
51
+ PATH=/home/${NB_USER}/.local/bin:$PATH
52
+
53
+ # Set up PySpark to use Python 3.8 for both driver and workers
54
+ ENV PYSPARK_PYTHON=/usr/bin/python3.8
55
+ ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
56
+
57
+ # Set the working directory to the user's home directory
58
+ WORKDIR ${HOME}
59
+
60
+ # Upgrade pip and install Python dependencies
61
+ RUN python3.8 -m pip install --upgrade pip
62
+ COPY requirements.txt /tmp/requirements.txt
63
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
64
+
65
+ # Copy the application code into the container at /home/jovyan
66
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
67
+
68
+ # Expose port for Streamlit
69
+ EXPOSE 7860
70
+
71
+ # Define the entry point for the container
72
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/bert_token_classifier_hi_en_ner/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ เคเคถเคฟเคฏเคจ-เคชเฅ‡เค‚เคŸเฅเคธ เคฒเคฟเคฎเคฟเคŸเฅ‡เคก (Asian Paints Limited) เคเค• เคญเคพเคฐเคคเฅ€เคฏ เคฌเคนเฅเคฐเคพเคทเฅเคŸเฅเคฐเฅ€เคฏ เคตเฅเคฏเคพเคชเคพเคฐ เคนเฅˆ เคœเคฟเคธเค•เคพ เคฎเฅเค–เฅเคฏเคพเคฒเคฏ เคฎเฅเค‚เคฌเคˆ (...
2
+ เคเคถเคฟเคฏเคจ-เคชเฅ‡เค‚เคŸเฅเคธ เคฒเคฟเคฎเคฟเคŸเฅ‡เคก (Asian Paints Limited) เคเค• เคญเคพเคฐเคคเฅ€เคฏ เคฌเคนเฅเคฐเคพเคทเฅเคŸเฅเคฐเฅ€เคฏ เคตเฅเคฏเคพเคชเคพเคฐ เคนเฅˆ เคœเคฟเคธเค•เคพ เคฎเฅเค–เฅเคฏเคพเคฒเคฏ เคฎเฅเค‚เคฌเคˆ (Mumbai), เคฎเคนเคพเคฐเคพเคทเฅเคŸเฅเคฐ (Maharashtra) เคฎเฅ‡เค‚ เคนเฅˆเฅค เคฏเฅ‡ เคตเฅเคฏเคพเคชเคพเคฐ, เคฐเค‚เค—, เค˜เคฐ เค•เฅ€ เคธเคœเคพเคตเคŸ, เคซเคฟเคŸเคฟเค‚เค— เคธเฅ‡ เคธเค‚เคฌเค‚เคงเคฟเคค เค‰เคคเฅเคชเคพเคฆเฅ‹เค‚ เค”เคฐ เคธเค‚เคฌเค‚เคงเคฟเคค เคธเฅ‡เคตเคพเคเค‚ เคชเฅเคฐเคฆเคพเคจ เค•เคฐเคจเฅ‡, เคจเคฟเคฐเฅเคฎเคพเคฃ, เคฌเคฟเค•เฅเคฐเฅ€ เค”เคฐ เคตเคฟเคคเคฐเคฃ เค•เฅ‡ เคตเฅเคฏเคตเคธเคพเคฏ เคฎเฅ‡เค‚ เคฒเค—เฅ€ เคนเฅเคˆ เคนเฅˆเฅค
inputs/bert_token_classifier_hi_en_ner/Example2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ เคฐเคฟเคฒเคพเคฏเค‚เคธ เค‡เค‚เคกเคธเฅเคŸเฅเคฐเฅ€เคœเคผ เคฒเคฟเคฎเคฟเคŸเฅ‡เคก (Reliance Industries Limited) เคเค• เคญเคพเคฐเคคเฅ€เคฏ เคธเค‚เค—เฅเคŸเคฟเค•เคพ เคจเคฟเคฏเค‚เคคเฅเคฐเค• เค•เค‚เคชเคจเฅ€ เคนเฅˆ, เคœเคฟเคธเค•...
2
+ เคฐเคฟเคฒเคพเคฏเค‚เคธ เค‡เค‚เคกเคธเฅเคŸเฅเคฐเฅ€เคœเคผ เคฒเคฟเคฎเคฟเคŸเฅ‡เคก (Reliance Industries Limited) เคเค• เคญเคพเคฐเคคเฅ€เคฏ เคธเค‚เค—เฅเคŸเคฟเค•เคพ เคจเคฟเคฏเค‚เคคเฅเคฐเค• เค•เค‚เคชเคจเฅ€ เคนเฅˆ, เคœเคฟเคธเค•เคพ เคฎเฅเค–เฅเคฏเคพเคฒเคฏ เคฎเฅเค‚เคฌเคˆ, เคฎเคนเคพเคฐเคพเคทเฅเคŸเฅเคฐ (Maharashtra) เคฎเฅ‡เค‚ เคธเฅเคฅเคฟเคค เคนเฅˆเฅคเคฐเคคเคจ เคจเคตเคฒ เคŸเคพเคŸเคพ (28 เคฆเคฟเคธเค‚เคฌเคฐ 1937, เค•เฅ‹ เคฎเฅเคฎเฅเคฌเคˆ , เคฎเฅ‡เค‚ เคœเคจเฅเคฎเฅ‡) เคŸเคพเคŸเคพ เคธเคฎเฅเคน เค•เฅ‡ เคตเคฐเฅเคคเคฎเคพเคจ เค…เคงเฅเคฏเค•เฅเคท, เคœเฅ‹ เคญเคพเคฐเคค เค•เฅ€ เคธเคฌเคธเฅ‡ เคฌเคกเคผเฅ€ เคตเฅเคฏเคพเคชเคพเคฐเคฟเค• เคธเคฎเฅ‚เคน เคนเฅˆ, เคœเคฟเคธเค•เฅ€ เคธเฅเคฅเคพเคชเคจเคพ เคœเคฎเคถเฅ‡เคฆเคœเฅ€ เคŸเคพเคŸเคพ เคจเฅ‡ เค•เฅ€ เค”เคฐ เค‰เคจเค•เฅ‡ เคชเคฐเคฟเคตเคพเคฐ เค•เฅ€ เคชเฅ€เคขเคฟเคฏเฅ‹เค‚ เคจเฅ‡ เค‡เคธเค•เคพ เคตเคฟเคธเฅเคคเคพเคฐ เค•เคฟเคฏเคพ เค”เคฐ เค‡เคธเฅ‡ เคฆเฅƒเคขเคผ เคฌเคจเคพเคฏเคพเฅค
inputs/bert_token_classifier_hi_en_ner/Example3.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ เคฒเคฟเคฏเฅ‹เคจเฅ‡เคฒ เคฎเฅ‡เคธเฅเคธเฅ€ (Lionel Messi); (เคœเคจเฅเคฎ 24 เคœเฅ‚เคจ 1987) เค…เคฐเฅเคœเฅ‡เค‚เคŸเฅ€เคจเคพ (Argentina) เค•เฅ‡ เคซเคผเฅเคŸเคฌเฅ‰เคฒ เค–เคฟเคฒเคพเคกเคผเฅ€ เคนเฅˆเค‚, เคœเฅ‹ ...
2
+ เคฒเคฟเคฏเฅ‹เคจเฅ‡เคฒ เคฎเฅ‡เคธเฅเคธเฅ€ (Lionel Messi); (เคœเคจเฅเคฎ 24 เคœเฅ‚เคจ 1987) เค…เคฐเฅเคœเฅ‡เค‚เคŸเฅ€เคจเคพ (Argentina) เค•เฅ‡ เคซเคผเฅเคŸเคฌเฅ‰เคฒ เค–เคฟเคฒเคพเคกเคผเฅ€ เคนเฅˆเค‚, เคœเฅ‹ เค‡เคธ เคธเคฎเคฏ เคชเฅ€.เคเคธ.เคœเฅ€ เคŸเฅ€เคฎ เคชเฅ‡เคฐเคฟเคธ เคธเฅ‡เค‚เคŸ-เคœเคฐเฅเคฎเฅ‡เคจ (Paris Saint-Germain Football Club) เค”เคฐ เค…เคฐเฅเคœเฅ‡เค‚เคŸเฅ€เคจเคพ (Argentina) เค•เฅ€ เคฐเคพเคทเฅเคŸเฅเคฐเฅ€เคฏ เคŸเฅ€เคฎ เค•เฅ‡ เคฒเคฟเค เค–เฅ‡เคฒเคคเฅ‡ เคนเฅˆเค‚เฅค'
inputs/bert_token_classifier_hi_en_ner/Example4.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ เค•เฅเคฐเคฟเคธเฅเคŸเคฟเคฏเคพเคจเฅ‹ เคฐเฅ‹เคจเคพเคฒเฅเคกเฅ‹ (Cristiano Ronaldo), (เคœเคจเฅเคฎ: 5 เคซเคฐเคตเคฐเฅ€ 1985), เคเค• เคชเฅเคฐเฅเคคเค—เคพเคฒเฅ€ (Portugal) เคชเฅ‡เคถเฅ‡เคตเคฐ เคซเฅเคŸเคฌ...
2
+ เค•เฅเคฐเคฟเคธเฅเคŸเคฟเคฏเคพเคจเฅ‹ เคฐเฅ‹เคจเคพเคฒเฅเคกเฅ‹ (Cristiano Ronaldo), (เคœเคจเฅเคฎ: 5 เคซเคฐเคตเคฐเฅ€ 1985), เคเค• เคชเฅเคฐเฅเคคเค—เคพเคฒเฅ€ (Portugal) เคชเฅ‡เคถเฅ‡เคตเคฐ เคซเฅเคŸเคฌเฅ‰เคฒ เค–เคฟเคฒเคพเคกเคผเฅ€ เคนเฅˆ, เคœเฅ‹ เคฎเฅ‡เคจเคšเฅ‡เคธเฅเคŸเคฐ เคฏเฅ‚เคจเคพเค‡เคŸเฅ‡เคก (Manchester United) เค•เฅ‡ เคฒเคฟเค เค”เคฐ เคชเฅเคฐเฅเคคเค—เคพเคฒ (Portugal) เคฐเคพเคทเฅเคŸเฅเคฐเฅ€เคฏ เคซเฅเคŸเคฌเฅ‰เคฒ เค•เฅ‡ เค•เคชเฅเคคเคพเคจ เคนเฅˆเค‚เฅค
inputs/bert_token_classifier_hi_en_ner/Example5.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ เคตเฅ‰เคฐเฅ‡เคจ เคเคกเคตเคฐเฅเคก เคฌเคซเฅ‡เคŸ (Warren Buffet) (เค…เค—เคธเฅเคค 30 (August 30), 1930 เค•เฅ‹ เค“เคฎเคพเคนเคพ (Omaha), เคจเฅ‡เคฌเฅเคฐเคพเคธเฅเค•เคพ (Nebraska...
2
+ เคตเฅ‰เคฐเฅ‡เคจ เคเคกเคตเคฐเฅเคก เคฌเคซเฅ‡เคŸ (Warren Buffet) (เค…เค—เคธเฅเคค 30 (August 30), 1930 เค•เฅ‹ เค“เคฎเคพเคนเคพ (Omaha), เคจเฅ‡เคฌเฅเคฐเคพเคธเฅเค•เคพ (Nebraska) เคฎเฅ‡เค‚ เคชเฅˆเคฆเคพ เคนเฅเค) เคเค• เค…เคฎเฅ‡เคฐเคฟเค•เฅ€ เคจเคฟเคตเฅ‡เคถเค• (investor), เคตเฅเคฏเคตเคธเคพเคฏเฅ€ เค”เคฐ เคชเคฐเฅ‹เคชเค•เคพเคฐเฅ€ (philanthropist) เคตเฅเคฏเค•เฅเคคเคฟเคคเฅเคต เคนเฅˆเค‚เฅค
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Custom CSS for better styling
5
+ st.markdown("""
6
+ <style>
7
+ .main-title {
8
+ font-size: 36px;
9
+ color: #4A90E2;
10
+ font-weight: bold;
11
+ text-align: center;
12
+ }
13
+ .sub-title {
14
+ font-size: 24px;
15
+ color: #4A90E2;
16
+ margin-top: 20px;
17
+ }
18
+ .section {
19
+ background-color: #f9f9f9;
20
+ padding: 15px;
21
+ border-radius: 10px;
22
+ margin-top: 20px;
23
+ }
24
+ .section p, .section ul {
25
+ color: #666666;
26
+ }
27
+ .link {
28
+ color: #4A90E2;
29
+ text-decoration: none;
30
+ }
31
+ h2 {
32
+ color: #4A90E2;
33
+ font-size: 28px;
34
+ font-weight: bold;
35
+ margin-top: 30px;
36
+ }
37
+ h3 {
38
+ color: #4A90E2;
39
+ font-size: 22px;
40
+ font-weight: bold;
41
+ margin-top: 20px;
42
+ }
43
+ h4 {
44
+ color: #4A90E2;
45
+ font-size: 18px;
46
+ font-weight: bold;
47
+ margin-top: 15px;
48
+ }
49
+ </style>
50
+ """, unsafe_allow_html=True)
51
+
52
+ # Main Title
53
+ st.markdown('<div class="main-title">Zero-Shot Named Entity Recognition (NER) with Spark NLP</div>', unsafe_allow_html=True)
54
+
55
+ # Overview Section
56
+ st.markdown("""
57
+ <div class="section">
58
+ <p>Named Entity Recognition (NER) is a crucial task in Natural Language Processing (NLP) that involves identifying entities such as names, places, dates, and other types of information within text. Traditional NER models require extensive labeled data to train on specific entity types, which can be time-consuming and expensive to acquire.</p>
59
+ <p>Zero-Shot NER, however, is a game-changing approach that enables you to recognize new entities without the need for labeled training data.</p>
60
+ </div>
61
+ """, unsafe_allow_html=True)
62
+
63
+ # What is Zero-Shot NER?
64
+ st.markdown("""
65
+ <div class="section">
66
+ <h3>What is Zero-Shot NER?</h3>
67
+ <p>Zero-Shot Named Entity Recognition is a technique that allows models to identify and classify entities in text without having been explicitly trained on those specific categories. Instead of relying on labeled datasets, Zero-Shot NER models use pretrained language models, such as RoBERTa, and a set of entity definitions provided at inference time.</p>
68
+ <p>This means you can define new entity types on the fly, making the model highly adaptable to different domains and tasks.</p>
69
+ </div>
70
+ """, unsafe_allow_html=True)
71
+
72
+ # Why Use Zero-Shot NER?
73
+ st.markdown("""
74
+ <div class="section">
75
+ <h3>Why Use Zero-Shot NER?</h3>
76
+ <p>Zero-Shot NER is particularly useful when:</p>
77
+ <ul>
78
+ <li>You need to recognize entities that were not included in the training data.</li>
79
+ <li>You are working with specialized or domain-specific texts where predefined NER models may not perform well.</li>
80
+ <li>Speed is critical, and you want to avoid the time-consuming process of labeling and training a new model.</li>
81
+ </ul>
82
+ <p>By using Zero-Shot NER, you can easily extend the model to recognize new entities by simply providing relevant prompts or questions.</p>
83
+ </div>
84
+ """, unsafe_allow_html=True)
85
+
86
+ # Use Cases
87
+ st.markdown("""
88
+ <div class="section">
89
+ <h3>Where to Use Zero-Shot NER?</h3>
90
+ <p>Zero-Shot NER can be applied in various scenarios, including:</p>
91
+ <ul>
92
+ <li><b>Custom Entity Recognition</b>: Quickly adapt to new types of entities as they emerge in your data.</li>
93
+ <li><b>Legal and Medical Texts</b>: Recognize domain-specific entities without needing a domain-specific training set.</li>
94
+ <li><b>Multilingual Applications</b>: Use Zero-Shot NER for languages with limited annotated data.</li>
95
+ <li><b>Rapid Prototyping</b>: Experiment with different entity types without waiting for data annotation and model training.</li>
96
+ </ul>
97
+ </div>
98
+ """, unsafe_allow_html=True)
99
+
100
+ # Pipeline and Results
101
+ st.markdown('<div class="sub-title">Pipeline and Results</div>', unsafe_allow_html=True)
102
+
103
+ st.markdown("""
104
+ <div class="section">
105
+ <p>In this section, weโ€™ll build a Spark NLP pipeline to perform Zero-Shot NER using a pretrained RoBERTa model. We'll define two entity types, "NAME" and "CITY," and demonstrate how the model identifies these entities in text.</p>
106
+ </div>
107
+ """, unsafe_allow_html=True)
108
+
109
+ # Step 1: Creating the Data
110
+ st.markdown("""
111
+ <div class="section">
112
+ <h4>Step 1: Creating the Data</h4>
113
+ <p>We'll start by creating a Spark DataFrame that includes a few sample sentences for testing Zero-Shot NER.</p>
114
+ """, unsafe_allow_html=True)
115
+
116
+ st.code("""
117
+ data = spark.createDataFrame([
118
+ "Hellen works in London, Paris and Berlin. My name is Clara, I live in New York and Hellen lives in Paris.",
119
+ "John is a man who works in London, London and London."
120
+ ], StringType()).toDF("text")
121
+ """, language="python")
122
+
123
+ # Step 2: Assembling the Pipeline
124
+ st.markdown("""
125
+ <div class="section">
126
+ <h4>Step 2: Assembling the Pipeline</h4>
127
+ <p>Next, we'll set up the pipeline with the necessary annotators, including the ZeroShotNerModel.</p>
128
+ """, unsafe_allow_html=True)
129
+
130
+ st.code("""
131
+ from sparknlp.annotator import ZeroShotNerModel, SentenceDetector, Tokenizer
132
+ from sparknlp.base import DocumentAssembler, NerConverter
133
+ from pyspark.ml import Pipeline
134
+
135
+ # Step 1: Document Assembler
136
+ documentAssembler = DocumentAssembler() \\
137
+ .setInputCol("text") \\
138
+ .setOutputCol("document")
139
+
140
+ # Step 2: Sentence Detection
141
+ sentenceDetector = SentenceDetector() \\
142
+ .setInputCols(["document"]) \\
143
+ .setOutputCol("sentence")
144
+
145
+ # Step 3: Tokenization
146
+ tokenizer = Tokenizer() \\
147
+ .setInputCols(["sentence"]) \\
148
+ .setOutputCol("token")
149
+
150
+ # Step 4: Zero-Shot NER Model
151
+ zero_shot_ner = ZeroShotNerModel.pretrained("zero_shot_ner_roberta", "en")\\
152
+ .setInputCols(["sentence", "token"])\\
153
+ .setOutputCol("zero_shot_ner")\\
154
+ .setEntityDefinitions(
155
+ {
156
+ "NAME": ["What is his name?", "What is my name?", "What is her name?"],
157
+ "CITY": ["Which city?", "Which is the city?"]
158
+ })
159
+
160
+ # Step 5: NER Converter
161
+ ner_converter = NerConverter()\\
162
+ .setInputCols(["sentence", "token", "zero_shot_ner"])\\
163
+ .setOutputCol("ner_chunk")
164
+
165
+ # Define the pipeline
166
+ pipeline = Pipeline(stages=[
167
+ documentAssembler,
168
+ sentenceDetector,
169
+ tokenizer,
170
+ zero_shot_ner,
171
+ ner_converter
172
+ ])
173
+
174
+ # Fit and transform data
175
+ result = pipeline.fit(data).transform(data)
176
+ """, language="python")
177
+
178
+ # Step 3: Viewing the Results
179
+ st.markdown("""
180
+ <div class="section">
181
+ <h4>Step 3: Viewing the Results</h4>
182
+ <p>After processing the data through the pipeline, you can inspect the recognized entities:</p>
183
+ """, unsafe_allow_html=True)
184
+
185
+ st.code("""
186
+ # View NER Results:
187
+
188
+ from pyspark.sql.functions import explode, expr, col
189
+
190
+ result.select(
191
+ expr("explode(ner_chunk) as ner_chunk")
192
+ ).select(
193
+ col("ner_chunk.result").alias("chunk"),
194
+ col("ner_chunk.metadata").getItem("entity").alias("ner_label")
195
+ ).show(truncate=False)
196
+ """, language="python")
197
+
198
+ st.text("""
199
+ +--------+---------+
200
+ |chunk |ner_label|
201
+ +--------+---------+
202
+ |Hellen |NAME |
203
+ |London |CITY |
204
+ |Paris |CITY |
205
+ |Berlin |CITY |
206
+ |Clara |NAME |
207
+ |New York|CITY |
208
+ |Hellen |NAME |
209
+ |Paris |CITY |
210
+ |John |NAME |
211
+ |London |CITY |
212
+ |London |CITY |
213
+ |London |CITY |
214
+ +--------+---------+
215
+ """)
216
+
217
+ # Model Information and Use Cases
218
+ st.markdown("""
219
+ <div class="section">
220
+ <h4>Model Information and Use Cases</h4>
221
+ <p>The model used in this example is <code>zero_shot_ner_roberta</code>, which is compatible with Spark NLP 4.3.0+ and is trained with RoBERTa embeddings. This model is capable of recognizing any entity defined at runtime, making it versatile for various applications.</p>
222
+ <ul>
223
+ <li><b>Model Name:</b> zero_shot_ner_roberta</li>
224
+ <li><b>Language:</b> English</li>
225
+ <li><b>Size:</b> 463.8 MB</li>
226
+ <li><b>Case Sensitive:</b> true</li>
227
+ <li><b>Supported Entities:</b> Defined at runtime</li>
228
+ <li><b>Task:</b> NER</li>
229
+ </ul>
230
+ <p>More models and embeddings can be found at <a href="https://nlp.johnsnowlabs.com/models" class="link">John Snow Labs' Model Hub</a>.</p>
231
+ </div>
232
+ """, unsafe_allow_html=True)
233
+
234
+ # Conclusion
235
+ st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
236
+
237
+ st.markdown("""
238
+ <div class="section">
239
+ <p>Zero-Shot NER is a powerful tool that allows you to recognize new entities without the need for labeled training data. By leveraging pretrained models and defining entity types at runtime, you can quickly adapt to new domains and tasks.</p>
240
+ <p>Try implementing Zero-Shot NER in your own projects to see how it can simplify your workflow and extend the capabilities of your NLP models.</p>
241
+ </div>
242
+ """, unsafe_allow_html=True)
243
+
244
+ # Additional Resources
245
+ st.markdown("""
246
+ <div class="section">
247
+ <h4>Additional Resources</h4>
248
+ <ul>
249
+ <li><a href="https://nlp.johnsnowlabs.com/" class="link">Spark NLP Official Website</a></li>
250
+ <li><a href="https://nlp.johnsnowlabs.com/docs/en/licensed/models" class="link">Spark NLP Models</a></li>
251
+ <li><a href="https://github.com/JohnSnowLabs/spark-nlp" class="link">GitHub Repository</a></li>
252
+ <li><a href="https://colab.research.google.com/drive/1-9x0hSoVhHb3Fq8UqMeZrbHMYaK0YHj7" class="link">Zero-Shot NER with Spark NLP - Google Colab Notebook</a></li>
253
+ </ul>
254
+ </div>
255
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ st-annotated-text
3
+ pandas
4
+ numpy
5
+ spark-nlp
6
+ pyspark