abdullahmubeen10
commited on
Commit
•
3c504d6
1
Parent(s):
9343bc7
Upload 6 files
Browse files- .gitattributes +1 -0
- .streamlit/config.toml +3 -0
- Demo.py +184 -0
- Dockerfile +76 -0
- images/text preprocessing visual.jpg +3 -0
- pages/Workflow & Model Overview.py +380 -0
- requirements.txt +5 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
images/text[[:space:]]preprocessing[[:space:]]visual.jpg filter=lfs diff=lfs merge=lfs -text
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import pyspark.sql.functions as F
|
7 |
+
from pyspark.ml import Pipeline
|
8 |
+
from pyspark.sql import SparkSession
|
9 |
+
from sparknlp.annotator import *
|
10 |
+
from sparknlp.base import *
|
11 |
+
from sparknlp.pretrained import PretrainedPipeline
|
12 |
+
from pyspark.sql.types import StringType, IntegerType
|
13 |
+
|
14 |
+
# Configure Streamlit page
|
15 |
+
st.set_page_config(
|
16 |
+
layout="wide",
|
17 |
+
page_title="Spark NLP Demos App",
|
18 |
+
initial_sidebar_state="auto"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Custom CSS for better styling
|
22 |
+
st.markdown("""
|
23 |
+
<style>
|
24 |
+
.main-title {
|
25 |
+
font-size: 36px;
|
26 |
+
color: #4A90E2;
|
27 |
+
font-weight: bold;
|
28 |
+
text-align: center;
|
29 |
+
}
|
30 |
+
.section p, .section ul {
|
31 |
+
color: #666666;
|
32 |
+
}
|
33 |
+
</style>
|
34 |
+
""", unsafe_allow_html=True)
|
35 |
+
|
36 |
+
@st.cache_resource
|
37 |
+
def init_spark():
|
38 |
+
spark = sparknlp.start()
|
39 |
+
return spark
|
40 |
+
|
41 |
+
@st.cache_resource
|
42 |
+
def create_pipeline():
|
43 |
+
|
44 |
+
documentAssembler = DocumentAssembler()\
|
45 |
+
.setInputCol("text")\
|
46 |
+
.setOutputCol("document")
|
47 |
+
|
48 |
+
sentenceDetector = SentenceDetector()\
|
49 |
+
.setInputCols(['document'])\
|
50 |
+
.setOutputCol('sentences')
|
51 |
+
|
52 |
+
tokenizer = Tokenizer() \
|
53 |
+
.setInputCols(["document"]) \
|
54 |
+
.setOutputCol("token")
|
55 |
+
|
56 |
+
normalizer = Normalizer() \
|
57 |
+
.setInputCols(["token"]) \
|
58 |
+
.setOutputCol("normalized")\
|
59 |
+
.setLowercase(True)\
|
60 |
+
.setCleanupPatterns(["[^\w\d\s]"])
|
61 |
+
|
62 |
+
stopwords_cleaner = StopWordsCleaner()\
|
63 |
+
.setInputCols("token")\
|
64 |
+
.setOutputCol("removed_stopwords")\
|
65 |
+
.setCaseSensitive(False)\
|
66 |
+
|
67 |
+
stemmer = Stemmer() \
|
68 |
+
.setInputCols(["token"]) \
|
69 |
+
.setOutputCol("stem")
|
70 |
+
|
71 |
+
lemmatizer = Lemmatizer() \
|
72 |
+
.setInputCols(["token"]) \
|
73 |
+
.setOutputCol("lemma") \
|
74 |
+
.setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")
|
75 |
+
|
76 |
+
nlpPipeline = Pipeline(stages=[documentAssembler,
|
77 |
+
sentenceDetector,
|
78 |
+
tokenizer,
|
79 |
+
normalizer,
|
80 |
+
stopwords_cleaner,
|
81 |
+
stemmer,
|
82 |
+
lemmatizer])
|
83 |
+
|
84 |
+
return nlpPipeline
|
85 |
+
|
86 |
+
def fit_data(pipeline, data):
|
87 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
88 |
+
pipeline_model = pipeline.fit(empty_df)
|
89 |
+
model = LightPipeline(pipeline_model)
|
90 |
+
result = model.fullAnnotate(data)
|
91 |
+
return result
|
92 |
+
|
93 |
+
def extract_annotations(output, annotation_type):
|
94 |
+
return [anno.result for anno in output[0][annotation_type]]
|
95 |
+
|
96 |
+
# st.title("Summarize Text")
|
97 |
+
st.markdown('<div class="main-title">State-of-the-Art Text Preprocessing with Spark NLP</div>', unsafe_allow_html=True)
|
98 |
+
st.write("")
|
99 |
+
st.write("")
|
100 |
+
st.markdown("""
|
101 |
+
<div class="section">
|
102 |
+
<p>This demo utilizes a comprehensive text preprocessing pipeline using Spark NLP. The pipeline includes several stages such as document assembly, tokenization, sentence detection, normalization, stopword cleaning, stemming, and lemmatization. These steps are essential for preparing text data for downstream NLP tasks, ensuring the text is clean and standardized for effective model training and evaluation.</p>
|
103 |
+
</div>
|
104 |
+
""", unsafe_allow_html=True)
|
105 |
+
|
106 |
+
model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
|
107 |
+
#model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
|
108 |
+
|
109 |
+
st.sidebar.title("Filter Annotator Outputs")
|
110 |
+
selected_models = []
|
111 |
+
for model in model_name.split('|'):
|
112 |
+
check = st.sidebar.checkbox(model, value=True, key=model)
|
113 |
+
selected_models.append(check)
|
114 |
+
|
115 |
+
# # Let's add the colab link for the notebook.
|
116 |
+
|
117 |
+
link= """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
|
118 |
+
st.sidebar.title('')
|
119 |
+
st.sidebar.markdown('Reference notebook:')
|
120 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
121 |
+
|
122 |
+
examples = [
|
123 |
+
|
124 |
+
"The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.",
|
125 |
+
"Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!",
|
126 |
+
"Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. Computers that perform quantum computations are known as quantum computers. Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers. The study of quantum computing is a subfield of quantum information science. Quantum computing began in the early 1980s, when physicist Paul Benioff proposed a quantum mechanical model of the Turing machine.",
|
127 |
+
"Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron. Incorporating both historical and fictionalized aspects, it is based on accounts of the sinking of the RMS Titanic, and stars Leonardo DiCaprio and Kate Winslet as members of different social classes who fall in love aboard the ship during its ill-fated maiden voyage.",
|
128 |
+
"William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000.[9] He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella.",
|
129 |
+
"""World War II (often abbreviated as WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945. The vast majority of the world's countries—including all the great powers—eventually formed two opposing military alliances: the Allies and the Axis. A state of total war emerged, directly involving more than 100 million people from more than 30 countries. The major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, blurring the distinction between civilian and military resources. World War II was the deadliest conflict in human history, marked by 70 to 85 million fatalities, most of whom were civilians in the Soviet Union and China. Tens of millions of people died during the conflict due to genocides (including the Holocaust), premeditated death from starvation, massacres, and disease. Aircraft played a major role in the conflict which included the use of terror bombing, strategic bombing and the only use of nuclear weapons in war.""",
|
130 |
+
"Disney Channel (originally called The Disney Channel from 1983 to 1997 and commonly shortened to Disney from 1997 to 2002) is an American pay television channel that serves as the flagship property of owner Disney Channels Worldwide unit of the Walt Disney Television subsidiary of The Walt Disney Company. Disney Channel's programming consists of original first-run television series, theatrically released and original made-for-TV movies and select other third-party programming. Disney Channel – which formerly operated as a premium service – originally marketed its programs towards families during the 1980s, and later at younger children by the 2000s.",
|
131 |
+
"For several hundred thousand years, the Sahara has alternated between desert and savanna grassland in a 20,000 year cycle[8] caused by the precession of the Earth's axis as it rotates around the Sun, which changes the location of the North African Monsoon. The area is next expected to become green in about 15,000 years (17,000 ACE).",
|
132 |
+
"Elon Musk is an engineer, industrial designer, technology entrepreneur and philanthropist. He is a citizen of South Africa, Canada, and the United States. He is the founder, CEO and chief engineer/designer of SpaceX; early investor, CEO and product architect of Tesla, Inc.; founder of The Boring Company; co-founder of Neuralink; and co-founder and initial co-chairman of OpenAI. He was elected a Fellow of the Royal Society (FRS) in 2018. In December 2016, he was ranked 21st on the Forbes list of The World's Most Powerful People, and was ranked joint-first on the Forbes list of the Most Innovative Leaders of 2019. A self-made billionaire, as of June 2020 his net worth was estimated at $38.8 billion and he is listed by Forbes as the 31st-richest person in the world. He is the longest tenured CEO of any automotive manufacturer globally.",
|
133 |
+
"Born and raised in the Austrian Empire, Tesla studied engineering and physics in the 1870s without receiving a degree, and gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884 he emigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His alternating current (AC) induction motor and related polyphase AC patents, licensed by Westinghouse Electric in 1888, earned him a considerable amount of money and became the cornerstone of the polyphase system which that company eventually marketed."
|
134 |
+
|
135 |
+
]
|
136 |
+
|
137 |
+
st.subheader("Split and clean text")
|
138 |
+
|
139 |
+
selected_text = st.selectbox("Select an example", examples)
|
140 |
+
|
141 |
+
custom_input = st.text_input("Try it for yourself!")
|
142 |
+
|
143 |
+
if custom_input:
|
144 |
+
selected_text = custom_input
|
145 |
+
elif selected_text:
|
146 |
+
selected_text = selected_text
|
147 |
+
|
148 |
+
st.subheader('Selected Text')
|
149 |
+
st.write(selected_text)
|
150 |
+
|
151 |
+
spark = init_spark()
|
152 |
+
Pipeline = create_pipeline()
|
153 |
+
output = fit_data(Pipeline, selected_text)
|
154 |
+
|
155 |
+
data_dict = {}
|
156 |
+
|
157 |
+
if selected_models[0]:
|
158 |
+
sentences = extract_annotations(output, 'sentences')
|
159 |
+
data_dict['sentences'] = sentences
|
160 |
+
|
161 |
+
if selected_models[1]:
|
162 |
+
tokens = extract_annotations(output, 'token')
|
163 |
+
data_dict['token'] = tokens
|
164 |
+
|
165 |
+
if selected_models[2]:
|
166 |
+
stems = extract_annotations(output, 'stem')
|
167 |
+
data_dict['stem'] = stems
|
168 |
+
|
169 |
+
if selected_models[3]:
|
170 |
+
lemmas = extract_annotations(output, 'lemma')
|
171 |
+
data_dict['lemma'] = lemmas
|
172 |
+
|
173 |
+
if selected_models[4]:
|
174 |
+
normalized = extract_annotations(output, 'normalized')
|
175 |
+
data_dict['normalized'] = normalized
|
176 |
+
|
177 |
+
if selected_models[5]:
|
178 |
+
removed_stopwords = extract_annotations(output, 'removed_stopwords')
|
179 |
+
data_dict['removed_stopwords'] = removed_stopwords
|
180 |
+
|
181 |
+
if data_dict:
|
182 |
+
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
|
183 |
+
st.subheader("Annotation Results:")
|
184 |
+
st.dataframe(df)
|
Dockerfile
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
ENV NB_USER jovyan
|
5 |
+
ENV NB_UID 1000
|
6 |
+
ENV HOME /home/${NB_USER}
|
7 |
+
|
8 |
+
ENV PYSPARK_PYTHON=python3
|
9 |
+
ENV PYSPARK_DRIVER_PYTHON=python3
|
10 |
+
|
11 |
+
RUN apt-get update && apt-get install -y \
|
12 |
+
tar \
|
13 |
+
wget \
|
14 |
+
bash \
|
15 |
+
rsync \
|
16 |
+
gcc \
|
17 |
+
libfreetype6-dev \
|
18 |
+
libhdf5-serial-dev \
|
19 |
+
libpng-dev \
|
20 |
+
libzmq3-dev \
|
21 |
+
python3 \
|
22 |
+
python3-dev \
|
23 |
+
python3-pip \
|
24 |
+
unzip \
|
25 |
+
pkg-config \
|
26 |
+
software-properties-common \
|
27 |
+
graphviz
|
28 |
+
|
29 |
+
RUN adduser --disabled-password \
|
30 |
+
--gecos "Default user" \
|
31 |
+
--uid ${NB_UID} \
|
32 |
+
${NB_USER}
|
33 |
+
|
34 |
+
# Install OpenJDK-8
|
35 |
+
RUN apt-get update && \
|
36 |
+
apt-get install -y openjdk-8-jdk && \
|
37 |
+
apt-get install -y ant && \
|
38 |
+
apt-get clean;
|
39 |
+
|
40 |
+
# Fix certificate issues
|
41 |
+
RUN apt-get update && \
|
42 |
+
apt-get install ca-certificates-java && \
|
43 |
+
apt-get clean && \
|
44 |
+
update-ca-certificates -f;
|
45 |
+
# Setup JAVA_HOME -- useful for docker commandline
|
46 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
47 |
+
RUN export JAVA_HOME
|
48 |
+
|
49 |
+
RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ~/.bashrc
|
50 |
+
|
51 |
+
RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
52 |
+
|
53 |
+
RUN apt-get update
|
54 |
+
RUN apt-get install -y software-properties-common
|
55 |
+
RUN add-apt-repository ppa:deadsnakes/ppa
|
56 |
+
RUN apt-get install -y python3.8 python3-pip
|
57 |
+
|
58 |
+
ENV PYSPARK_PYTHON=python3.8
|
59 |
+
ENV PYSPARK_DRIVER_PYTHON=python3.8
|
60 |
+
|
61 |
+
COPY . .
|
62 |
+
|
63 |
+
RUN python3.8 -m pip install --upgrade pip
|
64 |
+
RUN python3.8 -m pip install -r requirements.txt
|
65 |
+
|
66 |
+
USER root
|
67 |
+
RUN chown -R ${NB_UID} ${HOME}
|
68 |
+
USER ${NB_USER}
|
69 |
+
|
70 |
+
WORKDIR ${HOME}
|
71 |
+
|
72 |
+
COPY . .
|
73 |
+
|
74 |
+
EXPOSE 7860
|
75 |
+
|
76 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
images/text preprocessing visual.jpg
ADDED
Git LFS Details
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #333333;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Introduction
|
38 |
+
st.markdown('<div class="main-title">Text Preprocessing with Spark NLP</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
st.markdown("""
|
41 |
+
<div class="section">
|
42 |
+
<p>Welcome to the Spark NLP Text Preprocessing Demo App! In the field of Natural Language Processing (NLP), preprocessing is a crucial step that ensures the text data is clean and suitable for modeling. Effective preprocessing can significantly enhance the performance of NLP models.</p>
|
43 |
+
<p>Spark NLP stands out as a leading library for text preprocessing, offering a range of tools and models within an easy-to-use pipeline design compatible with Apache Spark. This demo showcases how you can leverage Spark NLP to preprocess your text data efficiently.</p>
|
44 |
+
</div>
|
45 |
+
""", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
# About Text Preprocessing
|
48 |
+
st.markdown('<div class="sub-title">About Text Preprocessing</div>', unsafe_allow_html=True)
|
49 |
+
st.markdown("""
|
50 |
+
<div class="section">
|
51 |
+
<p>Text preprocessing involves a series of steps to clean and normalize text data. Common tasks include tokenization, stopword removal, stemming, lemmatization, and more. These steps are essential for preparing raw text for downstream NLP tasks.</p>
|
52 |
+
<p>In Spark NLP, text preprocessing is facilitated through various annotators that can be combined into a preprocessing pipeline. We'll demonstrate how to use these annotators in Python to preprocess text data effectively.</p>
|
53 |
+
</div>
|
54 |
+
""", unsafe_allow_html=True)
|
55 |
+
|
56 |
+
st.image('https://www.johnsnowlabs.com/wp-content/uploads/2023/05/img_blog_2-4.jpg', caption='Text preprocessing pipeline visual', use_column_width='auto')
|
57 |
+
|
58 |
+
# How to Use the Preprocessing Tools
|
59 |
+
st.markdown('<div class="sub-title">How to Use Spark NLP for Text Preprocessing</div>', unsafe_allow_html=True)
|
60 |
+
st.markdown("""
|
61 |
+
<div class="section">
|
62 |
+
<p>To preprocess text using Spark NLP, we need to create a pipeline that includes various preprocessing annotators. These annotators transform the input text through steps like tokenization, normalization, and stopword removal.</p>
|
63 |
+
</div>
|
64 |
+
""", unsafe_allow_html=True)
|
65 |
+
|
66 |
+
st.markdown('<div class="sub-title">Installation</div>', unsafe_allow_html=True)
|
67 |
+
st.code('!pip install spark-nlp', language='python')
|
68 |
+
|
69 |
+
# Import Libraries and Read Data
|
70 |
+
st.markdown('<div class="sub-title">Importing Libraries and Reading Data</div>', unsafe_allow_html=True)
|
71 |
+
st.markdown("""
|
72 |
+
<div class="section">
|
73 |
+
<p>First, we'll import Spark NLP and necessary libraries, read the data from a local file, and convert it into a Spark DataFrame.</p>
|
74 |
+
</div>
|
75 |
+
""", unsafe_allow_html=True)
|
76 |
+
|
77 |
+
st.code("""
|
78 |
+
import sparknlp
|
79 |
+
from sparknlp.base import *
|
80 |
+
from sparknlp.annotator import *
|
81 |
+
|
82 |
+
spark= sparknlp.start()
|
83 |
+
|
84 |
+
df= spark.read\\
|
85 |
+
.option("header", True)\\
|
86 |
+
.csv("spam_text_messages.csv")\\
|
87 |
+
.toDF("category", "text")
|
88 |
+
df.show(5, truncate=30)
|
89 |
+
|
90 |
+
>>>
|
91 |
+
+--------+------------------------------+
|
92 |
+
|category| text|
|
93 |
+
+--------+------------------------------+
|
94 |
+
| ham|Go until jurong point, craz...|
|
95 |
+
| ham| Ok lar... Joking wif u oni...|
|
96 |
+
| spam|Free entry in 2 a wkly comp...|
|
97 |
+
| ham|U dun say so early hor... U...|
|
98 |
+
| ham|Nah I don't think he goes t...|
|
99 |
+
+--------+------------------------------+
|
100 |
+
only showing top 5 rows
|
101 |
+
""", language='python')
|
102 |
+
|
103 |
+
st.markdown("""
|
104 |
+
<div class="section">
|
105 |
+
<p>The dataset contains two columns: <strong>category</strong> and <strong>text</strong>. The text column contains messages, and the category column indicates whether the message is spam or not (ham).</p>
|
106 |
+
</div>
|
107 |
+
""", unsafe_allow_html=True)
|
108 |
+
|
109 |
+
# Document Assembler
|
110 |
+
st.markdown('<div class="sub-title">Document Assembler</div>', unsafe_allow_html=True)
|
111 |
+
st.markdown("""
|
112 |
+
<div class="section">
|
113 |
+
<p>The DocumentAssembler is the beginning part of any Spark NLP project. It creates the first annotation of type Document, which may be used by annotators down the road. We use it as follows:</p>
|
114 |
+
</div>
|
115 |
+
""", unsafe_allow_html=True)
|
116 |
+
|
117 |
+
st.code("""
|
118 |
+
documentAssembler = DocumentAssembler() \\
|
119 |
+
.setInputCol("text") \\
|
120 |
+
.setOutputCol("document") \\
|
121 |
+
.setCleanupMode("shrink")
|
122 |
+
|
123 |
+
df_doc = documentAssembler.transform(df)
|
124 |
+
df_doc.printSchema()
|
125 |
+
""", language='python')
|
126 |
+
|
127 |
+
st.markdown("""
|
128 |
+
<div class="section">
|
129 |
+
<p>The <code>shrink</code> cleanup mode removes new lines and tabs, merging multiple spaces and blank lines into a single space. The schema after transformation includes the new <strong>document</strong> column.</p>
|
130 |
+
</div>
|
131 |
+
""", unsafe_allow_html=True)
|
132 |
+
|
133 |
+
# Tokenizer
|
134 |
+
st.markdown('<div class="sub-title">Tokenizer</div>', unsafe_allow_html=True)
|
135 |
+
st.markdown("""
|
136 |
+
<div class="section">
|
137 |
+
<p>The Tokenizer identifies the tokens in Spark NLP:</p>
|
138 |
+
</div>
|
139 |
+
""", unsafe_allow_html=True)
|
140 |
+
|
141 |
+
st.code("""
|
142 |
+
tokenizer = Tokenizer() \\
|
143 |
+
.setInputCols(["document"]) \\
|
144 |
+
.setOutputCol("token")
|
145 |
+
""", language='python')
|
146 |
+
|
147 |
+
# Sentence Detector
|
148 |
+
st.markdown('<div class="sub-title">Sentence Detector</div>', unsafe_allow_html=True)
|
149 |
+
st.markdown("""
|
150 |
+
<div class="section">
|
151 |
+
<p>The SentenceDetector finds sentence boundaries in raw text:</p>
|
152 |
+
</div>
|
153 |
+
""", unsafe_allow_html=True)
|
154 |
+
|
155 |
+
st.code("""
|
156 |
+
sentenceDetector = SentenceDetector() \\
|
157 |
+
.setInputCols(["document"]) \\
|
158 |
+
.setOutputCol("sentence")
|
159 |
+
""", language='python')
|
160 |
+
|
161 |
+
# Normalizer
|
162 |
+
st.markdown('<div class="sub-title">Normalizer</div>', unsafe_allow_html=True)
|
163 |
+
st.markdown("""
|
164 |
+
<div class="section">
|
165 |
+
<p>The Normalizer cleans dirty characters after regex pattern and removes words based on a given dictionary:</p>
|
166 |
+
</div>
|
167 |
+
""", unsafe_allow_html=True)
|
168 |
+
|
169 |
+
st.code("""
|
170 |
+
normalizer = Normalizer() \\
|
171 |
+
.setInputCols(["token"]) \\
|
172 |
+
.setOutputCol("normalized") \\
|
173 |
+
.setLowercase(True) \\
|
174 |
+
.setCleanupPatterns(["[^\w\d\s]"])
|
175 |
+
""", language='python')
|
176 |
+
|
177 |
+
# Stopwords Cleaner
|
178 |
+
st.markdown('<div class="sub-title">Stopwords Cleaner</div>', unsafe_allow_html=True)
|
179 |
+
st.markdown("""
|
180 |
+
<div class="section">
|
181 |
+
<p>The StopWordsCleaner removes stopwords from the text:</p>
|
182 |
+
</div>
|
183 |
+
""", unsafe_allow_html=True)
|
184 |
+
|
185 |
+
st.code("""
|
186 |
+
stopwordsCleaner = StopWordsCleaner() \\
|
187 |
+
.setInputCols(["token"]) \\
|
188 |
+
.setOutputCol("cleaned_tokens") \\
|
189 |
+
.setCaseSensitive(True)
|
190 |
+
""", language='python')
|
191 |
+
|
192 |
+
# Token Assembler
|
193 |
+
st.markdown('<div class="sub-title">Token Assembler</div>', unsafe_allow_html=True)
|
194 |
+
st.markdown("""
|
195 |
+
<div class="section">
|
196 |
+
<p>The TokenAssembler assembles cleaned tokens back together:</p>
|
197 |
+
</div>
|
198 |
+
""", unsafe_allow_html=True)
|
199 |
+
|
200 |
+
st.code("""
|
201 |
+
tokenAssembler = TokenAssembler() \\
|
202 |
+
.setInputCols(["sentence", "cleaned_tokens"]) \\
|
203 |
+
.setOutputCol("assembled")
|
204 |
+
""", language='python')
|
205 |
+
|
206 |
+
# Stemmer
|
207 |
+
st.markdown('<div class="sub-title">Stemmer</div>', unsafe_allow_html=True)
|
208 |
+
st.markdown("""
|
209 |
+
<div class="section">
|
210 |
+
<p>The Stemmer reduces inflectional forms and sometimes derivationally related forms of a word to a common base form:</p>
|
211 |
+
</div>
|
212 |
+
""", unsafe_allow_html=True)
|
213 |
+
|
214 |
+
st.code("""
|
215 |
+
stemmer = Stemmer() \\
|
216 |
+
.setInputCols(["token"]) \\
|
217 |
+
.setOutputCol("stem")
|
218 |
+
""", language='python')
|
219 |
+
|
220 |
+
# Lemmatizer
|
221 |
+
st.markdown('<div class="sub-title">Lemmatizer</div>', unsafe_allow_html=True)
|
222 |
+
st.markdown("""
|
223 |
+
<div class="section">
|
224 |
+
<p>The Lemmatizer reduces words to their base or root form, referring to a dictionary to understand the word's meaning:</p>
|
225 |
+
</div>
|
226 |
+
""", unsafe_allow_html=True)
|
227 |
+
|
228 |
+
st.code("""
|
229 |
+
lemmatizer = Lemmatizer() \\
|
230 |
+
.setInputCols(["token"]) \\
|
231 |
+
.setOutputCol("lemma") \\
|
232 |
+
.setDictionary("AntBNC_lemmas_ver_001.txt", value_delimiter="\\t", key_delimiter="->")
|
233 |
+
""", language='python')
|
234 |
+
|
235 |
+
# Pipeline
|
236 |
+
st.markdown('<div class="sub-title">Putting All Processes into a Spark ML Pipeline</div>', unsafe_allow_html=True)
|
237 |
+
st.markdown("""
|
238 |
+
<div class="section">
|
239 |
+
<p>Now, we will put all the preprocessing stages into a Spark ML Pipeline and apply it to our dataset.</p>
|
240 |
+
</div>
|
241 |
+
""", unsafe_allow_html=True)
|
242 |
+
|
243 |
+
st.code("""
|
244 |
+
from pyspark.ml import Pipeline
|
245 |
+
|
246 |
+
nlpPipeline = Pipeline(stages=[
|
247 |
+
documentAssembler,
|
248 |
+
tokenizer,
|
249 |
+
sentenceDetector,
|
250 |
+
normalizer,
|
251 |
+
stopwordsCleaner,
|
252 |
+
tokenAssembler,
|
253 |
+
stemmer,
|
254 |
+
lemmatizer
|
255 |
+
])
|
256 |
+
|
257 |
+
empty_df = spark.createDataFrame([[""]]).toDF("text")
|
258 |
+
model = nlpPipeline.fit(empty_df)
|
259 |
+
result = model.transform(df)
|
260 |
+
""", language='python')
|
261 |
+
|
262 |
+
# Showcase/Example
|
263 |
+
st.markdown('<div class="sub-title">Showcase/Example</div>', unsafe_allow_html=True)
|
264 |
+
st.markdown("""
|
265 |
+
<div class="section">
|
266 |
+
<p>Let's examine the results of our preprocessing pipeline, starting with tokens and normalized tokens:</p>
|
267 |
+
</div>
|
268 |
+
""", unsafe_allow_html=True)
|
269 |
+
|
270 |
+
st.code("""
|
271 |
+
from pyspark.sql import functions as F
|
272 |
+
|
273 |
+
result.select("token.result", "normalized.result").show(5, truncate=30)
|
274 |
+
|
275 |
+
>>>
|
276 |
+
+------------------------------+------------------------------+
|
277 |
+
| result| result|
|
278 |
+
+------------------------------+------------------------------+
|
279 |
+
|[Go, until, jurong, point, ...|[go, until, jurong, point, ...|
|
280 |
+
|[Ok, lar, ..., Joking, wif,...|[ok, lar, joking, wif, u, oni]|
|
281 |
+
|[Free, entry, in, 2, a, wkl...|[free, entry, in, 2, a, wkl...|
|
282 |
+
|[U, dun, say, so, early, ho...|[u, dun, say, so, early, ho...|
|
283 |
+
|[Nah, I, don't, think, he, ...|[nah, ı, dont, think, he, g...|
|
284 |
+
+------------------------------+------------------------------+
|
285 |
+
only showing top 5 rows
|
286 |
+
""", language='python')
|
287 |
+
|
288 |
+
st.markdown("""
|
289 |
+
<div class="section">
|
290 |
+
<p>Next, we check the cleaned data from stopwords:</p>
|
291 |
+
</div>
|
292 |
+
""", unsafe_allow_html=True)
|
293 |
+
|
294 |
+
st.code("""
|
295 |
+
result.select(F.explode(F.arrays_zip("token.result", "cleaned_tokens.result")).alias("col")) \\
|
296 |
+
.select(F.expr("col['0']").alias("token"), F.expr("col['1']").alias("cleaned_sw")).show(10)
|
297 |
+
|
298 |
+
>>>
|
299 |
+
+------------------------------+------------------------------+
|
300 |
+
| result| result|
|
301 |
+
+------------------------------+------------------------------+
|
302 |
+
|[Go, until, jurong, point, ...|[go, until, jurong, point, ...|
|
303 |
+
|[Ok, lar, ..., Joking, wif,...|[ok, lar, joking, wif, u, oni]|
|
304 |
+
|[Free, entry, in, 2, a, wkl...|[free, entry, in, 2, a, wkl...|
|
305 |
+
|[U, dun, say, so, early, ho...|[u, dun, say, so, early, ho...|
|
306 |
+
|[Nah, I, don't, think, he, ...|[nah, ı, dont, think, he, g...|
|
307 |
+
+------------------------------+------------------------------+
|
308 |
+
only showing top 5 rows
|
309 |
+
""", language='python')
|
310 |
+
|
311 |
+
st.markdown("""
|
312 |
+
<div class="section">
|
313 |
+
<p>Finally, we compare the sentence detector result and token assembler result:</p>
|
314 |
+
</div>
|
315 |
+
""", unsafe_allow_html=True)
|
316 |
+
|
317 |
+
st.code("""
|
318 |
+
result.select(F.explode(F.arrays_zip("sentence.result", "assembled.result")).alias("col")) \\
|
319 |
+
.select(F.expr("col['0']").alias("sentence"), F.expr("col['1']").alias("assembled")).show(5, truncate=30)
|
320 |
+
|
321 |
+
>>>
|
322 |
+
+------------------------------+------------------------------+
|
323 |
+
| sentence| assembled|
|
324 |
+
+------------------------------+------------------------------+
|
325 |
+
| Go until jurong point, crazy.| Go jurong point, crazy|
|
326 |
+
| .| |
|
327 |
+
|Available only in bugis n g...|Available bugis n great wor...|
|
328 |
+
| Cine there got amore wat.| Cine got amore wat|
|
329 |
+
| .| |
|
330 |
+
+------------------------------+------------------------------+
|
331 |
+
only showing top 5 rows
|
332 |
+
|
333 |
+
|
334 |
+
result.withColumn("tmp", F.explode("assembled")) \\
|
335 |
+
.select("tmp.*").select("begin", "end", "result", "metadata.sentence").show(5, truncate=30)
|
336 |
+
|
337 |
+
>>>
|
338 |
+
+-----+---+------------------------------+--------+
|
339 |
+
|begin|end| result|sentence|
|
340 |
+
+-----+---+------------------------------+--------+
|
341 |
+
| 0| 21| Go jurong point, crazy| 0|
|
342 |
+
| 29| 28| | 1|
|
343 |
+
| 31| 74|Available bugis n great wor...| 2|
|
344 |
+
| 84|101| Cine got amore wat| 3|
|
345 |
+
| 109|108| | 4|
|
346 |
+
+-----+---+------------------------------+--------+
|
347 |
+
only showing top 5 rows
|
348 |
+
""", language='python')
|
349 |
+
|
350 |
+
st.markdown("""
|
351 |
+
<div class="section">
|
352 |
+
<p>In this example, we have successfully cleaned and preprocessed text data using various annotators and transformers in Spark NLP. This preprocessing pipeline is essential for preparing the data for further NLP tasks, ensuring that the text is clean and normalized.</p>
|
353 |
+
</div>
|
354 |
+
""", unsafe_allow_html=True)
|
355 |
+
|
356 |
+
st.markdown('<div class="sub-title">Additional Resources and References</div>', unsafe_allow_html=True)
|
357 |
+
st.markdown("""
|
358 |
+
<div class="section">
|
359 |
+
<ul>
|
360 |
+
<li><a class="link" href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/1hr_workshop/SparkNLP_openSource_workshop_1hr.ipynb" target="_blank">Intro to Spark NLP workflow</a></li>
|
361 |
+
<li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started with Spark NLP</a></li>
|
362 |
+
<li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
|
363 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
|
364 |
+
<li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
|
365 |
+
</ul>
|
366 |
+
</div>
|
367 |
+
""", unsafe_allow_html=True)
|
368 |
+
|
369 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
370 |
+
st.markdown("""
|
371 |
+
<div class="section">
|
372 |
+
<ul>
|
373 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
374 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
375 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
376 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
377 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
378 |
+
</ul>
|
379 |
+
</div>
|
380 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
spark-nlp
|
5 |
+
pyspark
|