abdullahmubeen10 commited on
Commit
884e39e
1 Parent(s): 3888679

Upload 8 files

Browse files
AntBNC_lemmas_ver_001.txt ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ ENV NB_USER jovyan
5
+ ENV NB_UID 1000
6
+ ENV HOME /home/${NB_USER}
7
+
8
+ ENV PYSPARK_PYTHON=python3
9
+ ENV PYSPARK_DRIVER_PYTHON=python3
10
+
11
+ RUN apt-get update && apt-get install -y \
12
+ tar \
13
+ wget \
14
+ bash \
15
+ rsync \
16
+ gcc \
17
+ libfreetype6-dev \
18
+ libhdf5-serial-dev \
19
+ libpng-dev \
20
+ libzmq3-dev \
21
+ python3 \
22
+ python3-dev \
23
+ python3-pip \
24
+ unzip \
25
+ pkg-config \
26
+ software-properties-common \
27
+ graphviz
28
+
29
+ RUN adduser --disabled-password \
30
+ --gecos "Default user" \
31
+ --uid ${NB_UID} \
32
+ ${NB_USER}
33
+
34
+ # Install OpenJDK-8
35
+ RUN apt-get update && \
36
+ apt-get install -y openjdk-8-jdk && \
37
+ apt-get install -y ant && \
38
+ apt-get clean;
39
+
40
+ # Fix certificate issues
41
+ RUN apt-get update && \
42
+ apt-get install ca-certificates-java && \
43
+ apt-get clean && \
44
+ update-ca-certificates -f;
45
+ # Setup JAVA_HOME -- useful for docker commandline
46
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
47
+ RUN export JAVA_HOME
48
+
49
+ RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ~/.bashrc
50
+
51
+ RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
52
+
53
+ RUN apt-get update
54
+ RUN apt-get install -y software-properties-common
55
+ RUN add-apt-repository ppa:deadsnakes/ppa
56
+ RUN apt-get install -y python3.8 python3-pip
57
+
58
+ ENV PYSPARK_PYTHON=python3.8
59
+ ENV PYSPARK_DRIVER_PYTHON=python3.8
60
+
61
+ COPY . .
62
+
63
+ RUN python3.8 -m pip install --upgrade pip
64
+ RUN python3.8 -m pip install -r requirements.txt
65
+
66
+ USER root
67
+ RUN chown -R ${NB_UID} ${HOME}
68
+ USER ${NB_USER}
69
+
70
+ WORKDIR ${HOME}
71
+
72
+ COPY . .
73
+
74
+ EXPOSE 7860
75
+
76
+ ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=7860", "--server.address=0.0.0.0"]
Home.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Configure Streamlit page
4
+ st.set_page_config(
5
+ layout="wide",
6
+ page_title="Spark NLP Demos App",
7
+ initial_sidebar_state="auto"
8
+ )
9
+
10
+ # Custom CSS for better styling
11
+ st.markdown("""
12
+ <style>
13
+ .main-title {
14
+ font-size: 36px;
15
+ color: #4A90E2;
16
+ font-weight: bold;
17
+ text-align: center;
18
+ }
19
+ .sub-title {
20
+ font-size: 24px;
21
+ color: #333333;
22
+ margin-top: 20px;
23
+ }
24
+ .section {
25
+ background-color: #f9f9f9;
26
+ padding: 15px;
27
+ border-radius: 10px;
28
+ margin-top: 20px;
29
+ }
30
+ .section h2 {
31
+ font-size: 22px;
32
+ color: #4A90E2;
33
+ }
34
+ .section p, .section ul {
35
+ color: #666666;
36
+ }
37
+ .link {
38
+ color: #4A90E2;
39
+ text-decoration: none;
40
+ }
41
+ </style>
42
+ """, unsafe_allow_html=True)
43
+
44
+ # Home page content
45
+ st.markdown('<div class="main-title">Spark NLP: State-of-the-Art Natural Language Processing</div>', unsafe_allow_html=True)
46
+
47
+ st.markdown("""
48
+ <div class="section">
49
+ <p>Spark NLP is a state-of-the-art Natural Language Processing library built on top of Apache Spark. It provides performant & accurate NLP annotations for machine learning pipelines that scale in a distributed environment. With 36,000+ pretrained models and pipelines in over 200 languages, Spark NLP supports a wide range of NLP tasks including:</p>
50
+ <ul>
51
+ <li>Tokenization</li>
52
+ <li>Named Entity Recognition</li>
53
+ <li>Sentiment Analysis</li>
54
+ <li>Text Classification</li>
55
+ <li>Machine Translation</li>
56
+ <li>Summarization</li>
57
+ <li>Question Answering</li>
58
+ <li>Text Generation</li>
59
+ <li>Image Classification</li>
60
+ <li>Automatic Speech Recognition</li>
61
+ <li>And many more</li>
62
+ </ul>
63
+ </div>
64
+ """, unsafe_allow_html=True)
65
+
66
+ st.markdown('<div class="sub-title">Key Features</div>', unsafe_allow_html=True)
67
+
68
+ st.markdown("""
69
+ <div class="section">
70
+ <ul>
71
+ <li>Integration with popular transformers like BERT, RoBERTa, ALBERT, and more</li>
72
+ <li>Support for Python, R, and JVM (Java, Scala, Kotlin)</li>
73
+ <li>GPU support for accelerated processing</li>
74
+ <li>Easy integration with Spark ML functions</li>
75
+ </ul>
76
+ </div>
77
+ """, unsafe_allow_html=True)
78
+
79
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
80
+
81
+ st.markdown("""
82
+ <div class="section">
83
+ <ul>
84
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
85
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
86
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
87
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
88
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
89
+ </ul>
90
+ </div>
91
+ """, unsafe_allow_html=True)
92
+
93
+ st.markdown('<div class="sub-title">Quick Links</div>', unsafe_allow_html=True)
94
+
95
+ st.markdown("""
96
+ <div class="section">
97
+ <ul>
98
+ <li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started</a></li>
99
+ <li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
100
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
101
+ <li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
102
+ </ul>
103
+ </div>
104
+ """, unsafe_allow_html=True)
pages/Named Entity Recognition.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import pandas as pd
5
+ import numpy as np
6
+ import random
7
+ import base64
8
+ import ast
9
+ import sparknlp
10
+ import pyspark.sql.functions as F
11
+ from pyspark.ml import Pipeline
12
+ from pyspark.sql import SparkSession
13
+ from sparknlp.annotator import *
14
+ from sparknlp.base import *
15
+ from sparknlp.pretrained import PretrainedPipeline
16
+ from pyspark.sql.types import StringType, IntegerType
17
+
18
+ @st.cache_resource
19
+ def init_spark():
20
+ spark = sparknlp.start()
21
+ return spark
22
+
23
+ @st.cache_resource
24
+ def create_pipeline(model):
25
+ documentAssembler = DocumentAssembler() \
26
+ .setInputCol('text') \
27
+ .setOutputCol('document')
28
+
29
+ tokenizer = Tokenizer() \
30
+ .setInputCols(['document']) \
31
+ .setOutputCol('token')
32
+
33
+ embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
34
+ .setInputCols(["document", 'token']) \
35
+ .setOutputCol("embeddings")
36
+
37
+ ner_model = NerDLModel.pretrained(model, 'en') \
38
+ .setInputCols(['document', 'token', 'embeddings']) \
39
+ .setOutputCol('ner')
40
+
41
+ ner_converter = NerConverter() \
42
+ .setInputCols(['document', 'token', 'ner']) \
43
+ .setOutputCol('ner_chunk')
44
+
45
+ nlp_pipeline = Pipeline(stages=[
46
+ documentAssembler,
47
+ tokenizer,
48
+ embeddings,
49
+ ner_model,
50
+ ner_converter
51
+ ])
52
+
53
+ return nlp_pipeline
54
+
55
+ def fit_data(pipeline, data):
56
+ empty_df = spark.createDataFrame([['']]).toDF('text')
57
+ pipeline_model = pipeline.fit(empty_df)
58
+ model = LightPipeline(pipeline_model)
59
+ result = model.fullAnnotate(data)
60
+ return result
61
+
62
+ def get_color(l):
63
+ if str(l).lower() in LABEL_COLORS.keys():
64
+ return LABEL_COLORS[l.lower()]
65
+ else:
66
+ r = lambda: random.randint(0,200)
67
+ return '#%02X%02X%02X' % (r(), r(), r())
68
+
69
+ def simplified_display_annotations(text, annotations, labels):
70
+ def get_html_for_span(span_text, background_color=None, entity_name=None):
71
+ if background_color:
72
+ style = f'style="background-color: {background_color};"'
73
+ else:
74
+ style = ""
75
+ if entity_name:
76
+ return f'<span {style}>{span_text} <span>({entity_name})</span></span>'
77
+ return f'<span {style}>{span_text}</span>'
78
+
79
+ label_color = {label: get_color(label) for label in labels}
80
+ html_output, pos = "", 0
81
+
82
+ for chunk in annotations:
83
+ begin, end = chunk[1], chunk[2]
84
+ entity_type = chunk[4]['entity']
85
+
86
+ if pos < begin:
87
+ html_output += get_html_for_span(text[pos:begin])
88
+ html_output += get_html_for_span(chunk[3], label_color.get(entity_type), entity_type if entity_type in label_color else None)
89
+ pos = end + 1
90
+
91
+ if pos < len(text):
92
+ html_output += get_html_for_span(text[pos:])
93
+
94
+ return html_output
95
+
96
+ def parse_text_to_complex_list(text):
97
+ text = text.strip()[11:-1]
98
+ def parse_inner_structure(s):
99
+ stack = []
100
+ result = ''
101
+ for char in s:
102
+ if char in ['{', '[']:
103
+ stack.append(char)
104
+ elif char in ['}', ']']:
105
+ stack.pop()
106
+ if not stack:
107
+ return result + char
108
+ result += char
109
+ return result
110
+
111
+ elements = []
112
+ temp = ''
113
+ stack = []
114
+ for char in text:
115
+ if char in ['{', '[']:
116
+ stack.append(char)
117
+ elif char in [']', '}']:
118
+ stack.pop()
119
+ elif char == ',' and not stack:
120
+ elements.append(temp.strip())
121
+ temp = ''
122
+ continue
123
+ temp += char
124
+ elements.append(temp.strip())
125
+
126
+ # Convert elements to the appropriate type
127
+ parsed_elements = []
128
+ for element in elements:
129
+ element = element.strip()
130
+ if element.isdigit():
131
+ parsed_elements.append(int(element))
132
+ elif element.startswith(('\'', '"')):
133
+ parsed_elements.append(element.strip('\'"'))
134
+ elif element.startswith(('{', '[')):
135
+ parsed_elements.append(ast.literal_eval(element))
136
+ else:
137
+ parsed_elements.append(element)
138
+
139
+ return parsed_elements
140
+
141
+ ############ SETTING UP THE PAGE LAYOUT ############
142
+ ### SIDEBAR CONTENT ###
143
+
144
+ language_info = {
145
+ "EN": {
146
+ "title": "Recognize entities in text",
147
+ "description": "Recognize Persons, Locations, Organizations and Misc entities using out of the box pretrained Deep Learning models based on BERT (ner_dl_bert) word embeddings.",
148
+ }
149
+ }
150
+
151
+ model = st.sidebar.selectbox("Choose the pretrained model", ["ner_dl_bert", "ner_dl"], help="For more info about the models visit: https://sparknlp.org/models")
152
+
153
+ st.title(language_info["EN"]["title"])
154
+
155
+ link = """<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_EN.ipynb">
156
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
157
+ st.sidebar.markdown('Reference notebook:')
158
+ st.sidebar.markdown(link, unsafe_allow_html=True)
159
+
160
+ ### MAIN CONTENT ###
161
+
162
+ examples = [
163
+ "William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000.[9] He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella.",
164
+ "The Mona Lisa is a 16th century oil painting created by Leonardo. It's held at the Louvre in Paris.",
165
+ "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, now the co-founder and CEO of online higher education startup Udacity, in an interview with Recode earlier this week.",
166
+ "Facebook is a social networking service launched as TheFacebook on February 4, 2004. It was founded by Mark Zuckerberg with his college roommates and fellow Harvard University students Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. The website's membership was initially limited by the founders to Harvard students, but was expanded to other colleges in the Boston area, the Ivy League, and gradually most universities in the United States and Canada.",
167
+ "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods. In 1950, Alan Turing published an article titled 'Computing Machinery and Intelligence' which proposed what is now called the Turing test as a criterion of intelligence",
168
+ "Geoffrey Everest Hinton is an English Canadian cognitive psychologist and computer scientist, most noted for his work on artificial neural networks. Since 2013 he divides his time working for Google and the University of Toronto. In 2017, he cofounded and became the Chief Scientific Advisor of the Vector Institute in Toronto.",
169
+ "When I told John that I wanted to move to Alaska, he warned me that I'd have trouble finding a Starbucks there.",
170
+ "Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc., the chairman and majority shareholder of Pixar, a member of The Walt Disney Company's board of directors following its acquisition of Pixar, and the founder, chairman, and CEO of NeXT. Jobs is widely recognized as a pioneer of the personal computer revolution of the 1970s and 1980s, along with Apple co-founder Steve Wozniak. Jobs was born in San Francisco, California, and put up for adoption. He was raised in the San Francisco Bay Area. He attended Reed College in 1972 before dropping out that same year, and traveled through India in 1974 seeking enlightenment and studying Zen Buddhism.",
171
+ "Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron. Incorporating both historical and fictionalized aspects, it is based on accounts of the sinking of the RMS Titanic, and stars Leonardo DiCaprio and Kate Winslet as members of different social classes who fall in love aboard the ship during its ill-fated maiden voyage.",
172
+ "Other than being the king of the north, John Snow is a an english physician and a leader in the development of anaesthesia and medical hygiene. He is considered for being the first one using data to cure cholera outbreak in 1834."
173
+ ]
174
+
175
+ st.subheader(language_info["EN"]["description"])
176
+ selected_text = st.selectbox("Select an example", examples)
177
+ custom_input = st.text_input("Try it for yourself!")
178
+
179
+ if custom_input:
180
+ selected_text = custom_input
181
+
182
+ st.subheader('Selected Text')
183
+ st.write(selected_text)
184
+
185
+ spark = init_spark()
186
+ Pipeline = create_pipeline(model)
187
+ output = fit_data(Pipeline, selected_text)
188
+
189
+ ner_mapping = {
190
+ 'PERSON': 'People, including fictional.',
191
+ 'PER': 'People, including fictional.',
192
+ 'HUM': 'Humans',
193
+ 'IND': 'Individuals',
194
+ 'NORP': 'Nationalities or religious or political groups.',
195
+ 'FAC': 'Buildings, airports, highways, bridges, etc.',
196
+ 'FACILITY': 'Buildings, airports, highways, bridges, etc.',
197
+ 'STRUCTURE': 'Structures like buildings, bridges, etc.',
198
+ 'ORG': 'Companies, agencies, institutions, etc.',
199
+ 'ORGANIZATION': 'Companies, agencies, institutions, etc.',
200
+ 'INSTITUTION': 'Educational, governmental, and other organizations.',
201
+ 'LOC': 'Countries, cities, states, mountain ranges, bodies of water.',
202
+ 'LOCATION': 'Countries, cities, states, mountain ranges, bodies of water.',
203
+ 'PLACE': 'Specific locations.',
204
+ 'GPE': 'Geopolitical entities, such as countries, cities, states.',
205
+ 'PRODUCT': 'Objects, vehicles, foods, etc. (Not services.)',
206
+ 'PROD': 'Product',
207
+ 'GOOD': 'Goods and products.',
208
+ 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
209
+ 'OCCURRENCE': 'Occurrences and events.',
210
+ 'WORK_OF_ART': 'Titles of books, songs, etc.',
211
+ 'ART': 'Works of art, including books, paintings, songs, etc.',
212
+ 'LAW': 'Named documents made into laws.',
213
+ 'LEGISLATION': 'Laws and legal documents.',
214
+ 'LANGUAGE': 'Any named language.',
215
+ 'DATE': 'Absolute or relative dates or periods.',
216
+ 'TIME': 'Times smaller than a day.',
217
+ 'PERCENT': 'Percentage, including ”%“.',
218
+ 'MONEY': 'Monetary values, including unit.',
219
+ 'CURRENCY': 'Monetary values, including unit.',
220
+ 'QUANTITY': 'Measurements, as of weight or distance.',
221
+ 'MEASURE': 'Measurements and quantities.',
222
+ 'ORDINAL': '“first”, “second”, etc.',
223
+ 'CARDINAL': 'Numerals that do not fall under another type.',
224
+ 'NUMBER': 'Numbers and numerals.',
225
+ 'MISC': 'Miscellaneous entities, e.g. events, nationalities, products or works of art.',
226
+ 'MISCELLANEOUS': 'Miscellaneous entities.',
227
+ 'ENT': 'Entity (generic label).',
228
+ 'GPE_LOC': 'Geopolitical Entity',
229
+ 'ANIMAL': 'Animals, including fictional.',
230
+ 'PLANT': 'Plants, including fictional.',
231
+ 'SUBSTANCE': 'Substances and materials.',
232
+ 'DISEASE': 'Diseases and medical conditions.',
233
+ 'SYMPTOM': 'Symptoms and medical signs.',
234
+ 'MEDICAL': 'Medical terms and conditions.',
235
+ 'FOOD': 'Food items.',
236
+ 'DRINK': 'Drinks and beverages.',
237
+ 'VEHICLE': 'Types of vehicles.',
238
+ 'WEAPON': 'Weapons and armaments.',
239
+ 'TECHNOLOGY': 'Technological terms and devices.',
240
+ 'GAME': 'Games and sports.',
241
+ 'HOBBY': 'Hobbies and recreational activities.',
242
+ 'RELIGION': 'Religious terms and entities.',
243
+ 'MYTH': 'Mythological entities.',
244
+ 'ASTRONOMICAL': 'Astronomical entities (e.g., planets, stars).',
245
+ 'NATURAL_PHENOMENON': 'Natural phenomena (e.g., earthquakes, storms).',
246
+ 'CELESTIAL_BODY': 'Celestial bodies (e.g., stars, planets).',
247
+ 'DRV': 'Driver'
248
+ }
249
+
250
+ entities = [{'text': ent.result, 'start': ent.begin, 'end': ent.end, 'label': ent.metadata['entity'], 'Explain NER Labels': ner_mapping[ent.metadata['entity']]} for ent in output[0]['ner_chunk']]
251
+
252
+ LABEL_COLORS = {
253
+ 'per': '#0C8888', 'pers': '#0C8888', 'person': '#0C8888',
254
+ 'org': '#FF33C1',
255
+ 'misc': '#3196D4', 'mis': '#3196D4',
256
+ 'loc': '#5B00A3', 'location': '#5B00A3'
257
+ }
258
+
259
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
260
+
261
+ chunk_list = []
262
+
263
+ for n in output[0]['ner_chunk']:
264
+ parsed_list = parse_text_to_complex_list(str(n))
265
+ chunk_list.append(parsed_list)
266
+
267
+ st.subheader("Text annotated with identified Named Entities")
268
+
269
+ labels = [n.metadata['entity'] for n in output[0]['ner_chunk']]
270
+ ner = simplified_display_annotations(selected_text, chunk_list, labels)
271
+ st.markdown(HTML_WRAPPER.format(ner), unsafe_allow_html=True)
272
+ st.write(pd.DataFrame(entities))
pages/Sentiment Analysis.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ from sparknlp.base import *
5
+ from sparknlp.common import *
6
+ from sparknlp.annotator import *
7
+ from pyspark.ml import Pipeline
8
+ from sparknlp.pretrained import PretrainedPipeline
9
+
10
+ @st.cache_resource
11
+ def init_spark():
12
+ spark = sparknlp.start()
13
+ return spark
14
+
15
+ @st.cache_resource
16
+ def create_pipeline(model):
17
+ documentAssembler = DocumentAssembler()\
18
+ .setInputCol("text")\
19
+ .setOutputCol("document")
20
+
21
+ use = UniversalSentenceEncoder.pretrained("tfhub_use", "en")\
22
+ .setInputCols(["document"])\
23
+ .setOutputCol("sentence_embeddings")
24
+
25
+ sentimentdl = SentimentDLModel.pretrained(model, "en")\
26
+ .setInputCols(["sentence_embeddings"])\
27
+ .setOutputCol("sentiment")
28
+
29
+ nlpPipeline = Pipeline(stages=[documentAssembler, use, sentimentdl])
30
+
31
+ return nlpPipeline
32
+
33
+ def fit_data(pipeline, data):
34
+ empty_df = spark.createDataFrame([['']]).toDF('text')
35
+ pipeline_model = pipeline.fit(empty_df)
36
+ model = LightPipeline(pipeline_model)
37
+ results = model.fullAnnotate(data)[0]
38
+
39
+ return results['sentiment'][0].result
40
+
41
+ ############ SETTING UP THE PAGE LAYOUT ############
42
+
43
+ st.title("Analyze sentiment in movie reviews and tweets")
44
+
45
+ ### SIDEBAR CONTENT ###
46
+
47
+ model_list = ["sentimentdl_use_imdb", "sentimentdl_use_twitter"]
48
+ model = st.sidebar.selectbox(
49
+ "Choose the pretrained model",
50
+ model_list,
51
+ help="For more info about the models visit: https://sparknlp.org/models"
52
+ )
53
+
54
+ # Let's add the colab link for the notebook.
55
+
56
+ link = """<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_EN.ipynb">
57
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
58
+ st.sidebar.title('')
59
+ st.sidebar.markdown('Reference notebook:')
60
+ st.sidebar.markdown(link, unsafe_allow_html=True)
61
+
62
+ ### MAIN CONTENT ###
63
+
64
+ examples = {
65
+ "IMDB": [
66
+ """Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!""",
67
+ """Back when Alec Baldwin and Kim Basinger were a mercurial, hot-tempered, high-powered Hollywood couple they filmed this (nearly) scene-for-scene remake of the 1972 Steve McQueen-Ali MacGraw action-thriller about a fugitive twosome. It almost worked the first time because McQueen was such a vital presence on the screen--even stone silent and weary, you could sense his clock ticking, his cagey magnetism. Baldwin is not in Steve McQueen's league, but he has his charms and is probably a more versatile actor--if so, this is not a showcase for his attributes. Basinger does well and certainly looks good, but James Woods is artificially hammy in a silly mob-magnet role. A sub-plot involving another couple taken hostage by Baldwin's ex-partner was unbearable in the '72 film and plays even worse here. As for the action scenes, they're pretty old hat, which causes one to wonder: why even remake the original?""",
68
+ """Despite a tight narrative, Johnnie To's Election feels at times like it was once a longer picture, with many characters and plot strands abandoned or ultimately unresolved. Some of these are dealt with in the truly excellent and far superior sequel, Election 2: Harmony is a Virtue, but it's still a dependably enthralling thriller about a contested Triad election that bypasses the usual shootouts and explosions (though not the violence) in favour of constantly shifting alliances that can turn in the time it takes to make a phone call. It's also a film where the most ruthless character isn't always the most threatening one, as the chilling ending makes only too clear: one can imagine a lifetime of psychological counselling being necessary for all the trauma that one inflicts on one unfortunate bystander. Simon Yam, all too often a variable actor but always at his best under To's direction, has possibly never been better in the lead, not least because Tony Leung's much more extrovert performance makes his stillness more the powerful.""",
69
+ """This movie has successfully proved what we all already know, that professional basket-ball players suck at everything besides playing basket-ball. Especially rapping and acting. I can not even begin to describe how bad this movie truly is. First of all, is it just me, or is that the ugliest kid you have ever seen? I mean, his teeth could be used as a can-opener. Secondly, why would a genie want to pursue a career in the music industry when, even though he has magical powers, he sucks horribly at making music? Third, I have read the Bible. In no way shape or form did it say that Jesus made genies. Fourth, what was the deal with all the crappy special effects? I assure you that any acne-addled nerdy teenager with a computer could make better effects than that. Fifth, why did the ending suck so badly? And what the hell is a djin? And finally, whoever created the nightmare known as Kazaam needs to be thrown off of a plane and onto the Eiffel Tower, because this movie take the word "suck" to an entirely new level.""",
70
+ """The fluttering of butterfly wings in the Atlantic can unleash a hurricane in the Pacific. According to this theory (somehow related to the Chaos Theory, I'm not sure exactly how), every action, no matter how small or insignificant, will start a chain reaction that can lead to big events. This small jewel of a film shows us a series of seemingly-unrelated characters, most of them in Paris, whose actions will affect each others' lives. (The six-degrees-of-separation theory can be applied as well.) Each story is a facet of the jewel that is this film. The acting is finely-tuned and nuanced (Audrey Tautou is luminous), the stories mesh plausibly, the humor is just right, and the viewer leaves the theatre nodding in agreement.""",
71
+ """There have been very few films I have not been able to sit through. I made it through Battle Field Earth no problem. But this, This is one of the single worst films EVER to be made. I understand Whoopi Goldberg tried to get of acting in it. I do not blame her. I would feel ashamed to have this on a resume. I belive it is a rare occasion when almost every gag in a film falls flat on it's face. Well it happens here. Not to mention the SFX, look for the dino with the control cables hanging out of it rear end!!!!!! Halfway through the film I was still looking for a plot. I never found one. Save yourself the trouble of renting this and save 90 minutes of your life.""",
72
+ """After a long hard week behind the desk making all those dam serious decisions this movie is a great way to relax. Like Wells and the original radio broadcast this movie will take you away to a land of alien humor and sci-fi paraday. 'Captain Zippo died in the great charge of the Buick. He was a brave man.' The Jack Nicholson impressions shine right through that alien face with the dark sun glasses and leather jacket. And always remember to beware of the 'doughnut of death!' Keep in mind the number one rule of this movie - suspension of disbelief - sit back and relax - and 'Prepare to die Earth Scum!' You just have to see it for yourself.""",
73
+ """When Ritchie first burst on to movie scene his films were hailed as funny, witty, well directed and original. If one could compare the hype he had generated with his first two attempts and the almost universal loathing his last two outings have created one should consider - has Ritchie been found out? Is he really that talented? Does he really have any genuine original ideas? Or is he simply a pretentious and egotistical director who really wants to be Fincher, Tarantino and Leone all rolled into one colossal and disorganised heap? After watching Revolver one could be excused for thinking were did it all go wrong? What happened to his great sense of humour? Where did he get all these mixed and convoluted ideas from? Revolver tries to be clever, philosophical and succinct, it tries to be an intelligent psychoanalysis, it tries to be an intricate and complicated thriller. Ritchie does make a gargantuan effort to fulfil all these many objectives and invests great chunks of a script into existential musings and numerous plot twists. However, in the end all it serves is to construct a severely disjointed, unstructured and ultimately unfriendly film to the audience. Its plagiarism is so sinful and blatant that although Ritchie does at least attempt to give his own spin he should be punished for even trying to pass it off as his own work. So what the audience gets ultimately is a terrible screenplay intertwined with many pretentious oneliners and clumsy setpieces.<br /><br />Revolver is ultimately an unoriginal and bland movie that has stolen countless themes from masterpieces like Fight Club, Usual Suspects and Pulp Fiction. It aims high, but inevitably shots blanks aplenty.<br /><br />Revolver deserves to be lambasted, it is a truly poor film masquerading as a wannabe masterpiece from a wannabe auteur. However, it falls flat on its farcical face and just fails at everything it wants to be and achieve.""",
74
+ """I always thought this would be a long and boring Talking-Heads flick full of static interior takes, dude, I was wrong. "Election" is a highly fascinating and thoroughly captivating thriller-drama, taking a deep and realistic view behind the origins of Triads-Rituals. Characters are constantly on the move, and although as a viewer you kinda always remain an outsider, it's still possible to feel the suspense coming from certain decisions and ambitions of the characters. Furthermore Johnnie To succeeds in creating some truly opulent images due to meticulously composed lighting and atmospheric light-shadow contrasts. Although there's hardly any action, the ending is still shocking in it's ruthless depicting of brutality. Cool movie that deserves more attention, and I came to like the minimalistic acoustic guitar score quite a bit.""",
75
+ """This is to the Zatoichi movies as the "Star Trek" movies were to "Star Trek"--except that in this case every one of the originals was more entertaining and interesting than this big, shiny re-do, and also better made, if substance is more important than surface. Had I never seen them, I would have thought this good-looking but empty; since I had, I thought its style inappropriate and its content insufficient. The idea of reviving the character in a bigger, slicker production must have sounded good, but there was no point in it, other than the hope of making money; it's just a show, which mostly fails to capture the atmosphere of the character's world and wholly fails to take the character anywhere he hasn't been already (also, the actor wasn't at his best). I'd been hoping to see Ichi at a late stage of life, in a story that would see him out gracefully and draw some conclusion from his experience overall; this just rehashes bits and pieces from the other movies, seasoned with more sex and sfx violence. Not the same experience at all."""
76
+ ],
77
+
78
+ "Twitter": [
79
+ """@Mbjthegreat i really dont want AT&amp;T phone service..they suck when it comes to having a signal""",
80
+ """holy crap. I take a nap for 4 hours and Pitchfork blows up my twitter dashboard. I wish I was at Coachella.""",
81
+ """@Susy412 he is working today ive tried that still not working..... hmmmm!! im rubbish with computers haha!""",
82
+ """Brand New Canon EOS 50D 15MP DSLR Camera Canon 17-85mm IS Lens ...: Web Technology Thread, Brand New Canon EOS 5.. http://u.mavrev.com/5a3t""",
83
+ """Watching a programme about the life of Hitler, its only enhancing my geekiness of history.""",
84
+ """GM says expects announcment on sale of Hummer soon - Reuters: WDSUGM says expects announcment on sale of Hummer .. http://bit.ly/4E1Fv""",
85
+ """@accannis @edog1203 Great Stanford course. Thanks for making it available to the public! Really helpful and informative for starting off!""",
86
+ """@the_real_usher LeBron is cool. I like his personality...he has good character.""",
87
+ """@sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol""",
88
+ """@PDubyaD right!!! LOL we'll get there!! I have high expectations, Warren Buffet style."""
89
+ ]
90
+ }
91
+
92
+ st.subheader("Detect the general sentiment expressed in a movie review or tweet by using our pretrained Spark NLP DL classifier.")
93
+
94
+ selected_text = None
95
+ result_type = 'tweet'
96
+ if 'imdb' in model.lower():
97
+ selected_text = st.selectbox("Select a sample IMDB review", examples['IMDB'])
98
+ result_type = 'review'
99
+ else:
100
+ selected_text = st.selectbox("Select a sample Tweet", examples['Twitter'])
101
+
102
+ custom_input = st.text_input("Try it for yourself!")
103
+
104
+ if custom_input:
105
+ selected_text = custom_input
106
+
107
+ st.subheader('Selected Text')
108
+ st.write(selected_text)
109
+
110
+ spark = init_spark()
111
+ pipeline = create_pipeline(model)
112
+ output = fit_data(pipeline, selected_text)
113
+
114
+ if output in ['pos', 'positive', 'POSITIVE']:
115
+ st.markdown(f"""<h3>This seems like a <span style="color: green">positive</span> {result_type}. <span style="font-size:35px;">&#128515;</span></h3>""", unsafe_allow_html=True)
116
+ elif output in ['neg', 'negative', 'NEGATIVE']:
117
+ st.markdown(f"""<h3>This seems like a <span style="color: red">negative</span> {result_type}. <span style="font-size:35px;">&#128544;</span></h3>""", unsafe_allow_html=True)
pages/Summarization.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ from sparknlp.base import *
5
+ from sparknlp.common import *
6
+ from sparknlp.annotator import *
7
+ from pyspark.ml import Pipeline
8
+ from sparknlp.pretrained import PretrainedPipeline
9
+
10
+ @st.cache_resource
11
+ def init_spark():
12
+ spark = sparknlp.start()
13
+ return spark
14
+
15
+ @st.cache_resource
16
+ def create_pipeline(model):
17
+ document_assembler = DocumentAssembler()\
18
+ .setInputCol("text")\
19
+ .setOutputCol("documents")
20
+
21
+ t5 = T5Transformer() \
22
+ .pretrained(model, 'en') \
23
+ .setTask("summarize:")\
24
+ .setMaxOutputLength(200)\
25
+ .setInputCols(["documents"]) \
26
+ .setOutputCol("summaries")
27
+
28
+ pipeline = Pipeline().setStages([document_assembler, t5])
29
+ return pipeline
30
+
31
+ def fit_data(pipeline, data):
32
+ empty_df = spark.createDataFrame([['']]).toDF('text')
33
+ pipeline_model = pipeline.fit(empty_df)
34
+ model = LightPipeline(pipeline_model)
35
+ results = model.fullAnnotate(data)[0]
36
+ return results['summaries'][0].result
37
+
38
+ ############ SETTING UP THE PAGE LAYOUT ############
39
+
40
+ st.title("Summarize Text")
41
+
42
+ ### SIDEBAR CONTENT ###
43
+
44
+ # Model selection in sidebar
45
+ model = st.sidebar.selectbox(
46
+ "Choose the pretrained model",
47
+ ['t5_base', 't5_small'],
48
+ help="For more info about the models visit: https://sparknlp.org/models"
49
+ )
50
+
51
+ # Colab link for the notebook
52
+ link = """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb">
53
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
54
+ </a>"""
55
+ st.sidebar.title('')
56
+ st.sidebar.markdown('Reference notebook:')
57
+ st.sidebar.markdown(link, unsafe_allow_html=True)
58
+
59
+ ### MAIN CONTENT ###
60
+
61
+ # Sample text options
62
+ options = [
63
+ "Mount Tai is a mountain of historical and cultural significance located north of the city of Tai'an, in Shandong province, China. The tallest peak is the Jade Emperor Peak, which is commonly reported as being 1,545 meters tall, but is officially described by the PRC government as 1,532.7 meters tall. It is associated with sunrise, birth, and renewal, and is often regarded the foremost of the five. Mount Tai has been a place of worship for at least 3,000 years and served as one of the most important ceremonial centers of China during large portions of this period.",
64
+ "The Guadeloupe amazon (Amazona violacea) is a hypothetical extinct species of parrot that is thought to have been endemic to the Lesser Antillean island region of Guadeloupe. Described by 17th- and 18th-century writers, it is thought to have been related to, or possibly the same as, the extant imperial amazon. A tibiotarsus and an ulna bone from the island of Marie-Galante may belong to the Guadeloupe amazon. According to contemporary descriptions, its head, neck and underparts were mainly violet or slate, mixed with green and black; the back was brownish green; and the wings were green, yellow and red. It had iridescent feathers, and was able to raise a \"ruff\" of feathers around its neck. It fed on fruits and nuts, and the male and female took turns sitting on the nest. French settlers ate the birds and destroyed their habitat. Rare by 1779, the species appears to have become extinct by the end of the 18th century.",
65
+ "Pierre-Simon, marquis de Laplace (23 March 1749 – 5 March 1827) was a French scholar and polymath whose work was important to the development of engineering, mathematics, statistics, physics, astronomy, and philosophy. He summarized and extended the work of his predecessors in his five-volume Mécanique Céleste (Celestial Mechanics) (1799–1825). This work translated the geometric study of classical mechanics to one based on calculus, opening up a broader range of problems. In statistics, the Bayesian interpretation of probability was developed mainly by Laplace.",
66
+ "John Snow (15 March 1813 – 16 June 1858) was an English physician and a leader in the development of anaesthesia and medical hygiene. He is considered one of the founders of modern epidemiology, in part because of his work in tracing the source of a cholera outbreak in Soho, London, in 1854, which he curtailed by removing the handle of a water pump. Snow's findings inspired the adoption of anaesthesia as well as fundamental changes in the water and waste systems of London, which led to similar changes in other cities, and a significant improvement in general public health around the world.",
67
+ "The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci. Considered an archetypal masterpiece of the Italian Renaissance, it has been described as \"the best known, the most visited, the most written about, the most sung about, the most parodied work of art in the world\". The painting's novel qualities include the subject's enigmatic expression, the monumentality of the composition, the subtle modelling of forms, and the atmospheric illusionism.",
68
+ """Calculus, originally called infinitesimal calculus or "the calculus of infinitesimals", is the mathematical study of continuous change, in the same way that geometry is the study of shape and algebra is the study of generalizations of arithmetic operations. It has two major branches, differential calculus and integral calculus; the former concerns instantaneous rates of change, and the slopes of curves, while integral calculus concerns accumulation of quantities, and areas under or between curves. These two branches are related to each other by the fundamental theorem of calculus, and they make use of the fundamental notions of convergence of infinite sequences and infinite series to a well-defined limit.[1] Infinitesimal calculus was developed independently in the late 17th century by Isaac Newton and Gottfried Wilhelm Leibniz.[2][3] Today, calculus has widespread uses in science, engineering, and economics.[4] In mathematics education, calculus denotes courses of elementary mathematical analysis, which are mainly devoted to the study of functions and limits. The word calculus (plural calculi) is a Latin word, meaning originally "small pebble" (this meaning is kept in medicine – see Calculus (medicine)). Because such pebbles were used for calculation, the meaning of the word has evolved and today usually means a method of computation. It is therefore used for naming specific methods of calculation and related theories, such as propositional calculus, Ricci calculus, calculus of variations, lambda calculus, and process calculus.""",
69
+ ]
70
+
71
+ st.subheader("Summarize text to make it shorter while retaining meaning.")
72
+
73
+ # Text input options
74
+ selected_text = st.selectbox("Select an example", options)
75
+ custom_input = st.text_input("Try it for yourself!")
76
+
77
+ if custom_input:
78
+ selected_text = custom_input
79
+
80
+ st.subheader('Text')
81
+ st.write(selected_text)
82
+
83
+ st.subheader("Summary")
84
+
85
+ # Generate summary
86
+ spark = init_spark()
87
+ pipeline = create_pipeline(model)
88
+ output = fit_data(pipeline, selected_text)
89
+ st.write(output)
pages/Text Preprocessing.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############ IMPORTING LIBRARIES ############
2
+
3
+ # Import streamlit, Spark NLP, pyspark
4
+
5
+ import streamlit as st
6
+ import sparknlp
7
+ import os
8
+ import pandas as pd
9
+
10
+ import pyspark.sql.functions as F
11
+ from pyspark.ml import Pipeline
12
+ from pyspark.sql import SparkSession
13
+ from sparknlp.annotator import *
14
+ from sparknlp.base import *
15
+ from sparknlp.pretrained import PretrainedPipeline
16
+ from pyspark.sql.types import StringType, IntegerType
17
+
18
+ @st.cache_resource
19
+ def init_spark():
20
+ spark = sparknlp.start()
21
+ return spark
22
+
23
+ @st.cache_resource
24
+ def create_pipeline(model):
25
+
26
+ documentAssembler = DocumentAssembler()\
27
+ .setInputCol("text")\
28
+ .setOutputCol("document")
29
+
30
+ sentenceDetector = SentenceDetector()\
31
+ .setInputCols(['document'])\
32
+ .setOutputCol('sentences')
33
+
34
+ tokenizer = Tokenizer() \
35
+ .setInputCols(["document"]) \
36
+ .setOutputCol("token")
37
+
38
+ normalizer = Normalizer() \
39
+ .setInputCols(["token"]) \
40
+ .setOutputCol("normalized")\
41
+ .setLowercase(True)\
42
+ .setCleanupPatterns(["[^\w\d\s]"])
43
+
44
+ stopwords_cleaner = StopWordsCleaner()\
45
+ .setInputCols("token")\
46
+ .setOutputCol("removed_stopwords")\
47
+ .setCaseSensitive(False)\
48
+
49
+ stemmer = Stemmer() \
50
+ .setInputCols(["token"]) \
51
+ .setOutputCol("stem")
52
+
53
+
54
+ lemmatizer = Lemmatizer() \
55
+ .setInputCols(["token"]) \
56
+ .setOutputCol("lemma") \
57
+ .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")
58
+
59
+ nlpPipeline = Pipeline(stages=[documentAssembler,
60
+ sentenceDetector,
61
+ tokenizer,
62
+ normalizer,
63
+ stopwords_cleaner,
64
+ stemmer,
65
+ lemmatizer])
66
+
67
+ return nlpPipeline
68
+
69
+ def fit_data(pipeline, data):
70
+
71
+ empty_df = spark.createDataFrame([['']]).toDF('text')
72
+ pipeline_model = pipeline.fit(empty_df)
73
+ model = LightPipeline(pipeline_model)
74
+ result = model.fullAnnotate(data)
75
+
76
+ return result
77
+
78
+ ############ SETTING UP THE PAGE LAYOUT ############
79
+
80
+ st.title("Typo Detector")
81
+
82
+ ### SIDEBAR CONTENT ###
83
+
84
+ model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
85
+ #model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
86
+
87
+ st.sidebar.title("Filter Annotator Outputs")
88
+ selected_models = []
89
+ for model in model_name.split('|'):
90
+ check = st.sidebar.checkbox(model, value=True, key=model)
91
+ selected_models.append(check)
92
+
93
+ # # Let's add the colab link for the notebook.
94
+
95
+ link= """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
96
+ st.sidebar.title('')
97
+ st.sidebar.markdown('Reference notebook:')
98
+ st.sidebar.markdown(link, unsafe_allow_html=True)
99
+
100
+ ### MAIN CONTENT ###
101
+
102
+ examples = [
103
+
104
+ "The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.",
105
+ "Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!",
106
+ "Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. Computers that perform quantum computations are known as quantum computers. Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers. The study of quantum computing is a subfield of quantum information science. Quantum computing began in the early 1980s, when physicist Paul Benioff proposed a quantum mechanical model of the Turing machine.",
107
+ "Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron. Incorporating both historical and fictionalized aspects, it is based on accounts of the sinking of the RMS Titanic, and stars Leonardo DiCaprio and Kate Winslet as members of different social classes who fall in love aboard the ship during its ill-fated maiden voyage.",
108
+ "William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000.[9] He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella.",
109
+ """World War II (often abbreviated as WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945. The vast majority of the world's countries—including all the great powers—eventually formed two opposing military alliances: the Allies and the Axis. A state of total war emerged, directly involving more than 100 million people from more than 30 countries. The major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, blurring the distinction between civilian and military resources. World War II was the deadliest conflict in human history, marked by 70 to 85 million fatalities, most of whom were civilians in the Soviet Union and China. Tens of millions of people died during the conflict due to genocides (including the Holocaust), premeditated death from starvation, massacres, and disease. Aircraft played a major role in the conflict which included the use of terror bombing, strategic bombing and the only use of nuclear weapons in war.""",
110
+ "Disney Channel (originally called The Disney Channel from 1983 to 1997 and commonly shortened to Disney from 1997 to 2002) is an American pay television channel that serves as the flagship property of owner Disney Channels Worldwide unit of the Walt Disney Television subsidiary of The Walt Disney Company. Disney Channel's programming consists of original first-run television series, theatrically released and original made-for-TV movies and select other third-party programming. Disney Channel – which formerly operated as a premium service – originally marketed its programs towards families during the 1980s, and later at younger children by the 2000s.",
111
+ "For several hundred thousand years, the Sahara has alternated between desert and savanna grassland in a 20,000 year cycle[8] caused by the precession of the Earth's axis as it rotates around the Sun, which changes the location of the North African Monsoon. The area is next expected to become green in about 15,000 years (17,000 ACE).",
112
+ "Elon Musk is an engineer, industrial designer, technology entrepreneur and philanthropist. He is a citizen of South Africa, Canada, and the United States. He is the founder, CEO and chief engineer/designer of SpaceX; early investor, CEO and product architect of Tesla, Inc.; founder of The Boring Company; co-founder of Neuralink; and co-founder and initial co-chairman of OpenAI. He was elected a Fellow of the Royal Society (FRS) in 2018. In December 2016, he was ranked 21st on the Forbes list of The World's Most Powerful People, and was ranked joint-first on the Forbes list of the Most Innovative Leaders of 2019. A self-made billionaire, as of June 2020 his net worth was estimated at $38.8 billion and he is listed by Forbes as the 31st-richest person in the world. He is the longest tenured CEO of any automotive manufacturer globally.",
113
+ "Born and raised in the Austrian Empire, Tesla studied engineering and physics in the 1870s without receiving a degree, and gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884 he emigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His alternating current (AC) induction motor and related polyphase AC patents, licensed by Westinghouse Electric in 1888, earned him a considerable amount of money and became the cornerstone of the polyphase system which that company eventually marketed."
114
+
115
+ ]
116
+
117
+ st.subheader("Split and clean text")
118
+
119
+ selected_text = st.selectbox("Select an example", examples)
120
+
121
+ custom_input = st.text_input("Try it for yourself!")
122
+
123
+ if custom_input:
124
+ selected_text = custom_input
125
+ elif selected_text:
126
+ selected_text = selected_text
127
+
128
+ st.subheader('Selected Text')
129
+ st.write(selected_text)
130
+
131
+ spark = init_spark()
132
+ Pipeline = create_pipeline(model)
133
+ output = fit_data(Pipeline, selected_text)
134
+
135
+ df = output.toPandas()
136
+
137
+ ### Displaying sentences
138
+ if selected_models[0] is True:
139
+ st.subheader("Detected sentences in text:")
140
+ st.dataframe(pd.DataFrame({'sentences': np.asarray(df['sentences'].values[0])[:,3]}))
141
+
142
+ ## Displaying tokens, stem, lemma
143
+ equal_columns = ['token', 'stem', 'lemma']
144
+ sbhrds = ["Basic", 'Stemmed', 'Lemmatized']
145
+ tdf = pd.DataFrame()
146
+ sbhdr = ''
147
+ for index, col in enumerate(equal_columns):
148
+ if selected_models[index+1] is True:
149
+ tcol_arr = np.asarray(df[col].values[0])
150
+ tdf[col] = tcol_arr[:,3]
151
+ sbhdr += (',\t'+sbhrds[index])
152
+ if tdf.shape[0]>=1:
153
+ st.subheader(sbhdr[1:]+"\tTokens:")
154
+ st.dataframe(tdf)
155
+
156
+ if selected_models[4] is True:
157
+ tcol_arr = np.asarray(df['normalized'].values[0])[:,3]
158
+ st.subheader("Tokens after removing punctuations:")
159
+ st.dataframe(pd.DataFrame({'normalized':tcol_arr}))
160
+
161
+ if selected_models[5] is True:
162
+ tcol_arr = np.asarray(df['removed_stopwords'].values[0])[:,3]
163
+ st.subheader("Tokens after removing Stop Words:")
164
+ st.dataframe(pd.DataFrame({'removed_stopwords':tcol_arr}))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ spark-nlp
5
+ pyspark
6
+
7
+