abdullahmubeen10 commited on
Commit
c55f1da
1 Parent(s): 2e04710

Update pages/Text Preprocessing.py

Browse files
Files changed (1) hide show
  1. pages/Text Preprocessing.py +155 -164
pages/Text Preprocessing.py CHANGED
@@ -1,164 +1,155 @@
1
- ############ IMPORTING LIBRARIES ############
2
-
3
- # Import streamlit, Spark NLP, pyspark
4
-
5
- import streamlit as st
6
- import sparknlp
7
- import os
8
- import pandas as pd
9
-
10
- import pyspark.sql.functions as F
11
- from pyspark.ml import Pipeline
12
- from pyspark.sql import SparkSession
13
- from sparknlp.annotator import *
14
- from sparknlp.base import *
15
- from sparknlp.pretrained import PretrainedPipeline
16
- from pyspark.sql.types import StringType, IntegerType
17
-
18
- @st.cache_resource
19
- def init_spark():
20
- spark = sparknlp.start()
21
- return spark
22
-
23
- @st.cache_resource
24
- def create_pipeline(model):
25
-
26
- documentAssembler = DocumentAssembler()\
27
- .setInputCol("text")\
28
- .setOutputCol("document")
29
-
30
- sentenceDetector = SentenceDetector()\
31
- .setInputCols(['document'])\
32
- .setOutputCol('sentences')
33
-
34
- tokenizer = Tokenizer() \
35
- .setInputCols(["document"]) \
36
- .setOutputCol("token")
37
-
38
- normalizer = Normalizer() \
39
- .setInputCols(["token"]) \
40
- .setOutputCol("normalized")\
41
- .setLowercase(True)\
42
- .setCleanupPatterns(["[^\w\d\s]"])
43
-
44
- stopwords_cleaner = StopWordsCleaner()\
45
- .setInputCols("token")\
46
- .setOutputCol("removed_stopwords")\
47
- .setCaseSensitive(False)\
48
-
49
- stemmer = Stemmer() \
50
- .setInputCols(["token"]) \
51
- .setOutputCol("stem")
52
-
53
-
54
- lemmatizer = Lemmatizer() \
55
- .setInputCols(["token"]) \
56
- .setOutputCol("lemma") \
57
- .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")
58
-
59
- nlpPipeline = Pipeline(stages=[documentAssembler,
60
- sentenceDetector,
61
- tokenizer,
62
- normalizer,
63
- stopwords_cleaner,
64
- stemmer,
65
- lemmatizer])
66
-
67
- return nlpPipeline
68
-
69
- def fit_data(pipeline, data):
70
-
71
- empty_df = spark.createDataFrame([['']]).toDF('text')
72
- pipeline_model = pipeline.fit(empty_df)
73
- model = LightPipeline(pipeline_model)
74
- result = model.fullAnnotate(data)
75
-
76
- return result
77
-
78
- ############ SETTING UP THE PAGE LAYOUT ############
79
-
80
- st.title("Typo Detector")
81
-
82
- ### SIDEBAR CONTENT ###
83
-
84
- model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
85
- #model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
86
-
87
- st.sidebar.title("Filter Annotator Outputs")
88
- selected_models = []
89
- for model in model_name.split('|'):
90
- check = st.sidebar.checkbox(model, value=True, key=model)
91
- selected_models.append(check)
92
-
93
- # # Let's add the colab link for the notebook.
94
-
95
- link= """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
96
- st.sidebar.title('')
97
- st.sidebar.markdown('Reference notebook:')
98
- st.sidebar.markdown(link, unsafe_allow_html=True)
99
-
100
- ### MAIN CONTENT ###
101
-
102
- examples = [
103
-
104
- "The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.",
105
- "Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!",
106
- "Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. Computers that perform quantum computations are known as quantum computers. Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers. The study of quantum computing is a subfield of quantum information science. Quantum computing began in the early 1980s, when physicist Paul Benioff proposed a quantum mechanical model of the Turing machine.",
107
- "Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron. Incorporating both historical and fictionalized aspects, it is based on accounts of the sinking of the RMS Titanic, and stars Leonardo DiCaprio and Kate Winslet as members of different social classes who fall in love aboard the ship during its ill-fated maiden voyage.",
108
- "William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000.[9] He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella.",
109
- """World War II (often abbreviated as WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945. The vast majority of the world's countries—including all the great powers—eventually formed two opposing military alliances: the Allies and the Axis. A state of total war emerged, directly involving more than 100 million people from more than 30 countries. The major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, blurring the distinction between civilian and military resources. World War II was the deadliest conflict in human history, marked by 70 to 85 million fatalities, most of whom were civilians in the Soviet Union and China. Tens of millions of people died during the conflict due to genocides (including the Holocaust), premeditated death from starvation, massacres, and disease. Aircraft played a major role in the conflict which included the use of terror bombing, strategic bombing and the only use of nuclear weapons in war.""",
110
- "Disney Channel (originally called The Disney Channel from 1983 to 1997 and commonly shortened to Disney from 1997 to 2002) is an American pay television channel that serves as the flagship property of owner Disney Channels Worldwide unit of the Walt Disney Television subsidiary of The Walt Disney Company. Disney Channel's programming consists of original first-run television series, theatrically released and original made-for-TV movies and select other third-party programming. Disney Channel – which formerly operated as a premium service – originally marketed its programs towards families during the 1980s, and later at younger children by the 2000s.",
111
- "For several hundred thousand years, the Sahara has alternated between desert and savanna grassland in a 20,000 year cycle[8] caused by the precession of the Earth's axis as it rotates around the Sun, which changes the location of the North African Monsoon. The area is next expected to become green in about 15,000 years (17,000 ACE).",
112
- "Elon Musk is an engineer, industrial designer, technology entrepreneur and philanthropist. He is a citizen of South Africa, Canada, and the United States. He is the founder, CEO and chief engineer/designer of SpaceX; early investor, CEO and product architect of Tesla, Inc.; founder of The Boring Company; co-founder of Neuralink; and co-founder and initial co-chairman of OpenAI. He was elected a Fellow of the Royal Society (FRS) in 2018. In December 2016, he was ranked 21st on the Forbes list of The World's Most Powerful People, and was ranked joint-first on the Forbes list of the Most Innovative Leaders of 2019. A self-made billionaire, as of June 2020 his net worth was estimated at $38.8 billion and he is listed by Forbes as the 31st-richest person in the world. He is the longest tenured CEO of any automotive manufacturer globally.",
113
- "Born and raised in the Austrian Empire, Tesla studied engineering and physics in the 1870s without receiving a degree, and gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884 he emigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His alternating current (AC) induction motor and related polyphase AC patents, licensed by Westinghouse Electric in 1888, earned him a considerable amount of money and became the cornerstone of the polyphase system which that company eventually marketed."
114
-
115
- ]
116
-
117
- st.subheader("Split and clean text")
118
-
119
- selected_text = st.selectbox("Select an example", examples)
120
-
121
- custom_input = st.text_input("Try it for yourself!")
122
-
123
- if custom_input:
124
- selected_text = custom_input
125
- elif selected_text:
126
- selected_text = selected_text
127
-
128
- st.subheader('Selected Text')
129
- st.write(selected_text)
130
-
131
- spark = init_spark()
132
- Pipeline = create_pipeline(model)
133
- output = fit_data(Pipeline, selected_text)
134
-
135
- df = output.toPandas()
136
-
137
- ### Displaying sentences
138
- if selected_models[0] is True:
139
- st.subheader("Detected sentences in text:")
140
- st.dataframe(pd.DataFrame({'sentences': np.asarray(df['sentences'].values[0])[:,3]}))
141
-
142
- ## Displaying tokens, stem, lemma
143
- equal_columns = ['token', 'stem', 'lemma']
144
- sbhrds = ["Basic", 'Stemmed', 'Lemmatized']
145
- tdf = pd.DataFrame()
146
- sbhdr = ''
147
- for index, col in enumerate(equal_columns):
148
- if selected_models[index+1] is True:
149
- tcol_arr = np.asarray(df[col].values[0])
150
- tdf[col] = tcol_arr[:,3]
151
- sbhdr += (',\t'+sbhrds[index])
152
- if tdf.shape[0]>=1:
153
- st.subheader(sbhdr[1:]+"\tTokens:")
154
- st.dataframe(tdf)
155
-
156
- if selected_models[4] is True:
157
- tcol_arr = np.asarray(df['normalized'].values[0])[:,3]
158
- st.subheader("Tokens after removing punctuations:")
159
- st.dataframe(pd.DataFrame({'normalized':tcol_arr}))
160
-
161
- if selected_models[5] is True:
162
- tcol_arr = np.asarray(df['removed_stopwords'].values[0])[:,3]
163
- st.subheader("Tokens after removing Stop Words:")
164
- st.dataframe(pd.DataFrame({'removed_stopwords':tcol_arr}))
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pyspark.sql.functions as F
7
+ from pyspark.ml import Pipeline
8
+ from pyspark.sql import SparkSession
9
+ from sparknlp.annotator import *
10
+ from sparknlp.base import *
11
+ from sparknlp.pretrained import PretrainedPipeline
12
+ from pyspark.sql.types import StringType, IntegerType
13
+
14
+ @st.cache_resource
15
+ def init_spark():
16
+ spark = sparknlp.start()
17
+ return spark
18
+
19
+ @st.cache_resource
20
+ def create_pipeline():
21
+
22
+ documentAssembler = DocumentAssembler()\
23
+ .setInputCol("text")\
24
+ .setOutputCol("document")
25
+
26
+ sentenceDetector = SentenceDetector()\
27
+ .setInputCols(['document'])\
28
+ .setOutputCol('sentences')
29
+
30
+ tokenizer = Tokenizer() \
31
+ .setInputCols(["document"]) \
32
+ .setOutputCol("token")
33
+
34
+ normalizer = Normalizer() \
35
+ .setInputCols(["token"]) \
36
+ .setOutputCol("normalized")\
37
+ .setLowercase(True)\
38
+ .setCleanupPatterns(["[^\w\d\s]"])
39
+
40
+ stopwords_cleaner = StopWordsCleaner()\
41
+ .setInputCols("token")\
42
+ .setOutputCol("removed_stopwords")\
43
+ .setCaseSensitive(False)\
44
+
45
+ stemmer = Stemmer() \
46
+ .setInputCols(["token"]) \
47
+ .setOutputCol("stem")
48
+
49
+ lemmatizer = Lemmatizer() \
50
+ .setInputCols(["token"]) \
51
+ .setOutputCol("lemma") \
52
+ .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")
53
+
54
+ nlpPipeline = Pipeline(stages=[documentAssembler,
55
+ sentenceDetector,
56
+ tokenizer,
57
+ normalizer,
58
+ stopwords_cleaner,
59
+ stemmer,
60
+ lemmatizer])
61
+
62
+ return nlpPipeline
63
+
64
+ def fit_data(pipeline, data):
65
+ empty_df = spark.createDataFrame([['']]).toDF('text')
66
+ pipeline_model = pipeline.fit(empty_df)
67
+ model = LightPipeline(pipeline_model)
68
+ result = model.fullAnnotate(data)
69
+ return result
70
+
71
+ def extract_annotations(output, annotation_type):
72
+ return [anno.result for anno in output[0][annotation_type]]
73
+
74
+
75
+ st.title("Typo Detector")
76
+
77
+ model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
78
+ #model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
79
+
80
+ st.sidebar.title("Filter Annotator Outputs")
81
+ selected_models = []
82
+ for model in model_name.split('|'):
83
+ check = st.sidebar.checkbox(model, value=True, key=model)
84
+ selected_models.append(check)
85
+
86
+ # # Let's add the colab link for the notebook.
87
+
88
+ link= """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
89
+ st.sidebar.title('')
90
+ st.sidebar.markdown('Reference notebook:')
91
+ st.sidebar.markdown(link, unsafe_allow_html=True)
92
+
93
+ examples = [
94
+
95
+ "The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.",
96
+ "Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!",
97
+ "Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. Computers that perform quantum computations are known as quantum computers. Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers. The study of quantum computing is a subfield of quantum information science. Quantum computing began in the early 1980s, when physicist Paul Benioff proposed a quantum mechanical model of the Turing machine.",
98
+ "Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron. Incorporating both historical and fictionalized aspects, it is based on accounts of the sinking of the RMS Titanic, and stars Leonardo DiCaprio and Kate Winslet as members of different social classes who fall in love aboard the ship during its ill-fated maiden voyage.",
99
+ "William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000.[9] He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella.",
100
+ """World War II (often abbreviated as WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945. The vast majority of the world's countries—including all the great powers—eventually formed two opposing military alliances: the Allies and the Axis. A state of total war emerged, directly involving more than 100 million people from more than 30 countries. The major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, blurring the distinction between civilian and military resources. World War II was the deadliest conflict in human history, marked by 70 to 85 million fatalities, most of whom were civilians in the Soviet Union and China. Tens of millions of people died during the conflict due to genocides (including the Holocaust), premeditated death from starvation, massacres, and disease. Aircraft played a major role in the conflict which included the use of terror bombing, strategic bombing and the only use of nuclear weapons in war.""",
101
+ "Disney Channel (originally called The Disney Channel from 1983 to 1997 and commonly shortened to Disney from 1997 to 2002) is an American pay television channel that serves as the flagship property of owner Disney Channels Worldwide unit of the Walt Disney Television subsidiary of The Walt Disney Company. Disney Channel's programming consists of original first-run television series, theatrically released and original made-for-TV movies and select other third-party programming. Disney Channel – which formerly operated as a premium service – originally marketed its programs towards families during the 1980s, and later at younger children by the 2000s.",
102
+ "For several hundred thousand years, the Sahara has alternated between desert and savanna grassland in a 20,000 year cycle[8] caused by the precession of the Earth's axis as it rotates around the Sun, which changes the location of the North African Monsoon. The area is next expected to become green in about 15,000 years (17,000 ACE).",
103
+ "Elon Musk is an engineer, industrial designer, technology entrepreneur and philanthropist. He is a citizen of South Africa, Canada, and the United States. He is the founder, CEO and chief engineer/designer of SpaceX; early investor, CEO and product architect of Tesla, Inc.; founder of The Boring Company; co-founder of Neuralink; and co-founder and initial co-chairman of OpenAI. He was elected a Fellow of the Royal Society (FRS) in 2018. In December 2016, he was ranked 21st on the Forbes list of The World's Most Powerful People, and was ranked joint-first on the Forbes list of the Most Innovative Leaders of 2019. A self-made billionaire, as of June 2020 his net worth was estimated at $38.8 billion and he is listed by Forbes as the 31st-richest person in the world. He is the longest tenured CEO of any automotive manufacturer globally.",
104
+ "Born and raised in the Austrian Empire, Tesla studied engineering and physics in the 1870s without receiving a degree, and gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884 he emigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His alternating current (AC) induction motor and related polyphase AC patents, licensed by Westinghouse Electric in 1888, earned him a considerable amount of money and became the cornerstone of the polyphase system which that company eventually marketed."
105
+
106
+ ]
107
+
108
+ st.subheader("Split and clean text")
109
+
110
+ selected_text = st.selectbox("Select an example", examples)
111
+
112
+ custom_input = st.text_input("Try it for yourself!")
113
+
114
+ if custom_input:
115
+ selected_text = custom_input
116
+ elif selected_text:
117
+ selected_text = selected_text
118
+
119
+ st.subheader('Selected Text')
120
+ st.write(selected_text)
121
+
122
+ spark = init_spark()
123
+ Pipeline = create_pipeline()
124
+ output = fit_data(Pipeline, selected_text)
125
+
126
+ data_dict = {}
127
+
128
+ if selected_models[0]:
129
+ sentences = extract_annotations(output, 'sentences')
130
+ data_dict['sentences'] = sentences
131
+
132
+ if selected_models[1]:
133
+ tokens = extract_annotations(output, 'token')
134
+ data_dict['token'] = tokens
135
+
136
+ if selected_models[2]:
137
+ stems = extract_annotations(output, 'stem')
138
+ data_dict['stem'] = stems
139
+
140
+ if selected_models[3]:
141
+ lemmas = extract_annotations(output, 'lemma')
142
+ data_dict['lemma'] = lemmas
143
+
144
+ if selected_models[4]:
145
+ normalized = extract_annotations(output, 'normalized')
146
+ data_dict['normalized'] = normalized
147
+
148
+ if selected_models[5]:
149
+ removed_stopwords = extract_annotations(output, 'removed_stopwords')
150
+ data_dict['removed_stopwords'] = removed_stopwords
151
+
152
+ if data_dict:
153
+ df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
154
+ st.subheader("Annotation Results:")
155
+ st.dataframe(df)