abdullahmubeen10 commited on
Commit
d6e48fd
1 Parent(s): 3473964

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/T5_model_diagram.jpg filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ from sparknlp.base import *
5
+ from sparknlp.common import *
6
+ from sparknlp.annotator import *
7
+ from pyspark.ml import Pipeline
8
+ from sparknlp.pretrained import PretrainedPipeline
9
+
10
+ # Configure Streamlit page
11
+ st.set_page_config(
12
+ layout="wide",
13
+ page_title="Spark NLP Demos App",
14
+ initial_sidebar_state="auto"
15
+ )
16
+
17
+ # Custom CSS for better styling
18
+ st.markdown("""
19
+ <style>
20
+ .main-title {
21
+ font-size: 36px;
22
+ color: #4A90E2;
23
+ font-weight: bold;
24
+ text-align: center;
25
+ }
26
+ .sub-title {
27
+ font-size: 24px;
28
+ color: #333333;
29
+ margin-top: 20px;
30
+ }
31
+ .section {
32
+ background-color: #f9f9f9;
33
+ padding: 15px;
34
+ border-radius: 10px;
35
+ margin-top: 20px;
36
+ }
37
+ .section h2 {
38
+ font-size: 22px;
39
+ color: #4A90E2;
40
+ }
41
+ .section p, .section ul {
42
+ color: #666666;
43
+ }
44
+ </style>
45
+ """, unsafe_allow_html=True)
46
+
47
+ @st.cache_resource
48
+ def init_spark():
49
+ spark = sparknlp.start()
50
+ return spark
51
+
52
+ @st.cache_resource
53
+ def create_pipeline(model):
54
+ document_assembler = DocumentAssembler()\
55
+ .setInputCol("text")\
56
+ .setOutputCol("documents")
57
+
58
+ t5 = T5Transformer() \
59
+ .pretrained(model, 'en') \
60
+ .setTask("summarize:")\
61
+ .setMaxOutputLength(200)\
62
+ .setInputCols(["documents"]) \
63
+ .setOutputCol("summaries")
64
+
65
+ pipeline = Pipeline().setStages([document_assembler, t5])
66
+ return pipeline
67
+
68
+ def fit_data(pipeline, data):
69
+ empty_df = spark.createDataFrame([['']]).toDF('text')
70
+ pipeline_model = pipeline.fit(empty_df)
71
+ model = LightPipeline(pipeline_model)
72
+ results = model.fullAnnotate(data)[0]
73
+ return results['summaries'][0].result
74
+
75
+ ############ SETTING UP THE PAGE LAYOUT ############
76
+
77
+ ### SIDEBAR CONTENT ###
78
+
79
+ # Model selection in sidebar
80
+ model = st.sidebar.selectbox(
81
+ "Choose the pretrained model",
82
+ ['t5_base', 't5_small'],
83
+ help="For more info about the models visit: https://sparknlp.org/models"
84
+ )
85
+
86
+ # Colab link for the notebook
87
+ link = """<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb">
88
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
89
+ </a>"""
90
+ st.sidebar.title('')
91
+ st.sidebar.markdown('Reference notebook:')
92
+ st.sidebar.markdown(link, unsafe_allow_html=True)
93
+
94
+ ### MAIN CONTENT ###
95
+
96
+ # st.title("Summarize Text")
97
+ st.markdown('<div class="main-title">State-of-the-Art Text Summarization with Spark NLP</div>', unsafe_allow_html=True)
98
+ st.write("")
99
+ st.write("")
100
+ st.markdown("""<p>This demo utilizes the <strong>Text-to-Text Transformer (T5)</strong>, introduced by Google researchers in 2019. T5 achieves remarkable results by utilizing a <strong>unique design</strong> that allows it to perform multiple NLP tasks with simple prefixes. For text summarization, the input text is prefixed with <strong>"summarize:"</strong>.</p>""", unsafe_allow_html=True)
101
+
102
+ # Sample text options
103
+ options = [
104
+ "Mount Tai is a mountain of historical and cultural significance located north of the city of Tai'an, in Shandong province, China. The tallest peak is the Jade Emperor Peak, which is commonly reported as being 1,545 meters tall, but is officially described by the PRC government as 1,532.7 meters tall. It is associated with sunrise, birth, and renewal, and is often regarded the foremost of the five. Mount Tai has been a place of worship for at least 3,000 years and served as one of the most important ceremonial centers of China during large portions of this period.",
105
+ "The Guadeloupe amazon (Amazona violacea) is a hypothetical extinct species of parrot that is thought to have been endemic to the Lesser Antillean island region of Guadeloupe. Described by 17th- and 18th-century writers, it is thought to have been related to, or possibly the same as, the extant imperial amazon. A tibiotarsus and an ulna bone from the island of Marie-Galante may belong to the Guadeloupe amazon. According to contemporary descriptions, its head, neck and underparts were mainly violet or slate, mixed with green and black; the back was brownish green; and the wings were green, yellow and red. It had iridescent feathers, and was able to raise a \"ruff\" of feathers around its neck. It fed on fruits and nuts, and the male and female took turns sitting on the nest. French settlers ate the birds and destroyed their habitat. Rare by 1779, the species appears to have become extinct by the end of the 18th century.",
106
+ "Pierre-Simon, marquis de Laplace (23 March 1749 – 5 March 1827) was a French scholar and polymath whose work was important to the development of engineering, mathematics, statistics, physics, astronomy, and philosophy. He summarized and extended the work of his predecessors in his five-volume Mécanique Céleste (Celestial Mechanics) (1799–1825). This work translated the geometric study of classical mechanics to one based on calculus, opening up a broader range of problems. In statistics, the Bayesian interpretation of probability was developed mainly by Laplace.",
107
+ "John Snow (15 March 1813 – 16 June 1858) was an English physician and a leader in the development of anaesthesia and medical hygiene. He is considered one of the founders of modern epidemiology, in part because of his work in tracing the source of a cholera outbreak in Soho, London, in 1854, which he curtailed by removing the handle of a water pump. Snow's findings inspired the adoption of anaesthesia as well as fundamental changes in the water and waste systems of London, which led to similar changes in other cities, and a significant improvement in general public health around the world.",
108
+ "The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci. Considered an archetypal masterpiece of the Italian Renaissance, it has been described as \"the best known, the most visited, the most written about, the most sung about, the most parodied work of art in the world\". The painting's novel qualities include the subject's enigmatic expression, the monumentality of the composition, the subtle modelling of forms, and the atmospheric illusionism.",
109
+ """Calculus, originally called infinitesimal calculus or "the calculus of infinitesimals", is the mathematical study of continuous change, in the same way that geometry is the study of shape and algebra is the study of generalizations of arithmetic operations. It has two major branches, differential calculus and integral calculus; the former concerns instantaneous rates of change, and the slopes of curves, while integral calculus concerns accumulation of quantities, and areas under or between curves. These two branches are related to each other by the fundamental theorem of calculus, and they make use of the fundamental notions of convergence of infinite sequences and infinite series to a well-defined limit.[1] Infinitesimal calculus was developed independently in the late 17th century by Isaac Newton and Gottfried Wilhelm Leibniz.[2][3] Today, calculus has widespread uses in science, engineering, and economics.[4] In mathematics education, calculus denotes courses of elementary mathematical analysis, which are mainly devoted to the study of functions and limits. The word calculus (plural calculi) is a Latin word, meaning originally "small pebble" (this meaning is kept in medicine – see Calculus (medicine)). Because such pebbles were used for calculation, the meaning of the word has evolved and today usually means a method of computation. It is therefore used for naming specific methods of calculation and related theories, such as propositional calculus, Ricci calculus, calculus of variations, lambda calculus, and process calculus.""",
110
+ ]
111
+
112
+ st.subheader("Summarize text to make it shorter while retaining meaning.")
113
+
114
+ # Text input options
115
+ selected_text = st.selectbox("Select an example", options)
116
+ custom_input = st.text_input("Try it for yourself!")
117
+
118
+ if custom_input:
119
+ selected_text = custom_input
120
+
121
+ st.subheader('Text')
122
+ st.write(selected_text)
123
+
124
+ st.subheader("Summary")
125
+
126
+ # Generate summary
127
+ spark = init_spark()
128
+ pipeline = create_pipeline(model)
129
+ output = fit_data(pipeline, selected_text)
130
+ st.write(output)
Dockerfile ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ ENV NB_USER jovyan
5
+ ENV NB_UID 1000
6
+ ENV HOME /home/${NB_USER}
7
+
8
+ ENV PYSPARK_PYTHON=python3
9
+ ENV PYSPARK_DRIVER_PYTHON=python3
10
+
11
+ RUN apt-get update && apt-get install -y \
12
+ tar \
13
+ wget \
14
+ bash \
15
+ rsync \
16
+ gcc \
17
+ libfreetype6-dev \
18
+ libhdf5-serial-dev \
19
+ libpng-dev \
20
+ libzmq3-dev \
21
+ python3 \
22
+ python3-dev \
23
+ python3-pip \
24
+ unzip \
25
+ pkg-config \
26
+ software-properties-common \
27
+ graphviz
28
+
29
+ RUN adduser --disabled-password \
30
+ --gecos "Default user" \
31
+ --uid ${NB_UID} \
32
+ ${NB_USER}
33
+
34
+ # Install OpenJDK-8
35
+ RUN apt-get update && \
36
+ apt-get install -y openjdk-8-jdk && \
37
+ apt-get install -y ant && \
38
+ apt-get clean;
39
+
40
+ # Fix certificate issues
41
+ RUN apt-get update && \
42
+ apt-get install ca-certificates-java && \
43
+ apt-get clean && \
44
+ update-ca-certificates -f;
45
+ # Setup JAVA_HOME -- useful for docker commandline
46
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
47
+ RUN export JAVA_HOME
48
+
49
+ RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ~/.bashrc
50
+
51
+ RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
52
+
53
+ RUN apt-get update
54
+ RUN apt-get install -y software-properties-common
55
+ RUN add-apt-repository ppa:deadsnakes/ppa
56
+ RUN apt-get install -y python3.8 python3-pip
57
+
58
+ ENV PYSPARK_PYTHON=python3.8
59
+ ENV PYSPARK_DRIVER_PYTHON=python3.8
60
+
61
+ COPY . .
62
+
63
+ RUN python3.8 -m pip install --upgrade pip
64
+ RUN python3.8 -m pip install -r requirements.txt
65
+
66
+ USER root
67
+ RUN chown -R ${NB_UID} ${HOME}
68
+ USER ${NB_USER}
69
+
70
+ WORKDIR ${HOME}
71
+
72
+ COPY . .
73
+
74
+ EXPOSE 7860
75
+
76
+ ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=7860", "--server.address=0.0.0.0"]
images/T5_model_diagram.jpg ADDED

Git LFS Details

  • SHA256: ecdc448c0c71610fa26d4063fd82edb1b6e879d3cb0e17fd2e8d29565a1ccbc4
  • Pointer size: 132 Bytes
  • Size of remote file: 3.15 MB
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #333333;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section h2 {
24
+ font-size: 22px;
25
+ color: #4A90E2;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ .link {
31
+ color: #4A90E2;
32
+ text-decoration: none;
33
+ }
34
+ </style>
35
+ """, unsafe_allow_html=True)
36
+
37
+ # Introduction
38
+ st.markdown('<div class="main-title">State-of-the-Art Text Summarization with Spark NLP</div>', unsafe_allow_html=True)
39
+
40
+ st.markdown("""
41
+ <div class="section">
42
+ <p>Welcome to the Spark NLP Demos App! In the rapidly evolving field of Natural Language Processing (NLP), the combination of powerful models and scalable frameworks is crucial. One such resource-intensive task is Text Summarization, which benefits immensely from the efficient implementation of machine learning models on distributed systems like Apache Spark.</p>
43
+ <p>Spark NLP stands out as the leading choice for enterprises building NLP solutions. This open-source library, built in Scala with a Python wrapper, offers state-of-the-art machine learning models within an easy-to-use pipeline design compatible with Spark ML.</p>
44
+ </div>
45
+ """, unsafe_allow_html=True)
46
+
47
+ # About the T5 Model
48
+ st.markdown('<div class="sub-title">About the T5 Model</div>', unsafe_allow_html=True)
49
+ st.markdown("""
50
+ <div class="section">
51
+ <p>A standout model for text summarization is the Text-to-Text Transformer (T5), introduced by Google researchers in 2019. T5 achieves remarkable results by utilizing a unique design that allows it to perform multiple NLP tasks with simple prefixes. For text summarization, the input text is prefixed with "summarize:".</p>
52
+ <p>In Spark NLP, the T5 model is available through the T5Transformer annotator. We'll show you how to use Spark NLP in Python to perform text summarization using the T5 model.</p>
53
+ </div>
54
+ """, unsafe_allow_html=True)
55
+
56
+ st.image('https://www.johnsnowlabs.com/wp-content/uploads/2023/09/img_blog_2.jpg', caption='Diagram of the T5 model, from the original paper', use_column_width='auto')
57
+
58
+ # How to Use the Model
59
+ st.markdown('<div class="sub-title">How to Use the T5 Model with Spark NLP</div>', unsafe_allow_html=True)
60
+ st.markdown("""
61
+ <div class="section">
62
+ <p>To use the T5Transformer annotator in Spark NLP to perform text summarization, we need to create a pipeline with two stages: the first transforms the input text into an annotation object, and the second stage contains the T5 model.</p>
63
+ </div>
64
+ """, unsafe_allow_html=True)
65
+
66
+ st.markdown('### Installation')
67
+ st.code('!pip install spark-nlp', language='python')
68
+
69
+ st.markdown('### Import Libraries and Start Spark Session')
70
+ st.code("""
71
+ import sparknlp
72
+ from sparknlp.base import DocumentAssembler, PipelineModel
73
+ from sparknlp.annotator import T5Transformer
74
+
75
+ # Start the Spark Session
76
+ spark = sparknlp.start()
77
+ """, language='python')
78
+
79
+ st.markdown("""
80
+ <div class="section">
81
+ <p>Now we can define the pipeline to use the T5 model. We'll use the PipelineModel object since we are using the pretrained model and don’t need to train any stage of the pipeline.</p>
82
+ </div>
83
+ """, unsafe_allow_html=True)
84
+
85
+ st.markdown('### Define the Pipeline')
86
+ st.code("""
87
+ # Transforms raw texts into `document` annotation
88
+ document_assembler = (
89
+ DocumentAssembler().setInputCol("text").setOutputCol("documents")
90
+ )
91
+ # The T5 model
92
+ t5 = (
93
+ T5Transformer.pretrained("t5_small")
94
+ .setTask("summarize:")
95
+ .setInputCols(["documents"])
96
+ .setMaxOutputLength(200)
97
+ .setOutputCol("t5")
98
+ )
99
+ # Define the Spark pipeline
100
+ pipeline = PipelineModel(stages = [document_assembler, t5])
101
+ """, language='python')
102
+
103
+ st.markdown("""
104
+ <div class="section">
105
+ <p>To use the model, create a Spark DataFrame containing the input data. In this example, we'll work with a single sentence, but the framework can handle multiple texts for simultaneous processing. The input column from the DocumentAssembler annotator requires a column named “text.”</p>
106
+ </div>
107
+ """, unsafe_allow_html=True)
108
+
109
+ st.markdown('### Create Example DataFrame')
110
+ st.code("""
111
+ example = \"""
112
+ Transfer learning, where a model is first pre-trained on a data-rich task
113
+ before being fine-tuned on a downstream task, has emerged as a powerful
114
+ technique in natural language processing (NLP). The effectiveness of transfer
115
+ learning has given rise to a diversity of approaches, methodology, and
116
+ practice. In this paper, we explore the landscape of transfer learning
117
+ techniques for NLP by introducing a unified framework that converts all
118
+ text-based language problems into a text-to-text format.
119
+ Our systematic study compares pre-training objectives, architectures,
120
+ unlabeled data sets, transfer approaches, and other factors on dozens of
121
+ language understanding tasks. By combining the insights from our exploration
122
+ with scale and our new Colossal Clean Crawled Corpus, we achieve
123
+ state-of-the-art results on many benchmarks covering summarization,
124
+ question answering, text classification, and more. To facilitate future
125
+ work on transfer learning for NLP, we release our data set, pre-trained
126
+ models, and code.
127
+ \"""
128
+
129
+ spark_df = spark.createDataFrame([[example]])
130
+ """, language='python')
131
+
132
+ st.markdown('### Apply the Pipeline')
133
+ st.code("""
134
+ result = pipeline.transform(spark_df)
135
+ result.select("t5.result").show(truncate=False)
136
+ """, language='python')
137
+
138
+ st.markdown('<div class="sub-title">Output</div>', unsafe_allow_html=True)
139
+ st.markdown("""
140
+ <div class="section">
141
+ <p>The summarization output will look something like this:</p>
142
+ <pre>transfer learning has emerged as a powerful technique in natural language processing (NLP) the effectiveness of transfer learning has given rise to a diversity of approaches, methodologies, and practice.</pre>
143
+ <p>Note: We defined the maximum output length to 200. Depending on the length of the original text, this parameter should be adapted.</p>
144
+ </div>
145
+ """, unsafe_allow_html=True)
146
+
147
+ # Additional Resources and References
148
+ st.markdown('<div class="sub-title">Additional Resources and References</div>', unsafe_allow_html=True)
149
+ st.markdown("""
150
+ <div class="section">
151
+ <ul>
152
+ <li><a class="link" href="https://sparknlp.org/docs/en/transformers#t5transformer" target="_blank">T5Transformer documentation page</a></li>
153
+ <li><a class="link" href="https://arxiv.org/abs/1910.10683" target="_blank">T5 paper</a></li>
154
+ <li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started with Spark NLP</a></li>
155
+ <li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
156
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
157
+ <li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
158
+ </ul>
159
+ </div>
160
+ """, unsafe_allow_html=True)
161
+
162
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
163
+ st.markdown("""
164
+ <div class="section">
165
+ <ul>
166
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
167
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
168
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
169
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
170
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
171
+ </ul>
172
+ </div>
173
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ spark-nlp
5
+ pyspark