abdullahmubeen10 commited on
Commit
8ef089c
·
verified ·
1 Parent(s): d6f8fc0

Upload 16 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from streamlit_tags import st_tags
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ layout="wide",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ # CSS for styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-title {
22
+ font-size: 36px;
23
+ color: #4A90E2;
24
+ font-weight: bold;
25
+ text-align: center;
26
+ }
27
+ .section {
28
+ background-color: #f9f9f9;
29
+ padding: 10px;
30
+ border-radius: 10px;
31
+ margin-top: 10px;
32
+ }
33
+ .section p, .section ul {
34
+ color: #666666;
35
+ }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
+
39
+ @st.cache_resource
40
+ def init_spark():
41
+ return sparknlp.start()
42
+
43
+ @st.cache_resource
44
+ def create_pipeline(model):
45
+ imageAssembler = ImageAssembler() \
46
+ .setInputCol("image") \
47
+ .setOutputCol("image_assembler")
48
+
49
+ imageCaptioning = VisionEncoderDecoderForImageCaptioning \
50
+ .pretrained("image_captioning_vit_gpt2") \
51
+ .setBeamSize(2) \
52
+ .setDoSample(False) \
53
+ .setInputCols(["image_assembler"]) \
54
+ .setOutputCol("caption")
55
+
56
+ pipeline = Pipeline(stages=[imageAssembler, imageCaptioning])
57
+ return pipeline
58
+
59
+ def fit_data(pipeline, data):
60
+ empty_df = spark.createDataFrame([['']]).toDF('text')
61
+ model = pipeline.fit(empty_df)
62
+ light_pipeline = LightPipeline(model)
63
+ annotations_result = light_pipeline.fullAnnotateImage(data)
64
+ return annotations_result[0]['class'][0].result
65
+
66
+ def save_uploadedfile(uploadedfile):
67
+ filepath = os.path.join(IMAGE_FILE_PATH, uploadedfile.name)
68
+ with open(filepath, "wb") as f:
69
+ if hasattr(uploadedfile, 'getbuffer'):
70
+ f.write(uploadedfile.getbuffer())
71
+ else:
72
+ f.write(uploadedfile.read())
73
+
74
+ # Sidebar content
75
+ model_list = ['image_captioning_vit_gpt2']
76
+ model = st.sidebar.selectbox(
77
+ "Choose the pretrained model",
78
+ model_list,
79
+ help="For more info about the models visit: https://sparknlp.org/models"
80
+ )
81
+
82
+ # Set up the page layout
83
+ st.markdown(f'<div class="main-title">VisionEncoderDecoder For Image Captioning</div>', unsafe_allow_html=True)
84
+ # st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
85
+
86
+ # Reference notebook link in sidebar
87
+ link = """
88
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/image/VisionEncoderDecoderForImageCaptioning.ipynb">
89
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
90
+ </a>
91
+ """
92
+ st.sidebar.markdown('Reference notebook:')
93
+ st.sidebar.markdown(link, unsafe_allow_html=True)
94
+
95
+ # Load examples
96
+ IMAGE_FILE_PATH = f"inputs"
97
+ image_files = sorted([file for file in os.listdir(IMAGE_FILE_PATH) if file.split('.')[-1]=='png' or file.split('.')[-1]=='jpg' or file.split('.')[-1]=='JPEG' or file.split('.')[-1]=='jpeg'])
98
+
99
+ img_options = st.selectbox("Select an image", image_files)
100
+ uploadedfile = st.file_uploader("Try it for yourself!")
101
+
102
+ if uploadedfile:
103
+ file_details = {"FileName":uploadedfile.name,"FileType":uploadedfile.type}
104
+ save_uploadedfile(uploadedfile)
105
+ selected_image = f"{IMAGE_FILE_PATH}/{uploadedfile.name}"
106
+ elif img_options:
107
+ selected_image = f"{IMAGE_FILE_PATH}/{img_options}"
108
+
109
+ st.subheader('Classified Image')
110
+
111
+ image_size = st.slider('Image Size', 400, 1000, value=400, step = 100)
112
+
113
+ try:
114
+ st.image(f"{IMAGE_FILE_PATH}/{selected_image}", width=image_size)
115
+ except:
116
+ st.image(selected_image, width=image_size)
117
+
118
+ st.subheader('Classification')
119
+
120
+ spark = init_spark()
121
+ Pipeline = create_pipeline(model)
122
+ output = fit_data(Pipeline, selected_image)
123
+
124
+ st.markdown(f'This document has been classified as : **{output}**')
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/image-1.png ADDED
inputs/image-10.png ADDED
inputs/image-11.png ADDED
inputs/image-2.png ADDED
inputs/image-3.png ADDED
inputs/image-4.png ADDED
inputs/image-5.png ADDED
inputs/image-6.png ADDED
inputs/image-7.png ADDED
inputs/image-8.png ADDED
inputs/image-9.png ADDED
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section h2 {
24
+ font-size: 22px;
25
+ color: #4A90E2;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ .link {
31
+ color: #4A90E2;
32
+ text-decoration: none;
33
+ }
34
+ .benchmark-table {
35
+ width: 100%;
36
+ border-collapse: collapse;
37
+ margin-top: 20px;
38
+ }
39
+ .benchmark-table th, .benchmark-table td {
40
+ border: 1px solid #ddd;
41
+ padding: 8px;
42
+ text-align: left;
43
+ }
44
+ .benchmark-table th {
45
+ background-color: #4A90E2;
46
+ color: white;
47
+ }
48
+ .benchmark-table td {
49
+ background-color: #f2f2f2;
50
+ }
51
+ </style>
52
+ """, unsafe_allow_html=True)
53
+
54
+ # Main Title
55
+ st.markdown('<div class="main-title">Image Captioning with VisionEncoderDecoderModel</div>', unsafe_allow_html=True)
56
+
57
+ # Description
58
+ st.markdown("""
59
+ <div class="section">
60
+ <p><strong>VisionEncoderDecoderModel</strong> allows you to initialize an image-to-text model using any pretrained Transformer-based vision model (e.g., ViT, BEiT, DeiT, Swin) as the encoder and any pretrained language model (e.g., RoBERTa, GPT2, BERT, DistilBERT) as the decoder.</p>
61
+ <p>This approach has been demonstrated to be effective in models like TrOCR: <a class="link" href="https://arxiv.org/abs/2103.14030" target="_blank">Transformer-based Optical Character Recognition with Pre-trained Models by Minghao Li et al.</a></p>
62
+ <p>After training or fine-tuning a VisionEncoderDecoderModel, it can be saved and loaded just like any other model. Examples are provided below.</p>
63
+ </div>
64
+ """, unsafe_allow_html=True)
65
+
66
+ # Image Captioning Overview
67
+ st.markdown('<div class="sub-title">What is Image Captioning?</div>', unsafe_allow_html=True)
68
+ st.markdown("""
69
+ <div class="section">
70
+ <p><strong>Image Captioning</strong> is the task of generating a textual description of an image. It uses a model to encode the image into a feature representation, which is then decoded by a language model to produce a natural language description.</p>
71
+ <h2>How It Works</h2>
72
+ <p>Image captioning typically involves the following steps:</p>
73
+ <ul>
74
+ <li><strong>Image Encoding</strong>: The image is passed through a vision model (e.g., ViT) to produce a feature representation.</li>
75
+ <li><strong>Caption Generation</strong>: The feature representation is fed into a language model (e.g., GPT2) to generate a caption for the image.</li>
76
+ </ul>
77
+ <h2>Why Use Image Captioning?</h2>
78
+ <p>Image captioning is useful for:</p>
79
+ <ul>
80
+ <li>Automatically generating descriptions for images, enhancing accessibility.</li>
81
+ <li>Improving search engine capabilities by allowing images to be indexed with textual content.</li>
82
+ <li>Supporting content management systems with automated tagging and description generation.</li>
83
+ </ul>
84
+ <h2>Where to Use It</h2>
85
+ <p>Applications of image captioning span various domains:</p>
86
+ <ul>
87
+ <li><strong>Social Media</strong>: Automatically generating captions for user-uploaded images.</li>
88
+ <li><strong>Digital Libraries</strong>: Creating descriptive metadata for image collections.</li>
89
+ <li><strong>Accessibility</strong>: Assisting visually impaired individuals by describing visual content.</li>
90
+ </ul>
91
+ <h2>Importance</h2>
92
+ <p>Image captioning is essential for bridging the gap between visual and textual information, enabling better interaction between machines and users by providing context and meaning to images.</p>
93
+ </div>
94
+ """, unsafe_allow_html=True)
95
+
96
+ # How to Use
97
+ st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
98
+ st.code('''
99
+ import sparknlp
100
+ from sparknlp.base import *
101
+ from sparknlp.annotator import *
102
+ from pyspark.ml import Pipeline
103
+
104
+ # Load image data
105
+ imageDF = spark.read \\
106
+ .format("image") \\
107
+ .option("dropInvalid", value = True) \\
108
+ .load("src/test/resources/image/")
109
+
110
+ # Define Image Assembler
111
+ imageAssembler = ImageAssembler() \\
112
+ .setInputCol("image") \\
113
+ .setOutputCol("image_assembler")
114
+
115
+ # Define VisionEncoderDecoder for image captioning
116
+ imageCaptioning = VisionEncoderDecoderForImageCaptioning \\
117
+ .pretrained() \\
118
+ .setBeamSize(2) \\
119
+ .setDoSample(False) \\
120
+ .setInputCols(["image_assembler"]) \\
121
+ .setOutputCol("caption")
122
+
123
+ # Create pipeline
124
+ pipeline = Pipeline().setStages([imageAssembler, imageCaptioning])
125
+
126
+ # Apply pipeline to image data
127
+ pipelineDF = pipeline.fit(imageDF).transform(imageDF)
128
+
129
+ # Show results
130
+ pipelineDF \\
131
+ .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "caption.result") \\
132
+ .show(truncate = False)
133
+ ''', language='python')
134
+
135
+ # Results
136
+ st.markdown('<div class="sub-title">Results</div>', unsafe_allow_html=True)
137
+ st.markdown("""
138
+ <div class="section">
139
+ <table class="benchmark-table">
140
+ <tr>
141
+ <th>Image Name</th>
142
+ <th>Result</th>
143
+ </tr>
144
+ <tr>
145
+ <td>palace.JPEG</td>
146
+ <td>[a large room filled with furniture and a large window]</td>
147
+ </tr>
148
+ <tr>
149
+ <td>egyptian_cat.jpeg</td>
150
+ <td>[a cat laying on a couch next to another cat]</td>
151
+ </tr>
152
+ <tr>
153
+ <td>hippopotamus.JPEG</td>
154
+ <td>[a brown bear in a body of water]</td>
155
+ </tr>
156
+ <tr>
157
+ <td>hen.JPEG</td>
158
+ <td>[a flock of chickens standing next to each other]</td>
159
+ </tr>
160
+ <tr>
161
+ <td>ostrich.JPEG</td>
162
+ <td>[a large bird standing on top of a lush green field]</td>
163
+ </tr>
164
+ <tr>
165
+ <td>junco.JPEG</td>
166
+ <td>[a small bird standing on a wet ground]</td>
167
+ </tr>
168
+ <tr>
169
+ <td>bluetick.jpg</td>
170
+ <td>[a small dog standing on a wooden floor]</td>
171
+ </tr>
172
+ <tr>
173
+ <td>chihuahua.jpg</td>
174
+ <td>[a small brown dog wearing a blue sweater]</td>
175
+ </tr>
176
+ <tr>
177
+ <td>tractor.JPEG</td>
178
+ <td>[a man is standing in a field with a tractor]</td>
179
+ </tr>
180
+ <tr>
181
+ <td>ox.JPEG</td>
182
+ <td>[a large brown cow standing on top of a lush green field]</td>
183
+ </tr>
184
+ </table>
185
+ </div>
186
+ """, unsafe_allow_html=True)
187
+
188
+ # Model Information
189
+ st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
190
+ st.markdown("""
191
+ <div class="section">
192
+ <table class="benchmark-table">
193
+ <tr>
194
+ <th>Attribute</th>
195
+ <th>Description</th>
196
+ </tr>
197
+ <tr>
198
+ <td><strong>Model Name</strong></td>
199
+ <td>image_captioning_vit_gpt2</td>
200
+ </tr>
201
+ <tr>
202
+ <td><strong>Compatibility</strong></td>
203
+ <td>Spark NLP 5.1.2+</td>
204
+ </tr>
205
+ <tr>
206
+ <td><strong>License</strong></td>
207
+ <td>Open Source</td>
208
+ </tr>
209
+ <tr>
210
+ <td><strong>Edition</strong></td>
211
+ <td>Official</td>
212
+ </tr>
213
+ <tr>
214
+ <td><strong>Input Labels</strong></td>
215
+ <td>[image_assembler]</td>
216
+ </tr>
217
+ <tr>
218
+ <td><strong>Output Labels</strong></td>
219
+ <td>[caption]</td>
220
+ </tr>
221
+ <tr>
222
+ <td><strong>Language</strong></td>
223
+ <td>en</td>
224
+ </tr>
225
+ <tr>
226
+ <td><strong>Size</strong></td>
227
+ <td>890.3 MB</td>
228
+ </tr>
229
+ </table>
230
+ </div>
231
+ """, unsafe_allow_html=True)
232
+
233
+ # Data Source Section
234
+ st.markdown('<div class="sub-title">Data Source</div>', unsafe_allow_html=True)
235
+ st.markdown("""
236
+ <div class="section">
237
+ <p>The image captioning model is available on <a class="link" href="https://huggingface.co/nlpconnect/vit-gpt2-image-captioning" target="_blank">Hugging Face</a>. This model uses ViT for image encoding and GPT2 for generating captions.</p>
238
+ </div>
239
+ """, unsafe_allow_html=True)
240
+
241
+ # Conclusion
242
+ st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
243
+ st.markdown("""
244
+ <div class="section">
245
+ <p>The <strong>VisionEncoderDecoderModel</strong> represents a powerful approach for bridging the gap between visual and textual information. By leveraging pretrained models for both image encoding and text generation, it effectively captures the nuances of both domains, resulting in high-quality outputs such as detailed image captions and accurate text-based interpretations of visual content.</p>
246
+ </div>
247
+ """, unsafe_allow_html=True)
248
+
249
+ # References
250
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
251
+ st.markdown("""
252
+ <div class="section">
253
+ <ul>
254
+ <li><a class="link" href="https://sparknlp.org/2023/09/20/image_captioning_vit_gpt2_en.html" target="_blank" rel="noopener">Image Captioning Model on Spark NLP</a></li>
255
+ <li><a class="link" href="https://huggingface.co/nlpconnect/vit-gpt2-image-captioning" target="_blank">Image Captioning Model on Hugging Face</a></li>
256
+ <li><a class="link" href="https://arxiv.org/abs/2103.14030" target="_blank">TrOCR Paper</a></li
257
+ </ul>
258
+ </div>
259
+ """, unsafe_allow_html=True)
260
+
261
+ # Community & Support
262
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
263
+ st.markdown("""
264
+ <div class="section">
265
+ <ul>
266
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
267
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
268
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
269
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
270
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
271
+ </ul>
272
+ </div>
273
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ streamlit-tags
3
+ pandas
4
+ numpy
5
+ spark-nlp
6
+ pyspark