abdullahmubeen10
commited on
Update pages/Workflow & Model Overview.py
Browse files- pages/Workflow & Model Overview.py +609 -609
pages/Workflow & Model Overview.py
CHANGED
@@ -1,610 +1,610 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
# Custom CSS for better styling
|
4 |
-
st.markdown("""
|
5 |
-
<style>
|
6 |
-
.main-title {
|
7 |
-
font-size: 36px;
|
8 |
-
color: #4A90E2;
|
9 |
-
font-weight: bold;
|
10 |
-
text-align: center;
|
11 |
-
}
|
12 |
-
.sub-title {
|
13 |
-
font-size: 24px;
|
14 |
-
color: #4A90E2;
|
15 |
-
margin-top: 20px;
|
16 |
-
}
|
17 |
-
.section {
|
18 |
-
background-color: #f9f9f9;
|
19 |
-
padding: 15px;
|
20 |
-
border-radius: 10px;
|
21 |
-
margin-top: 20px;
|
22 |
-
}
|
23 |
-
.section h2 {
|
24 |
-
font-size: 22px;
|
25 |
-
color: #4A90E2;
|
26 |
-
}
|
27 |
-
.section p, .section ul {
|
28 |
-
color: #666666;
|
29 |
-
}
|
30 |
-
.link {
|
31 |
-
color: #4A90E2;
|
32 |
-
text-decoration: none;
|
33 |
-
}
|
34 |
-
.benchmark-table {
|
35 |
-
width: 100%;
|
36 |
-
border-collapse: collapse;
|
37 |
-
margin-top: 20px;
|
38 |
-
}
|
39 |
-
.benchmark-table th, .benchmark-table td {
|
40 |
-
border: 1px solid #ddd;
|
41 |
-
padding: 8px;
|
42 |
-
text-align: left;
|
43 |
-
}
|
44 |
-
.benchmark-table th {
|
45 |
-
background-color: #4A90E2;
|
46 |
-
color: white;
|
47 |
-
}
|
48 |
-
.benchmark-table td {
|
49 |
-
background-color: #f2f2f2;
|
50 |
-
}
|
51 |
-
</style>
|
52 |
-
""", unsafe_allow_html=True)
|
53 |
-
|
54 |
-
# Title
|
55 |
-
st.markdown('<div class="main-title">Introduction to ALBERT Annotators in Spark NLP</div>', unsafe_allow_html=True)
|
56 |
-
|
57 |
-
# Subtitle
|
58 |
-
st.markdown("""
|
59 |
-
<div class="section">
|
60 |
-
<p>ALBERT (A Lite BERT) offers a more efficient alternative to BERT by implementing two parameter-reduction techniques: splitting the embedding matrix and using repeating layers. It maintains high performance while being more memory-efficient. Below, we provide an overview of the ALBERT annotator for token classification:</p>
|
61 |
-
</div>
|
62 |
-
""", unsafe_allow_html=True)
|
63 |
-
|
64 |
-
tab1, tab2, tab3 = st.tabs(["ALBERT for Token Classification", "ALBERT for Sequence Classification", "ALBERT for Question Answering"])
|
65 |
-
|
66 |
-
with tab1:
|
67 |
-
st.markdown("""
|
68 |
-
<div class="section">
|
69 |
-
<h2>ALBERT for Token Classification</h2>
|
70 |
-
<p>The <strong>AlbertForTokenClassification</strong> annotator is designed for Named Entity Recognition (NER) tasks using ALBERT. This model efficiently handles token classification, enabling the identification and classification of entities in text. The ALBERT model, with its parameter-reduction techniques, achieves state-of-the-art performance while being more lightweight compared to BERT.</p>
|
71 |
-
<p>Token classification with ALBERT enables:</p>
|
72 |
-
<ul>
|
73 |
-
<li><strong>Named Entity Recognition (NER):</strong> Identifying and classifying entities such as names, organizations, locations, and other predefined categories.</li>
|
74 |
-
<li><strong>Information Extraction:</strong> Extracting key information from unstructured text for further analysis.</li>
|
75 |
-
<li><strong>Text Categorization:</strong> Enhancing document retrieval and categorization based on entity recognition.</li>
|
76 |
-
</ul>
|
77 |
-
<p>Here is an example of how ALBERT token classification works:</p>
|
78 |
-
<table class="benchmark-table">
|
79 |
-
<tr>
|
80 |
-
<th>Entity</th>
|
81 |
-
<th>Label</th>
|
82 |
-
</tr>
|
83 |
-
<tr>
|
84 |
-
<td>Google</td>
|
85 |
-
<td>ORG</td>
|
86 |
-
</tr>
|
87 |
-
<tr>
|
88 |
-
<td>Satya Nadella</td>
|
89 |
-
<td>PER</td>
|
90 |
-
</tr>
|
91 |
-
<tr>
|
92 |
-
<td>Seattle</td>
|
93 |
-
<td>LOC</td>
|
94 |
-
</tr>
|
95 |
-
</table>
|
96 |
-
</div>
|
97 |
-
""", unsafe_allow_html=True)
|
98 |
-
|
99 |
-
# ALBERT Token Classification - NER CoNLL
|
100 |
-
st.markdown('<div class="sub-title">ALBERT Token Classification - NER CoNLL</div>', unsafe_allow_html=True)
|
101 |
-
st.markdown("""
|
102 |
-
<div class="section">
|
103 |
-
<p>The <strong>albert_base_token_classifier_conll03</strong> is a fine-tuned ALBERT model for token classification tasks, specifically adapted for Named Entity Recognition (NER) on the CoNLL-03 dataset. It recognizes four types of entities: location (LOC), organizations (ORG), person (PER), and Miscellaneous (MISC).</p>
|
104 |
-
</div>
|
105 |
-
""", unsafe_allow_html=True)
|
106 |
-
|
107 |
-
# How to Use the Model - Token Classification
|
108 |
-
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
109 |
-
st.code('''
|
110 |
-
from sparknlp.base import *
|
111 |
-
from sparknlp.annotator import *
|
112 |
-
from pyspark.ml import Pipeline
|
113 |
-
from pyspark.sql.functions import col, expr
|
114 |
-
|
115 |
-
document_assembler = DocumentAssembler() \\
|
116 |
-
.setInputCol('text') \\
|
117 |
-
.setOutputCol('document')
|
118 |
-
|
119 |
-
tokenizer = Tokenizer() \\
|
120 |
-
.setInputCols(['document']) \\
|
121 |
-
.setOutputCol('token')
|
122 |
-
|
123 |
-
tokenClassifier = AlbertForTokenClassification \\
|
124 |
-
.pretrained('albert_base_token_classifier_conll03', 'en') \\
|
125 |
-
.setInputCols(['token', 'document']) \\
|
126 |
-
.setOutputCol('ner') \\
|
127 |
-
.setCaseSensitive(True) \\
|
128 |
-
.setMaxSentenceLength(512)
|
129 |
-
|
130 |
-
# Convert NER labels to entities
|
131 |
-
ner_converter = NerConverter() \\
|
132 |
-
.setInputCols(['document', 'token', 'ner']) \\
|
133 |
-
.setOutputCol('entities')
|
134 |
-
|
135 |
-
pipeline = Pipeline(stages=[
|
136 |
-
document_assembler,
|
137 |
-
tokenizer,
|
138 |
-
tokenClassifier,
|
139 |
-
ner_converter
|
140 |
-
])
|
141 |
-
|
142 |
-
example = spark.createDataFrame([["My name is John!"]]).toDF("text")
|
143 |
-
result = pipeline.fit(example).transform(example)
|
144 |
-
|
145 |
-
result.select(
|
146 |
-
expr("explode(entities) as ner_chunk")
|
147 |
-
).select(
|
148 |
-
col("ner_chunk.result").alias("chunk"),
|
149 |
-
col("ner_chunk.metadata.entity").alias("ner_label")
|
150 |
-
).show(truncate=False)
|
151 |
-
''', language='python')
|
152 |
-
|
153 |
-
# Results
|
154 |
-
st.text("""
|
155 |
-
+-----+---------+
|
156 |
-
|chunk|ner_label|
|
157 |
-
+-----+---------+
|
158 |
-
|John |PER |
|
159 |
-
+-----+---------+
|
160 |
-
""")
|
161 |
-
|
162 |
-
# Performance Metrics
|
163 |
-
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
164 |
-
st.markdown("""
|
165 |
-
<div class="section">
|
166 |
-
<p>Here are the detailed performance metrics for the ALBERT token classification model:</p>
|
167 |
-
<table class="benchmark-table">
|
168 |
-
<tr>
|
169 |
-
<th>Entity</th>
|
170 |
-
<th>Precision</th>
|
171 |
-
<th>Recall</th>
|
172 |
-
<th>F1-Score</th>
|
173 |
-
<th>Support</th>
|
174 |
-
</tr>
|
175 |
-
<tr>
|
176 |
-
<td>B-LOC</td>
|
177 |
-
<td>0.95</td>
|
178 |
-
<td>0.97</td>
|
179 |
-
<td>0.96</td>
|
180 |
-
<td>1837</td>
|
181 |
-
</tr>
|
182 |
-
<tr>
|
183 |
-
<td>B-MISC</td>
|
184 |
-
<td>0.87</td>
|
185 |
-
<td>0.86</td>
|
186 |
-
<td>0.87</td>
|
187 |
-
<td>922</td>
|
188 |
-
</tr>
|
189 |
-
<tr>
|
190 |
-
<td>B-ORG</td>
|
191 |
-
<td>0.90</td>
|
192 |
-
<td>0.91</td>
|
193 |
-
<td>0.90</td>
|
194 |
-
<td>1341</td>
|
195 |
-
</tr>
|
196 |
-
<tr>
|
197 |
-
<td>B-PER</td>
|
198 |
-
<td>0.91</td>
|
199 |
-
<td>0.97</td>
|
200 |
-
<td>0.94</td>
|
201 |
-
<td>1842</td>
|
202 |
-
</tr>
|
203 |
-
<tr>
|
204 |
-
<td>I-LOC</td>
|
205 |
-
<td>0.88</td>
|
206 |
-
<td>0.86</td>
|
207 |
-
<td>0.87</td>
|
208 |
-
<td>257</td>
|
209 |
-
</tr>
|
210 |
-
<tr>
|
211 |
-
<td>I-MISC</td>
|
212 |
-
<td>0.78</td>
|
213 |
-
<td>0.76</td>
|
214 |
-
<td>0.77</td>
|
215 |
-
<td>346</td>
|
216 |
-
</tr>
|
217 |
-
<tr>
|
218 |
-
<td>I-ORG</td>
|
219 |
-
<td>0.84</td>
|
220 |
-
<td>0.85</td>
|
221 |
-
<td>0.85</td>
|
222 |
-
<td>751</td>
|
223 |
-
</tr>
|
224 |
-
<tr>
|
225 |
-
<td>I-PER</td>
|
226 |
-
<td>0.97</td>
|
227 |
-
<td>0.92</td>
|
228 |
-
<td>0.94</td>
|
229 |
-
<td>1307</td>
|
230 |
-
</tr>
|
231 |
-
<tr>
|
232 |
-
<td>O</td>
|
233 |
-
<td>0.99</td>
|
234 |
-
<td>0.99</td>
|
235 |
-
<td>0.99</td>
|
236 |
-
<td>42759</td>
|
237 |
-
</tr>
|
238 |
-
<tr>
|
239 |
-
<td>average</td>
|
240 |
-
<td>0.92</td>
|
241 |
-
<td>0.92</td>
|
242 |
-
<td>0.92</td>
|
243 |
-
<td>52000</td>
|
244 |
-
</tr>
|
245 |
-
</table>
|
246 |
-
</div>
|
247 |
-
|
248 |
-
""", unsafe_allow_html=True)
|
249 |
-
# Model Info Section
|
250 |
-
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
251 |
-
st.markdown("""
|
252 |
-
<div class="section">
|
253 |
-
<ul>
|
254 |
-
<li><strong>Model Name:</strong> ALBERT for Token Classification</li>
|
255 |
-
<li><strong>Pretrained Model:</strong> albert_base_token_classifier_conll03</li>
|
256 |
-
<li><strong>Training Dataset:</strong> CoNLL-03</li>
|
257 |
-
<li><strong>Languages Supported:</strong> English</li>
|
258 |
-
<li><strong>Use Cases:</strong>
|
259 |
-
<ul>
|
260 |
-
<li>Named Entity Recognition (NER)</li>
|
261 |
-
<li>Information Extraction</li>
|
262 |
-
<li>Text Categorization</li>
|
263 |
-
</ul>
|
264 |
-
</li>
|
265 |
-
<li><strong>Performance:</strong> High accuracy with a focus on memory efficiency</li>
|
266 |
-
<li><strong>Implementation:</strong> Spark NLP</li>
|
267 |
-
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments with optimization</li>
|
268 |
-
</ul>
|
269 |
-
</div>
|
270 |
-
""", unsafe_allow_html=True)
|
271 |
-
|
272 |
-
# References Section
|
273 |
-
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
274 |
-
st.markdown("""
|
275 |
-
<div class="section">
|
276 |
-
<ul>
|
277 |
-
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
278 |
-
<li><a class="link" href="https://github.com/google-research/albert" target="_blank">Google Research's ALBERT GitHub Repository</a></li>
|
279 |
-
<li><a class="link" href="https://sparknlp.org/2022/06/15/albert_base_qa_squad2_en_3_0.html" target="_blank">Spark NLP Model - albert_base_qa_squad2</a></li>
|
280 |
-
<li><a class="link" href="https://nlp.stanford.edu/projects/conll2003/" target="_blank">CoNLL-03 Named Entity Recognition Dataset</a></li>
|
281 |
-
</ul>
|
282 |
-
</div>
|
283 |
-
""", unsafe_allow_html=True)
|
284 |
-
|
285 |
-
with tab2:
|
286 |
-
st.markdown("""
|
287 |
-
<div class="section">
|
288 |
-
<h2>ALBERT for Sequence Classification</h2>
|
289 |
-
<p>The <strong>AlbertForSequenceClassification</strong> annotator is tailored for tasks like sentiment analysis or multi-class text classification using the ALBERT model. This model efficiently handles sequence classification, achieving state-of-the-art performance with reduced parameters compared to BERT.</p>
|
290 |
-
<p>Sequence classification with ALBERT enables:</p>
|
291 |
-
<ul>
|
292 |
-
<li><strong>Sentiment Analysis:</strong> Determining the sentiment expressed in text, such as positive, negative, or neutral.</li>
|
293 |
-
<li><strong>Multi-Class Text Classification:</strong> Categorizing text into predefined classes, such as news categories or topics.</li>
|
294 |
-
<li><strong>Document Classification:</strong> Enhancing search and categorization of documents based on content classification.</li>
|
295 |
-
</ul>
|
296 |
-
<p>Here is an example of how ALBERT sequence classification works:</p>
|
297 |
-
<table class="benchmark-table">
|
298 |
-
<tr>
|
299 |
-
<th>Text</th>
|
300 |
-
<th>Label</th>
|
301 |
-
</tr>
|
302 |
-
<tr>
|
303 |
-
<td>Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993.</td>
|
304 |
-
<td>Business</td>
|
305 |
-
</tr>
|
306 |
-
</table>
|
307 |
-
</div>
|
308 |
-
""", unsafe_allow_html=True)
|
309 |
-
|
310 |
-
# ALBERT Sequence Classification - AG News
|
311 |
-
st.markdown('<div class="sub-title">ALBERT Sequence Classification - AG News</div>', unsafe_allow_html=True)
|
312 |
-
st.markdown("""
|
313 |
-
<div class="section">
|
314 |
-
<p>The <strong>albert_base_sequence_classifier_ag_news</strong> is a fine-tuned ALBERT model for sequence classification tasks, specifically adapted for text classification on the AG News dataset. It recognizes four categories: Business, Sci/Tech, Sports, and World.</p>
|
315 |
-
</div>
|
316 |
-
""", unsafe_allow_html=True)
|
317 |
-
|
318 |
-
# How to Use the Model - Sequence Classification
|
319 |
-
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
320 |
-
st.code('''
|
321 |
-
from sparknlp.base import *
|
322 |
-
from sparknlp.annotator import *
|
323 |
-
from pyspark.ml import Pipeline
|
324 |
-
from pyspark.sql.functions import col, expr
|
325 |
-
|
326 |
-
document_assembler = DocumentAssembler() \\
|
327 |
-
.setInputCol('text') \\
|
328 |
-
.setOutputCol('document')
|
329 |
-
|
330 |
-
tokenizer = Tokenizer() \\
|
331 |
-
.setInputCols(['document']) \\
|
332 |
-
.setOutputCol('token')
|
333 |
-
|
334 |
-
sequenceClassifier = AlbertForSequenceClassification \\
|
335 |
-
.pretrained('albert_base_sequence_classifier_ag_news', 'en') \\
|
336 |
-
.setInputCols(['token', 'document']) \\
|
337 |
-
.setOutputCol('class') \\
|
338 |
-
.setCaseSensitive(False) \\
|
339 |
-
.setMaxSentenceLength(512)
|
340 |
-
|
341 |
-
pipeline = Pipeline(stages=[
|
342 |
-
document_assembler,
|
343 |
-
tokenizer,
|
344 |
-
sequenceClassifier
|
345 |
-
])
|
346 |
-
|
347 |
-
example = spark.createDataFrame([["Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993."]]).toDF("text")
|
348 |
-
result = pipeline.fit(example).transform(example)
|
349 |
-
|
350 |
-
result.select(
|
351 |
-
expr("explode(class) as classification_result")
|
352 |
-
).select(
|
353 |
-
col("classification_result.result").alias("category")
|
354 |
-
).show(truncate=False)
|
355 |
-
''', language='python')
|
356 |
-
|
357 |
-
# Results
|
358 |
-
st.text("""
|
359 |
-
+---------+
|
360 |
-
|category |
|
361 |
-
+---------+
|
362 |
-
|Business |
|
363 |
-
+---------+
|
364 |
-
""")
|
365 |
-
|
366 |
-
# Performance Metrics
|
367 |
-
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
368 |
-
st.markdown("""
|
369 |
-
<div class="section">
|
370 |
-
<p>Here are the detailed performance metrics for the ALBERT sequence classification model on the AG News dataset:</p>
|
371 |
-
<table class="benchmark-table">
|
372 |
-
<tr>
|
373 |
-
<th>Metric</th>
|
374 |
-
<th>Score</th>
|
375 |
-
</tr>
|
376 |
-
<tr>
|
377 |
-
<td>Accuracy</td>
|
378 |
-
<td>0.9472</td>
|
379 |
-
</tr>
|
380 |
-
<tr>
|
381 |
-
<td>F1-Score</td>
|
382 |
-
<td>0.9472</td>
|
383 |
-
</tr>
|
384 |
-
<tr>
|
385 |
-
<td>Precision</td>
|
386 |
-
<td>0.9472</td>
|
387 |
-
</tr>
|
388 |
-
<tr>
|
389 |
-
<td>Recall</td>
|
390 |
-
<td>0.9472</td>
|
391 |
-
</tr>
|
392 |
-
<tr>
|
393 |
-
<td>Evaluation Loss</td>
|
394 |
-
<td>0.1882</td>
|
395 |
-
</tr>
|
396 |
-
</table>
|
397 |
-
</div>
|
398 |
-
|
399 |
-
""", unsafe_allow_html=True)
|
400 |
-
|
401 |
-
# Model Info Section
|
402 |
-
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
403 |
-
st.markdown("""
|
404 |
-
<div class="section">
|
405 |
-
<ul>
|
406 |
-
<li><strong>Model Name:</strong> ALBERT for Sequence Classification</li>
|
407 |
-
<li><strong>Pretrained Model:</strong> albert_base_sequence_classifier_ag_news</li>
|
408 |
-
<li><strong>Training Dataset:</strong> AG News</li>
|
409 |
-
<li><strong>Languages Supported:</strong> English</li>
|
410 |
-
<li><strong>Use Cases:</strong>
|
411 |
-
<ul>
|
412 |
-
<li>Sentiment Analysis</li>
|
413 |
-
<li>Multi-Class Text Classification</li>
|
414 |
-
<li>Document Classification</li>
|
415 |
-
</ul>
|
416 |
-
</li>
|
417 |
-
<li><strong>Performance:</strong> High accuracy with a focus on memory efficiency</li>
|
418 |
-
<li><strong>Implementation:</strong> Spark NLP</li>
|
419 |
-
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments with optimization</li>
|
420 |
-
</ul>
|
421 |
-
</div>
|
422 |
-
""", unsafe_allow_html=True)
|
423 |
-
|
424 |
-
# References Section
|
425 |
-
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
426 |
-
st.markdown("""
|
427 |
-
<div class="section">
|
428 |
-
<ul>
|
429 |
-
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
430 |
-
<li><a class="link" href="https://github.com/google-research/albert" target="_blank">Google Research's ALBERT GitHub Repository</a></li>
|
431 |
-
<li><a class="link" href="https://sparknlp.org/2021/12/16/albert_base_sequence_classifier_ag_news_en.html" target="_blank">Spark NLP Model - albert_base_sequence_classifier_ag_news</a></li>
|
432 |
-
<li><a class="link" href="https://huggingface.co/datasets/ag_news" target="_blank">AG News Dataset</a></li>
|
433 |
-
</ul>
|
434 |
-
</div>
|
435 |
-
""", unsafe_allow_html=True)
|
436 |
-
|
437 |
-
with tab3:
|
438 |
-
st.markdown("""
|
439 |
-
<div class="section">
|
440 |
-
<h2>ALBERT for Question Answering</h2>
|
441 |
-
<p>The <strong>AlbertForQuestionAnswering</strong> annotator is specialized for tasks involving Question Answering (QA) using the ALBERT model. This model efficiently processes question-context pairs to provide accurate answers, making it ideal for QA systems and information retrieval applications.</p>
|
442 |
-
<p>Question Answering with ALBERT enables:</p>
|
443 |
-
<ul>
|
444 |
-
<li><strong>Information Retrieval:</strong> Extracting precise answers from large text corpora based on user queries.</li>
|
445 |
-
<li><strong>Knowledge Management:</strong> Enhancing customer support and information systems by providing accurate answers.</li>
|
446 |
-
<li><strong>Contextual Understanding:</strong> Leveraging ALBERT’s capabilities to understand the context of questions and provide relevant answers.</li>
|
447 |
-
</ul>
|
448 |
-
<p>Here is an example of how ALBERT question answering works:</p>
|
449 |
-
<table class="benchmark-table">
|
450 |
-
<tr>
|
451 |
-
<th>Question</th>
|
452 |
-
<th>Context</th>
|
453 |
-
<th>Answer</th>
|
454 |
-
</tr>
|
455 |
-
<tr>
|
456 |
-
<td>What is my name?</td>
|
457 |
-
<td>My name is Clara and I live in Berkeley.</td>
|
458 |
-
<td>Clara</td>
|
459 |
-
</tr>
|
460 |
-
</table>
|
461 |
-
</div>
|
462 |
-
""", unsafe_allow_html=True)
|
463 |
-
|
464 |
-
# ALBERT Question Answering - SQuAD2
|
465 |
-
st.markdown('<div class="sub-title">ALBERT Question Answering - SQuAD2</div>', unsafe_allow_html=True)
|
466 |
-
st.markdown("""
|
467 |
-
<div class="section">
|
468 |
-
<p>The <strong>albert_base_qa_squad2</strong> is a fine-tuned ALBERT model for Question Answering tasks, specifically adapted for the SQuAD2 dataset. It is capable of answering questions based on the provided context with high accuracy.</p>
|
469 |
-
</div>
|
470 |
-
""", unsafe_allow_html=True)
|
471 |
-
|
472 |
-
# How to Use the Model - Question Answering
|
473 |
-
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
474 |
-
st.code('''
|
475 |
-
from sparknlp.base import *
|
476 |
-
from sparknlp.annotator import *
|
477 |
-
from pyspark.ml import Pipeline
|
478 |
-
from pyspark.sql.functions import col, expr
|
479 |
-
|
480 |
-
documentAssembler = MultiDocumentAssembler() \\
|
481 |
-
.setInputCols(["question", "context"]) \\
|
482 |
-
.setOutputCols(["document_question", "document_context"])
|
483 |
-
|
484 |
-
spanClassifier = AlbertForQuestionAnswering.pretrained("albert_base_qa_squad2","en") \\
|
485 |
-
.setInputCols(["document_question", "document_context"]) \\
|
486 |
-
.setOutputCol("answer") \\
|
487 |
-
.setCaseSensitive(False)
|
488 |
-
|
489 |
-
pipeline = Pipeline(stages=[documentAssembler, spanClassifier])
|
490 |
-
|
491 |
-
data = spark.createDataFrame([["What is my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
|
492 |
-
result = pipeline.fit(data).transform(data)
|
493 |
-
|
494 |
-
result.select(
|
495 |
-
col("answer.result").alias("predicted_answer")
|
496 |
-
).show(truncate=False)
|
497 |
-
''', language='python')
|
498 |
-
|
499 |
-
# Results
|
500 |
-
st.text("""
|
501 |
-
+----------------+
|
502 |
-
|predicted_answer|
|
503 |
-
+----------------+
|
504 |
-
|[clara] |
|
505 |
-
+----------------+
|
506 |
-
""")
|
507 |
-
|
508 |
-
# Performance Metrics
|
509 |
-
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
510 |
-
st.markdown("""
|
511 |
-
<div class="section">
|
512 |
-
<p>The performance metrics of the ALBERT question answering model on a development subset of the SQuAD2 dataset are:</p>
|
513 |
-
<table class="benchmark-table">
|
514 |
-
<tr>
|
515 |
-
<th>Metric</th>
|
516 |
-
<th>Score</th>
|
517 |
-
</tr>
|
518 |
-
<tr>
|
519 |
-
<td>Exact Match</td>
|
520 |
-
<td>78.71%</td>
|
521 |
-
</tr>
|
522 |
-
<tr>
|
523 |
-
<td>F1 Score</td>
|
524 |
-
<td>81.89%</td>
|
525 |
-
</tr>
|
526 |
-
<tr>
|
527 |
-
<td>Total</td>
|
528 |
-
<td>6078</td>
|
529 |
-
</tr>
|
530 |
-
<tr>
|
531 |
-
<td>HasAns Exact Match</td>
|
532 |
-
<td>75.40%</td>
|
533 |
-
</tr>
|
534 |
-
<tr>
|
535 |
-
<td>HasAns F1 Score</td>
|
536 |
-
<td>82.04%</td>
|
537 |
-
</tr>
|
538 |
-
<tr>
|
539 |
-
<td>HasAns Total</td>
|
540 |
-
<td>2910</td>
|
541 |
-
</tr>
|
542 |
-
<tr>
|
543 |
-
<td>NoAns Exact Match</td>
|
544 |
-
<td>81.76%</td>
|
545 |
-
</tr>
|
546 |
-
<tr>
|
547 |
-
<td>NoAns F1 Score</td>
|
548 |
-
<td>81.76%</td>
|
549 |
-
</tr>
|
550 |
-
<tr>
|
551 |
-
<td>NoAns Total</td>
|
552 |
-
<td>3168</td>
|
553 |
-
</tr>
|
554 |
-
<tr>
|
555 |
-
<td>Best Exact Match</td>
|
556 |
-
<td>78.73%</td>
|
557 |
-
</tr>
|
558 |
-
<tr>
|
559 |
-
<td>Best F1 Score</td>
|
560 |
-
<td>81.91%</td>
|
561 |
-
</tr>
|
562 |
-
</table>
|
563 |
-
</div>
|
564 |
-
""", unsafe_allow_html=True)
|
565 |
-
|
566 |
-
# Model Info Section
|
567 |
-
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
568 |
-
st.markdown("""
|
569 |
-
<div class="section">
|
570 |
-
<ul>
|
571 |
-
<li><strong>Model Name:</strong> ALBERT for Question Answering</li>
|
572 |
-
<li><strong>Pretrained Model:</strong> albert_base_qa_squad2</li>
|
573 |
-
<li><strong>Training Dataset:</strong> SQuAD2</li>
|
574 |
-
<li><strong>Languages Supported:</strong> English</li>
|
575 |
-
<li><strong>Use Cases:</strong>
|
576 |
-
<ul>
|
577 |
-
<li>Information Retrieval</li>
|
578 |
-
<li>Knowledge Management</li>
|
579 |
-
<li>Contextual Understanding</li>
|
580 |
-
</ul>
|
581 |
-
</li>
|
582 |
-
<li><strong>Performance:</strong> High accuracy with optimized resource usage</li>
|
583 |
-
<li><strong>Implementation:</strong> Spark NLP</li>
|
584 |
-
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments</li>
|
585 |
-
</ul>
|
586 |
-
</div>
|
587 |
-
""", unsafe_allow_html=True)
|
588 |
-
|
589 |
-
# References Section
|
590 |
-
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
591 |
-
st.markdown("""
|
592 |
-
<div class="section">
|
593 |
-
<ul>
|
594 |
-
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
595 |
-
<li><a class="link" href="https://sparknlp.org/2022/06/15/albert_base_qa_squad2_en_3_0.html" target="_blank">Spark NLP Model - albert_base_qa_squad2</a></li>
|
596 |
-
</ul>
|
597 |
-
</div>
|
598 |
-
""", unsafe_allow_html=True)
|
599 |
-
|
600 |
-
# Community & Support
|
601 |
-
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
602 |
-
st.markdown("""
|
603 |
-
<div class="
|
604 |
-
<ul>
|
605 |
-
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
606 |
-
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
607 |
-
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
608 |
-
</ul>
|
609 |
-
</div>
|
610 |
""", unsafe_allow_html=True)
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
.benchmark-table {
|
35 |
+
width: 100%;
|
36 |
+
border-collapse: collapse;
|
37 |
+
margin-top: 20px;
|
38 |
+
}
|
39 |
+
.benchmark-table th, .benchmark-table td {
|
40 |
+
border: 1px solid #ddd;
|
41 |
+
padding: 8px;
|
42 |
+
text-align: left;
|
43 |
+
}
|
44 |
+
.benchmark-table th {
|
45 |
+
background-color: #4A90E2;
|
46 |
+
color: white;
|
47 |
+
}
|
48 |
+
.benchmark-table td {
|
49 |
+
background-color: #f2f2f2;
|
50 |
+
}
|
51 |
+
</style>
|
52 |
+
""", unsafe_allow_html=True)
|
53 |
+
|
54 |
+
# Title
|
55 |
+
st.markdown('<div class="main-title">Introduction to ALBERT Annotators in Spark NLP</div>', unsafe_allow_html=True)
|
56 |
+
|
57 |
+
# Subtitle
|
58 |
+
st.markdown("""
|
59 |
+
<div class="section">
|
60 |
+
<p>ALBERT (A Lite BERT) offers a more efficient alternative to BERT by implementing two parameter-reduction techniques: splitting the embedding matrix and using repeating layers. It maintains high performance while being more memory-efficient. Below, we provide an overview of the ALBERT annotator for token classification:</p>
|
61 |
+
</div>
|
62 |
+
""", unsafe_allow_html=True)
|
63 |
+
|
64 |
+
tab1, tab2, tab3 = st.tabs(["ALBERT for Token Classification", "ALBERT for Sequence Classification", "ALBERT for Question Answering"])
|
65 |
+
|
66 |
+
with tab1:
|
67 |
+
st.markdown("""
|
68 |
+
<div class="section">
|
69 |
+
<h2>ALBERT for Token Classification</h2>
|
70 |
+
<p>The <strong>AlbertForTokenClassification</strong> annotator is designed for Named Entity Recognition (NER) tasks using ALBERT. This model efficiently handles token classification, enabling the identification and classification of entities in text. The ALBERT model, with its parameter-reduction techniques, achieves state-of-the-art performance while being more lightweight compared to BERT.</p>
|
71 |
+
<p>Token classification with ALBERT enables:</p>
|
72 |
+
<ul>
|
73 |
+
<li><strong>Named Entity Recognition (NER):</strong> Identifying and classifying entities such as names, organizations, locations, and other predefined categories.</li>
|
74 |
+
<li><strong>Information Extraction:</strong> Extracting key information from unstructured text for further analysis.</li>
|
75 |
+
<li><strong>Text Categorization:</strong> Enhancing document retrieval and categorization based on entity recognition.</li>
|
76 |
+
</ul>
|
77 |
+
<p>Here is an example of how ALBERT token classification works:</p>
|
78 |
+
<table class="benchmark-table">
|
79 |
+
<tr>
|
80 |
+
<th>Entity</th>
|
81 |
+
<th>Label</th>
|
82 |
+
</tr>
|
83 |
+
<tr>
|
84 |
+
<td>Google</td>
|
85 |
+
<td>ORG</td>
|
86 |
+
</tr>
|
87 |
+
<tr>
|
88 |
+
<td>Satya Nadella</td>
|
89 |
+
<td>PER</td>
|
90 |
+
</tr>
|
91 |
+
<tr>
|
92 |
+
<td>Seattle</td>
|
93 |
+
<td>LOC</td>
|
94 |
+
</tr>
|
95 |
+
</table>
|
96 |
+
</div>
|
97 |
+
""", unsafe_allow_html=True)
|
98 |
+
|
99 |
+
# ALBERT Token Classification - NER CoNLL
|
100 |
+
st.markdown('<div class="sub-title">ALBERT Token Classification - NER CoNLL</div>', unsafe_allow_html=True)
|
101 |
+
st.markdown("""
|
102 |
+
<div class="section">
|
103 |
+
<p>The <strong>albert_base_token_classifier_conll03</strong> is a fine-tuned ALBERT model for token classification tasks, specifically adapted for Named Entity Recognition (NER) on the CoNLL-03 dataset. It recognizes four types of entities: location (LOC), organizations (ORG), person (PER), and Miscellaneous (MISC).</p>
|
104 |
+
</div>
|
105 |
+
""", unsafe_allow_html=True)
|
106 |
+
|
107 |
+
# How to Use the Model - Token Classification
|
108 |
+
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
109 |
+
st.code('''
|
110 |
+
from sparknlp.base import *
|
111 |
+
from sparknlp.annotator import *
|
112 |
+
from pyspark.ml import Pipeline
|
113 |
+
from pyspark.sql.functions import col, expr
|
114 |
+
|
115 |
+
document_assembler = DocumentAssembler() \\
|
116 |
+
.setInputCol('text') \\
|
117 |
+
.setOutputCol('document')
|
118 |
+
|
119 |
+
tokenizer = Tokenizer() \\
|
120 |
+
.setInputCols(['document']) \\
|
121 |
+
.setOutputCol('token')
|
122 |
+
|
123 |
+
tokenClassifier = AlbertForTokenClassification \\
|
124 |
+
.pretrained('albert_base_token_classifier_conll03', 'en') \\
|
125 |
+
.setInputCols(['token', 'document']) \\
|
126 |
+
.setOutputCol('ner') \\
|
127 |
+
.setCaseSensitive(True) \\
|
128 |
+
.setMaxSentenceLength(512)
|
129 |
+
|
130 |
+
# Convert NER labels to entities
|
131 |
+
ner_converter = NerConverter() \\
|
132 |
+
.setInputCols(['document', 'token', 'ner']) \\
|
133 |
+
.setOutputCol('entities')
|
134 |
+
|
135 |
+
pipeline = Pipeline(stages=[
|
136 |
+
document_assembler,
|
137 |
+
tokenizer,
|
138 |
+
tokenClassifier,
|
139 |
+
ner_converter
|
140 |
+
])
|
141 |
+
|
142 |
+
example = spark.createDataFrame([["My name is John!"]]).toDF("text")
|
143 |
+
result = pipeline.fit(example).transform(example)
|
144 |
+
|
145 |
+
result.select(
|
146 |
+
expr("explode(entities) as ner_chunk")
|
147 |
+
).select(
|
148 |
+
col("ner_chunk.result").alias("chunk"),
|
149 |
+
col("ner_chunk.metadata.entity").alias("ner_label")
|
150 |
+
).show(truncate=False)
|
151 |
+
''', language='python')
|
152 |
+
|
153 |
+
# Results
|
154 |
+
st.text("""
|
155 |
+
+-----+---------+
|
156 |
+
|chunk|ner_label|
|
157 |
+
+-----+---------+
|
158 |
+
|John |PER |
|
159 |
+
+-----+---------+
|
160 |
+
""")
|
161 |
+
|
162 |
+
# Performance Metrics
|
163 |
+
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
164 |
+
st.markdown("""
|
165 |
+
<div class="section">
|
166 |
+
<p>Here are the detailed performance metrics for the ALBERT token classification model:</p>
|
167 |
+
<table class="benchmark-table">
|
168 |
+
<tr>
|
169 |
+
<th>Entity</th>
|
170 |
+
<th>Precision</th>
|
171 |
+
<th>Recall</th>
|
172 |
+
<th>F1-Score</th>
|
173 |
+
<th>Support</th>
|
174 |
+
</tr>
|
175 |
+
<tr>
|
176 |
+
<td>B-LOC</td>
|
177 |
+
<td>0.95</td>
|
178 |
+
<td>0.97</td>
|
179 |
+
<td>0.96</td>
|
180 |
+
<td>1837</td>
|
181 |
+
</tr>
|
182 |
+
<tr>
|
183 |
+
<td>B-MISC</td>
|
184 |
+
<td>0.87</td>
|
185 |
+
<td>0.86</td>
|
186 |
+
<td>0.87</td>
|
187 |
+
<td>922</td>
|
188 |
+
</tr>
|
189 |
+
<tr>
|
190 |
+
<td>B-ORG</td>
|
191 |
+
<td>0.90</td>
|
192 |
+
<td>0.91</td>
|
193 |
+
<td>0.90</td>
|
194 |
+
<td>1341</td>
|
195 |
+
</tr>
|
196 |
+
<tr>
|
197 |
+
<td>B-PER</td>
|
198 |
+
<td>0.91</td>
|
199 |
+
<td>0.97</td>
|
200 |
+
<td>0.94</td>
|
201 |
+
<td>1842</td>
|
202 |
+
</tr>
|
203 |
+
<tr>
|
204 |
+
<td>I-LOC</td>
|
205 |
+
<td>0.88</td>
|
206 |
+
<td>0.86</td>
|
207 |
+
<td>0.87</td>
|
208 |
+
<td>257</td>
|
209 |
+
</tr>
|
210 |
+
<tr>
|
211 |
+
<td>I-MISC</td>
|
212 |
+
<td>0.78</td>
|
213 |
+
<td>0.76</td>
|
214 |
+
<td>0.77</td>
|
215 |
+
<td>346</td>
|
216 |
+
</tr>
|
217 |
+
<tr>
|
218 |
+
<td>I-ORG</td>
|
219 |
+
<td>0.84</td>
|
220 |
+
<td>0.85</td>
|
221 |
+
<td>0.85</td>
|
222 |
+
<td>751</td>
|
223 |
+
</tr>
|
224 |
+
<tr>
|
225 |
+
<td>I-PER</td>
|
226 |
+
<td>0.97</td>
|
227 |
+
<td>0.92</td>
|
228 |
+
<td>0.94</td>
|
229 |
+
<td>1307</td>
|
230 |
+
</tr>
|
231 |
+
<tr>
|
232 |
+
<td>O</td>
|
233 |
+
<td>0.99</td>
|
234 |
+
<td>0.99</td>
|
235 |
+
<td>0.99</td>
|
236 |
+
<td>42759</td>
|
237 |
+
</tr>
|
238 |
+
<tr>
|
239 |
+
<td>average</td>
|
240 |
+
<td>0.92</td>
|
241 |
+
<td>0.92</td>
|
242 |
+
<td>0.92</td>
|
243 |
+
<td>52000</td>
|
244 |
+
</tr>
|
245 |
+
</table>
|
246 |
+
</div>
|
247 |
+
|
248 |
+
""", unsafe_allow_html=True)
|
249 |
+
# Model Info Section
|
250 |
+
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
251 |
+
st.markdown("""
|
252 |
+
<div class="section">
|
253 |
+
<ul>
|
254 |
+
<li><strong>Model Name:</strong> ALBERT for Token Classification</li>
|
255 |
+
<li><strong>Pretrained Model:</strong> albert_base_token_classifier_conll03</li>
|
256 |
+
<li><strong>Training Dataset:</strong> CoNLL-03</li>
|
257 |
+
<li><strong>Languages Supported:</strong> English</li>
|
258 |
+
<li><strong>Use Cases:</strong>
|
259 |
+
<ul>
|
260 |
+
<li>Named Entity Recognition (NER)</li>
|
261 |
+
<li>Information Extraction</li>
|
262 |
+
<li>Text Categorization</li>
|
263 |
+
</ul>
|
264 |
+
</li>
|
265 |
+
<li><strong>Performance:</strong> High accuracy with a focus on memory efficiency</li>
|
266 |
+
<li><strong>Implementation:</strong> Spark NLP</li>
|
267 |
+
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments with optimization</li>
|
268 |
+
</ul>
|
269 |
+
</div>
|
270 |
+
""", unsafe_allow_html=True)
|
271 |
+
|
272 |
+
# References Section
|
273 |
+
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
274 |
+
st.markdown("""
|
275 |
+
<div class="section">
|
276 |
+
<ul>
|
277 |
+
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
278 |
+
<li><a class="link" href="https://github.com/google-research/albert" target="_blank">Google Research's ALBERT GitHub Repository</a></li>
|
279 |
+
<li><a class="link" href="https://sparknlp.org/2022/06/15/albert_base_qa_squad2_en_3_0.html" target="_blank">Spark NLP Model - albert_base_qa_squad2</a></li>
|
280 |
+
<li><a class="link" href="https://nlp.stanford.edu/projects/conll2003/" target="_blank">CoNLL-03 Named Entity Recognition Dataset</a></li>
|
281 |
+
</ul>
|
282 |
+
</div>
|
283 |
+
""", unsafe_allow_html=True)
|
284 |
+
|
285 |
+
with tab2:
|
286 |
+
st.markdown("""
|
287 |
+
<div class="section">
|
288 |
+
<h2>ALBERT for Sequence Classification</h2>
|
289 |
+
<p>The <strong>AlbertForSequenceClassification</strong> annotator is tailored for tasks like sentiment analysis or multi-class text classification using the ALBERT model. This model efficiently handles sequence classification, achieving state-of-the-art performance with reduced parameters compared to BERT.</p>
|
290 |
+
<p>Sequence classification with ALBERT enables:</p>
|
291 |
+
<ul>
|
292 |
+
<li><strong>Sentiment Analysis:</strong> Determining the sentiment expressed in text, such as positive, negative, or neutral.</li>
|
293 |
+
<li><strong>Multi-Class Text Classification:</strong> Categorizing text into predefined classes, such as news categories or topics.</li>
|
294 |
+
<li><strong>Document Classification:</strong> Enhancing search and categorization of documents based on content classification.</li>
|
295 |
+
</ul>
|
296 |
+
<p>Here is an example of how ALBERT sequence classification works:</p>
|
297 |
+
<table class="benchmark-table">
|
298 |
+
<tr>
|
299 |
+
<th>Text</th>
|
300 |
+
<th>Label</th>
|
301 |
+
</tr>
|
302 |
+
<tr>
|
303 |
+
<td>Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993.</td>
|
304 |
+
<td>Business</td>
|
305 |
+
</tr>
|
306 |
+
</table>
|
307 |
+
</div>
|
308 |
+
""", unsafe_allow_html=True)
|
309 |
+
|
310 |
+
# ALBERT Sequence Classification - AG News
|
311 |
+
st.markdown('<div class="sub-title">ALBERT Sequence Classification - AG News</div>', unsafe_allow_html=True)
|
312 |
+
st.markdown("""
|
313 |
+
<div class="section">
|
314 |
+
<p>The <strong>albert_base_sequence_classifier_ag_news</strong> is a fine-tuned ALBERT model for sequence classification tasks, specifically adapted for text classification on the AG News dataset. It recognizes four categories: Business, Sci/Tech, Sports, and World.</p>
|
315 |
+
</div>
|
316 |
+
""", unsafe_allow_html=True)
|
317 |
+
|
318 |
+
# How to Use the Model - Sequence Classification
|
319 |
+
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
320 |
+
st.code('''
|
321 |
+
from sparknlp.base import *
|
322 |
+
from sparknlp.annotator import *
|
323 |
+
from pyspark.ml import Pipeline
|
324 |
+
from pyspark.sql.functions import col, expr
|
325 |
+
|
326 |
+
document_assembler = DocumentAssembler() \\
|
327 |
+
.setInputCol('text') \\
|
328 |
+
.setOutputCol('document')
|
329 |
+
|
330 |
+
tokenizer = Tokenizer() \\
|
331 |
+
.setInputCols(['document']) \\
|
332 |
+
.setOutputCol('token')
|
333 |
+
|
334 |
+
sequenceClassifier = AlbertForSequenceClassification \\
|
335 |
+
.pretrained('albert_base_sequence_classifier_ag_news', 'en') \\
|
336 |
+
.setInputCols(['token', 'document']) \\
|
337 |
+
.setOutputCol('class') \\
|
338 |
+
.setCaseSensitive(False) \\
|
339 |
+
.setMaxSentenceLength(512)
|
340 |
+
|
341 |
+
pipeline = Pipeline(stages=[
|
342 |
+
document_assembler,
|
343 |
+
tokenizer,
|
344 |
+
sequenceClassifier
|
345 |
+
])
|
346 |
+
|
347 |
+
example = spark.createDataFrame([["Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993."]]).toDF("text")
|
348 |
+
result = pipeline.fit(example).transform(example)
|
349 |
+
|
350 |
+
result.select(
|
351 |
+
expr("explode(class) as classification_result")
|
352 |
+
).select(
|
353 |
+
col("classification_result.result").alias("category")
|
354 |
+
).show(truncate=False)
|
355 |
+
''', language='python')
|
356 |
+
|
357 |
+
# Results
|
358 |
+
st.text("""
|
359 |
+
+---------+
|
360 |
+
|category |
|
361 |
+
+---------+
|
362 |
+
|Business |
|
363 |
+
+---------+
|
364 |
+
""")
|
365 |
+
|
366 |
+
# Performance Metrics
|
367 |
+
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
368 |
+
st.markdown("""
|
369 |
+
<div class="section">
|
370 |
+
<p>Here are the detailed performance metrics for the ALBERT sequence classification model on the AG News dataset:</p>
|
371 |
+
<table class="benchmark-table">
|
372 |
+
<tr>
|
373 |
+
<th>Metric</th>
|
374 |
+
<th>Score</th>
|
375 |
+
</tr>
|
376 |
+
<tr>
|
377 |
+
<td>Accuracy</td>
|
378 |
+
<td>0.9472</td>
|
379 |
+
</tr>
|
380 |
+
<tr>
|
381 |
+
<td>F1-Score</td>
|
382 |
+
<td>0.9472</td>
|
383 |
+
</tr>
|
384 |
+
<tr>
|
385 |
+
<td>Precision</td>
|
386 |
+
<td>0.9472</td>
|
387 |
+
</tr>
|
388 |
+
<tr>
|
389 |
+
<td>Recall</td>
|
390 |
+
<td>0.9472</td>
|
391 |
+
</tr>
|
392 |
+
<tr>
|
393 |
+
<td>Evaluation Loss</td>
|
394 |
+
<td>0.1882</td>
|
395 |
+
</tr>
|
396 |
+
</table>
|
397 |
+
</div>
|
398 |
+
|
399 |
+
""", unsafe_allow_html=True)
|
400 |
+
|
401 |
+
# Model Info Section
|
402 |
+
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
403 |
+
st.markdown("""
|
404 |
+
<div class="section">
|
405 |
+
<ul>
|
406 |
+
<li><strong>Model Name:</strong> ALBERT for Sequence Classification</li>
|
407 |
+
<li><strong>Pretrained Model:</strong> albert_base_sequence_classifier_ag_news</li>
|
408 |
+
<li><strong>Training Dataset:</strong> AG News</li>
|
409 |
+
<li><strong>Languages Supported:</strong> English</li>
|
410 |
+
<li><strong>Use Cases:</strong>
|
411 |
+
<ul>
|
412 |
+
<li>Sentiment Analysis</li>
|
413 |
+
<li>Multi-Class Text Classification</li>
|
414 |
+
<li>Document Classification</li>
|
415 |
+
</ul>
|
416 |
+
</li>
|
417 |
+
<li><strong>Performance:</strong> High accuracy with a focus on memory efficiency</li>
|
418 |
+
<li><strong>Implementation:</strong> Spark NLP</li>
|
419 |
+
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments with optimization</li>
|
420 |
+
</ul>
|
421 |
+
</div>
|
422 |
+
""", unsafe_allow_html=True)
|
423 |
+
|
424 |
+
# References Section
|
425 |
+
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
426 |
+
st.markdown("""
|
427 |
+
<div class="section">
|
428 |
+
<ul>
|
429 |
+
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
430 |
+
<li><a class="link" href="https://github.com/google-research/albert" target="_blank">Google Research's ALBERT GitHub Repository</a></li>
|
431 |
+
<li><a class="link" href="https://sparknlp.org/2021/12/16/albert_base_sequence_classifier_ag_news_en.html" target="_blank">Spark NLP Model - albert_base_sequence_classifier_ag_news</a></li>
|
432 |
+
<li><a class="link" href="https://huggingface.co/datasets/ag_news" target="_blank">AG News Dataset</a></li>
|
433 |
+
</ul>
|
434 |
+
</div>
|
435 |
+
""", unsafe_allow_html=True)
|
436 |
+
|
437 |
+
with tab3:
|
438 |
+
st.markdown("""
|
439 |
+
<div class="section">
|
440 |
+
<h2>ALBERT for Question Answering</h2>
|
441 |
+
<p>The <strong>AlbertForQuestionAnswering</strong> annotator is specialized for tasks involving Question Answering (QA) using the ALBERT model. This model efficiently processes question-context pairs to provide accurate answers, making it ideal for QA systems and information retrieval applications.</p>
|
442 |
+
<p>Question Answering with ALBERT enables:</p>
|
443 |
+
<ul>
|
444 |
+
<li><strong>Information Retrieval:</strong> Extracting precise answers from large text corpora based on user queries.</li>
|
445 |
+
<li><strong>Knowledge Management:</strong> Enhancing customer support and information systems by providing accurate answers.</li>
|
446 |
+
<li><strong>Contextual Understanding:</strong> Leveraging ALBERT’s capabilities to understand the context of questions and provide relevant answers.</li>
|
447 |
+
</ul>
|
448 |
+
<p>Here is an example of how ALBERT question answering works:</p>
|
449 |
+
<table class="benchmark-table">
|
450 |
+
<tr>
|
451 |
+
<th>Question</th>
|
452 |
+
<th>Context</th>
|
453 |
+
<th>Answer</th>
|
454 |
+
</tr>
|
455 |
+
<tr>
|
456 |
+
<td>What is my name?</td>
|
457 |
+
<td>My name is Clara and I live in Berkeley.</td>
|
458 |
+
<td>Clara</td>
|
459 |
+
</tr>
|
460 |
+
</table>
|
461 |
+
</div>
|
462 |
+
""", unsafe_allow_html=True)
|
463 |
+
|
464 |
+
# ALBERT Question Answering - SQuAD2
|
465 |
+
st.markdown('<div class="sub-title">ALBERT Question Answering - SQuAD2</div>', unsafe_allow_html=True)
|
466 |
+
st.markdown("""
|
467 |
+
<div class="section">
|
468 |
+
<p>The <strong>albert_base_qa_squad2</strong> is a fine-tuned ALBERT model for Question Answering tasks, specifically adapted for the SQuAD2 dataset. It is capable of answering questions based on the provided context with high accuracy.</p>
|
469 |
+
</div>
|
470 |
+
""", unsafe_allow_html=True)
|
471 |
+
|
472 |
+
# How to Use the Model - Question Answering
|
473 |
+
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
|
474 |
+
st.code('''
|
475 |
+
from sparknlp.base import *
|
476 |
+
from sparknlp.annotator import *
|
477 |
+
from pyspark.ml import Pipeline
|
478 |
+
from pyspark.sql.functions import col, expr
|
479 |
+
|
480 |
+
documentAssembler = MultiDocumentAssembler() \\
|
481 |
+
.setInputCols(["question", "context"]) \\
|
482 |
+
.setOutputCols(["document_question", "document_context"])
|
483 |
+
|
484 |
+
spanClassifier = AlbertForQuestionAnswering.pretrained("albert_base_qa_squad2","en") \\
|
485 |
+
.setInputCols(["document_question", "document_context"]) \\
|
486 |
+
.setOutputCol("answer") \\
|
487 |
+
.setCaseSensitive(False)
|
488 |
+
|
489 |
+
pipeline = Pipeline(stages=[documentAssembler, spanClassifier])
|
490 |
+
|
491 |
+
data = spark.createDataFrame([["What is my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
|
492 |
+
result = pipeline.fit(data).transform(data)
|
493 |
+
|
494 |
+
result.select(
|
495 |
+
col("answer.result").alias("predicted_answer")
|
496 |
+
).show(truncate=False)
|
497 |
+
''', language='python')
|
498 |
+
|
499 |
+
# Results
|
500 |
+
st.text("""
|
501 |
+
+----------------+
|
502 |
+
|predicted_answer|
|
503 |
+
+----------------+
|
504 |
+
|[clara] |
|
505 |
+
+----------------+
|
506 |
+
""")
|
507 |
+
|
508 |
+
# Performance Metrics
|
509 |
+
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
|
510 |
+
st.markdown("""
|
511 |
+
<div class="section">
|
512 |
+
<p>The performance metrics of the ALBERT question answering model on a development subset of the SQuAD2 dataset are:</p>
|
513 |
+
<table class="benchmark-table">
|
514 |
+
<tr>
|
515 |
+
<th>Metric</th>
|
516 |
+
<th>Score</th>
|
517 |
+
</tr>
|
518 |
+
<tr>
|
519 |
+
<td>Exact Match</td>
|
520 |
+
<td>78.71%</td>
|
521 |
+
</tr>
|
522 |
+
<tr>
|
523 |
+
<td>F1 Score</td>
|
524 |
+
<td>81.89%</td>
|
525 |
+
</tr>
|
526 |
+
<tr>
|
527 |
+
<td>Total</td>
|
528 |
+
<td>6078</td>
|
529 |
+
</tr>
|
530 |
+
<tr>
|
531 |
+
<td>HasAns Exact Match</td>
|
532 |
+
<td>75.40%</td>
|
533 |
+
</tr>
|
534 |
+
<tr>
|
535 |
+
<td>HasAns F1 Score</td>
|
536 |
+
<td>82.04%</td>
|
537 |
+
</tr>
|
538 |
+
<tr>
|
539 |
+
<td>HasAns Total</td>
|
540 |
+
<td>2910</td>
|
541 |
+
</tr>
|
542 |
+
<tr>
|
543 |
+
<td>NoAns Exact Match</td>
|
544 |
+
<td>81.76%</td>
|
545 |
+
</tr>
|
546 |
+
<tr>
|
547 |
+
<td>NoAns F1 Score</td>
|
548 |
+
<td>81.76%</td>
|
549 |
+
</tr>
|
550 |
+
<tr>
|
551 |
+
<td>NoAns Total</td>
|
552 |
+
<td>3168</td>
|
553 |
+
</tr>
|
554 |
+
<tr>
|
555 |
+
<td>Best Exact Match</td>
|
556 |
+
<td>78.73%</td>
|
557 |
+
</tr>
|
558 |
+
<tr>
|
559 |
+
<td>Best F1 Score</td>
|
560 |
+
<td>81.91%</td>
|
561 |
+
</tr>
|
562 |
+
</table>
|
563 |
+
</div>
|
564 |
+
""", unsafe_allow_html=True)
|
565 |
+
|
566 |
+
# Model Info Section
|
567 |
+
st.markdown('<div class="sub-title">Model Info</div>', unsafe_allow_html=True)
|
568 |
+
st.markdown("""
|
569 |
+
<div class="section">
|
570 |
+
<ul>
|
571 |
+
<li><strong>Model Name:</strong> ALBERT for Question Answering</li>
|
572 |
+
<li><strong>Pretrained Model:</strong> albert_base_qa_squad2</li>
|
573 |
+
<li><strong>Training Dataset:</strong> SQuAD2</li>
|
574 |
+
<li><strong>Languages Supported:</strong> English</li>
|
575 |
+
<li><strong>Use Cases:</strong>
|
576 |
+
<ul>
|
577 |
+
<li>Information Retrieval</li>
|
578 |
+
<li>Knowledge Management</li>
|
579 |
+
<li>Contextual Understanding</li>
|
580 |
+
</ul>
|
581 |
+
</li>
|
582 |
+
<li><strong>Performance:</strong> High accuracy with optimized resource usage</li>
|
583 |
+
<li><strong>Implementation:</strong> Spark NLP</li>
|
584 |
+
<li><strong>Resource Requirements:</strong> Moderate computational resources; suitable for production environments</li>
|
585 |
+
</ul>
|
586 |
+
</div>
|
587 |
+
""", unsafe_allow_html=True)
|
588 |
+
|
589 |
+
# References Section
|
590 |
+
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
591 |
+
st.markdown("""
|
592 |
+
<div class="section">
|
593 |
+
<ul>
|
594 |
+
<li><a class="link" href="https://arxiv.org/abs/1909.11942" target="_blank">Lan, Z., Chen, J., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv preprint arXiv:1909.11942.</a></li>
|
595 |
+
<li><a class="link" href="https://sparknlp.org/2022/06/15/albert_base_qa_squad2_en_3_0.html" target="_blank">Spark NLP Model - albert_base_qa_squad2</a></li>
|
596 |
+
</ul>
|
597 |
+
</div>
|
598 |
+
""", unsafe_allow_html=True)
|
599 |
+
|
600 |
+
# Community & Support
|
601 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
602 |
+
st.markdown("""
|
603 |
+
<div class="section">
|
604 |
+
<ul>
|
605 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
606 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
607 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
608 |
+
</ul>
|
609 |
+
</div>
|
610 |
""", unsafe_allow_html=True)
|