File size: 7,261 Bytes
dcdb825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import streamlit as st
import sparknlp
import os
import pandas as pd

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

# Page Configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="auto"
)

# Custom CSS for Styling
st.markdown("""

    <style>

        .main-title {

            font-size: 36px;

            color: #4A90E2;

            font-weight: bold;

            text-align: center;

        }

        .section-content {

            background-color: #f9f9f9;

            padding: 10px;

            border-radius: 10px;

            margin-top: 10px;

        }

        .section-content p, .section-content ul {

            color: #666666;

        }

    </style>

""", unsafe_allow_html=True)

# Initialize Spark Session
@st.cache_resource
def start_spark_session():
    return sparknlp.start()

# Create NLP Pipeline
@st.cache_resource
def build_nlp_pipeline(model_name, task):
    document_assembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")
    
    t5_transformer = T5Transformer() \
        .pretrained(model_name, 'en') \
        .setTask(task)\
        .setInputCols(["document"]) \
        .setOutputCol("output")

    pipeline = Pipeline().setStages([document_assembler, t5_transformer])
    return pipeline

# Apply Pipeline to Text Data
def process_text(pipeline, text):
    df = spark.createDataFrame([[text]]).toDF("text")
    result = pipeline.fit(df).transform(df)
    return result.select('output.result').collect()

# Model and Task Information
model_info = [
    {
        "model_name": "t5_small",
        "title": "Multi-Task NLP Model",
        "description": "The T5 model performs 18 different NLP tasks including summarization, question answering, and grammatical correctness detection."
    },
    {
        "model_name": "t5_base",
        "title": "Multi-Task NLP Model",
        "description": "A larger variant of the T5 model, capable of performing a variety of NLP tasks with improved accuracy."
    },
    {
        "model_name": "google_t5_small_ssm_nq",
        "title": "Question Answering Model",
        "description": "This model is fine-tuned for answering questions based on the Natural Questions dataset, leveraging pre-training on large text corpora."
    }
]

task_descriptions = {
  'Sentence Classification - cola': "Classify if a sentence is grammatically correct.",
  'Natural Language Inference - rte': "The RTE task is defined as recognizing, given two text fragments, whether the meaning of one text can be inferred (entailed) from the other or not.",
  'Natural Language Inference - mnli': "Classify for a hypothesis and premise whether they contradict or contradict each other or neither of both (3 class).",
  'Natural Language Inference - qnli': "Classify whether the answer to a question can be deducted from an answer candidate.",
  'Natural Language Inference - cb': "Classify for a premise and a hypothesis whether they contradict each other or not (binary).",
  'Coreference Resolution - mrpc': "Classify whether a pair of sentences is a re-phrasing of each other (semantically equivalent).",
  'Coreference Resolution - qqp': "Classify whether a pair of questions is a re-phrasing of each other (semantically equivalent).",
  'Sentiment Analysis - sst2': "Classify the sentiment of a sentence as positive or negative.",
  'Sentiment Analysis - stsb': "Measures how similar two sentences are on a scale from 0 to 5",
  'Question Answering - copa': "Classify for a question, premise, and 2 choices which choice the correct choice is (binary).",
  'Question Answering  - multirc': "Classify for a question, a paragraph of text, and an answer candidate, if the answer is correct (binary).",
  'Question Answering - squad': "Answer a question for a given context.",
  'Word Sense Disambiguation - wic': "Classify for a pair of sentences and a disambiguous word if the word has the same meaning in both sentences.",
  'Text - summarization': "Summarize text into a shorter representation.",
  'Translation - wmt1': "This model is used to translate one language to the other language. Example: Translate English to German.",
  'Translation - wmt2': "This model is used to translate one language to the other language. Example: Translate English to French.",
  'Translation - wmt3': "This model is used to translate one language to the other language. Example: Translate English to Romanian."
}

# Sidebar: Task and Model Selection
selected_task = st.sidebar.selectbox("Choose an NLP Task", list(task_descriptions.keys()))
task_for_pipeline = f"{selected_task.split(' - ')[-1]}:"

available_models = ['google_t5_small_ssm_nq'] if "Question Answering" in selected_task else ['t5_base', 't5_small']
selected_model = st.sidebar.selectbox("Choose a Model", available_models)

# Get Model Info
model_details = next((info for info in model_info if info['model_name'] == selected_model), None)
app_title = model_details['title'] if model_details else "Unknown Model"
app_description = model_details['description'] if model_details else "No description available."

# Display Model Info
st.markdown(f'<div class="main-title">{app_title}</div>', unsafe_allow_html=True)
st.markdown(f'<div class="section-content"><p>{app_description}</p></div>', unsafe_allow_html=True)
st.subheader(task_descriptions[selected_task])

# Load Example Texts
example_folder = f"inputs/{selected_task}/{selected_model}"
example_texts = [
    line.strip() 
    for file in os.listdir(example_folder) 
    if file.endswith('.txt') 
    for line in open(os.path.join(example_folder, file), 'r', encoding='utf-8')
]

# User Input: Select or Enter Text
selected_example = st.selectbox("Select an Example", example_texts)
custom_input = st.text_input("Or enter your own text:")

text_to_process = custom_input if custom_input else selected_example

# Display Selected Text
st.subheader('Selected Text')
st.markdown(f'<div class="section-content">{text_to_process}</div>', unsafe_allow_html=True)

# Sidebar: Reference Notebook
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown("""

<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb">

    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>

</a>

""", unsafe_allow_html=True)

# Special Cases for Translation Tasks
task_for_pipeline = {
    'wmt1:': 'translate English to German:',
    'wmt2:': 'translate English to French:',
    'wmt3:': 'translate English to Romanian:'
}.get(task_for_pipeline, task_for_pipeline)

# Initialize Spark, Build Pipeline, and Process Text
spark = start_spark_session()
nlp_pipeline = build_nlp_pipeline(selected_model, task_for_pipeline)
processed_output = process_text(nlp_pipeline, text_to_process)

# Display Processed Output
st.subheader("Processed Output")
output_text = "".join(processed_output[0][0])
st.markdown(f'<div class="section-content">{output_text}</div>', unsafe_allow_html=True)