import streamlit as st
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import *
from pyspark.ml import Pipeline
import pandas as pd
# Page configuration
st.set_page_config(
layout="wide",
page_title="Spark NLP Demos App",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
""", unsafe_allow_html=True)
# Initialize Spark NLP
@st.cache_resource
def init_spark():
return sparknlp.start()
# Create Spark NLP pipeline
@st.cache_resource
def create_pipeline(n):
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
ngram = NGramGenerator().setN(n).setInputCols(["token"]).setOutputCol("ngrams")
pipeline = Pipeline(stages=[document_assembler, tokenizer, ngram])
df = spark.createDataFrame([[""]]).toDF("text")
model = pipeline.fit(df)
light_pipeline = LightPipeline(model)
return light_pipeline
# Function to fit data to the pipeline and get results
@st.cache_resource
def fit_data(_light_pipeline, data):
return _light_pipeline.fullAnnotate(data)
# Set up the page layout
st.markdown('
State-of-the-Art NGram Generation with Spark NLP
', unsafe_allow_html=True)
st.markdown("Generate meaningful n-grams from text data using Spark NLP's efficient and scalable NGramGenerator, capturing context and identifying key phrases even in large-scale, noisy datasets.
", unsafe_allow_html=True)
st.write("")
# Sidebar configuration
NGram_selection_list = {"Unigram": 1, "bigram": 2, "trigram": 3}
NGram = st.sidebar.selectbox(
"Choose an NGram specification",
list(NGram_selection_list.keys()),
index=1,
help="For more info about the models visit: https://sparknlp.org/models"
)
# Add the Colab link for the notebook
colab_link = """
"""
st.sidebar.title('Reference notebook:')
st.sidebar.markdown(colab_link, unsafe_allow_html=True)
# Sample texts for sentence detection
examples = [
"Brexit: U.K to ban more EU citizens with criminal records. In a recent Home Office meeting a consensus has been made to tighten border policies. However a no-deal Brexit could make it harder to identify foreign criminals; With the U.K in a transition period since it formally left the EU in January, an EU citizen can currently only be refused entry if they present a genuine, present and serious threat.",
"Harry Harding on the US, China, and a ‘Cold War 2.0’. “Calling it a second Cold War is misleading, but to deny that it’s a Cold War is also disingenuous.”, Harding is a specialist on Asia and U.S.-Asian relations. His major publications include Organizing China: The Problem of Bureaucracy, 1949-1966.The phrase “new Cold War” is an example of the use of analogies in understanding the world.The world is a very complicated place.People like to find ways of coming to a clearer and simpler understanding.",
"Tesla’s latest quarterly numbers beat analyst expectations on both revenue and earnings per share, bringing in $8.77 billion in revenues for the third quarter.That’s up 39% from the year-ago period.Wall Street had expected $8.36 billion in revenue for the quarter, according to estimates published by CNBC. Revenue grew 30% year-on-year, something the company attributed to substantial growth in vehicle deliveries, and operating income also grew to $809 million, showing improving operating margins to 9.2%.",
"2020 is another year that is consistent with a rapidly changing Arctic.Without a systematic reduction in greenhouse gases, the likelihood of our first ‘ice-free’ summer will continue to increase by the mid-21st century;It is already well known that a smaller ice sheet means less of a white area to reflect the sun’s heat back into space. But this is not the only reason the Arctic is warming more than twice as fast as the global average",
"HBR: The world is changing in rapid, unprecedented ways, but one thing remains certain: as businesses look to embed lessons learned in recent months and to build enterprise resilience for the future, they are due for even more transformation.As such, most organizations are voraciously evaluating existing and future technologies to see if they’ll be able to deliver the innovation at scale that they’ll need to survive and thrive.However, technology should not be central to these transformation efforts; people should."
]
# User input for text selection
selected_text = st.selectbox("Select a sample text", examples)
custom_input = st.text_input("Try it for yourself!")
if custom_input:
selected_text = custom_input
elif selected_text:
selected_text = selected_text
st.subheader('Selected Text')
st.write(selected_text)
# Run the pipeline and display results
spark = init_spark()
Pipeline = create_pipeline(NGram_selection_list[NGram])
output = fit_data(Pipeline, selected_text)
# Display detected sentences
st.subheader('Genrated NGrams')
data = [ngram.result for ngram in output[0]['ngrams']]
df = pd.DataFrame(data)
df.index = df.index + 1
df.columns = ["N-Grams"]
st.dataframe(df)