|
import streamlit as st |
|
import sparknlp |
|
from sparknlp.base import * |
|
from sparknlp.annotator import * |
|
from sparknlp.pretrained import * |
|
from pyspark.ml import Pipeline |
|
import pandas as pd |
|
|
|
|
|
st.set_page_config( |
|
layout="wide", |
|
page_title="Spark NLP Demos App", |
|
initial_sidebar_state="auto" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-title { |
|
font-size: 36px; |
|
color: #4A90E2; |
|
font-weight: bold; |
|
text-align: center; |
|
} |
|
.section p, .section ul { |
|
color: #666666; |
|
} |
|
.box { |
|
text-align: left; |
|
font-family: "IBM Plex Sans", sans-serif; |
|
font-weight: normal; |
|
width: 100%; |
|
box-sizing: border-box; |
|
position: relative; |
|
font-size: 14px !important; |
|
line-height: 26px !important; |
|
color: #536B76 !important; |
|
} |
|
h3 { |
|
text-align: centre; |
|
box-sizing: border-box; |
|
padding: 0; |
|
margin: 25px 0 0 !important; |
|
font-family: 'Montserrat', sans-serif !important; |
|
font-weight: 500 !important; |
|
font-size: 18px !important; |
|
line-height: 22px; |
|
color: #1E77B7 !important; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
@st.cache_resource |
|
def init_spark(): |
|
return sparknlp.start() |
|
|
|
|
|
@st.cache_resource |
|
def create_pipeline(n): |
|
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document") |
|
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") |
|
ngram = NGramGenerator().setN(n).setInputCols(["token"]).setOutputCol("ngrams") |
|
pipeline = Pipeline(stages=[document_assembler, tokenizer, ngram]) |
|
|
|
df = spark.createDataFrame([[""]]).toDF("text") |
|
model = pipeline.fit(df) |
|
light_pipeline = LightPipeline(model) |
|
|
|
return light_pipeline |
|
|
|
|
|
@st.cache_resource |
|
def fit_data(_light_pipeline, data): |
|
return _light_pipeline.fullAnnotate(data) |
|
|
|
|
|
st.markdown('<div class="main-title">State-of-the-Art NGram Generation with Spark NLP</div>', unsafe_allow_html=True) |
|
st.markdown("<h3>Generate meaningful n-grams from text data using Spark NLP's efficient and scalable NGramGenerator, capturing context and identifying key phrases even in large-scale, noisy datasets.</h3>", unsafe_allow_html=True) |
|
st.write("") |
|
|
|
|
|
NGram_selection_list = {"Unigram": 1, "bigram": 2, "trigram": 3} |
|
NGram = st.sidebar.selectbox( |
|
"Choose an NGram specification", |
|
list(NGram_selection_list.keys()), |
|
index=1, |
|
help="For more info about the models visit: https://sparknlp.org/models" |
|
) |
|
|
|
|
|
colab_link = """ |
|
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/9.SentenceDetectorDL.ipynb"> |
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/> |
|
</a> |
|
""" |
|
st.sidebar.title('Reference notebook:') |
|
st.sidebar.markdown(colab_link, unsafe_allow_html=True) |
|
|
|
|
|
examples = [ |
|
"Brexit: U.K to ban more EU citizens with criminal records. In a recent Home Office meeting a consensus has been made to tighten border policies. However a no-deal Brexit could make it harder to identify foreign criminals; With the U.K in a transition period since it formally left the EU in January, an EU citizen can currently only be refused entry if they present a genuine, present and serious threat.", |
|
"Harry Harding on the US, China, and a ‘Cold War 2.0’. “Calling it a second Cold War is misleading, but to deny that it’s a Cold War is also disingenuous.”, Harding is a specialist on Asia and U.S.-Asian relations. His major publications include Organizing China: The Problem of Bureaucracy, 1949-1966.The phrase “new Cold War” is an example of the use of analogies in understanding the world.The world is a very complicated place.People like to find ways of coming to a clearer and simpler understanding.", |
|
"Tesla’s latest quarterly numbers beat analyst expectations on both revenue and earnings per share, bringing in $8.77 billion in revenues for the third quarter.That’s up 39% from the year-ago period.Wall Street had expected $8.36 billion in revenue for the quarter, according to estimates published by CNBC. Revenue grew 30% year-on-year, something the company attributed to substantial growth in vehicle deliveries, and operating income also grew to $809 million, showing improving operating margins to 9.2%.", |
|
"2020 is another year that is consistent with a rapidly changing Arctic.Without a systematic reduction in greenhouse gases, the likelihood of our first ‘ice-free’ summer will continue to increase by the mid-21st century;It is already well known that a smaller ice sheet means less of a white area to reflect the sun’s heat back into space. But this is not the only reason the Arctic is warming more than twice as fast as the global average", |
|
"HBR: The world is changing in rapid, unprecedented ways, but one thing remains certain: as businesses look to embed lessons learned in recent months and to build enterprise resilience for the future, they are due for even more transformation.As such, most organizations are voraciously evaluating existing and future technologies to see if they’ll be able to deliver the innovation at scale that they’ll need to survive and thrive.However, technology should not be central to these transformation efforts; people should." |
|
] |
|
|
|
|
|
selected_text = st.selectbox("Select a sample text", examples) |
|
custom_input = st.text_input("Try it for yourself!") |
|
|
|
if custom_input: |
|
selected_text = custom_input |
|
elif selected_text: |
|
selected_text = selected_text |
|
|
|
st.subheader('Selected Text') |
|
st.write(selected_text) |
|
|
|
|
|
spark = init_spark() |
|
Pipeline = create_pipeline(NGram_selection_list[NGram]) |
|
output = fit_data(Pipeline, selected_text) |
|
|
|
|
|
st.subheader('Genrated NGrams') |
|
|
|
data = [ngram.result for ngram in output[0]['ngrams']] |
|
df = pd.DataFrame(data) |
|
|
|
df.index = df.index + 1 |
|
df.columns = ["N-Grams"] |
|
st.dataframe(df) |
|
|