import streamlit as st
import sparknlp
import os
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
# Page configuration
st.set_page_config(
layout="wide",
page_title="Spark NLP Demos App",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
""", unsafe_allow_html=True)
# Initialize Spark NLP
@st.cache_resource
def init_spark():
return sparknlp.start()
# Create Spark NLP pipeline
@st.cache_resource
def create_pipeline(year, month, day):
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")
date_matcher = DateMatcher() \
.setInputCols(['sentence'])\
.setOutputCol("date")\
.setAnchorDateYear(year)\
.setAnchorDateMonth(month)\
.setAnchorDateDay(day)
pipeline = Pipeline(
stages=[
document_assembler,
sentence_detector,
date_matcher,
])
return pipeline
# Fit data and get results
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
results = model.fullAnnotate(data)
return results
# Set up the page layout
st.markdown('
State-of-the-Art Date Detecting and normalization with Spark NLP
', unsafe_allow_html=True)
st.write("")
# Sidebar content
date = st.sidebar.date_input('Select reference date')
# Reference notebook link in sidebar
link = """
"""
st.sidebar.title('')
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Load examples from files
folder_path = f"inputs/date_matcher"
examples = [
lines[1].strip()
for filename in os.listdir(folder_path)
if filename.endswith('.txt')
for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
if len(lines) >= 2
]
st.subheader("Automatically detect phrases expressing dates and normalize them with respect to a reference date.")
selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own Sentence!")
text_to_analyze = custom_input if custom_input else selected_text
st.subheader('Full example text')
st.write(text_to_analyze)
# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(date.year, date.month, date.day)
output = fit_data(pipeline, text_to_analyze)
# Display matched sentence
st.subheader("Dates matched:")
# Extracting the results into a list
data = []
for result in output:
sentences = result['sentence']
dates = result['date']
for date in dates:
sent = sentences[int(date.metadata['sentence'])]
data.append({
'text/chunk': sent.result[date.begin:date.end+1],
'mapped_date': date.result
})
df = pd.DataFrame(data)
df.index += 1
st.dataframe(df)