import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline # Page configuration st.set_page_config( layout="wide", page_title="Spark NLP Demos App", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) # Initialize Spark NLP @st.cache_resource def init_spark(): return sparknlp.start() # Create Spark NLP pipeline @st.cache_resource def create_pipeline(year, month, day): document_assembler = DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") sentence_detector = SentenceDetector()\ .setInputCols(["document"])\ .setOutputCol("sentence") date_matcher = DateMatcher() \ .setInputCols(['sentence'])\ .setOutputCol("date")\ .setAnchorDateYear(year)\ .setAnchorDateMonth(month)\ .setAnchorDateDay(day) pipeline = Pipeline( stages=[ document_assembler, sentence_detector, date_matcher, ]) return pipeline # Fit data and get results def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) results = model.fullAnnotate(data) return results # Set up the page layout st.markdown('
State-of-the-Art Date Detecting and normalization with Spark NLP
', unsafe_allow_html=True) st.write("") # Sidebar content date = st.sidebar.date_input('Select reference date') # Reference notebook link in sidebar link = """ Open In Colab """ st.sidebar.title('') st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples from files folder_path = f"inputs/date_matcher" examples = [ lines[1].strip() for filename in os.listdir(folder_path) if filename.endswith('.txt') for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] if len(lines) >= 2 ] st.subheader("Automatically detect phrases expressing dates and normalize them with respect to a reference date.") selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own Sentence!") text_to_analyze = custom_input if custom_input else selected_text st.subheader('Full example text') st.write(text_to_analyze) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(date.year, date.month, date.day) output = fit_data(pipeline, text_to_analyze) # Display matched sentence st.subheader("Dates matched:") # Extracting the results into a list data = [] for result in output: sentences = result['sentence'] dates = result['date'] for date in dates: sent = sentences[int(date.metadata['sentence'])] data.append({ 'text/chunk': sent.result[date.begin:date.end+1], 'mapped_date': date.result }) df = pd.DataFrame(data) df.index += 1 st.dataframe(df)