import streamlit as st import difflib import spacy @st.cache(allow_output_mutation=True) def load_model(): return spacy.load('en_core_web_md') ## Layout stuff st.set_page_config( page_title="Compare Demo", page_icon="๐Ÿ”—", layout="wide", initial_sidebar_state="expanded", menu_items={ 'Get Help': 'mailto:hello@simplexico.ai', 'Report a bug': None, 'About': "## This a demo showcasing different Legal AI Actions" } ) st.title('๐Ÿ”— Compare Demo') st.write(""" This demo shows how AI can be used to compare passages of text. """) st.write("**๐Ÿ‘ˆ Enter two passages of text on the left** and hit the button **Compare** to see the demo in action") with st.spinner('โš™๏ธ Loading model...'): nlp = load_model() EXAMPLE_TEXT_1 = """This Agreement shall be governed by and interpreted under the laws of the State of Delaware without regard to its conflicts of law provisions.""" EXAMPLE_TEXT_2 = """This agreement will be governed by and must be construed in accordance with the laws of the State of Israel.""" text_1 = st.sidebar.text_area('Enter a passage of text', value=EXAMPLE_TEXT_1, height=150, key='input1') text_2 = st.sidebar.text_area('Enter a second passage of text', value=EXAMPLE_TEXT_2, height=150, key='input2') button = st.sidebar.button('Compare', type='primary', use_container_width=True) def get_tokens(doc): return [token.lower for token in doc] def add_md_color(text, match): color = 'green' if match else 'red' return f":{color}[{text}]" def create_str_output(doc, matching_idxs): out = [] for token in doc: if any(token.i in range(start, end) for start, end in matching_idxs): match = True else: match = False out.append(add_md_color(token.text, match)) return ' '.join(out) if button: with st.spinner('โš™๏ธ Comparing Texts...'): doc_1 = nlp(text_1) doc_2 = nlp(text_2) st.header('๐Ÿงช Comparison') st.markdown('We can highlight the :green[similarities] and :red[differences] across the two texts') col1, col2 = st.columns(2) sm = difflib.SequenceMatcher(None, get_tokens(doc_1), get_tokens(doc_2)) matching_blocks = [match for match in sm.get_matching_blocks()] doc_1_matching_idxs = [] doc_2_matching_idxs = [] for a, b, n in matching_blocks: doc_1_matching_idxs.append((a, a + n)) doc_2_matching_idxs.append((b, b + n)) with col1: st.markdown(create_str_output(doc_1, doc_1_matching_idxs)) with col2: st.markdown(create_str_output(doc_2, doc_2_matching_idxs)) col1, col2, col3 = st.columns(3) with col1: # perform simple sequence matching sm = difflib.SequenceMatcher(None, get_tokens(doc_1), get_tokens(doc_2)) st.subheader('๐Ÿ“‘ Textual Similarity') st.markdown('We can measure the similarity based on the *wording* of the two texts.') st.metric(label='Textual Similarity', value=f"{sm.ratio() * 100:.1f}%") with col2: st.subheader('๐Ÿ“ Linguistic Similarity') st.markdown( 'We can measure the similarity based on the *linguistic features* of the two texts.') postags_1 = [token.pos_ for token in doc_1] postags_2 = [token.pos_ for token in doc_2] sm = difflib.SequenceMatcher(None, postags_1, postags_2) st.metric(label='Linguistic Similarity', value=f"{sm.ratio() * 100:.1f}%") with col3: st.subheader('๐Ÿ’ญ Semantic Similarity') st.markdown('We can measure the similarity based on the *meaning* of the two texts.') st.metric(label='Semantic Similarity', value=f"{doc_1.similarity(doc_2) * 100:.1f}%")