Spaces:

simplexico
/

legal-ai-actions

Runtime error

App Files Files Community

Uwais commited on Apr 15, 2023

Commit

aec7e41

1 Parent(s): 2df6c47

updating some styling changes for Find and now can upload pdfs

Browse files

Files changed (2) hide show

pages/5_🗂_Organise_Demo.py +2 -4
pages/6_🔎_Find_Demo.py +56 -57

pages/5_🗂_Organise_Demo.py CHANGED Viewed

@@ -46,7 +46,7 @@ Similar documents are grouped by color.
 \n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
 """)
-st.info("👈 Upload your own documents on the left (as .txt or .pdf files) to see how your own documents can be organised using AI.")
 @st.cache(allow_output_mutation=True)
@@ -127,7 +127,7 @@ def prepare_page():
 uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
                                           type=['pdf', 'txt'],
-                                          help="Upload your own documents. Don't worry we don't store any data.")
 # button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
@@ -176,8 +176,6 @@ if uploaded_files:
         st.plotly_chart(fig, use_container_width=True)
-add_email_signup_form()
 add_footer()
 streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])

 \n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
 """)
+st.info("**👈 Upload your own documents on the left (as .txt or .pdf files)**")
 @st.cache(allow_output_mutation=True)
 uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
                                           type=['pdf', 'txt'],
+                                          help="Upload a set of .pdf or .txt files")
 # button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
         st.plotly_chart(fig, use_container_width=True)
 add_footer()
 streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])

pages/6_🔎_Find_Demo.py CHANGED Viewed

@@ -7,7 +7,8 @@ import pandas as pd
 import streamlit as st
 import streamlit_analytics
-import  streamlit_toggle as tog
 from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
@@ -29,12 +30,14 @@ EXAMPLE_TEXT = "the governing law is the State of Texas"
 streamlit_analytics.start_tracking()
 @st.cache(allow_output_mutation=True)
 def load_dataset():
     snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
     df = pd.read_json(DATA_FILENAME)
     return df
 @st.cache(allow_output_mutation=True)
 def generate_document_store(df):
     """Create haystack document store using contract clause data
@@ -46,7 +49,7 @@ def generate_document_store(df):
             {
                 'content': row['paragraph'],
                 'meta': {'contract_title': row['contract_title']}
-            }
         )
     document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
@@ -55,25 +58,33 @@ def generate_document_store(df):
     return document_store
 def files_to_dataframe(uploaded_files, limit=10):
     texts = []
     titles = []
     for uploaded_file in uploaded_files[:limit]:
-        stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
-        text = stringio.read().strip()
-        paragraphs = text.split("\n\n")
         paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
         texts.extend(paragraphs)
-        titles.extend([uploaded_file.name]*len(paragraphs))
     return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
 @st.cache(allow_output_mutation=True)
 def generate_bm25_retriever(document_store):
     return BM25Retriever(document_store)
 @st.cache(allow_output_mutation=True)
 def generate_embeddings(embedding_model, document_store):
     embedding_retriever = EmbeddingRetriever(
@@ -85,6 +96,7 @@ def generate_embeddings(embedding_model, document_store):
     document_store.update_embeddings(embedding_retriever)
     return embedding_retriever
 def process_query(query, retriever):
     """Generates dataframe with top ten results"""
     texts = []
@@ -95,22 +107,23 @@ def process_query(query, retriever):
         query=query,
         top_k=10,
     )
     for idx, document in enumerate(candidate_documents):
         texts.append(document.content)
         contract_titles.append(document.meta["contract_title"])
         scores.append(str(round(document.score, 2)))
         ranking.append(idx + 1)
     return pd.DataFrame(
         {
-            "Ranking": ranking,
             "Text": texts,
-            "Source Contract": contract_titles,
-            "Similarity": scores
         }
     )
 st.set_page_config(
     page_title="Find Demo",
     page_icon="🔎",
@@ -124,37 +137,37 @@ st.set_page_config(
 )
 add_logo_to_sidebar()
-st.sidebar.success("👆 Select a demo above.")
 st.title('🔎 Find Demo')
 st.write("""
-This demo shows how a set of clauses can be searched.
-Upload a set of contracts on the left and the paragraphs can be searched using **keywords** or using **semantic search**.
-Semantic search leverages an AI model which matches on clauses with a similar meaning to the input text.
 """)
-st.write("**👈 Upload a set of contracts on the left** to start the demo")
-#df = load_dataset()
-#document_store = generate_document_store(df)
-#bm25_retriever = generate_bm25_retriever(document_store)
-#embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
-col1, col2, col3, col4, col5 = st.columns(5)
-uploaded_files = st.sidebar.file_uploader("Select contracts to search **(upload up to 10 files)**", accept_multiple_files=True)
 if uploaded_files:
     with col1:
-        st.write("Toggle between **keyword** or **semantic** search:")
         value = tog.st_toggle_switch(
-            label="Keyword/Semantic",
-            label_after=True,
-            inactive_color='#D3D3D3',
-            active_color="#11567f",
             track_color="#29B5E8"
         )
         if value:
@@ -162,14 +175,7 @@ if uploaded_files:
         else:
             search_type = "keyword"
-    print(value)
-    df = files_to_dataframe(uploaded_files)
-    document_store = generate_document_store(df)
-    bm25_retriever = generate_bm25_retriever(document_store)
-    st.write("**👇 Enter search query below** and hit the button **Find Clauses** to see the demo in action")
-    query = st.text_area(label='Enter Search Query', value=EXAMPLE_TEXT, height=250)
-    button = st.button('**Find Clauses**', type='primary', use_container_width=True)
     if button:
@@ -180,29 +186,22 @@ if uploaded_files:
             </style>
             """
-        st.subheader(f'Search Results ({search_type}):')
         # Inject CSS with Markdown
         st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
         if search_type == "keyword":
-            df_bm25 = process_query(query, bm25_retriever)
             st.table(df_bm25)
         if search_type == "semantic":
-            embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
-            df_embed = process_query(query, embedding_retriever)
             st.table(df_embed)
-        # with col2:
-        #     st.subheader('Semantic Search Results:')
-        #     # Inject CSS with Markdown
-        #     st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
-        #     df_embed = process_query(query, embedding_retriever)
-        #     st.table(df_embed)
-add_email_signup_form()
-add_footer()
-streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])

 import streamlit as st
 import streamlit_analytics
+import streamlit_toggle as tog
+from pypdf import PdfReader
 from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
 streamlit_analytics.start_tracking()
 @st.cache(allow_output_mutation=True)
 def load_dataset():
     snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
     df = pd.read_json(DATA_FILENAME)
     return df
 @st.cache(allow_output_mutation=True)
 def generate_document_store(df):
     """Create haystack document store using contract clause data
             {
                 'content': row['paragraph'],
                 'meta': {'contract_title': row['contract_title']}
+            }
         )
     document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
     return document_store
 def files_to_dataframe(uploaded_files, limit=10):
     texts = []
     titles = []
     for uploaded_file in uploaded_files[:limit]:
+        if '.pdf' in uploaded_file.name.lower():
+            reader = PdfReader(uploaded_file)
+            page_texts = [page.extract_text() for page in reader.pages]
+            text = "\n".join(page_texts).strip()
+        if '.txt' in uploaded_file.name.lower():
+            stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
+            text = stringio.read().strip()
+        paragraphs = text.split("\n")
         paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
         texts.extend(paragraphs)
+        titles.extend([uploaded_file.name] * len(paragraphs))
     return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
 @st.cache(allow_output_mutation=True)
 def generate_bm25_retriever(document_store):
     return BM25Retriever(document_store)
 @st.cache(allow_output_mutation=True)
 def generate_embeddings(embedding_model, document_store):
     embedding_retriever = EmbeddingRetriever(
     document_store.update_embeddings(embedding_retriever)
     return embedding_retriever
 def process_query(query, retriever):
     """Generates dataframe with top ten results"""
     texts = []
         query=query,
         top_k=10,
     )
     for idx, document in enumerate(candidate_documents):
         texts.append(document.content)
         contract_titles.append(document.meta["contract_title"])
         scores.append(str(round(document.score, 2)))
         ranking.append(idx + 1)
     return pd.DataFrame(
         {
+            "Rank": ranking,
             "Text": texts,
+            "Source Document": contract_titles,
+            "Similarity Score": scores
         }
     )
 st.set_page_config(
     page_title="Find Demo",
     page_icon="🔎",
 )
 add_logo_to_sidebar()
 st.title('🔎 Find Demo')
 st.write("""
+This demo shows how a set of documents can be searched.
+Upload a set of documents on the left and the paragraphs can be searched using **keyword** or using **semantic** search.
+Semantic search leverages an AI model which matches on paragraphs with a similar meaning to the input text.
 """)
+st.info("**👈 Upload a set of documents on the left**")
+uploaded_files = st.sidebar.file_uploader("Upload a set of documents **(upload up to 10 files)**",
+                                          type=['pdf', 'txt'],
+                                          help='Upload a set of .pdf or .txt files',
+                                          accept_multiple_files=True)
 if uploaded_files:
+    with st.spinner('🔺 Uploading files...'):
+        df = files_to_dataframe(uploaded_files)
+        document_store = generate_document_store(df)
+    st.write("**👇 Enter a search query below** and toggle keyword/semantic mode and hit **Search**")
+    col1, col2 = st.columns([3, 1])
     with col1:
+        query = st.text_input(label='Enter Search Query', label_visibility='collapsed', value=EXAMPLE_TEXT)
+    with col2:
         value = tog.st_toggle_switch(
+            label="Semantic Mode",
+            label_after=False,
+            inactive_color='#D3D3D3',
+            active_color="#11567f",
             track_color="#29B5E8"
         )
         if value:
         else:
             search_type = "keyword"
+    button = st.button('Search', type='primary', use_container_width=True)
     if button:
             </style>
             """
+        st.subheader(f'✅ {search_type.capitalize()} Search Results')
         # Inject CSS with Markdown
         st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
         if search_type == "keyword":
+            with st.spinner('⚙️ Running search...'):
+                bm25_retriever = generate_bm25_retriever(document_store)
+                df_bm25 = process_query(query, bm25_retriever)
             st.table(df_bm25)
         if search_type == "semantic":
+            with st.spinner('⚙️ Running search...'):
+                embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
+                df_embed = process_query(query, embedding_retriever)
             st.table(df_embed)
+        add_footer()
+streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])