Uwais commited on
Commit
aec7e41
β€’
1 Parent(s): 2df6c47

updating some styling changes for Find and now can upload pdfs

Browse files
pages/5_πŸ—‚_Organise_Demo.py CHANGED
@@ -46,7 +46,7 @@ Similar documents are grouped by color.
46
  \n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
47
  """)
48
 
49
- st.info("πŸ‘ˆ Upload your own documents on the left (as .txt or .pdf files) to see how your own documents can be organised using AI.")
50
 
51
 
52
  @st.cache(allow_output_mutation=True)
@@ -127,7 +127,7 @@ def prepare_page():
127
 
128
  uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
129
  type=['pdf', 'txt'],
130
- help="Upload your own documents. Don't worry we don't store any data.")
131
 
132
  # button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
133
 
@@ -176,8 +176,6 @@ if uploaded_files:
176
  st.plotly_chart(fig, use_container_width=True)
177
 
178
 
179
- add_email_signup_form()
180
-
181
  add_footer()
182
 
183
  streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
 
46
  \n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
47
  """)
48
 
49
+ st.info("**πŸ‘ˆ Upload your own documents on the left (as .txt or .pdf files)**")
50
 
51
 
52
  @st.cache(allow_output_mutation=True)
 
127
 
128
  uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
129
  type=['pdf', 'txt'],
130
+ help="Upload a set of .pdf or .txt files")
131
 
132
  # button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
133
 
 
176
  st.plotly_chart(fig, use_container_width=True)
177
 
178
 
 
 
179
  add_footer()
180
 
181
  streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
pages/6_πŸ”Ž_Find_Demo.py CHANGED
@@ -7,7 +7,8 @@ import pandas as pd
7
  import streamlit as st
8
  import streamlit_analytics
9
 
10
- import streamlit_toggle as tog
 
11
 
12
  from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
13
 
@@ -29,12 +30,14 @@ EXAMPLE_TEXT = "the governing law is the State of Texas"
29
 
30
  streamlit_analytics.start_tracking()
31
 
 
32
  @st.cache(allow_output_mutation=True)
33
  def load_dataset():
34
  snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
35
  df = pd.read_json(DATA_FILENAME)
36
  return df
37
 
 
38
  @st.cache(allow_output_mutation=True)
39
  def generate_document_store(df):
40
  """Create haystack document store using contract clause data
@@ -46,7 +49,7 @@ def generate_document_store(df):
46
  {
47
  'content': row['paragraph'],
48
  'meta': {'contract_title': row['contract_title']}
49
- }
50
  )
51
 
52
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
@@ -55,25 +58,33 @@ def generate_document_store(df):
55
 
56
  return document_store
57
 
 
58
  def files_to_dataframe(uploaded_files, limit=10):
59
  texts = []
60
  titles = []
61
  for uploaded_file in uploaded_files[:limit]:
 
 
 
 
62
 
63
- stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
 
 
64
 
65
- text = stringio.read().strip()
66
- paragraphs = text.split("\n\n")
67
  paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
68
  texts.extend(paragraphs)
69
- titles.extend([uploaded_file.name]*len(paragraphs))
70
-
71
  return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
72
 
 
73
  @st.cache(allow_output_mutation=True)
74
  def generate_bm25_retriever(document_store):
75
  return BM25Retriever(document_store)
76
 
 
77
  @st.cache(allow_output_mutation=True)
78
  def generate_embeddings(embedding_model, document_store):
79
  embedding_retriever = EmbeddingRetriever(
@@ -85,6 +96,7 @@ def generate_embeddings(embedding_model, document_store):
85
  document_store.update_embeddings(embedding_retriever)
86
  return embedding_retriever
87
 
 
88
  def process_query(query, retriever):
89
  """Generates dataframe with top ten results"""
90
  texts = []
@@ -95,22 +107,23 @@ def process_query(query, retriever):
95
  query=query,
96
  top_k=10,
97
  )
98
-
99
  for idx, document in enumerate(candidate_documents):
100
  texts.append(document.content)
101
  contract_titles.append(document.meta["contract_title"])
102
  scores.append(str(round(document.score, 2)))
103
  ranking.append(idx + 1)
104
-
105
  return pd.DataFrame(
106
  {
107
- "Ranking": ranking,
108
  "Text": texts,
109
- "Source Contract": contract_titles,
110
- "Similarity": scores
111
  }
112
  )
113
 
 
114
  st.set_page_config(
115
  page_title="Find Demo",
116
  page_icon="πŸ”Ž",
@@ -124,37 +137,37 @@ st.set_page_config(
124
  )
125
 
126
  add_logo_to_sidebar()
127
- st.sidebar.success("πŸ‘† Select a demo above.")
128
 
129
  st.title('πŸ”Ž Find Demo')
130
 
131
  st.write("""
132
- This demo shows how a set of clauses can be searched.
133
- Upload a set of contracts on the left and the paragraphs can be searched using **keywords** or using **semantic search**.
134
- Semantic search leverages an AI model which matches on clauses with a similar meaning to the input text.
135
  """)
136
- st.write("**πŸ‘ˆ Upload a set of contracts on the left** to start the demo")
137
 
 
138
 
139
- #df = load_dataset()
140
-
141
- #document_store = generate_document_store(df)
142
-
143
- #bm25_retriever = generate_bm25_retriever(document_store)
144
-
145
- #embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
146
- col1, col2, col3, col4, col5 = st.columns(5)
147
-
148
- uploaded_files = st.sidebar.file_uploader("Select contracts to search **(upload up to 10 files)**", accept_multiple_files=True)
149
 
150
  if uploaded_files:
 
 
 
 
 
 
151
  with col1:
152
- st.write("Toggle between **keyword** or **semantic** search:")
 
153
  value = tog.st_toggle_switch(
154
- label="Keyword/Semantic",
155
- label_after=True,
156
- inactive_color='#D3D3D3',
157
- active_color="#11567f",
158
  track_color="#29B5E8"
159
  )
160
  if value:
@@ -162,14 +175,7 @@ if uploaded_files:
162
  else:
163
  search_type = "keyword"
164
 
165
- print(value)
166
-
167
- df = files_to_dataframe(uploaded_files)
168
- document_store = generate_document_store(df)
169
- bm25_retriever = generate_bm25_retriever(document_store)
170
- st.write("**πŸ‘‡ Enter search query below** and hit the button **Find Clauses** to see the demo in action")
171
- query = st.text_area(label='Enter Search Query', value=EXAMPLE_TEXT, height=250)
172
- button = st.button('**Find Clauses**', type='primary', use_container_width=True)
173
 
174
  if button:
175
 
@@ -180,29 +186,22 @@ if uploaded_files:
180
  </style>
181
  """
182
 
183
- st.subheader(f'Search Results ({search_type}):')
184
  # Inject CSS with Markdown
185
  st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
186
 
187
  if search_type == "keyword":
188
- df_bm25 = process_query(query, bm25_retriever)
 
 
189
  st.table(df_bm25)
190
-
191
  if search_type == "semantic":
192
- embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
193
- df_embed = process_query(query, embedding_retriever)
 
194
  st.table(df_embed)
195
-
196
- # with col2:
197
-
198
- # st.subheader('Semantic Search Results:')
199
- # # Inject CSS with Markdown
200
- # st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
201
- # df_embed = process_query(query, embedding_retriever)
202
- # st.table(df_embed)
203
-
204
- add_email_signup_form()
205
 
206
- add_footer()
207
 
208
- streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
 
7
  import streamlit as st
8
  import streamlit_analytics
9
 
10
+ import streamlit_toggle as tog
11
+ from pypdf import PdfReader
12
 
13
  from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
14
 
 
30
 
31
  streamlit_analytics.start_tracking()
32
 
33
+
34
  @st.cache(allow_output_mutation=True)
35
  def load_dataset():
36
  snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
37
  df = pd.read_json(DATA_FILENAME)
38
  return df
39
 
40
+
41
  @st.cache(allow_output_mutation=True)
42
  def generate_document_store(df):
43
  """Create haystack document store using contract clause data
 
49
  {
50
  'content': row['paragraph'],
51
  'meta': {'contract_title': row['contract_title']}
52
+ }
53
  )
54
 
55
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
 
58
 
59
  return document_store
60
 
61
+
62
  def files_to_dataframe(uploaded_files, limit=10):
63
  texts = []
64
  titles = []
65
  for uploaded_file in uploaded_files[:limit]:
66
+ if '.pdf' in uploaded_file.name.lower():
67
+ reader = PdfReader(uploaded_file)
68
+ page_texts = [page.extract_text() for page in reader.pages]
69
+ text = "\n".join(page_texts).strip()
70
 
71
+ if '.txt' in uploaded_file.name.lower():
72
+ stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
73
+ text = stringio.read().strip()
74
 
75
+ paragraphs = text.split("\n")
 
76
  paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
77
  texts.extend(paragraphs)
78
+ titles.extend([uploaded_file.name] * len(paragraphs))
79
+
80
  return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
81
 
82
+
83
  @st.cache(allow_output_mutation=True)
84
  def generate_bm25_retriever(document_store):
85
  return BM25Retriever(document_store)
86
 
87
+
88
  @st.cache(allow_output_mutation=True)
89
  def generate_embeddings(embedding_model, document_store):
90
  embedding_retriever = EmbeddingRetriever(
 
96
  document_store.update_embeddings(embedding_retriever)
97
  return embedding_retriever
98
 
99
+
100
  def process_query(query, retriever):
101
  """Generates dataframe with top ten results"""
102
  texts = []
 
107
  query=query,
108
  top_k=10,
109
  )
110
+
111
  for idx, document in enumerate(candidate_documents):
112
  texts.append(document.content)
113
  contract_titles.append(document.meta["contract_title"])
114
  scores.append(str(round(document.score, 2)))
115
  ranking.append(idx + 1)
116
+
117
  return pd.DataFrame(
118
  {
119
+ "Rank": ranking,
120
  "Text": texts,
121
+ "Source Document": contract_titles,
122
+ "Similarity Score": scores
123
  }
124
  )
125
 
126
+
127
  st.set_page_config(
128
  page_title="Find Demo",
129
  page_icon="πŸ”Ž",
 
137
  )
138
 
139
  add_logo_to_sidebar()
 
140
 
141
  st.title('πŸ”Ž Find Demo')
142
 
143
  st.write("""
144
+ This demo shows how a set of documents can be searched.
145
+ Upload a set of documents on the left and the paragraphs can be searched using **keyword** or using **semantic** search.
146
+ Semantic search leverages an AI model which matches on paragraphs with a similar meaning to the input text.
147
  """)
 
148
 
149
+ st.info("**πŸ‘ˆ Upload a set of documents on the left**")
150
 
151
+ uploaded_files = st.sidebar.file_uploader("Upload a set of documents **(upload up to 10 files)**",
152
+ type=['pdf', 'txt'],
153
+ help='Upload a set of .pdf or .txt files',
154
+ accept_multiple_files=True)
 
 
 
 
 
 
155
 
156
  if uploaded_files:
157
+ with st.spinner('πŸ”Ί Uploading files...'):
158
+ df = files_to_dataframe(uploaded_files)
159
+ document_store = generate_document_store(df)
160
+
161
+ st.write("**πŸ‘‡ Enter a search query below** and toggle keyword/semantic mode and hit **Search**")
162
+ col1, col2 = st.columns([3, 1])
163
  with col1:
164
+ query = st.text_input(label='Enter Search Query', label_visibility='collapsed', value=EXAMPLE_TEXT)
165
+ with col2:
166
  value = tog.st_toggle_switch(
167
+ label="Semantic Mode",
168
+ label_after=False,
169
+ inactive_color='#D3D3D3',
170
+ active_color="#11567f",
171
  track_color="#29B5E8"
172
  )
173
  if value:
 
175
  else:
176
  search_type = "keyword"
177
 
178
+ button = st.button('Search', type='primary', use_container_width=True)
 
 
 
 
 
 
 
179
 
180
  if button:
181
 
 
186
  </style>
187
  """
188
 
189
+ st.subheader(f'βœ… {search_type.capitalize()} Search Results')
190
  # Inject CSS with Markdown
191
  st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
192
 
193
  if search_type == "keyword":
194
+ with st.spinner('βš™οΈ Running search...'):
195
+ bm25_retriever = generate_bm25_retriever(document_store)
196
+ df_bm25 = process_query(query, bm25_retriever)
197
  st.table(df_bm25)
198
+
199
  if search_type == "semantic":
200
+ with st.spinner('βš™οΈ Running search...'):
201
+ embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
202
+ df_embed = process_query(query, embedding_retriever)
203
  st.table(df_embed)
 
 
 
 
 
 
 
 
 
 
204
 
205
+ add_footer()
206
 
207
+ streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])