Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
updating some styling changes for Find and now can upload pdfs
Browse files- pages/5_π_Organise_Demo.py +2 -4
- pages/6_π_Find_Demo.py +56 -57
pages/5_π_Organise_Demo.py
CHANGED
@@ -46,7 +46,7 @@ Similar documents are grouped by color.
|
|
46 |
\n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
|
47 |
""")
|
48 |
|
49 |
-
st.info("
|
50 |
|
51 |
|
52 |
@st.cache(allow_output_mutation=True)
|
@@ -127,7 +127,7 @@ def prepare_page():
|
|
127 |
|
128 |
uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
|
129 |
type=['pdf', 'txt'],
|
130 |
-
help="Upload
|
131 |
|
132 |
# button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
|
133 |
|
@@ -176,8 +176,6 @@ if uploaded_files:
|
|
176 |
st.plotly_chart(fig, use_container_width=True)
|
177 |
|
178 |
|
179 |
-
add_email_signup_form()
|
180 |
-
|
181 |
add_footer()
|
182 |
|
183 |
streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
|
|
|
46 |
\n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend.
|
47 |
""")
|
48 |
|
49 |
+
st.info("**π Upload your own documents on the left (as .txt or .pdf files)**")
|
50 |
|
51 |
|
52 |
@st.cache(allow_output_mutation=True)
|
|
|
127 |
|
128 |
uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True,
|
129 |
type=['pdf', 'txt'],
|
130 |
+
help="Upload a set of .pdf or .txt files")
|
131 |
|
132 |
# button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True)
|
133 |
|
|
|
176 |
st.plotly_chart(fig, use_container_width=True)
|
177 |
|
178 |
|
|
|
|
|
179 |
add_footer()
|
180 |
|
181 |
streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
|
pages/6_π_Find_Demo.py
CHANGED
@@ -7,7 +7,8 @@ import pandas as pd
|
|
7 |
import streamlit as st
|
8 |
import streamlit_analytics
|
9 |
|
10 |
-
import
|
|
|
11 |
|
12 |
from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
|
13 |
|
@@ -29,12 +30,14 @@ EXAMPLE_TEXT = "the governing law is the State of Texas"
|
|
29 |
|
30 |
streamlit_analytics.start_tracking()
|
31 |
|
|
|
32 |
@st.cache(allow_output_mutation=True)
|
33 |
def load_dataset():
|
34 |
snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
|
35 |
df = pd.read_json(DATA_FILENAME)
|
36 |
return df
|
37 |
|
|
|
38 |
@st.cache(allow_output_mutation=True)
|
39 |
def generate_document_store(df):
|
40 |
"""Create haystack document store using contract clause data
|
@@ -46,7 +49,7 @@ def generate_document_store(df):
|
|
46 |
{
|
47 |
'content': row['paragraph'],
|
48 |
'meta': {'contract_title': row['contract_title']}
|
49 |
-
}
|
50 |
)
|
51 |
|
52 |
document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
|
@@ -55,25 +58,33 @@ def generate_document_store(df):
|
|
55 |
|
56 |
return document_store
|
57 |
|
|
|
58 |
def files_to_dataframe(uploaded_files, limit=10):
|
59 |
texts = []
|
60 |
titles = []
|
61 |
for uploaded_file in uploaded_files[:limit]:
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
|
|
|
|
|
64 |
|
65 |
-
|
66 |
-
paragraphs = text.split("\n\n")
|
67 |
paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
|
68 |
texts.extend(paragraphs)
|
69 |
-
titles.extend([uploaded_file.name]*len(paragraphs))
|
70 |
-
|
71 |
return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
|
72 |
|
|
|
73 |
@st.cache(allow_output_mutation=True)
|
74 |
def generate_bm25_retriever(document_store):
|
75 |
return BM25Retriever(document_store)
|
76 |
|
|
|
77 |
@st.cache(allow_output_mutation=True)
|
78 |
def generate_embeddings(embedding_model, document_store):
|
79 |
embedding_retriever = EmbeddingRetriever(
|
@@ -85,6 +96,7 @@ def generate_embeddings(embedding_model, document_store):
|
|
85 |
document_store.update_embeddings(embedding_retriever)
|
86 |
return embedding_retriever
|
87 |
|
|
|
88 |
def process_query(query, retriever):
|
89 |
"""Generates dataframe with top ten results"""
|
90 |
texts = []
|
@@ -95,22 +107,23 @@ def process_query(query, retriever):
|
|
95 |
query=query,
|
96 |
top_k=10,
|
97 |
)
|
98 |
-
|
99 |
for idx, document in enumerate(candidate_documents):
|
100 |
texts.append(document.content)
|
101 |
contract_titles.append(document.meta["contract_title"])
|
102 |
scores.append(str(round(document.score, 2)))
|
103 |
ranking.append(idx + 1)
|
104 |
-
|
105 |
return pd.DataFrame(
|
106 |
{
|
107 |
-
"
|
108 |
"Text": texts,
|
109 |
-
"Source
|
110 |
-
"Similarity": scores
|
111 |
}
|
112 |
)
|
113 |
|
|
|
114 |
st.set_page_config(
|
115 |
page_title="Find Demo",
|
116 |
page_icon="π",
|
@@ -124,37 +137,37 @@ st.set_page_config(
|
|
124 |
)
|
125 |
|
126 |
add_logo_to_sidebar()
|
127 |
-
st.sidebar.success("π Select a demo above.")
|
128 |
|
129 |
st.title('π Find Demo')
|
130 |
|
131 |
st.write("""
|
132 |
-
This demo shows how a set of
|
133 |
-
Upload a set of
|
134 |
-
Semantic search leverages an AI model which matches on
|
135 |
""")
|
136 |
-
st.write("**π Upload a set of contracts on the left** to start the demo")
|
137 |
|
|
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
#bm25_retriever = generate_bm25_retriever(document_store)
|
144 |
-
|
145 |
-
#embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
|
146 |
-
col1, col2, col3, col4, col5 = st.columns(5)
|
147 |
-
|
148 |
-
uploaded_files = st.sidebar.file_uploader("Select contracts to search **(upload up to 10 files)**", accept_multiple_files=True)
|
149 |
|
150 |
if uploaded_files:
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
with col1:
|
152 |
-
st.
|
|
|
153 |
value = tog.st_toggle_switch(
|
154 |
-
label="
|
155 |
-
label_after=
|
156 |
-
inactive_color='#D3D3D3',
|
157 |
-
active_color="#11567f",
|
158 |
track_color="#29B5E8"
|
159 |
)
|
160 |
if value:
|
@@ -162,14 +175,7 @@ if uploaded_files:
|
|
162 |
else:
|
163 |
search_type = "keyword"
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
df = files_to_dataframe(uploaded_files)
|
168 |
-
document_store = generate_document_store(df)
|
169 |
-
bm25_retriever = generate_bm25_retriever(document_store)
|
170 |
-
st.write("**π Enter search query below** and hit the button **Find Clauses** to see the demo in action")
|
171 |
-
query = st.text_area(label='Enter Search Query', value=EXAMPLE_TEXT, height=250)
|
172 |
-
button = st.button('**Find Clauses**', type='primary', use_container_width=True)
|
173 |
|
174 |
if button:
|
175 |
|
@@ -180,29 +186,22 @@ if uploaded_files:
|
|
180 |
</style>
|
181 |
"""
|
182 |
|
183 |
-
st.subheader(f'
|
184 |
# Inject CSS with Markdown
|
185 |
st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
|
186 |
|
187 |
if search_type == "keyword":
|
188 |
-
|
|
|
|
|
189 |
st.table(df_bm25)
|
190 |
-
|
191 |
if search_type == "semantic":
|
192 |
-
|
193 |
-
|
|
|
194 |
st.table(df_embed)
|
195 |
-
|
196 |
-
# with col2:
|
197 |
-
|
198 |
-
# st.subheader('Semantic Search Results:')
|
199 |
-
# # Inject CSS with Markdown
|
200 |
-
# st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
|
201 |
-
# df_embed = process_query(query, embedding_retriever)
|
202 |
-
# st.table(df_embed)
|
203 |
-
|
204 |
-
add_email_signup_form()
|
205 |
|
206 |
-
add_footer()
|
207 |
|
208 |
-
streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
|
|
|
7 |
import streamlit as st
|
8 |
import streamlit_analytics
|
9 |
|
10 |
+
import streamlit_toggle as tog
|
11 |
+
from pypdf import PdfReader
|
12 |
|
13 |
from utils import add_logo_to_sidebar, add_footer, add_email_signup_form
|
14 |
|
|
|
30 |
|
31 |
streamlit_analytics.start_tracking()
|
32 |
|
33 |
+
|
34 |
@st.cache(allow_output_mutation=True)
|
35 |
def load_dataset():
|
36 |
snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset')
|
37 |
df = pd.read_json(DATA_FILENAME)
|
38 |
return df
|
39 |
|
40 |
+
|
41 |
@st.cache(allow_output_mutation=True)
|
42 |
def generate_document_store(df):
|
43 |
"""Create haystack document store using contract clause data
|
|
|
49 |
{
|
50 |
'content': row['paragraph'],
|
51 |
'meta': {'contract_title': row['contract_title']}
|
52 |
+
}
|
53 |
)
|
54 |
|
55 |
document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=EMBEDDING_DIM, similarity='cosine')
|
|
|
58 |
|
59 |
return document_store
|
60 |
|
61 |
+
|
62 |
def files_to_dataframe(uploaded_files, limit=10):
|
63 |
texts = []
|
64 |
titles = []
|
65 |
for uploaded_file in uploaded_files[:limit]:
|
66 |
+
if '.pdf' in uploaded_file.name.lower():
|
67 |
+
reader = PdfReader(uploaded_file)
|
68 |
+
page_texts = [page.extract_text() for page in reader.pages]
|
69 |
+
text = "\n".join(page_texts).strip()
|
70 |
|
71 |
+
if '.txt' in uploaded_file.name.lower():
|
72 |
+
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
73 |
+
text = stringio.read().strip()
|
74 |
|
75 |
+
paragraphs = text.split("\n")
|
|
|
76 |
paragraphs = [p.strip() for p in paragraphs if len(p.split()) > 10]
|
77 |
texts.extend(paragraphs)
|
78 |
+
titles.extend([uploaded_file.name] * len(paragraphs))
|
79 |
+
|
80 |
return pd.DataFrame({'paragraph': texts, 'contract_title': titles})
|
81 |
|
82 |
+
|
83 |
@st.cache(allow_output_mutation=True)
|
84 |
def generate_bm25_retriever(document_store):
|
85 |
return BM25Retriever(document_store)
|
86 |
|
87 |
+
|
88 |
@st.cache(allow_output_mutation=True)
|
89 |
def generate_embeddings(embedding_model, document_store):
|
90 |
embedding_retriever = EmbeddingRetriever(
|
|
|
96 |
document_store.update_embeddings(embedding_retriever)
|
97 |
return embedding_retriever
|
98 |
|
99 |
+
|
100 |
def process_query(query, retriever):
|
101 |
"""Generates dataframe with top ten results"""
|
102 |
texts = []
|
|
|
107 |
query=query,
|
108 |
top_k=10,
|
109 |
)
|
110 |
+
|
111 |
for idx, document in enumerate(candidate_documents):
|
112 |
texts.append(document.content)
|
113 |
contract_titles.append(document.meta["contract_title"])
|
114 |
scores.append(str(round(document.score, 2)))
|
115 |
ranking.append(idx + 1)
|
116 |
+
|
117 |
return pd.DataFrame(
|
118 |
{
|
119 |
+
"Rank": ranking,
|
120 |
"Text": texts,
|
121 |
+
"Source Document": contract_titles,
|
122 |
+
"Similarity Score": scores
|
123 |
}
|
124 |
)
|
125 |
|
126 |
+
|
127 |
st.set_page_config(
|
128 |
page_title="Find Demo",
|
129 |
page_icon="π",
|
|
|
137 |
)
|
138 |
|
139 |
add_logo_to_sidebar()
|
|
|
140 |
|
141 |
st.title('π Find Demo')
|
142 |
|
143 |
st.write("""
|
144 |
+
This demo shows how a set of documents can be searched.
|
145 |
+
Upload a set of documents on the left and the paragraphs can be searched using **keyword** or using **semantic** search.
|
146 |
+
Semantic search leverages an AI model which matches on paragraphs with a similar meaning to the input text.
|
147 |
""")
|
|
|
148 |
|
149 |
+
st.info("**π Upload a set of documents on the left**")
|
150 |
|
151 |
+
uploaded_files = st.sidebar.file_uploader("Upload a set of documents **(upload up to 10 files)**",
|
152 |
+
type=['pdf', 'txt'],
|
153 |
+
help='Upload a set of .pdf or .txt files',
|
154 |
+
accept_multiple_files=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
if uploaded_files:
|
157 |
+
with st.spinner('πΊ Uploading files...'):
|
158 |
+
df = files_to_dataframe(uploaded_files)
|
159 |
+
document_store = generate_document_store(df)
|
160 |
+
|
161 |
+
st.write("**π Enter a search query below** and toggle keyword/semantic mode and hit **Search**")
|
162 |
+
col1, col2 = st.columns([3, 1])
|
163 |
with col1:
|
164 |
+
query = st.text_input(label='Enter Search Query', label_visibility='collapsed', value=EXAMPLE_TEXT)
|
165 |
+
with col2:
|
166 |
value = tog.st_toggle_switch(
|
167 |
+
label="Semantic Mode",
|
168 |
+
label_after=False,
|
169 |
+
inactive_color='#D3D3D3',
|
170 |
+
active_color="#11567f",
|
171 |
track_color="#29B5E8"
|
172 |
)
|
173 |
if value:
|
|
|
175 |
else:
|
176 |
search_type = "keyword"
|
177 |
|
178 |
+
button = st.button('Search', type='primary', use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
if button:
|
181 |
|
|
|
186 |
</style>
|
187 |
"""
|
188 |
|
189 |
+
st.subheader(f'β
{search_type.capitalize()} Search Results')
|
190 |
# Inject CSS with Markdown
|
191 |
st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
|
192 |
|
193 |
if search_type == "keyword":
|
194 |
+
with st.spinner('βοΈ Running search...'):
|
195 |
+
bm25_retriever = generate_bm25_retriever(document_store)
|
196 |
+
df_bm25 = process_query(query, bm25_retriever)
|
197 |
st.table(df_bm25)
|
198 |
+
|
199 |
if search_type == "semantic":
|
200 |
+
with st.spinner('βοΈ Running search...'):
|
201 |
+
embedding_retriever = generate_embeddings(EMBEDDING_MODEL, document_store)
|
202 |
+
df_embed = process_query(query, embedding_retriever)
|
203 |
st.table(df_embed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
+
add_footer()
|
206 |
|
207 |
+
streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])
|