leavoigt commited on
Commit
127c047
1 Parent(s): 5cfddb1

Delete appStore/keyword_search.py

Browse files
Files changed (1) hide show
  1. appStore/keyword_search.py +0 -176
appStore/keyword_search.py DELETED
@@ -1,176 +0,0 @@
1
- # set path
2
- import glob, os, sys;
3
- sys.path.append('../utils')
4
-
5
- import streamlit as st
6
- import json
7
- import logging
8
- from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
- from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
10
- from utils.checkconfig import getconfig
11
- from utils.streamlitcheck import checkbox_without_preselect
12
-
13
- # Declare all the necessary variables
14
- config = getconfig('paramconfig.cfg')
15
- split_by = config.get('semantic_search','SPLIT_BY')
16
- split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
17
- split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
18
- split_respect_sentence_boundary = bool(int(config.get('semantic_search',
19
- 'RESPECT_SENTENCE_BOUNDARY')))
20
- remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
21
- embedding_model = config.get('semantic_search','RETRIEVER')
22
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
23
- embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
24
- embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
25
- max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
26
- retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
27
- reader_model = config.get('semantic_search','READER')
28
- reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
29
- top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
30
- lexical_split_by= config.get('lexical_search','SPLIT_BY')
31
- lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
32
- lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
33
- lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
34
- lexical_top_k=int(config.get('lexical_search','TOP_K'))
35
-
36
- def app():
37
-
38
- with st.container():
39
- st.markdown("<h1 style='text-align: center; \
40
- color: black;'> Search</h1>",
41
- unsafe_allow_html=True)
42
- st.write(' ')
43
- st.write(' ')
44
-
45
- with st.expander("ℹ️ - About this app", expanded=False):
46
-
47
- st.write(
48
- """
49
- The *Search* app is an interface \
50
- for doing contextual and keyword searches in \
51
- policy documents. \
52
- """)
53
- st.write("")
54
- st.write(""" The application allows its user to perform a search\
55
- based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
56
- and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
57
- The lexical search only \
58
- displays paragraphs in the document with exact matching results, \
59
- the semantic search shows paragraphs with meaningful connections \
60
- (e.g., synonyms) based on the search context. Both \
61
- methods employ a probabilistic retrieval framework in its identification\
62
- of relevant paragraphs. By defualt the search is performed using \
63
- 'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
64
- checkbox provided which will by-pass semantic search. Furthermore,\
65
- the application allows the user to search for pre-defined keywords \
66
- from different thematic buckets present in sidebar.""")
67
- st.write("")
68
- st.write(""" The Exact Matches gives back top {} findings, and Semantic
69
- search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
70
- st.write("")
71
- st.write("")
72
- st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
73
- col1,col2,col3= st.columns([2,4,4])
74
- with col1:
75
- st.caption("OCR File processing")
76
- # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
77
- st.write("50 sec")
78
-
79
- with col2:
80
- st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
81
- # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
82
- st.write("15 sec")
83
-
84
- with col3:
85
- st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
86
- # st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
87
- st.write("120 sec(including emebedding creation)")
88
-
89
- with st.sidebar:
90
- with open('docStore/sample/keywordexample.json','r') as json_file:
91
- keywordexample = json.load(json_file)
92
-
93
- # genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
94
- st.caption("Select Keyword Category")
95
- genre = checkbox_without_preselect(list(keywordexample.keys()))
96
- if genre:
97
- keywordList = keywordexample[genre]
98
- else:
99
- keywordList = None
100
-
101
- st.markdown("---")
102
-
103
- with st.container():
104
- type_hinting = "Please enter here your question and we \
105
- will look for an answer in the document\
106
- OR enter the keyword you are looking \
107
- for and we will look for similar\
108
- context in the document.\
109
- You can also explore predefined sets of keywords from sidebar. "
110
- if keywordList is not None:
111
- # queryList = st.text_input("You selected the {} category we \
112
- # will look for these keywords in document".format(genre)
113
- # value="{}".format(keywordList))
114
- queryList = st.text_input(type_hinting,
115
- value = "{}".format(keywordList))
116
- else:
117
- queryList = st.text_input(type_hinting,
118
- placeholder="Enter keyword/query here")
119
-
120
- searchtype = st.checkbox("Show only Exact Matches")
121
- if st.button("Find them"):
122
-
123
- if queryList == "":
124
- st.info("🤔 No keyword provided, if you dont have any, \
125
- please try example sets from sidebar!")
126
- logging.warning("Terminated as no keyword provided")
127
- else:
128
- if 'filepath' in st.session_state:
129
-
130
- if searchtype:
131
- all_documents = runLexicalPreprocessingPipeline(
132
- file_name=st.session_state['filename'],
133
- file_path=st.session_state['filepath'],
134
- split_by=lexical_split_by,
135
- split_length=lexical_split_length,
136
- split_overlap=lexical_split_overlap,
137
- remove_punc=lexical_remove_punc)
138
- logging.info("performing lexical search")
139
- with st.spinner("Performing Exact matching search \
140
- (Lexical search) for you"):
141
- lexical_search(query=queryList,
142
- documents = all_documents['documents'],
143
- top_k = lexical_top_k )
144
- else:
145
- all_documents = runSemanticPreprocessingPipeline(
146
- file_path= st.session_state['filepath'],
147
- file_name = st.session_state['filename'],
148
- split_by=split_by,
149
- split_length= split_length,
150
- split_overlap=split_overlap,
151
- remove_punc= remove_punc,
152
- split_respect_sentence_boundary=split_respect_sentence_boundary)
153
- if len(all_documents['documents']) > 100:
154
- warning_msg = ": This might take sometime, please sit back and relax."
155
- else:
156
- warning_msg = ""
157
-
158
- logging.info("starting semantic search")
159
- with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
160
- semantic_keywordsearch(query = queryList,
161
- documents = all_documents['documents'],
162
- embedding_model=embedding_model,
163
- embedding_layer=embedding_layer,
164
- embedding_model_format=embedding_model_format,
165
- reader_model=reader_model,reader_top_k=reader_top_k,
166
- retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
167
- max_seq_len=max_seq_len,
168
- top_k_per_candidate = top_k_per_candidate)
169
-
170
- else:
171
- st.info("🤔 No document found, please try to upload it at the sidebar!")
172
- logging.warning("Terminated as no document provided")
173
-
174
-
175
-
176
-