prashant
commited on
Commit
•
d7ce857
1
Parent(s):
7af394d
update
Browse files- appStore/keyword_search.py +10 -9
- appStore/sdg_analysis.py +7 -7
- utils/keyword_extraction.py +3 -3
- utils/lexical_search.py +14 -3
- utils/preprocessing.py +8 -8
- utils/sdg_classifier.py +10 -10
- utils/semantic_search.py +38 -25
appStore/keyword_search.py
CHANGED
@@ -6,7 +6,7 @@ import streamlit as st
|
|
6 |
import json
|
7 |
import logging
|
8 |
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
-
from utils.semantic_search import runSemanticPreprocessingPipeline,
|
10 |
from utils.checkconfig import getconfig
|
11 |
|
12 |
# Declare all the necessary variables
|
@@ -21,6 +21,7 @@ embedding_model = config.get('semantic_search','RETRIEVER')
|
|
21 |
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
22 |
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
23 |
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
|
|
24 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
25 |
reader_model = config.get('semantic_search','READER')
|
26 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
@@ -100,7 +101,7 @@ def app():
|
|
100 |
if 'filepath' in st.session_state:
|
101 |
|
102 |
if searchtype:
|
103 |
-
|
104 |
file_name=st.session_state['filename'],
|
105 |
file_path=st.session_state['filepath'],
|
106 |
split_by=lexical_split_by,
|
@@ -110,13 +111,12 @@ def app():
|
|
110 |
logging.info("performing lexical search")
|
111 |
with st.spinner("Performing Exact matching search \
|
112 |
(Lexical search) for you"):
|
113 |
-
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
114 |
lexical_search(
|
115 |
query=queryList,
|
116 |
-
documents =
|
117 |
top_k = lexical_top_k )
|
118 |
else:
|
119 |
-
|
120 |
file_path= st.session_state['filepath'],
|
121 |
file_name = st.session_state['filename'],
|
122 |
split_by=split_by,
|
@@ -124,20 +124,21 @@ def app():
|
|
124 |
split_overlap=split_overlap,
|
125 |
removePunc= remove_punc,
|
126 |
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
127 |
-
if len(
|
128 |
warning_msg = ": This might take sometime, please sit back and relax."
|
129 |
else:
|
130 |
warning_msg = ""
|
131 |
|
132 |
logging.info("starting semantic search")
|
133 |
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
134 |
-
|
135 |
-
documents =
|
136 |
embedding_model=embedding_model,
|
137 |
embedding_layer=embedding_layer,
|
138 |
embedding_model_format=embedding_model_format,
|
139 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
140 |
-
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim
|
|
|
141 |
|
142 |
else:
|
143 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
|
|
6 |
import json
|
7 |
import logging
|
8 |
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
+
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
|
10 |
from utils.checkconfig import getconfig
|
11 |
|
12 |
# Declare all the necessary variables
|
|
|
21 |
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
22 |
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
23 |
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
24 |
+
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
25 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
26 |
reader_model = config.get('semantic_search','READER')
|
27 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
|
|
101 |
if 'filepath' in st.session_state:
|
102 |
|
103 |
if searchtype:
|
104 |
+
all_documents = runLexicalPreprocessingPipeline(
|
105 |
file_name=st.session_state['filename'],
|
106 |
file_path=st.session_state['filepath'],
|
107 |
split_by=lexical_split_by,
|
|
|
111 |
logging.info("performing lexical search")
|
112 |
with st.spinner("Performing Exact matching search \
|
113 |
(Lexical search) for you"):
|
|
|
114 |
lexical_search(
|
115 |
query=queryList,
|
116 |
+
documents = all_documents['documents'],
|
117 |
top_k = lexical_top_k )
|
118 |
else:
|
119 |
+
all_documents = runSemanticPreprocessingPipeline(
|
120 |
file_path= st.session_state['filepath'],
|
121 |
file_name = st.session_state['filename'],
|
122 |
split_by=split_by,
|
|
|
124 |
split_overlap=split_overlap,
|
125 |
removePunc= remove_punc,
|
126 |
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
127 |
+
if len(all_documents['documents']) > 100:
|
128 |
warning_msg = ": This might take sometime, please sit back and relax."
|
129 |
else:
|
130 |
warning_msg = ""
|
131 |
|
132 |
logging.info("starting semantic search")
|
133 |
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
134 |
+
semantic_keywordsearch(query = queryList,
|
135 |
+
documents = all_documents['documents'],
|
136 |
embedding_model=embedding_model,
|
137 |
embedding_layer=embedding_layer,
|
138 |
embedding_model_format=embedding_model_format,
|
139 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
140 |
+
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
141 |
+
max_seq_len=max_seq_len)
|
142 |
|
143 |
else:
|
144 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
appStore/sdg_analysis.py
CHANGED
@@ -93,31 +93,31 @@ def app():
|
|
93 |
file_path = st.session_state['filepath']
|
94 |
classifier = load_sdgClassifier(classifier_name=model_name)
|
95 |
st.session_state['sdg_classifier'] = classifier
|
96 |
-
|
97 |
filePath= file_path, split_by= split_by,
|
98 |
split_length= split_length,
|
99 |
split_overlap= split_overlap,
|
100 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
101 |
-
|
102 |
|
103 |
-
if len(
|
104 |
warning_msg = ": This might take sometime, please sit back and relax."
|
105 |
else:
|
106 |
warning_msg = ""
|
107 |
|
108 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
109 |
|
110 |
-
df, x = sdg_classification(
|
111 |
threshold= threshold)
|
112 |
df = df.drop(['Relevancy'], axis = 1)
|
113 |
sdg_labels = x.SDG.unique()[::-1]
|
114 |
-
|
115 |
for label in sdg_labels:
|
116 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
117 |
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
118 |
if len(textranklist_) > 0:
|
119 |
-
|
120 |
-
tRkeywordsDf = pd.DataFrame(
|
121 |
|
122 |
|
123 |
plt.rcParams['font.size'] = 25
|
|
|
93 |
file_path = st.session_state['filepath']
|
94 |
classifier = load_sdgClassifier(classifier_name=model_name)
|
95 |
st.session_state['sdg_classifier'] = classifier
|
96 |
+
all_documents = runSDGPreprocessingPipeline(fileName= file_name,
|
97 |
filePath= file_path, split_by= split_by,
|
98 |
split_length= split_length,
|
99 |
split_overlap= split_overlap,
|
100 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
101 |
+
remove_punc= remove_punc)
|
102 |
|
103 |
+
if len(all_documents['documents']) > 100:
|
104 |
warning_msg = ": This might take sometime, please sit back and relax."
|
105 |
else:
|
106 |
warning_msg = ""
|
107 |
|
108 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
109 |
|
110 |
+
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
111 |
threshold= threshold)
|
112 |
df = df.drop(['Relevancy'], axis = 1)
|
113 |
sdg_labels = x.SDG.unique()[::-1]
|
114 |
+
textrank_keyword_list = []
|
115 |
for label in sdg_labels:
|
116 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
117 |
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
118 |
if len(textranklist_) > 0:
|
119 |
+
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
120 |
+
tRkeywordsDf = pd.DataFrame(textrank_keyword_list)
|
121 |
|
122 |
|
123 |
plt.rcParams['font.size'] = 25
|
utils/keyword_extraction.py
CHANGED
@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
|
58 |
return results
|
59 |
|
60 |
|
61 |
-
def
|
62 |
"""
|
63 |
TFIDF based keywords extraction
|
64 |
|
@@ -81,7 +81,7 @@ def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
|
|
81 |
keywords = [keyword for keyword in results]
|
82 |
return keywords
|
83 |
|
84 |
-
def
|
85 |
"""
|
86 |
TFIDF based keywords extraction
|
87 |
|
@@ -102,7 +102,7 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
102 |
features = vectorizer.get_feature_names_out()
|
103 |
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
|
104 |
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
105 |
-
top_n =
|
106 |
results=extract_topn_from_vector(features,sorted_items,top_n)
|
107 |
keywords = [keyword for keyword in results]
|
108 |
return keywords
|
|
|
58 |
return results
|
59 |
|
60 |
|
61 |
+
def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
|
62 |
"""
|
63 |
TFIDF based keywords extraction
|
64 |
|
|
|
81 |
keywords = [keyword for keyword in results]
|
82 |
return keywords
|
83 |
|
84 |
+
def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
|
85 |
"""
|
86 |
TFIDF based keywords extraction
|
87 |
|
|
|
102 |
features = vectorizer.get_feature_names_out()
|
103 |
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
|
104 |
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
105 |
+
top_n = top_n
|
106 |
results=extract_topn_from_vector(features,sorted_items,top_n)
|
107 |
keywords = [keyword for keyword in results]
|
108 |
return keywords
|
utils/lexical_search.py
CHANGED
@@ -25,7 +25,7 @@ except ImportError:
|
|
25 |
|
26 |
def runLexicalPreprocessingPipeline(file_path,file_name,
|
27 |
split_by: Literal["sentence", "word"] = 'word',
|
28 |
-
split_length:int = 80,
|
29 |
split_overlap:int = 0 )->List[Document]:
|
30 |
"""
|
31 |
creates the pipeline and runs the preprocessing pipeline,
|
@@ -61,7 +61,7 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
|
|
61 |
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
62 |
params= {"FileConverter": {"file_path": file_path, \
|
63 |
"file_name": file_name},
|
64 |
-
"UdfPreProcessor": {"
|
65 |
"split_by": split_by, \
|
66 |
"split_length":split_length,\
|
67 |
"split_overlap": split_overlap}})
|
@@ -223,12 +223,23 @@ def lexical_search(query:Text,top_k:int, documents:List[Document]):
|
|
223 |
retriever = TfidfRetriever(document_store)
|
224 |
results = retriever.retrieve(query=query, top_k = top_k)
|
225 |
query_tokens = tokenize_lexical_query(query)
|
|
|
226 |
for count, result in enumerate(results):
|
227 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
|
|
228 |
if len(matches) != 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
if check_streamlit():
|
230 |
st.write("Result {}".format(count+1))
|
231 |
else:
|
232 |
print("Results {}".format(count +1))
|
233 |
spacyAnnotator(matches, doc)
|
234 |
-
|
|
|
|
|
|
25 |
|
26 |
def runLexicalPreprocessingPipeline(file_path,file_name,
|
27 |
split_by: Literal["sentence", "word"] = 'word',
|
28 |
+
split_length:int = 80, remove_punc:bool = False,
|
29 |
split_overlap:int = 0 )->List[Document]:
|
30 |
"""
|
31 |
creates the pipeline and runs the preprocessing pipeline,
|
|
|
61 |
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
62 |
params= {"FileConverter": {"file_path": file_path, \
|
63 |
"file_name": file_name},
|
64 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
65 |
"split_by": split_by, \
|
66 |
"split_length":split_length,\
|
67 |
"split_overlap": split_overlap}})
|
|
|
223 |
retriever = TfidfRetriever(document_store)
|
224 |
results = retriever.retrieve(query=query, top_k = top_k)
|
225 |
query_tokens = tokenize_lexical_query(query)
|
226 |
+
flag = True
|
227 |
for count, result in enumerate(results):
|
228 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
229 |
+
|
230 |
if len(matches) != 0:
|
231 |
+
if flag:
|
232 |
+
flag = False
|
233 |
+
if check_streamlit:
|
234 |
+
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
235 |
+
else:
|
236 |
+
print("Top few lexical search (TFIDF) hits")
|
237 |
+
|
238 |
if check_streamlit():
|
239 |
st.write("Result {}".format(count+1))
|
240 |
else:
|
241 |
print("Results {}".format(count +1))
|
242 |
spacyAnnotator(matches, doc)
|
243 |
+
|
244 |
+
if flag:
|
245 |
+
st.info("🤔 No relevant result found. Please try another keyword.")
|
utils/preprocessing.py
CHANGED
@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
|
|
120 |
return
|
121 |
|
122 |
|
123 |
-
def basic(s,
|
124 |
|
125 |
"""
|
126 |
Performs basic cleaning of text.
|
@@ -141,7 +141,7 @@ def basic(s, removePunc:bool = False):
|
|
141 |
s = re.sub('\n', ' ', s)
|
142 |
|
143 |
# Remove punctuations
|
144 |
-
if
|
145 |
translator = str.maketrans(' ', ' ', string.punctuation)
|
146 |
s = s.translate(translator)
|
147 |
# Remove distracting single quotes and dotted pattern
|
@@ -164,7 +164,7 @@ class UdfPreProcessor(BaseComponent):
|
|
164 |
"""
|
165 |
outgoing_edges = 1
|
166 |
|
167 |
-
def run(self, documents:List[Document],
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
split_respect_sentence_boundary = False,
|
170 |
split_length:int = 2, split_overlap:int = 0):
|
@@ -220,7 +220,7 @@ class UdfPreProcessor(BaseComponent):
|
|
220 |
# i = basic(i)
|
221 |
docs_processed = preprocessor.process([i])
|
222 |
for item in docs_processed:
|
223 |
-
item.content = basic(item.content,
|
224 |
|
225 |
df = pd.DataFrame(docs_processed)
|
226 |
all_text = " ".join(df.content.to_list())
|
@@ -248,12 +248,12 @@ def processingpipeline():
|
|
248 |
"""
|
249 |
|
250 |
preprocessing_pipeline = Pipeline()
|
251 |
-
|
252 |
-
|
253 |
|
254 |
-
preprocessing_pipeline.add_node(component=
|
255 |
name="FileConverter", inputs=["File"])
|
256 |
-
preprocessing_pipeline.add_node(component =
|
257 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
258 |
|
259 |
return preprocessing_pipeline
|
|
|
120 |
return
|
121 |
|
122 |
|
123 |
+
def basic(s, remove_punc:bool = False):
|
124 |
|
125 |
"""
|
126 |
Performs basic cleaning of text.
|
|
|
141 |
s = re.sub('\n', ' ', s)
|
142 |
|
143 |
# Remove punctuations
|
144 |
+
if remove_punc == True:
|
145 |
translator = str.maketrans(' ', ' ', string.punctuation)
|
146 |
s = s.translate(translator)
|
147 |
# Remove distracting single quotes and dotted pattern
|
|
|
164 |
"""
|
165 |
outgoing_edges = 1
|
166 |
|
167 |
+
def run(self, documents:List[Document], remove_punc:bool,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
split_respect_sentence_boundary = False,
|
170 |
split_length:int = 2, split_overlap:int = 0):
|
|
|
220 |
# i = basic(i)
|
221 |
docs_processed = preprocessor.process([i])
|
222 |
for item in docs_processed:
|
223 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
224 |
|
225 |
df = pd.DataFrame(docs_processed)
|
226 |
all_text = " ".join(df.content.to_list())
|
|
|
248 |
"""
|
249 |
|
250 |
preprocessing_pipeline = Pipeline()
|
251 |
+
file_converter = FileConverter()
|
252 |
+
custom_preprocessor = UdfPreProcessor()
|
253 |
|
254 |
+
preprocessing_pipeline.add_node(component=file_converter,
|
255 |
name="FileConverter", inputs=["File"])
|
256 |
+
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
257 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
258 |
|
259 |
return preprocessing_pipeline
|
utils/sdg_classifier.py
CHANGED
@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
|
|
34 |
17:'SDG 17 - Partnership for the goals',}
|
35 |
|
36 |
@st.cache(allow_output_mutation=True)
|
37 |
-
def load_sdgClassifier(
|
38 |
"""
|
39 |
loads the document classifier using haystack, where the name/path of model
|
40 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
@@ -52,11 +52,11 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
|
|
52 |
Return: document classifier model
|
53 |
"""
|
54 |
if not classifier_name:
|
55 |
-
if not
|
56 |
logging.warning("Pass either model name or config file")
|
57 |
return
|
58 |
else:
|
59 |
-
config = getconfig(
|
60 |
classifier_name = config.get('sdg','MODEL')
|
61 |
|
62 |
logging.info("Loading classifier")
|
@@ -68,8 +68,8 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
|
|
68 |
|
69 |
|
70 |
@st.cache(allow_output_mutation=True)
|
71 |
-
def sdg_classification(
|
72 |
-
threshold:float,
|
73 |
"""
|
74 |
Text-Classification on the list of texts provided. Classifier provides the
|
75 |
most appropriate label for each text. these labels are in terms of if text
|
@@ -93,14 +93,14 @@ def sdg_classification(haystackdoc:List[Document],
|
|
93 |
|
94 |
"""
|
95 |
logging.info("Working on SDG Classification")
|
96 |
-
if not
|
97 |
if check_streamlit:
|
98 |
-
|
99 |
else:
|
100 |
logging.warning("No streamlit envinornment found, Pass the classifier")
|
101 |
return
|
102 |
|
103 |
-
results =
|
104 |
|
105 |
|
106 |
labels_= [(l.meta['classification']['label'],
|
@@ -130,7 +130,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
|
|
130 |
split_by: Literal["sentence", "word"] = 'sentence',
|
131 |
split_respect_sentence_boundary = False,
|
132 |
split_length:int = 2, split_overlap = 0,
|
133 |
-
|
134 |
"""
|
135 |
creates the pipeline and runs the preprocessing pipeline,
|
136 |
the params for pipeline are fetched from paramconfig
|
@@ -163,7 +163,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
|
|
163 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
|
164 |
params= {"FileConverter": {"file_path": filePath, \
|
165 |
"file_name": fileName},
|
166 |
-
"UdfPreProcessor": {"
|
167 |
"split_by": split_by, \
|
168 |
"split_length":split_length,\
|
169 |
"split_overlap": split_overlap, \
|
|
|
34 |
17:'SDG 17 - Partnership for the goals',}
|
35 |
|
36 |
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_sdgClassifier(config_file = None, classifier_name = None):
|
38 |
"""
|
39 |
loads the document classifier using haystack, where the name/path of model
|
40 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
|
|
52 |
Return: document classifier model
|
53 |
"""
|
54 |
if not classifier_name:
|
55 |
+
if not config_file:
|
56 |
logging.warning("Pass either model name or config file")
|
57 |
return
|
58 |
else:
|
59 |
+
config = getconfig(config_file)
|
60 |
classifier_name = config.get('sdg','MODEL')
|
61 |
|
62 |
logging.info("Loading classifier")
|
|
|
68 |
|
69 |
|
70 |
@st.cache(allow_output_mutation=True)
|
71 |
+
def sdg_classification(haystack_doc:List[Document],
|
72 |
+
threshold:float, classifier_model= None)->Tuple[DataFrame,Series]:
|
73 |
"""
|
74 |
Text-Classification on the list of texts provided. Classifier provides the
|
75 |
most appropriate label for each text. these labels are in terms of if text
|
|
|
93 |
|
94 |
"""
|
95 |
logging.info("Working on SDG Classification")
|
96 |
+
if not classifier_model:
|
97 |
if check_streamlit:
|
98 |
+
classifier_model = st.session_state['sdg_classifier']
|
99 |
else:
|
100 |
logging.warning("No streamlit envinornment found, Pass the classifier")
|
101 |
return
|
102 |
|
103 |
+
results = classifier_model.predict(haystack_doc)
|
104 |
|
105 |
|
106 |
labels_= [(l.meta['classification']['label'],
|
|
|
130 |
split_by: Literal["sentence", "word"] = 'sentence',
|
131 |
split_respect_sentence_boundary = False,
|
132 |
split_length:int = 2, split_overlap = 0,
|
133 |
+
remove_punc = False)->List[Document]:
|
134 |
"""
|
135 |
creates the pipeline and runs the preprocessing pipeline,
|
136 |
the params for pipeline are fetched from paramconfig
|
|
|
163 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
|
164 |
params= {"FileConverter": {"file_path": filePath, \
|
165 |
"file_name": fileName},
|
166 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
167 |
"split_by": split_by, \
|
168 |
"split_length":split_length,\
|
169 |
"split_overlap": split_overlap, \
|
utils/semantic_search.py
CHANGED
@@ -37,8 +37,8 @@ class QueryCheck(BaseComponent):
|
|
37 |
Uses Query Classifier from Haystack, process the query based on query type.
|
38 |
Ability to determine the statements is not so good, therefore the chances
|
39 |
statement also get modified. Ex: "List water related issues" will be
|
40 |
-
identified by the model as keywords, and therefore it be processed as "
|
41 |
-
|
42 |
but is igonred for now, as semantic search will not get affected a lot, by this.
|
43 |
|
44 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
@@ -61,7 +61,7 @@ class QueryCheck(BaseComponent):
|
|
61 |
output = {"query":query,
|
62 |
"query_type": 'question/statement'}
|
63 |
else:
|
64 |
-
output = {"query": "
|
65 |
"query_type": 'statements/keyword'}
|
66 |
logging.info(output)
|
67 |
return output, "output_1"
|
@@ -74,7 +74,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
74 |
split_by: Literal["sentence", "word"] = 'sentence',
|
75 |
split_respect_sentence_boundary = False,
|
76 |
split_length:int = 2, split_overlap = 0,
|
77 |
-
|
78 |
"""
|
79 |
creates the pipeline and runs the preprocessing pipeline.
|
80 |
|
@@ -106,7 +106,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
106 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
107 |
params= {"FileConverter": {"file_path": file_path, \
|
108 |
"file_name": file_name},
|
109 |
-
"UdfPreProcessor": {"
|
110 |
"split_by": split_by, \
|
111 |
"split_length":split_length,\
|
112 |
"split_overlap": split_overlap,
|
@@ -118,7 +118,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
118 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
119 |
def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
|
120 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
121 |
-
document_store:InMemoryDocumentStore = None):
|
122 |
"""
|
123 |
Returns the Retriever model based on params provided.
|
124 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
@@ -133,6 +133,8 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
133 |
embedding_model_format: check the github link of Haystack provided in documentation
|
134 |
embedding_layer: check the github link of Haystack provided in documentation
|
135 |
retriever_top_k: Number of Top results to be returned by retriever
|
|
|
|
|
136 |
document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
|
137 |
and pass the same to function call. Can be done using createDocumentStore from utils.
|
138 |
|
@@ -149,14 +151,15 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
149 |
embedding_model=embedding_model,top_k = retriever_top_k,
|
150 |
document_store = document_store,
|
151 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
152 |
-
model_format=embedding_model_format, use_gpu = True
|
|
|
153 |
if check_streamlit:
|
154 |
st.session_state['retriever'] = retriever
|
155 |
return retriever
|
156 |
|
157 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
158 |
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
159 |
-
|
160 |
"""
|
161 |
Creates the InMemory Document Store from haystack list of Documents.
|
162 |
It is mandatory component for Retriever to work in Haystack frame work.
|
@@ -185,15 +188,20 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
|
|
185 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
186 |
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
187 |
useQueryCheck = True, embedding_model_format:Text = None,
|
|
|
188 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
189 |
-
reader_model:str = None, reader_top_k:int = 10
|
190 |
-
|
191 |
"""
|
192 |
creates the semantic search pipeline and document Store object from the
|
193 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
194 |
same, so that all the results returned by Retriever are used, however the
|
195 |
context is extracted by Reader for each retrieved result. The querycheck is
|
196 |
-
added as node to process the query.
|
|
|
|
|
|
|
|
|
197 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
198 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
199 |
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
@@ -218,6 +226,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
218 |
embedding_dim: Document store has default value of embedding size = 768, and
|
219 |
update_embeddings method of Docstore cannot infer the embedding size of
|
220 |
retiever automaticallu, therefore set this value as per the model card.
|
|
|
|
|
221 |
|
222 |
|
223 |
Return
|
@@ -237,27 +247,28 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
237 |
embedding_model_format=embedding_model_format,
|
238 |
embedding_layer=embedding_layer,
|
239 |
retriever_top_k= retriever_top_k,
|
240 |
-
document_store = document_store
|
|
|
241 |
|
242 |
document_store.update_embeddings(retriever)
|
243 |
reader = FARMReader(model_name_or_path=reader_model,
|
244 |
top_k = reader_top_k, use_gpu=True)
|
245 |
-
|
246 |
if useQueryCheck:
|
247 |
querycheck = QueryCheck()
|
248 |
-
|
249 |
inputs = ["Query"])
|
250 |
-
|
251 |
inputs = ["QueryCheck.output_1"])
|
252 |
-
|
253 |
inputs= ["EmbeddingRetriever"])
|
254 |
else:
|
255 |
-
|
256 |
inputs = ["Query"])
|
257 |
-
|
258 |
inputs= ["EmbeddingRetriever"])
|
259 |
|
260 |
-
return
|
261 |
|
262 |
|
263 |
def semanticsearchAnnotator(matches: List[List[int]], document):
|
@@ -296,11 +307,12 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
296 |
print(annotated_text)
|
297 |
|
298 |
|
299 |
-
def
|
300 |
embedding_model_format:Text,
|
301 |
embedding_layer:int, reader_model:str,
|
302 |
retriever_top_k:int = 10, reader_top_k:int = 10,
|
303 |
-
return_results:bool = False, embedding_dim:int = 768
|
|
|
304 |
"""
|
305 |
Performs the Semantic search on the List of haystack documents which is
|
306 |
returned by preprocessing Pipeline.
|
@@ -316,7 +328,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
|
|
316 |
embedding_layer= embedding_layer,
|
317 |
embedding_model_format= embedding_model_format,
|
318 |
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
319 |
-
reader_top_k= reader_top_k, embedding_dim=embedding_dim
|
|
|
320 |
|
321 |
results = semanticsearch_pipeline.run(query = query)
|
322 |
if return_results:
|
@@ -328,10 +341,10 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
|
|
328 |
print("Top few semantic search results")
|
329 |
for i,answer in enumerate(results['answers']):
|
330 |
temp = answer.to_dict()
|
331 |
-
start_idx = temp['offsets_in_document'][0]['start']
|
332 |
-
end_idx = temp['offsets_in_document'][0]['end']
|
333 |
-
match = [[start_idx,end_idx]]
|
334 |
doc = doc_store.get_document_by_id(temp['document_id']).content
|
|
|
|
|
|
|
335 |
if check_streamlit:
|
336 |
st.write("Result {}".format(i+1))
|
337 |
else:
|
|
|
37 |
Uses Query Classifier from Haystack, process the query based on query type.
|
38 |
Ability to determine the statements is not so good, therefore the chances
|
39 |
statement also get modified. Ex: "List water related issues" will be
|
40 |
+
identified by the model as keywords, and therefore it be processed as "what are
|
41 |
+
the 'list all water related issues' related issues and discussions?". This is one shortcoming
|
42 |
but is igonred for now, as semantic search will not get affected a lot, by this.
|
43 |
|
44 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
|
|
61 |
output = {"query":query,
|
62 |
"query_type": 'question/statement'}
|
63 |
else:
|
64 |
+
output = {"query": "what are the {} related issues and discussions?".format(query),
|
65 |
"query_type": 'statements/keyword'}
|
66 |
logging.info(output)
|
67 |
return output, "output_1"
|
|
|
74 |
split_by: Literal["sentence", "word"] = 'sentence',
|
75 |
split_respect_sentence_boundary = False,
|
76 |
split_length:int = 2, split_overlap = 0,
|
77 |
+
remove_punc = False)->List[Document]:
|
78 |
"""
|
79 |
creates the pipeline and runs the preprocessing pipeline.
|
80 |
|
|
|
106 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
107 |
params= {"FileConverter": {"file_path": file_path, \
|
108 |
"file_name": file_name},
|
109 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
110 |
"split_by": split_by, \
|
111 |
"split_length":split_length,\
|
112 |
"split_overlap": split_overlap,
|
|
|
118 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
119 |
def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
|
120 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
121 |
+
max_seq_len:int = 512, document_store:InMemoryDocumentStore = None):
|
122 |
"""
|
123 |
Returns the Retriever model based on params provided.
|
124 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
|
|
133 |
embedding_model_format: check the github link of Haystack provided in documentation
|
134 |
embedding_layer: check the github link of Haystack provided in documentation
|
135 |
retriever_top_k: Number of Top results to be returned by retriever
|
136 |
+
max_seq_len: everymodel has max seq len it can handle, check in model card.
|
137 |
+
Needed to hanlde the edge cases.
|
138 |
document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
|
139 |
and pass the same to function call. Can be done using createDocumentStore from utils.
|
140 |
|
|
|
151 |
embedding_model=embedding_model,top_k = retriever_top_k,
|
152 |
document_store = document_store,
|
153 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
154 |
+
model_format=embedding_model_format, use_gpu = True,
|
155 |
+
max_seq_len = max_seq_len )
|
156 |
if check_streamlit:
|
157 |
st.session_state['retriever'] = retriever
|
158 |
return retriever
|
159 |
|
160 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
161 |
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
162 |
+
embedding_dim:int = 768):
|
163 |
"""
|
164 |
Creates the InMemory Document Store from haystack list of Documents.
|
165 |
It is mandatory component for Retriever to work in Haystack frame work.
|
|
|
188 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
189 |
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
190 |
useQueryCheck = True, embedding_model_format:Text = None,
|
191 |
+
max_seq_len:int =512,embedding_dim:int = 768,
|
192 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
193 |
+
reader_model:str = None, reader_top_k:int = 10
|
194 |
+
):
|
195 |
"""
|
196 |
creates the semantic search pipeline and document Store object from the
|
197 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
198 |
same, so that all the results returned by Retriever are used, however the
|
199 |
context is extracted by Reader for each retrieved result. The querycheck is
|
200 |
+
added as node to process the query. This pipeline is suited for keyword search,
|
201 |
+
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
202 |
+
highlight the context for retrieved result and not for QA, however as stated
|
203 |
+
it can work for QA too in limited sense.
|
204 |
+
|
205 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
206 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
207 |
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
|
|
226 |
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
retiever automaticallu, therefore set this value as per the model card.
|
229 |
+
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
230 |
+
Needed to hanlde the edge cases
|
231 |
|
232 |
|
233 |
Return
|
|
|
247 |
embedding_model_format=embedding_model_format,
|
248 |
embedding_layer=embedding_layer,
|
249 |
retriever_top_k= retriever_top_k,
|
250 |
+
document_store = document_store,
|
251 |
+
max_seq_len=max_seq_len)
|
252 |
|
253 |
document_store.update_embeddings(retriever)
|
254 |
reader = FARMReader(model_name_or_path=reader_model,
|
255 |
top_k = reader_top_k, use_gpu=True)
|
256 |
+
semantic_search_pipeline = Pipeline()
|
257 |
if useQueryCheck:
|
258 |
querycheck = QueryCheck()
|
259 |
+
semantic_search_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
260 |
inputs = ["Query"])
|
261 |
+
semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
262 |
inputs = ["QueryCheck.output_1"])
|
263 |
+
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
264 |
inputs= ["EmbeddingRetriever"])
|
265 |
else:
|
266 |
+
semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
267 |
inputs = ["Query"])
|
268 |
+
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
269 |
inputs= ["EmbeddingRetriever"])
|
270 |
|
271 |
+
return semantic_search_pipeline, document_store
|
272 |
|
273 |
|
274 |
def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
|
307 |
print(annotated_text)
|
308 |
|
309 |
|
310 |
+
def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:Text,
|
311 |
embedding_model_format:Text,
|
312 |
embedding_layer:int, reader_model:str,
|
313 |
retriever_top_k:int = 10, reader_top_k:int = 10,
|
314 |
+
return_results:bool = False, embedding_dim:int = 768,
|
315 |
+
max_seq_len:int = 512):
|
316 |
"""
|
317 |
Performs the Semantic search on the List of haystack documents which is
|
318 |
returned by preprocessing Pipeline.
|
|
|
328 |
embedding_layer= embedding_layer,
|
329 |
embedding_model_format= embedding_model_format,
|
330 |
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
331 |
+
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
332 |
+
max_seq_len=max_seq_len)
|
333 |
|
334 |
results = semanticsearch_pipeline.run(query = query)
|
335 |
if return_results:
|
|
|
341 |
print("Top few semantic search results")
|
342 |
for i,answer in enumerate(results['answers']):
|
343 |
temp = answer.to_dict()
|
|
|
|
|
|
|
344 |
doc = doc_store.get_document_by_id(temp['document_id']).content
|
345 |
+
start_idx = doc.find(temp['context'])
|
346 |
+
end_idx = start_idx + len(temp['context'])
|
347 |
+
match = [[start_idx,end_idx]]
|
348 |
if check_streamlit:
|
349 |
st.write("Result {}".format(i+1))
|
350 |
else:
|