prashant commited on
Commit
f9949bb
·
1 Parent(s): fb4cce0

lexcial update

Browse files
appStore/keyword_search.py CHANGED
@@ -47,12 +47,9 @@ def app():
47
  else:
48
  keywordList = None
49
 
50
- searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
 
51
  ['Exact Matches', 'Similar context/meaning'])
52
- # if searchtype == 'Similar context/meaning':
53
- # show_answers = st.sidebar.checkbox("Show context")
54
-
55
-
56
 
57
 
58
  with st.container():
@@ -61,33 +58,38 @@ def app():
61
  will look for these keywords in document".format(genre),
62
  value="{}".format(keywordList))
63
  else:
64
- queryList = st.text_input("Please enter here your question and we will look \
65
- for an answer in the document OR enter the keyword you \
66
- are looking for and we will \
67
- we will look for similar context \
68
- in the document.",
69
  placeholder="Enter keyword here")
70
 
71
  if st.button("Find them"):
72
 
73
  if queryList == "":
74
- st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
 
75
  logging.warning("Terminated as no keyword provided")
76
  else:
77
  if 'filepath' in st.session_state:
78
 
 
79
  if searchtype == 'Exact Matches':
80
- paraList = runLexicalPreprocessingPipeline()
 
 
81
  logging.info("performing lexical search")
82
- with st.spinner("Performing Exact matching search (Lexical search) for you"):
 
83
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
84
- lexical_search(queryList,paraList)
85
  else:
86
-
87
- paraList = runSemanticPreprocessingPipeline()
88
- logging.info("starting semantic search")
89
- with st.spinner("Performing Similar/Contextual search"):
90
- semantic_search(queryList,paraList)
91
 
92
  else:
93
  st.info("🤔 No document found, please try to upload it at the sidebar!")
 
47
  else:
48
  keywordList = None
49
 
50
+ searchtype = st.selectbox("Do you want to find exact macthes or similar \
51
+ meaning/context",
52
  ['Exact Matches', 'Similar context/meaning'])
 
 
 
 
53
 
54
 
55
  with st.container():
 
58
  will look for these keywords in document".format(genre),
59
  value="{}".format(keywordList))
60
  else:
61
+ queryList = st.text_input("Please enter here your question and we \
62
+ will look for an answer in the document\
63
+ OR enter the keyword you are looking \
64
+ for and we will we will look for similar\
65
+ context in the document.",
66
  placeholder="Enter keyword here")
67
 
68
  if st.button("Find them"):
69
 
70
  if queryList == "":
71
+ st.info("🤔 No keyword provided, if you dont have any, \
72
+ please try example sets from sidebar!")
73
  logging.warning("Terminated as no keyword provided")
74
  else:
75
  if 'filepath' in st.session_state:
76
 
77
+
78
  if searchtype == 'Exact Matches':
79
+ allDocuments = runLexicalPreprocessingPipeline(
80
+ st.session_state['filepath'],
81
+ st.session_state['filename'])
82
  logging.info("performing lexical search")
83
+ with st.spinner("Performing Exact matching search \
84
+ (Lexical search) for you"):
85
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
86
+ lexical_search(queryList,allDocuments['documents'])
87
  else:
88
+ pass
89
+ # paraList = runSemanticPreprocessingPipeline()
90
+ # logging.info("starting semantic search")
91
+ # with st.spinner("Performing Similar/Contextual search"):
92
+ # semantic_search(queryList,paraList)
93
 
94
  else:
95
  st.info("🤔 No document found, please try to upload it at the sidebar!")
utils/lexical_search.py CHANGED
@@ -1,20 +1,67 @@
1
  from haystack.nodes import TfidfRetriever
2
- from haystack.nodes.base import BaseComponent
3
  from haystack.document_stores import InMemoryDocumentStore
4
- import configparser
5
  import spacy
6
  import re
7
  from spacy.matcher import Matcher
 
8
  import streamlit as st
9
  from markdown import markdown
10
  from annotated_text import annotation
11
  from haystack.schema import Document
12
  from typing import List, Text
13
  from utils.preprocessing import processingpipeline
14
- from haystack.pipelines import Pipeline
 
 
15
 
 
 
 
 
16
  config = configparser.ConfigParser()
17
- config.read_file(open('paramconfig.cfg'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def tokenize_lexical_query(query:str)-> List[str]:
@@ -100,61 +147,56 @@ def runRegexMatcher(token_list:List[str], document:Text):
100
 
101
  return matches, document
102
 
103
- def lexicalsearchAnnotator(matches: List[List[int]], document):
104
  """
 
105
  Annotates the text in the document defined by list of [start index, end index]
106
  Example: "How are you today", if document type is text, matches = [[0,3]]
107
  will give answer = "How", however in case we used the spacy matcher then the
108
  matches = [[0,3]] will give answer = "How are you". However if spacy is used
109
  to find "How" then the matches = [[0,1]] for the string defined above.
110
 
 
 
 
 
 
 
 
 
 
 
 
111
  """
112
  start = 0
113
  annotated_text = ""
114
  for match in matches:
115
  start_idx = match[0]
116
  end_idx = match[1]
117
- annotated_text = (annotated_text + document[start:start_idx].text
118
- + str(annotation(body=document[start_idx:end_idx].text,
119
- label="ANSWER", background="#964448", color='#ffffff')))
 
 
 
 
 
 
 
 
120
  start = end_idx
121
 
122
  annotated_text = annotated_text + document[end_idx:].text
123
-
124
- st.write(
125
- markdown(annotated_text),
126
- unsafe_allow_html=True,
127
- )
128
 
129
- def runLexicalPreprocessingPipeline()->List[Document]:
130
- """
131
- creates the pipeline and runs the preprocessing pipeline,
132
- the params for pipeline are fetched from paramconfig
133
 
134
- Return
135
- --------------
136
- List[Document]: When preprocessing pipeline is run, the output dictionary
137
- has four objects. For the lexicaal search using TFIDFRetriever we
138
- need to use the List of Haystack Document, which can be fetched by
139
- key = 'documents' on output.
140
-
141
- """
142
- file_path = st.session_state['filepath']
143
- file_name = st.session_state['filename']
144
- lexical_processing_pipeline = processingpipeline()
145
- split_by = config.get('lexical_search','SPLIT_BY')
146
- split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
147
- split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
148
-
149
- output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
150
- params= {"FileConverter": {"file_path": file_path, \
151
- "file_name": file_name},
152
- "UdfPreProcessor": {"removePunc": False, \
153
- "split_by": split_by, \
154
- "split_length":split_length,\
155
- "split_overlap": split_overlap}})
156
 
157
- return output_lexical_pre['documents']
 
 
 
 
 
158
 
159
  def lexical_search(query:Text,documents:List[Document]):
160
  """
@@ -164,7 +206,7 @@ def lexical_search(query:Text,documents:List[Document]):
164
  Params
165
  -------
166
  query: Keywords that need to be searche in documents.
167
- documents: List fo Haystack documents returned by preprocessing pipeline.
168
 
169
  """
170
 
@@ -177,9 +219,11 @@ def lexical_search(query:Text,documents:List[Document]):
177
  top_k= int(config.get('lexical_search','TOP_K')))
178
  query_tokens = tokenize_lexical_query(query)
179
  for count, result in enumerate(results):
180
- # if result.content != "":
181
  matches, doc = runSpacyMatcher(query_tokens,result.content)
182
  if len(matches) != 0:
183
- st.write("Result {}".format(count+1))
184
- lexicalsearchAnnotator(matches, doc)
 
 
 
185
 
 
1
  from haystack.nodes import TfidfRetriever
 
2
  from haystack.document_stores import InMemoryDocumentStore
 
3
  import spacy
4
  import re
5
  from spacy.matcher import Matcher
6
+ from termcolor import colored
7
  import streamlit as st
8
  from markdown import markdown
9
  from annotated_text import annotation
10
  from haystack.schema import Document
11
  from typing import List, Text
12
  from utils.preprocessing import processingpipeline
13
+ from utils.streamlitcheck import check_streamlit
14
+ import configparser
15
+ import logging
16
 
17
+ try:
18
+ import streamlit as st
19
+ except ImportError:
20
+ logging.info("Streamlit not installed")
21
  config = configparser.ConfigParser()
22
+ try:
23
+ config.read_file(open('paramconfig.cfg'))
24
+ except Exception:
25
+ logging.info("paramconfig file not found")
26
+ st.info("Please place the paramconfig file in the same directory as app.py")
27
+
28
+
29
+ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
30
+ """
31
+ creates the pipeline and runs the preprocessing pipeline,
32
+ the params for pipeline are fetched from paramconfig
33
+
34
+ Params
35
+ ------------
36
+
37
+ file_name: filename, in case of streamlit application use
38
+ st.session_state['filename']
39
+ file_path: filepath, in case of streamlit application use
40
+ st.session_state['filepath']
41
+
42
+ Return
43
+ --------------
44
+ List[Document]: When preprocessing pipeline is run, the output dictionary
45
+ has four objects. For the lexicaal search using TFIDFRetriever we
46
+ need to use the List of Haystack Document, which can be fetched by
47
+ key = 'documents' on output.
48
+
49
+ """
50
+
51
+ lexical_processing_pipeline = processingpipeline()
52
+ split_by = config.get('lexical_search','SPLIT_BY')
53
+ split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
54
+ split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
55
+
56
+ output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
57
+ params= {"FileConverter": {"file_path": file_path, \
58
+ "file_name": file_name},
59
+ "UdfPreProcessor": {"removePunc": False, \
60
+ "split_by": split_by, \
61
+ "split_length":split_length,\
62
+ "split_overlap": split_overlap}})
63
+
64
+ return output_lexical_pre
65
 
66
 
67
  def tokenize_lexical_query(query:str)-> List[str]:
 
147
 
148
  return matches, document
149
 
150
+ def spacyAnnotator(matches: List[List[int]], document:spacy.token.doc.Doc):
151
  """
152
+ This is spacy Annotator and needs spacy.doc
153
  Annotates the text in the document defined by list of [start index, end index]
154
  Example: "How are you today", if document type is text, matches = [[0,3]]
155
  will give answer = "How", however in case we used the spacy matcher then the
156
  matches = [[0,3]] will give answer = "How are you". However if spacy is used
157
  to find "How" then the matches = [[0,1]] for the string defined above.
158
 
159
+ Params
160
+ -----------
161
+ matches: As mentioned its list of list. Example [[0,1],[10,13]]
162
+ document: document which needs to be indexed.
163
+
164
+
165
+ Return
166
+ --------
167
+ will send the output to either app front end using streamlit or
168
+ write directly to output screen.
169
+
170
  """
171
  start = 0
172
  annotated_text = ""
173
  for match in matches:
174
  start_idx = match[0]
175
  end_idx = match[1]
176
+
177
+ if check_streamlit():
178
+ annotated_text = (annotated_text + document[start:start_idx].text
179
+ + str(annotation(body=document[start_idx:end_idx].text,
180
+ label="ANSWER", background="#964448", color='#ffffff')))
181
+ else:
182
+ annotated_text = (annotated_text + document[start:start_idx].text
183
+ + colored(document[start_idx:end_idx].text,
184
+ "green", attrs = ['bold']))
185
+
186
+
187
  start = end_idx
188
 
189
  annotated_text = annotated_text + document[end_idx:].text
 
 
 
 
 
190
 
 
 
 
 
191
 
192
+ if check_streamlit():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ st.write(
195
+ markdown(annotated_text),
196
+ unsafe_allow_html=True,
197
+ )
198
+ else:
199
+ print(annotated_text)
200
 
201
  def lexical_search(query:Text,documents:List[Document]):
202
  """
 
206
  Params
207
  -------
208
  query: Keywords that need to be searche in documents.
209
+ documents: List of Haystack documents returned by preprocessing pipeline.
210
 
211
  """
212
 
 
219
  top_k= int(config.get('lexical_search','TOP_K')))
220
  query_tokens = tokenize_lexical_query(query)
221
  for count, result in enumerate(results):
 
222
  matches, doc = runSpacyMatcher(query_tokens,result.content)
223
  if len(matches) != 0:
224
+ if check_streamlit():
225
+ st.write("Result {}".format(count+1))
226
+ else:
227
+ print("Results {}".format(count +1))
228
+ spacyAnnotator(matches, doc)
229
 
utils/sdg_classifier.py CHANGED
@@ -56,7 +56,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
56
  the number of times it is covered/discussed/count_of_paragraphs.
57
 
58
  """
59
- logging.info("running SDG classifiication")
60
  threshold = float(config.get('sdg','THRESHOLD'))
61
 
62
 
@@ -83,7 +83,7 @@ def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
83
  creates the pipeline and runs the preprocessing pipeline,
84
  the params for pipeline are fetched from paramconfig
85
 
86
- Param
87
  ------------
88
 
89
  file_name: filename, in case of streamlit application use
 
56
  the number of times it is covered/discussed/count_of_paragraphs.
57
 
58
  """
59
+ logging.info("Working on SDG Classification")
60
  threshold = float(config.get('sdg','THRESHOLD'))
61
 
62
 
 
83
  creates the pipeline and runs the preprocessing pipeline,
84
  the params for pipeline are fetched from paramconfig
85
 
86
+ Params
87
  ------------
88
 
89
  file_name: filename, in case of streamlit application use
utils/streamlitcheck.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def check_streamlit():
2
+ """
3
+ Function to check whether python code is run within streamlit
4
+
5
+ Returns
6
+ -------
7
+ use_streamlit : boolean
8
+ True if code is run within streamlit, else False
9
+ """
10
+ try:
11
+ from streamlit.scriptrunner.script_run_context import get_script_run_ctx
12
+ if not get_script_run_ctx():
13
+ use_streamlit = False
14
+ else:
15
+ use_streamlit = True
16
+ except ModuleNotFoundError:
17
+ use_streamlit = False
18
+ return use_streamlit
19
+