prashant commited on
Commit
4a20529
·
1 Parent(s): 570b6e4

lexical search haystack plus spacy

Browse files
Files changed (3) hide show
  1. requirements.txt +2 -1
  2. utils/search.py +58 -135
  3. ver0.1 scripts/search.py +141 -0
requirements.txt CHANGED
@@ -11,6 +11,7 @@ pdfplumber==0.6.2
11
  Pillow==9.1.1
12
  seaborn==0.11.2
13
  transformers==4.21.2
14
- rank_bm25
 
15
  python-docx
16
  streamlit_option_menu
 
11
  Pillow==9.1.1
12
  seaborn==0.11.2
13
  transformers==4.21.2
14
+ st-annotated-text
15
+ markdown
16
  python-docx
17
  streamlit_option_menu
utils/search.py CHANGED
@@ -1,145 +1,68 @@
1
- import glob, os, sys; sys.path.append('../udfPreprocess')
2
-
3
- #import helper
4
- import udfPreprocess.docPreprocessing as pre
5
- import udfPreprocess.cleaning as clean
6
-
7
- #import needed libraries
8
- import seaborn as sns
9
- from pandas import DataFrame
10
- from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
- # from keybert import KeyBERT
12
- from transformers import pipeline
13
- import matplotlib.pyplot as plt
14
- import numpy as np
15
- import streamlit as st
16
- import pandas as pd
17
- from rank_bm25 import BM25Okapi
18
- from sklearn.feature_extraction import _stop_words
19
- import string
20
- from tqdm.autonotebook import tqdm
21
- import numpy as np
22
- import docx
23
- from docx.shared import Inches
24
- from docx.shared import Pt
25
- from docx.enum.style import WD_STYLE_TYPE
26
- import logging
27
- logger = logging.getLogger(__name__)
28
- import tempfile
29
- import sqlite3
30
  import configparser
 
 
 
 
 
 
31
 
32
- ### These are lexcial search related functions/methods#####
33
-
34
- def bm25_tokenizer(text):
35
- tokenized_doc = []
36
- for token in text.lower().split():
37
- token = token.strip(string.punctuation)
38
-
39
- if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
40
- tokenized_doc.append(token)
41
- return tokenized_doc
42
-
43
- def bm25TokenizeDoc(paraList):
44
- tokenized_corpus = []
45
- ##########Commenting this for now########### will incorporate paragrpah splitting later.
46
- # for passage in tqdm(paraList):
47
- # if len(passage.split()) >256:
48
- # # st.write("Splitting")
49
- # temp = " ".join(passage.split()[:256])
50
- # tokenized_corpus.append(bm25_tokenizer(temp))
51
- # temp = " ".join(passage.split()[256:])
52
- # tokenized_corpus.append(bm25_tokenizer(temp))
53
- # else:
54
- # tokenized_corpus.append(bm25_tokenizer(passage))
55
- ######################################################################################33333
56
- for passage in tqdm(paraList):
57
- tokenized_corpus.append(bm25_tokenizer(passage))
58
-
59
- return tokenized_corpus
60
 
61
- def lexical_search(keyword, document_bm25):
62
- config = configparser.ConfigParser()
63
- config.read_file(open('udfPreprocess/paramconfig.cfg'))
64
- top_k = int(config.get('lexical_search','TOP_K'))
65
- bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
66
- top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
67
- bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
68
- bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
69
- return bm25_hits
70
 
71
- @st.cache(allow_output_mutation=True)
72
- def load_sentenceTransformer(name):
73
- return SentenceTransformer(name)
 
74
 
 
 
 
 
 
 
 
75
 
76
- def semantic_search(keywordlist,paraList):
 
 
77
 
78
- ##### Sematic Search #####
79
- #query = "Does document contain {} issues ?".format(keyword)
80
- config = configparser.ConfigParser()
81
- config.read_file(open('udfPreprocess/paramconfig.cfg'))
82
- model_name = config.get('semantic_search','MODEL_NAME')
83
 
84
- bi_encoder = load_sentenceTransformer(model_name)
85
- bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
86
- top_k = int(config.get('semantic_search','TOP_K'))
87
- document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
88
- question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
89
-
90
- hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
91
 
92
- return hits
93
-
94
- def show_results(keywordList):
95
- document = docx.Document()
96
- # document.add_heading('Document name:{}'.format(file_name), 2)
97
- section = document.sections[0]
98
-
99
- # Calling the footer
100
- footer = section.footer
101
-
102
- # Calling the paragraph already present in
103
- # the footer section
104
- footer_para = footer.paragraphs[0]
105
-
106
- font_styles = document.styles
107
- font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
108
- font_object = font_charstyle.font
109
- font_object.size = Pt(7)
110
- # Adding the centered zoned footer
111
- footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
112
- document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
113
- for keyword in keywordList:
114
-
115
- st.write("Results for Query: {}".format(keyword))
116
- para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
117
- para.font.size = Pt(12)
118
- bm25_hits, hits = search(keyword)
119
-
120
- st.markdown("""
121
- We will provide with 2 kind of results. The 'lexical search' and the semantic search.
122
- """)
123
- # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
124
- st.markdown("Top few lexical search (BM25) hits")
125
- document.add_paragraph("Top few lexical search (BM25) hits")
126
-
127
- for hit in bm25_hits[0:5]:
128
- if hit['score'] > 0.00:
129
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
130
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
131
-
132
-
133
-
134
- # st.table(bm25_hits[0:3])
135
-
136
- st.markdown("\n-------------------------\n")
137
- st.markdown("Top few Bi-Encoder Retrieval hits")
138
- document.add_paragraph("\n-------------------------\n")
139
- document.add_paragraph("Top few Bi-Encoder Retrieval hits")
140
 
141
- hits = sorted(hits, key=lambda x: x['score'], reverse=True)
142
- for hit in hits[0:5]:
143
- # if hit['score'] > 0.45:
144
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
145
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
 
1
+ from haystack.nodes import TfidfRetriever
2
+ from haystack.document_stores import InMemoryDocumentStore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import configparser
4
+ import spacy
5
+ import re
6
+ from spacy.matcher import Matcher
7
+ import streamlit as st
8
+ from markdown import markdown
9
+ from annotated_text import annotation
10
 
11
+ config = configparser.ConfigParser()
12
+ config.read_file(open('paramconfig.py'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def tokenize_lexical_query(query):
16
+ nlp = spacy.load("en_core_web_sm")
17
+ token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
18
+ return token_list
19
 
20
+ def runSpacyMatcher(token_list, document):
21
+ nlp = spacy.load("en_core_web_sm")
22
+ spacydoc = nlp(document)
23
+ matcher = Matcher(nlp.vocab)
24
+ token_pattern = [[{"LOWER":token}] for token in token_list]
25
+ matcher.add(",".join(token_list), token_pattern)
26
+ spacymatches = matcher(spacydoc)
27
 
28
+ matches = []
29
+ for match_id, start, end in spacymatches:
30
+ matches = matches + [[start, end]]
31
 
32
+ return matches, spacydoc
 
 
 
 
33
 
34
+ def runRegexMatcher(token_list, document):
35
+ matches = []
36
+ for token in token_list:
37
+ matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
 
 
 
38
 
39
+ return matches, document
40
+
41
+ def searchAnnotator(matches, document):
42
+ start = 0
43
+ annotated_text = ""
44
+ for match in matches:
45
+ start_idx = match[0]
46
+ end_idx = match[1]
47
+ annotated_text = annotated_text + document[start:start_idx] + str(annotation(body=document[start_idx:end_idx], label="ANSWER", background="#964448", color='#ffffff'))
48
+ start = end_idx
49
+
50
+ st.write(
51
+ markdown(annotated_text),
52
+ unsafe_allow_html=True,
53
+ )
54
+
55
+ def lexical_search(query,documents):
56
+
57
+ document_store = InMemoryDocumentStore()
58
+ document_store.write_documents(documents)
59
+ retriever = TfidfRetriever(document_store)
60
+ results = retriever.retrieve(query=query,
61
+ top_k= int(config.get('lexical_search','TOP_K')))
62
+ query_tokens = tokenize_lexical_query(query)
63
+ for result in results:
64
+ matches, doc = runSpacyMatcher(query_tokens,result.content)
65
+ searchAnnotator(matches, doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+
68
+
 
 
 
ver0.1 scripts/search.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, os, sys; sys.path.append('../utils')
2
+
3
+ #import needed libraries
4
+ import seaborn as sns
5
+ from pandas import DataFrame
6
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
7
+ # from keybert import KeyBERT
8
+ from transformers import pipeline
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import streamlit as st
12
+ import pandas as pd
13
+ from rank_bm25 import BM25Okapi
14
+ from sklearn.feature_extraction import _stop_words
15
+ import string
16
+ from tqdm.autonotebook import tqdm
17
+ import numpy as np
18
+ import docx
19
+ from docx.shared import Inches
20
+ from docx.shared import Pt
21
+ from docx.enum.style import WD_STYLE_TYPE
22
+ import logging
23
+ logger = logging.getLogger(__name__)
24
+ import tempfile
25
+ import sqlite3
26
+ import configparser
27
+
28
+ ### These are lexcial search related functions #####
29
+
30
+ def bm25_tokenizer(text):
31
+ tokenized_doc = []
32
+ for token in text.lower().split():
33
+ token = token.strip(string.punctuation)
34
+
35
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
36
+ tokenized_doc.append(token)
37
+ return tokenized_doc
38
+
39
+ def bm25TokenizeDoc(paraList):
40
+ tokenized_corpus = []
41
+ ##########Commenting this for now########### will incorporate paragrpah splitting later.
42
+ # for passage in tqdm(paraList):
43
+ # if len(passage.split()) >256:
44
+ # # st.write("Splitting")
45
+ # temp = " ".join(passage.split()[:256])
46
+ # tokenized_corpus.append(bm25_tokenizer(temp))
47
+ # temp = " ".join(passage.split()[256:])
48
+ # tokenized_corpus.append(bm25_tokenizer(temp))
49
+ # else:
50
+ # tokenized_corpus.append(bm25_tokenizer(passage))
51
+ ######################################################################################33333
52
+ for passage in tqdm(paraList):
53
+ tokenized_corpus.append(bm25_tokenizer(passage))
54
+
55
+ return tokenized_corpus
56
+
57
+ def lexical_search(keyword, document_bm25):
58
+ config = configparser.ConfigParser()
59
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
60
+ top_k = int(config.get('lexical_search','TOP_K'))
61
+ bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
62
+ top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
63
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
64
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
65
+ return bm25_hits
66
+
67
+ @st.cache(allow_output_mutation=True)
68
+ def load_sentenceTransformer(name):
69
+ return SentenceTransformer(name)
70
+
71
+
72
+ def semantic_search(keywordlist,paraList):
73
+
74
+ ##### Sematic Search #####
75
+ #query = "Does document contain {} issues ?".format(keyword)
76
+ config = configparser.ConfigParser()
77
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
78
+ model_name = config.get('semantic_search','MODEL_NAME')
79
+
80
+ bi_encoder = load_sentenceTransformer(model_name)
81
+ bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
82
+ top_k = int(config.get('semantic_search','TOP_K'))
83
+ document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
84
+ question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
85
+
86
+ hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
87
+
88
+ return hits
89
+
90
+ def show_results(keywordList):
91
+ document = docx.Document()
92
+ # document.add_heading('Document name:{}'.format(file_name), 2)
93
+ section = document.sections[0]
94
+
95
+ # Calling the footer
96
+ footer = section.footer
97
+
98
+ # Calling the paragraph already present in
99
+ # the footer section
100
+ footer_para = footer.paragraphs[0]
101
+
102
+ font_styles = document.styles
103
+ font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
104
+ font_object = font_charstyle.font
105
+ font_object.size = Pt(7)
106
+ # Adding the centered zoned footer
107
+ footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
108
+ document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
109
+ for keyword in keywordList:
110
+
111
+ st.write("Results for Query: {}".format(keyword))
112
+ para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
113
+ para.font.size = Pt(12)
114
+ bm25_hits, hits = search(keyword)
115
+
116
+ st.markdown("""
117
+ We will provide with 2 kind of results. The 'lexical search' and the semantic search.
118
+ """)
119
+ # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
120
+ st.markdown("Top few lexical search (BM25) hits")
121
+ document.add_paragraph("Top few lexical search (BM25) hits")
122
+
123
+ for hit in bm25_hits[0:5]:
124
+ if hit['score'] > 0.00:
125
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
126
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
127
+
128
+
129
+
130
+ # st.table(bm25_hits[0:3])
131
+
132
+ st.markdown("\n-------------------------\n")
133
+ st.markdown("Top few Bi-Encoder Retrieval hits")
134
+ document.add_paragraph("\n-------------------------\n")
135
+ document.add_paragraph("Top few Bi-Encoder Retrieval hits")
136
+
137
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
138
+ for hit in hits[0:5]:
139
+ # if hit['score'] > 0.45:
140
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
141
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))