prashant
commited on
Commit
•
4a20529
1
Parent(s):
570b6e4
lexical search haystack plus spacy
Browse files- requirements.txt +2 -1
- utils/search.py +58 -135
- ver0.1 scripts/search.py +141 -0
requirements.txt
CHANGED
@@ -11,6 +11,7 @@ pdfplumber==0.6.2
|
|
11 |
Pillow==9.1.1
|
12 |
seaborn==0.11.2
|
13 |
transformers==4.21.2
|
14 |
-
|
|
|
15 |
python-docx
|
16 |
streamlit_option_menu
|
|
|
11 |
Pillow==9.1.1
|
12 |
seaborn==0.11.2
|
13 |
transformers==4.21.2
|
14 |
+
st-annotated-text
|
15 |
+
markdown
|
16 |
python-docx
|
17 |
streamlit_option_menu
|
utils/search.py
CHANGED
@@ -1,145 +1,68 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
#import helper
|
4 |
-
import udfPreprocess.docPreprocessing as pre
|
5 |
-
import udfPreprocess.cleaning as clean
|
6 |
-
|
7 |
-
#import needed libraries
|
8 |
-
import seaborn as sns
|
9 |
-
from pandas import DataFrame
|
10 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
11 |
-
# from keybert import KeyBERT
|
12 |
-
from transformers import pipeline
|
13 |
-
import matplotlib.pyplot as plt
|
14 |
-
import numpy as np
|
15 |
-
import streamlit as st
|
16 |
-
import pandas as pd
|
17 |
-
from rank_bm25 import BM25Okapi
|
18 |
-
from sklearn.feature_extraction import _stop_words
|
19 |
-
import string
|
20 |
-
from tqdm.autonotebook import tqdm
|
21 |
-
import numpy as np
|
22 |
-
import docx
|
23 |
-
from docx.shared import Inches
|
24 |
-
from docx.shared import Pt
|
25 |
-
from docx.enum.style import WD_STYLE_TYPE
|
26 |
-
import logging
|
27 |
-
logger = logging.getLogger(__name__)
|
28 |
-
import tempfile
|
29 |
-
import sqlite3
|
30 |
import configparser
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
def bm25_tokenizer(text):
|
35 |
-
tokenized_doc = []
|
36 |
-
for token in text.lower().split():
|
37 |
-
token = token.strip(string.punctuation)
|
38 |
-
|
39 |
-
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
40 |
-
tokenized_doc.append(token)
|
41 |
-
return tokenized_doc
|
42 |
-
|
43 |
-
def bm25TokenizeDoc(paraList):
|
44 |
-
tokenized_corpus = []
|
45 |
-
##########Commenting this for now########### will incorporate paragrpah splitting later.
|
46 |
-
# for passage in tqdm(paraList):
|
47 |
-
# if len(passage.split()) >256:
|
48 |
-
# # st.write("Splitting")
|
49 |
-
# temp = " ".join(passage.split()[:256])
|
50 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
51 |
-
# temp = " ".join(passage.split()[256:])
|
52 |
-
# tokenized_corpus.append(bm25_tokenizer(temp))
|
53 |
-
# else:
|
54 |
-
# tokenized_corpus.append(bm25_tokenizer(passage))
|
55 |
-
######################################################################################33333
|
56 |
-
for passage in tqdm(paraList):
|
57 |
-
tokenized_corpus.append(bm25_tokenizer(passage))
|
58 |
-
|
59 |
-
return tokenized_corpus
|
60 |
|
61 |
-
def lexical_search(keyword, document_bm25):
|
62 |
-
config = configparser.ConfigParser()
|
63 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
64 |
-
top_k = int(config.get('lexical_search','TOP_K'))
|
65 |
-
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
66 |
-
top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
|
67 |
-
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
68 |
-
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
69 |
-
return bm25_hits
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
|
|
|
|
77 |
|
78 |
-
|
79 |
-
#query = "Does document contain {} issues ?".format(keyword)
|
80 |
-
config = configparser.ConfigParser()
|
81 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
82 |
-
model_name = config.get('semantic_search','MODEL_NAME')
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
|
89 |
-
|
90 |
-
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
91 |
|
92 |
-
return
|
93 |
-
|
94 |
-
def
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
st.markdown("""
|
121 |
-
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
122 |
-
""")
|
123 |
-
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
124 |
-
st.markdown("Top few lexical search (BM25) hits")
|
125 |
-
document.add_paragraph("Top few lexical search (BM25) hits")
|
126 |
-
|
127 |
-
for hit in bm25_hits[0:5]:
|
128 |
-
if hit['score'] > 0.00:
|
129 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
130 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
# st.table(bm25_hits[0:3])
|
135 |
-
|
136 |
-
st.markdown("\n-------------------------\n")
|
137 |
-
st.markdown("Top few Bi-Encoder Retrieval hits")
|
138 |
-
document.add_paragraph("\n-------------------------\n")
|
139 |
-
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
# if hit['score'] > 0.45:
|
144 |
-
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
145 |
-
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
|
|
1 |
+
from haystack.nodes import TfidfRetriever
|
2 |
+
from haystack.document_stores import InMemoryDocumentStore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import configparser
|
4 |
+
import spacy
|
5 |
+
import re
|
6 |
+
from spacy.matcher import Matcher
|
7 |
+
import streamlit as st
|
8 |
+
from markdown import markdown
|
9 |
+
from annotated_text import annotation
|
10 |
|
11 |
+
config = configparser.ConfigParser()
|
12 |
+
config.read_file(open('paramconfig.py'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
def tokenize_lexical_query(query):
|
16 |
+
nlp = spacy.load("en_core_web_sm")
|
17 |
+
token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
|
18 |
+
return token_list
|
19 |
|
20 |
+
def runSpacyMatcher(token_list, document):
|
21 |
+
nlp = spacy.load("en_core_web_sm")
|
22 |
+
spacydoc = nlp(document)
|
23 |
+
matcher = Matcher(nlp.vocab)
|
24 |
+
token_pattern = [[{"LOWER":token}] for token in token_list]
|
25 |
+
matcher.add(",".join(token_list), token_pattern)
|
26 |
+
spacymatches = matcher(spacydoc)
|
27 |
|
28 |
+
matches = []
|
29 |
+
for match_id, start, end in spacymatches:
|
30 |
+
matches = matches + [[start, end]]
|
31 |
|
32 |
+
return matches, spacydoc
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
def runRegexMatcher(token_list, document):
|
35 |
+
matches = []
|
36 |
+
for token in token_list:
|
37 |
+
matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
|
|
|
|
|
|
|
38 |
|
39 |
+
return matches, document
|
40 |
+
|
41 |
+
def searchAnnotator(matches, document):
|
42 |
+
start = 0
|
43 |
+
annotated_text = ""
|
44 |
+
for match in matches:
|
45 |
+
start_idx = match[0]
|
46 |
+
end_idx = match[1]
|
47 |
+
annotated_text = annotated_text + document[start:start_idx] + str(annotation(body=document[start_idx:end_idx], label="ANSWER", background="#964448", color='#ffffff'))
|
48 |
+
start = end_idx
|
49 |
+
|
50 |
+
st.write(
|
51 |
+
markdown(annotated_text),
|
52 |
+
unsafe_allow_html=True,
|
53 |
+
)
|
54 |
+
|
55 |
+
def lexical_search(query,documents):
|
56 |
+
|
57 |
+
document_store = InMemoryDocumentStore()
|
58 |
+
document_store.write_documents(documents)
|
59 |
+
retriever = TfidfRetriever(document_store)
|
60 |
+
results = retriever.retrieve(query=query,
|
61 |
+
top_k= int(config.get('lexical_search','TOP_K')))
|
62 |
+
query_tokens = tokenize_lexical_query(query)
|
63 |
+
for result in results:
|
64 |
+
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
65 |
+
searchAnnotator(matches, doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
|
68 |
+
|
|
|
|
|
|
ver0.1 scripts/search.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob, os, sys; sys.path.append('../utils')
|
2 |
+
|
3 |
+
#import needed libraries
|
4 |
+
import seaborn as sns
|
5 |
+
from pandas import DataFrame
|
6 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
7 |
+
# from keybert import KeyBERT
|
8 |
+
from transformers import pipeline
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import numpy as np
|
11 |
+
import streamlit as st
|
12 |
+
import pandas as pd
|
13 |
+
from rank_bm25 import BM25Okapi
|
14 |
+
from sklearn.feature_extraction import _stop_words
|
15 |
+
import string
|
16 |
+
from tqdm.autonotebook import tqdm
|
17 |
+
import numpy as np
|
18 |
+
import docx
|
19 |
+
from docx.shared import Inches
|
20 |
+
from docx.shared import Pt
|
21 |
+
from docx.enum.style import WD_STYLE_TYPE
|
22 |
+
import logging
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
import tempfile
|
25 |
+
import sqlite3
|
26 |
+
import configparser
|
27 |
+
|
28 |
+
### These are lexcial search related functions #####
|
29 |
+
|
30 |
+
def bm25_tokenizer(text):
|
31 |
+
tokenized_doc = []
|
32 |
+
for token in text.lower().split():
|
33 |
+
token = token.strip(string.punctuation)
|
34 |
+
|
35 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
36 |
+
tokenized_doc.append(token)
|
37 |
+
return tokenized_doc
|
38 |
+
|
39 |
+
def bm25TokenizeDoc(paraList):
|
40 |
+
tokenized_corpus = []
|
41 |
+
##########Commenting this for now########### will incorporate paragrpah splitting later.
|
42 |
+
# for passage in tqdm(paraList):
|
43 |
+
# if len(passage.split()) >256:
|
44 |
+
# # st.write("Splitting")
|
45 |
+
# temp = " ".join(passage.split()[:256])
|
46 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
47 |
+
# temp = " ".join(passage.split()[256:])
|
48 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
49 |
+
# else:
|
50 |
+
# tokenized_corpus.append(bm25_tokenizer(passage))
|
51 |
+
######################################################################################33333
|
52 |
+
for passage in tqdm(paraList):
|
53 |
+
tokenized_corpus.append(bm25_tokenizer(passage))
|
54 |
+
|
55 |
+
return tokenized_corpus
|
56 |
+
|
57 |
+
def lexical_search(keyword, document_bm25):
|
58 |
+
config = configparser.ConfigParser()
|
59 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
60 |
+
top_k = int(config.get('lexical_search','TOP_K'))
|
61 |
+
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
62 |
+
top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
|
63 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
64 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
65 |
+
return bm25_hits
|
66 |
+
|
67 |
+
@st.cache(allow_output_mutation=True)
|
68 |
+
def load_sentenceTransformer(name):
|
69 |
+
return SentenceTransformer(name)
|
70 |
+
|
71 |
+
|
72 |
+
def semantic_search(keywordlist,paraList):
|
73 |
+
|
74 |
+
##### Sematic Search #####
|
75 |
+
#query = "Does document contain {} issues ?".format(keyword)
|
76 |
+
config = configparser.ConfigParser()
|
77 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
78 |
+
model_name = config.get('semantic_search','MODEL_NAME')
|
79 |
+
|
80 |
+
bi_encoder = load_sentenceTransformer(model_name)
|
81 |
+
bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
|
82 |
+
top_k = int(config.get('semantic_search','TOP_K'))
|
83 |
+
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
84 |
+
question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
|
85 |
+
|
86 |
+
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
87 |
+
|
88 |
+
return hits
|
89 |
+
|
90 |
+
def show_results(keywordList):
|
91 |
+
document = docx.Document()
|
92 |
+
# document.add_heading('Document name:{}'.format(file_name), 2)
|
93 |
+
section = document.sections[0]
|
94 |
+
|
95 |
+
# Calling the footer
|
96 |
+
footer = section.footer
|
97 |
+
|
98 |
+
# Calling the paragraph already present in
|
99 |
+
# the footer section
|
100 |
+
footer_para = footer.paragraphs[0]
|
101 |
+
|
102 |
+
font_styles = document.styles
|
103 |
+
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
104 |
+
font_object = font_charstyle.font
|
105 |
+
font_object.size = Pt(7)
|
106 |
+
# Adding the centered zoned footer
|
107 |
+
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
108 |
+
document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
|
109 |
+
for keyword in keywordList:
|
110 |
+
|
111 |
+
st.write("Results for Query: {}".format(keyword))
|
112 |
+
para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
|
113 |
+
para.font.size = Pt(12)
|
114 |
+
bm25_hits, hits = search(keyword)
|
115 |
+
|
116 |
+
st.markdown("""
|
117 |
+
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
118 |
+
""")
|
119 |
+
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
120 |
+
st.markdown("Top few lexical search (BM25) hits")
|
121 |
+
document.add_paragraph("Top few lexical search (BM25) hits")
|
122 |
+
|
123 |
+
for hit in bm25_hits[0:5]:
|
124 |
+
if hit['score'] > 0.00:
|
125 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
# st.table(bm25_hits[0:3])
|
131 |
+
|
132 |
+
st.markdown("\n-------------------------\n")
|
133 |
+
st.markdown("Top few Bi-Encoder Retrieval hits")
|
134 |
+
document.add_paragraph("\n-------------------------\n")
|
135 |
+
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
136 |
+
|
137 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
138 |
+
for hit in hits[0:5]:
|
139 |
+
# if hit['score'] > 0.45:
|
140 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
141 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|