Spaces:
Runtime error
Runtime error
File size: 5,040 Bytes
b951bdb cbd24a5 b951bdb cbd24a5 09f0fe7 cbd24a5 63ffd0c cbd24a5 09f0fe7 cbd24a5 63ffd0c cbd24a5 09f0fe7 cbd24a5 63ffd0c cbd24a5 09f0fe7 cbd24a5 63ffd0c cbd24a5 09f0fe7 cbd24a5 63ffd0c cbd24a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import streamlit as st
import json
import time
import faiss
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
class DocumentSearch:
'''
This class is dedicated to
perform semantic document search
based on previously trained:
faiss: index
sbert: encoder
sbert: cross_encoder
'''
def __init__(self, labels_path: str, encoder_path: str,
index_path: str, cross_encoder_path: str):
# loading docs and corresponding urls
with open(labels_path, 'r') as json_file:
self.docs = json.load(json_file)
# loading sbert encoder model
self.encoder = SentenceTransformer(encoder_path)
# loading faiss index
self.index = faiss.read_index(index_path)
# loading sbert cross_encoder
self.cross_encoder = CrossEncoder(cross_encoder_path)
def search(self, query: str, k: int) -> list:
# get vector representation of text query
query_vector = self.encoder.encode([query])
# perform search via faiss FlatIP index
_, indeces = self.index.search(query_vector, k*10)
# get answers by index
answers = [self.docs[i] for i in indeces[0]]
# prepare inputs for cross encoder
model_inputs = [[query, pairs[0]] for pairs in answers]
urls = [pairs[1] for pairs in answers]
# get similarity score between query and documents
scores = self.cross_encoder.predict(model_inputs, batch_size=1)
# compose results into list of dicts
results = [{'doc': doc[1], 'url': url, 'score': score} for doc, url, score in zip(model_inputs, urls, scores)]
# return results sorteed by similarity scores
return sorted(results, key=lambda x: x['score'], reverse=True)[:k]
if __name__ == "__main__":
enc_path = "ivan-savchuk/msmarco-distilbert-dot-v5-tuned-full-v1"
idx_path = "idx_vectors.index"
cross_enc_path = "ivan-savchuk/cross-encoder-ms-marco-MiniLM-L-12-v2-tuned_mediqa-v1"
docs_path = "docs.json"
# get instance of DocumentSearch class
surfer = DocumentSearch(
labels_path=docs_path,
encoder_path=enc_path,
index_path=idx_path,
cross_encoder_path=cross_enc_path
)
# streamlit part starts here with title
st.title('Medical Search')
# here we have input space
query = st.text_input("Enter any query about our data",
placeholder="Type query here...")
# on submit we execute search
if(st.button("Search")):
# set start time
stt = time.time()
# retrieve top 5 documents
results = surfer.search(query, k=5)
# set endtime
ent = time.time()
# measure resulting time
elapsed_time = round(ent - stt, 2)
# define container for answers
with st.container():
# show which query was entered, and what was searching time
st.write(f"**Results Related to:** {query} ({elapsed_time} sec.)")
# answer starts with header
st.subheader(f"Answer 1")
# cropped answer
doc = results[0]["doc"][:150] + "..."
# and url to the full answer
url = results[0]["url"]
# then we display it
st.markdown(f"{doc}\n[<span style=\"color:Blue\">**Read More**</span>]({url})\n")
# answer starts with header
st.subheader(f"Answer 2")
# cropped answer
doc = results[1]["doc"][:150] + "..."
# and url to the full answer
url = results[1]["url"]
# then we display it
st.markdown(f"{doc}\n[<span style=\"color:Blue\">**Read More**</span>]({url})\n")
# answer starts with header
st.subheader(f"Answer 3")
# cropped answer
doc = results[2]["doc"][:150] + "..."
# and url to the full answer
url = results[2]["url"]
# then we display it
st.markdown(f"{doc}\n[<span style=\"color:Blue\">**Read More**</span>]({url})\n")
# answer starts with header
st.subheader(f"Answer 4")
# cropped answer
doc = results[3]["doc"][:150] + "..."
# and url to the full answer
url = results[3]["url"]
# then we display it
st.markdown(f"{doc}\n[<span style=\"color:Blue\">**Read More**</span>]({url})\n")
# answer starts with header
st.subheader(f"Answer 5")
# cropped answer
doc = results[4]["doc"][:150] + "..."
# and url to the full answer
url = results[4]["url"]
# then we display it
st.markdown(f"{doc}\n[<span style=\"color:Blue\">**Read More**</span>]({url})\n")
st.markdown("---")
st.markdown("Author: Ivan Savchuk. 2022")
|