Spaces:
Sleeping
Sleeping
File size: 10,357 Bytes
dfed715 cbf3a17 dfed715 cbf3a17 dfed715 38c9b02 dfed715 38c9b02 dfed715 62b04b6 4206bd7 70c2398 dfed715 c306d7d dfed715 cbf3a17 dfed715 cbf3a17 dfed715 bc54fc9 dfed715 cbf3a17 dfed715 cbf3a17 dfed715 c306d7d 18083b8 c306d7d f4b5cf8 cbf3a17 6211e32 c306d7d b726bae cbf3a17 5f58eb0 cbf3a17 5f58eb0 f4b5cf8 048a704 dfed715 cbf3a17 c306d7d cbf3a17 c306d7d cbf3a17 dfed715 5e1fd6b cbf3a17 5e1fd6b cbf3a17 5e1fd6b cbf3a17 5e1fd6b 1dc9db2 cbf3a17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import streamlit as st
from PIL import Image
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import pickle
import pandas as pd
############
## Main page
############
st.write("# Demonstration for User Query Expansion(QE)")
st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
image = Image.open('top.png')
st.image(image)
st.sidebar.write("# Top-N Selection")
maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
#user_query = st_tags(
# label='# Enter Query:',
# text='Press enter to add more',
# value=['Mother'],
# suggestions=['gift', 'nike', 'wool'],
# maxtags=maxtags_sidebar,
# key="aljnf")
user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")
# Add selectbox in streamlit
option1 = st.sidebar.selectbox(
'Which transformers model would you like to be selected?',
('multi-qa-MiniLM-L6-cos-v1','null','null'))
option2 = st.sidebar.selectbox(
'Which cross-encoder model would you like to be selected?',
('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
st.sidebar.success("Load Successfully!")
#if not torch.cuda.is_available():
# print("Warning: No GPU found. Please add GPU to your notebook")
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
@st.cache_resource
def load_encoders(sentence_enc, cross_enc):
return SentenceTransformer(sentence_enc,device='cpu'), CrossEncoder(cross_enc,device='cpu')
bi_encoder, cross_encoder = load_encoders(option1,option2)
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
passages = []
# load pre-train embeedings files
@st.cache_resource
def load_pickle(path):
with open(path, "rb") as fIn:
cache_data = pickle.load(fIn)
passages = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']
print("Load pre-computed embeddings from disc")
return passages,corpus_embeddings
embedding_cache_path = 'etsy-embeddings-cpu.pkl'
passages,corpus_embeddings = load_pickle(embedding_cache_path)
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import re
import yake
@st.cache_resource
def load_model():
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 3
numOfKeywords = 3
return yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
custom_kw_extractor = load_model()
# load query GMS information
@st.cache_resource
def load_json(path):
with open(path, 'r') as file:
query_gms_dict = json.load(file)
return query_gms_dict
query_gms_dict = load_json('query_gms_mock_2M.json')
# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
tokenized_doc.append(token)
return tokenized_doc
@st.cache_resource
def get_tokenized_corpus(passages,_tokenizer):
tokenized_corpus = []
for passage in passages:
tokenized_corpus.append(_tokenizer(passage))
return tokenized_corpus
tokenized_corpus = get_tokenized_corpus(passages,bm25_tokenizer)
bm25 = BM25Okapi(tokenized_corpus)
def word_len(s):
return len([i for i in s.split(' ') if i])
# This function will search all wikipedia articles for passages that
# answer the query
DEFAULT_SCORE = -100.0
def clean_string(input_string):
string_sub1 = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', input_string)
string_sub2 = re.sub("\x20\x20", "\n", string_sub1)
string_strip = string_sub2.strip().lower()
output_string = []
if len(string_strip) > 20:
keywords = custom_kw_extractor.extract_keywords(string_strip)
for tokens in keywords:
string_clean = tokens[0]
if word_len(string_clean) > 1:
output_string.append(string_clean)
else:
output_string.append(string_strip)
return output_string
# def add_gms_score_for_candidates(candidates, query_gms_dict):
# for query_candidate in candidates:
# value = candidates[query_candidate]
# value['gms'] = query_gms_dict.get(query_candidate, 0)
# candidates[query_candidate] = value
# return candidates
def generate_query_expansion_candidates(query):
print("Input query:", query)
expanded_query_set = {}
##### BM25 search (lexical search) #####
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
# finds the indices of the top n scores
top_n_indices = np.argpartition(bm25_scores, -5)[-5:]
bm25_hits = [{'corpus_id': idx, 'bm25_score': bm25_scores[idx]} for idx in top_n_indices]
# bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
##### Sematic Search #####
# Encode the query using the bi-encoder and find potentially relevant passages
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
# query_embedding = query_embedding.cuda()
# Get the hits for the first query
encoder_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
# For all retrieved passages, add the cross_encoder scores
cross_inp = [[query, passages[hit['corpus_id']]] for hit in encoder_hits]
cross_scores = cross_encoder.predict(cross_inp)
for idx in range(len(cross_scores)):
encoder_hits[idx]['cross_score'] = cross_scores[idx]
candidates = {}
for hit in bm25_hits:
corpus_id = hit['corpus_id']
if corpus_id not in candidates:
candidates[corpus_id] = {'bm25_score': hit['bm25_score'], 'bi_score': DEFAULT_SCORE, 'cross_score': DEFAULT_SCORE}
for hit in encoder_hits:
corpus_id = hit['corpus_id']
if corpus_id not in candidates:
candidates[corpus_id] = {'bm25_score': DEFAULT_SCORE, 'bi_score': hit['score'], 'cross_score': hit['cross_score']}
else:
bm25_score = candidates[corpus_id]['bm25_score']
candidates[corpus_id].update({'bm25_score': bm25_score, 'bi_score': hit['score'], 'cross_score': hit['cross_score']})
final_candidates = {}
for key, value in candidates.items():
input_string = passages[key].replace("\n", "")
string_set = set(clean_string(input_string))
for item in string_set:
final_candidates[item.replace("\n", " ")] = value
# remove the query itself from candidates
if query in final_candidates:
del final_candidates[query]
# print(final_candidates)
# add gms column
df = pd.DataFrame(final_candidates).T
df['gms'] = [query_gms_dict.get(i,0) for i in df.index]
# Total Results
return df.to_dict('index')
def re_rank_candidates(query, candidates, method):
if method == 'bm25':
# Filter and sort by bm25_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if v['bm25_score'] > DEFAULT_SCORE],
key=lambda x: x[1]['bm25_score'],
reverse=True
)
elif method == 'bi_encoder':
# Filter and sort by bi_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if v['bi_score'] > DEFAULT_SCORE],
key=lambda x: x[1]['bi_score'],
reverse=True
)
elif method == 'cross_encoder':
# Filter and sort by cross_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if v['cross_score'] > DEFAULT_SCORE],
key=lambda x: x[1]['cross_score'],
reverse=True
)
elif method == 'gms':
filtered_sorted_by_encoder = sorted(
[(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
reverse=True
)
# first sort by cross_score + bi_score
filtered_sorted_result = sorted(filtered_sorted_by_encoder, key=lambda x: x[1]['gms'], reverse=True
)
else:
# use default method cross_score + bi_score
# Filter and sort by cross_score + bi_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
reverse=True
)
data_dicts = [{'query': item[0], **item[1]} for item in filtered_sorted_result]
# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data_dicts)
return df
# st.write("## Raw Candidates:")
if st.button('Generated Expansion'):
col1, col2 = st.columns(2)
candidates = generate_query_expansion_candidates(query = user_query)
with col1:
st.subheader('Original Ranking')
ranking_cross = re_rank_candidates(user_query, candidates, method='cross_encoder')
ranking_cross.index = ranking_cross.index+1
st.table(ranking_cross['query'][:maxtags_sidebar])
with col2:
st.subheader('GMS-sorted Ranking')
ranking_gms = re_rank_candidates(user_query, candidates, method='gms')
ranking_gms.index = ranking_gms.index + 1
st.table(ranking_gms[['query', 'gms']][:maxtags_sidebar])
## convert into dataframe
# data_dicts = [{'query': key, **values} for key, values in candidates.items()]
# df = pd.DataFrame(data_dicts)
# st.write(list(candidates.keys())[0:maxtags_sidebar])
# st.write(df)
# st.dataframe(df)
# st.success(raw_candidates)
#if st.button('Rerank By GMS'):
#candidates = generate_query_expansion_candidates(query = user_query)
#df = re_rank_candidates(user_query, candidates, method='gms')
#st.dataframe(df[['query', 'gms']][:maxtags_sidebar]) |