QueryExpansionForEtsy

Sleeping

File size: 10,357 Bytes

dfed715
cbf3a17
dfed715
 
 
 
 
cbf3a17
dfed715
 
 
 
 
38c9b02
dfed715
 
38c9b02
dfed715
 
62b04b6
 
4206bd7
 
 
 
 
 
 
 
70c2398
dfed715
 
 
 
c306d7d
dfed715
 
cbf3a17
dfed715
 
 
 
 
 
 
 
cbf3a17
 
 
 
dfed715
 
 
bc54fc9
 
dfed715
cbf3a17
 
 
 
 
 
 
 
 
dfed715
cbf3a17
 
dfed715
c306d7d
 
 
 
 
18083b8
c306d7d
f4b5cf8
 
cbf3a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6211e32
c306d7d
 
 
 
 
 
 
 
 
b726bae
cbf3a17
 
 
 
 
 
5f58eb0
cbf3a17
5f58eb0
 
f4b5cf8
 
 
048a704
dfed715
 
cbf3a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c306d7d
cbf3a17
c306d7d
 
 
cbf3a17
 
 
 
dfed715
5e1fd6b
 
 
 
cbf3a17
 
 
5e1fd6b
cbf3a17
 
5e1fd6b
 
cbf3a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e1fd6b
1dc9db2
cbf3a17

import streamlit as st

from PIL import Image

import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import pickle
import pandas as pd

############
## Main page
############

st.write("# Demonstration for User Query Expansion（QE）")

st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
image = Image.open('top.png')
st.image(image)

st.sidebar.write("# Top-N Selection")
maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
#user_query = st_tags(
#    label='# Enter Query:',
#    text='Press enter to add more',
#    value=['Mother'],
#    suggestions=['gift', 'nike', 'wool'],
#    maxtags=maxtags_sidebar,
#    key="aljnf")

user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")

# Add selectbox in streamlit
option1 = st.sidebar.selectbox(
     'Which transformers model would you like to be selected?',
     ('multi-qa-MiniLM-L6-cos-v1','null','null'))

option2 = st.sidebar.selectbox(
     'Which cross-encoder model would you like to be selected?',
     ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))

st.sidebar.success("Load Successfully!")

#if not torch.cuda.is_available():
#    print("Warning: No GPU found. Please add GPU to your notebook")

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
@st.cache_resource
def load_encoders(sentence_enc, cross_enc):
    return SentenceTransformer(sentence_enc,device='cpu'), CrossEncoder(cross_enc,device='cpu')
bi_encoder, cross_encoder = load_encoders(option1,option2)
bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

passages = []

# load pre-train embeedings files
@st.cache_resource
def load_pickle(path):
    with open(path, "rb") as fIn:
        cache_data = pickle.load(fIn)
        passages = cache_data['sentences']
        corpus_embeddings = cache_data['embeddings']
    print("Load pre-computed embeddings from disc")
    return passages,corpus_embeddings

embedding_cache_path = 'etsy-embeddings-cpu.pkl'
passages,corpus_embeddings = load_pickle(embedding_cache_path)


from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import re

import yake

@st.cache_resource
def load_model():
    language = "en"
    max_ngram_size = 3
    deduplication_threshold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 3
    numOfKeywords = 3
    return yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
custom_kw_extractor = load_model()
# load query GMS information
@st.cache_resource
def load_json(path):
    with open(path, 'r') as file:
        query_gms_dict = json.load(file)
    return query_gms_dict

query_gms_dict = load_json('query_gms_mock_2M.json')
# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

@st.cache_resource
def get_tokenized_corpus(passages,_tokenizer):
    tokenized_corpus = []
    for passage in passages:
        tokenized_corpus.append(_tokenizer(passage))
    return tokenized_corpus

tokenized_corpus = get_tokenized_corpus(passages,bm25_tokenizer)
bm25 = BM25Okapi(tokenized_corpus)

def word_len(s):
    return len([i for i in s.split(' ') if i])


# This function will search all wikipedia articles for passages that
# answer the query
DEFAULT_SCORE = -100.0
def clean_string(input_string):
    string_sub1 = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', input_string)
    string_sub2 = re.sub("\x20\x20", "\n", string_sub1)
    string_strip = string_sub2.strip().lower()
    output_string = []
    if len(string_strip) > 20:
        keywords = custom_kw_extractor.extract_keywords(string_strip)
        for tokens in keywords:
            string_clean = tokens[0]
            if word_len(string_clean) > 1:
                output_string.append(string_clean)
    else:
        output_string.append(string_strip)
    return output_string

# def add_gms_score_for_candidates(candidates, query_gms_dict):
#     for query_candidate in candidates:
#         value = candidates[query_candidate]
#         value['gms'] = query_gms_dict.get(query_candidate, 0)
#         candidates[query_candidate] = value
#     return candidates

def generate_query_expansion_candidates(query):
    print("Input query:", query)
    expanded_query_set = {}

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    # finds the indices of the top n scores
    top_n_indices = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'bm25_score': bm25_scores[idx]} for idx in top_n_indices]
    # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)


    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    # query_embedding = query_embedding.cuda()
    # Get the hits for the first query
    encoder_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # For all retrieved passages, add the cross_encoder scores
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in encoder_hits]
    cross_scores = cross_encoder.predict(cross_inp)
    for idx in range(len(cross_scores)):
        encoder_hits[idx]['cross_score'] = cross_scores[idx]

    candidates = {}
    for hit in bm25_hits:
        corpus_id = hit['corpus_id']
        if  corpus_id not in candidates:
            candidates[corpus_id] = {'bm25_score': hit['bm25_score'], 'bi_score': DEFAULT_SCORE, 'cross_score': DEFAULT_SCORE}
    for hit in encoder_hits:
        corpus_id = hit['corpus_id']
        if corpus_id not in candidates:
            candidates[corpus_id] = {'bm25_score': DEFAULT_SCORE, 'bi_score': hit['score'], 'cross_score': hit['cross_score']}
        else:
            bm25_score = candidates[corpus_id]['bm25_score']
            candidates[corpus_id].update({'bm25_score': bm25_score, 'bi_score': hit['score'], 'cross_score': hit['cross_score']})

    final_candidates = {}
    for key, value in candidates.items():
        input_string = passages[key].replace("\n", "")
        string_set = set(clean_string(input_string))
        for item in string_set:
            final_candidates[item.replace("\n", " ")] = value
    # remove the query itself from candidates
    if query in final_candidates:
        del final_candidates[query]
    # print(final_candidates)
    # add gms column
    df = pd.DataFrame(final_candidates).T
    df['gms'] = [query_gms_dict.get(i,0) for i in df.index]
    # Total Results

    return df.to_dict('index')

def re_rank_candidates(query, candidates, method):
    if method == 'bm25':
        # Filter and sort by bm25_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if v['bm25_score'] > DEFAULT_SCORE],
            key=lambda x: x[1]['bm25_score'],
            reverse=True
        )
    elif method == 'bi_encoder':
        # Filter and sort by bi_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if v['bi_score'] > DEFAULT_SCORE],
            key=lambda x: x[1]['bi_score'],
            reverse=True
        )
    elif method == 'cross_encoder':
        # Filter and sort by cross_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if v['cross_score'] > DEFAULT_SCORE],
            key=lambda x: x[1]['cross_score'],
            reverse=True
        )
    elif method == 'gms':
        filtered_sorted_by_encoder = sorted(
            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
            reverse=True
        )
        # first sort by cross_score + bi_score
        filtered_sorted_result = sorted(filtered_sorted_by_encoder, key=lambda x: x[1]['gms'], reverse=True
        )
    else:
        # use default method cross_score + bi_score
        # Filter and sort by cross_score + bi_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
            reverse=True
        )
    data_dicts = [{'query': item[0], **item[1]} for item in filtered_sorted_result]
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data_dicts)
    return df


# st.write("## Raw Candidates:")
if st.button('Generated Expansion'):
    col1, col2 = st.columns(2)
    candidates = generate_query_expansion_candidates(query = user_query)

    with col1:
        st.subheader('Original Ranking')
        ranking_cross = re_rank_candidates(user_query, candidates, method='cross_encoder')
        ranking_cross.index = ranking_cross.index+1
        st.table(ranking_cross['query'][:maxtags_sidebar])

    with col2:
        st.subheader('GMS-sorted Ranking')
        ranking_gms = re_rank_candidates(user_query, candidates, method='gms')
        ranking_gms.index = ranking_gms.index + 1
        st.table(ranking_gms[['query', 'gms']][:maxtags_sidebar])

    ## convert into dataframe
    # data_dicts = [{'query': key, **values} for key, values in candidates.items()]
    # df = pd.DataFrame(data_dicts)
    # st.write(list(candidates.keys())[0:maxtags_sidebar])
    # st.write(df)
    # st.dataframe(df)
    # st.success(raw_candidates)

#if st.button('Rerank By GMS'):
    #candidates = generate_query_expansion_candidates(query = user_query)
    #df = re_rank_candidates(user_query, candidates, method='gms')
    #st.dataframe(df[['query', 'gms']][:maxtags_sidebar])