Spaces:
Sleeping
Sleeping
import numpy as np | |
from collections import defaultdict | |
from gensim.utils import simple_preprocess | |
from tqdm import tqdm | |
import joblib | |
def get_tf_query(query): | |
k = len(query) | |
tf_query = defaultdict(lambda: 0) | |
for i in range(k): | |
tf_query[query[i]] += 1 | |
for token in tf_query.keys(): | |
tf_query[token] /= k | |
return tf_query | |
def get_tf_idf_query(query, idf_dict): | |
query = simple_preprocess(query) | |
tf_idf_query = defaultdict(lambda: 0) | |
tf_query = get_tf_query(query) | |
for token in tf_query.keys(): | |
tf_idf_query[token] = tf_query[token] * idf_dict[token] | |
return tf_idf_query | |
def get_tf_idf_vector(tf_idf_instance, vocab): | |
temp = [] | |
for key in vocab.keys(): | |
temp.append(tf_idf_instance[key]) | |
return temp | |
def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k): | |
query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1)) | |
scores = [] | |
dot_products = document_matrix @ query_vector.T | |
query_norm = np.linalg.norm(query_vector) | |
doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True) | |
cosine_similarities = dot_products / (doc_norms * query_norm) | |
cosine_similarities = cosine_similarities.flatten() | |
rankings = np.argsort(cosine_similarities)[::-1] | |
rankings = rankings[:k] | |
scores = [] | |
for rank in rankings: | |
scores.append(cosine_similarities[rank]) | |
# scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True) | |
# scores = scores[:k] | |
# rankings = get_documents_from_scores(scores) | |
return rankings, scores | |
def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100): | |
idf_dict = joblib.load(idf_dict_path) | |
print("idf loaded...") | |
tf_idf_dict = joblib.load(tf_idf_dict_path) | |
print("tf-idf loaded...") | |
vocab = joblib.load(vocab_path) | |
print("vocab loaded...") | |
document_matrix = joblib.load(document_matrix_path) | |
print("document_matrix loaded...") | |
ids = joblib.load(ids_path) | |
print("ids loaded") | |
rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k) | |
rankings2 = [] | |
for ranking in tqdm(rankings): | |
rankings2.append(ids[ranking]) | |
return rankings2 |