CoUL-document-search / src /tfidf_search.py
wbrooks's picture
making sure all the correct arguments are there in the function calls
bccb1fa
# This script defines functions that search the corpus for blocks that are similar to the query.
# Loading embeddings of the query had to be changed for deployment in production because
# my CSVs took too much space for the free tier of HuggingFace spaces.
import polars as pl
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import hf_hub_download
import numpy as np
from numpy.typing import NDArray
from joblib import load
import scipy
import fasttext
from collections.abc import Callable
def query_worker(query: str, rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame:
"""
Calculate the cosine similarity of the query to each block of text from the corpus.
Parameters:
query (str): Search query
fasttext_model (fasttext.FastText._FastText):
idf (numpy.ndarray):
dtm_svd (numpy.ndarray):
dtm_svd_mat (numpy.ndarray):
vocab_norm (numpy.ndarray):
concentration (float):
Returns:
polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
"""
# query embeddings:
query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
# Normalize rows
query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)
# Compute cosine similarity matrix
query_similarities = np.dot(query_norm, vocab_norm.T)
query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1)
query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
# calculate the average TF-IDF score of the query over topics:
mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
sorted_df = pl.DataFrame(
{
'score-tfidf': mean_query_score,
'file': rownames
}).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
#return the sorted results
return(sorted_df)
def query_factory(rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]:
"""
Create a function that will compare query text to the documents in the corpus.
Parameters:
dtm_svd (np.ndarray):
"""
def do_query(query: str) -> pl.DataFrame:
"""
Call the worker that compares the query term distribution to the documents in the corpus
Parameters:
query (str): Text to compare to the documents
Returns:
polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
"""
return query_worker(query, rownames, fasttext_model, idf, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
return do_query
def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares the word distribution in a query to each document in the corpus.
Parameters:
dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format.
vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`.
model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)"
Returns:
callable: Function that compares the query string to the corpus.
"""
# load the fasttext model
fasttext_model = fasttext.load_model(hf_hub_download(model_name, "model.bin"))
# load the TF-IDF and DTM
my_df = pl.read_parquet(dtm_df_path)
my_vectorizer = load(vectorizer_path)
# vocab embeddings:
my_vocabulary = my_vectorizer.get_feature_names_out()
vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
# drop terms that have no embeddings in the fasttext model:
vocab_embeddings = vocab_embeddings[keep_terms, :]
my_vocabulary = my_vocabulary[keep_terms]
# get just IDF document-term matrix of the corpus:
my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0]))
# calculate length of each embedding vector
vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
# get the document-term matrix and project it to 300 pseudo-topics.
filenames = my_df["file"].to_list()
doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
dtm_svd = TruncatedSVD(n_components=300)
X_svd = dtm_svd.fit_transform(doc_term_mat)
return query_factory(rownames = filenames, fasttext_model = fasttext_model, idf = my_idf, dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)