Spaces:

wbrooks
/

CoUL-document-search

Running

App Files Files Community

CoUL-document-search / src /tfidf_search.py

wbrooks

making sure all the correct arguments are there in the function calls

bccb1fa 6 days ago

raw

history blame contribute delete

5.48 kB

	# This script defines functions that search the corpus for blocks that are similar to the query.
	# Loading embeddings of the query had to be changed for deployment in production because
	# my CSVs took too much space for the free tier of HuggingFace spaces.

	import polars as pl
	from sklearn.decomposition import TruncatedSVD
	from sklearn.metrics.pairwise import cosine_similarity
	from huggingface_hub import hf_hub_download
	import numpy as np
	from numpy.typing import NDArray
	from joblib import load
	import scipy
	import fasttext
	from collections.abc import Callable


	def query_worker(query: str, rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame:
	"""
	Calculate the cosine similarity of the query to each block of text from the corpus.

	Parameters:
	query (str): Search query
	fasttext_model (fasttext.FastText._FastText):
	idf (numpy.ndarray):
	dtm_svd (numpy.ndarray):
	dtm_svd_mat (numpy.ndarray):
	vocab_norm (numpy.ndarray):
	concentration (float):
	Returns:
	polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
	"""

	# query embeddings:
	query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])

	# Normalize rows
	query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)

	# Compute cosine similarity matrix
	query_similarities = np.dot(query_norm, vocab_norm.T)
	query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1)
	query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)

	# calculate the average TF-IDF score of the query over topics:
	mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)

	sorted_df = pl.DataFrame(
	{
	'score-tfidf': mean_query_score,
	'file': rownames
	}).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))

	#return the sorted results
	return(sorted_df)



	def query_factory(rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]:
	"""
	Create a function that will compare query text to the documents in the corpus.

	Parameters:
	dtm_svd (np.ndarray):
	"""

	def do_query(query: str) -> pl.DataFrame:
	"""
	Call the worker that compares the query term distribution to the documents in the corpus

	Parameters:
	query (str): Text to compare to the documents

	Returns:
	polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
	"""
	return query_worker(query, rownames, fasttext_model, idf, dtm_svd, dtm_svd_mat, vocab_norm, concentration)

	return do_query



	def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]:
	"""
	Create a function that compares the word distribution in a query to each document in the corpus.

	Parameters:
	dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format.
	vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`.
	model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)"

	Returns:
	callable: Function that compares the query string to the corpus.
	"""

	# load the fasttext model
	fasttext_model = fasttext.load_model(hf_hub_download(model_name, "model.bin"))

	# load the TF-IDF and DTM
	my_df = pl.read_parquet(dtm_df_path)
	my_vectorizer = load(vectorizer_path)

	# vocab embeddings:
	my_vocabulary = my_vectorizer.get_feature_names_out()
	vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
	keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]

	# drop terms that have no embeddings in the fasttext model:
	vocab_embeddings = vocab_embeddings[keep_terms, :]
	my_vocabulary = my_vocabulary[keep_terms]

	# get just IDF document-term matrix of the corpus:
	my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0]))

	# calculate length of each embedding vector
	vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)

	# get the document-term matrix and project it to 300 pseudo-topics.
	filenames = my_df["file"].to_list()
	doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
	dtm_svd = TruncatedSVD(n_components=300)
	X_svd = dtm_svd.fit_transform(doc_term_mat)

	return query_factory(rownames = filenames, fasttext_model = fasttext_model, idf = my_idf, dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)