Spaces:

wbrooks
/

CoUL-document-search

Running

App Files Files Community

CoUL-document-search / app.py

wbrooks

removed debugging messages now that search is working

b6127b6 1 day ago

raw

history blame contribute delete

1.99 kB

	from fastapi import FastAPI, Query
	from fastapi.responses import JSONResponse
	from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
	from src.tfidf_search import create_tfidf_search_function

	import polars as pl
	#from jinja2 import Template

	# remove this prefix from the file paths:
	path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"

	# data we will need for search:
	block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
	doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
	tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"

	sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
	model_name = "sentence-transformers/all-MiniLM-L6-v2",
	embeddings_df_path = block_embeddings_df_path,
	device = "cpu")
	tfidf_query_docs = create_tfidf_search_function(
	dtm_df_path = doc_tfidf_df_path,
	vectorizer_path = tfidf_vectorizer_path,
	model_name = "facebook/fasttext-en-vectors")

	app = FastAPI()


	@app.get("/")
	def default():
	return {"status": "ok", "version": 0.1}


	@app.get("/search", response_class=JSONResponse)
	def search(q: str = Query(..., description="Search query")):
	res_tfidf = tfidf_query_docs(q)
	res_sbert = sbert_query_docs(q)

	joined = res_sbert.join(res_tfidf, on='file', how = 'inner')

	res_combined = joined.with_columns(
	(0.7 * pl.col("rank-sbert") + 0.3 * pl.col("rank-tfidf")).alias("rank-combined"),
	pl.col("file").str.strip_prefix(path_prefix).alias("file")
	).sort("rank-combined").with_columns(
	(20.0 / pl.col('rank-combined')).round(2).alias('confidence')
	).select(['file', 'confidence'])

	#return {"request": request, "results": str(res_combined)}
	#return {"request": request, "results": res_combined.to_dicts()}
	return res_combined.to_dicts()




	@app.get("/test")
	def echo(query: str):

	return {"echo": query}