Spaces:

Abdul-Ib
/

Full-text-Search

Runtime error

App Files Files Community

Abdul-Ib commited on Feb 29

Commit

abe481f

•

1 Parent(s): be900e5

Update helper_functions.py

Browse files

Files changed (1) hide show

helper_functions.py +28 -16

helper_functions.py CHANGED Viewed

@@ -13,16 +13,15 @@ from fastapi import HTTPException
 from optimum.onnxruntime import ORTModelForFeatureExtraction
 from sentenceTranformer import SentenceEmbeddingPipeline
 from transformers import AutoTokenizer
 # Initialize
 # model_path = "Abdul-Ib/all-MiniLM-L6-v2-2024"
 # semantic_model = SentenceTransformer(model_path, cache_folder="./assets")
 try:
     # Load the semantic model
-    tokenizer = AutoTokenizer.from_pretrained("./assets/onnx")
     model = ORTModelForFeatureExtraction.from_pretrained(
-        "./assets/onnx", file_name="model_quantized.onnx"
     )
     semantic_model = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
 except Exception as e:
@@ -34,8 +33,8 @@ except Exception as e:
 # Initialization
 try:
     normalizer = Normalizer()
-    categorizer = fasttext.load_model("./assets/categorization_pipeline.ftz")
-    category_map = np.load("./assets/category_map.npy", allow_pickle=True).item()
 except Exception as e:
     raise HTTPException(
         status_code=500,
@@ -72,7 +71,7 @@ def make_request(url: str) -> dict:
         )
-def full_text_search(query: str, keyword_search: BM25L) -> np.ndarray:
     """
     Perform full-text search using the given query and BM25L model.
@@ -84,7 +83,8 @@ def full_text_search(query: str, keyword_search: BM25L) -> np.ndarray:
     - np.ndarray: The scores of the search results.
     """
     try:
-        tokenized_query = normalizer.translate_text(query).split(" ")
         ft_scores = keyword_search.get_scores(tokenized_query)
         return ft_scores
     except Exception as e:
@@ -95,7 +95,7 @@ def full_text_search(query: str, keyword_search: BM25L) -> np.ndarray:
         )
-def semantic_search(query: str, doc_embeddings: torch.Tensor) -> torch.Tensor:
     """
     Perform semantic search using the given query and document embeddings.
@@ -107,7 +107,8 @@ def semantic_search(query: str, doc_embeddings: torch.Tensor) -> torch.Tensor:
     - np.ndarray: The cosine similarity scores of the search results.
     """
     try:
-        query_embedding = semantic_model(normalizer.translate_text(query))[0]
         cos_sim = torch.nn.functional.cosine_similarity(
             query_embedding, doc_embeddings, dim=-1
         )
@@ -180,6 +181,9 @@ def calculate_interrelations(
     - doc_embeddings (np.ndarray): The document embeddings for products.
     - interrelation_threshold (float): How similar two products are.
     Returns:
     - None
     """
@@ -190,6 +194,11 @@ def calculate_interrelations(
         cos_sim_matrix = torch.mm(
             doc_embeddings_norm, doc_embeddings_norm.transpose(0, 1)
         )
         for i in range(num_products):
             related_indices = np.where(cos_sim_matrix[i] > interrelation_threshold)[0]
@@ -205,7 +214,7 @@ def calculate_interrelations(
         )
-def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
     """
     Check the validity of the input query against keyword match search.
@@ -228,7 +237,7 @@ def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
     """
     try:
         # Step 1: Perform keyword match search on the original query
-        keyword_scores = full_text_search(query, keyword_search)
         # Step 2: If any matches found in step 1, return the search scores
         if max(keyword_scores) != 0.0:
@@ -236,7 +245,7 @@ def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
         # Step 3: Generate a modified query by keeping only one character and perform a keyword match search
         one_char_query = normalizer.keep_one_char(query)
-        one_char_scores = full_text_search(one_char_query, keyword_search)
         # Step 4: If any matches found in step 3, return the search scores
         if max(one_char_scores) != 0.0:
             return one_char_scores
@@ -245,7 +254,7 @@ def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
         spelled_query = normalizer.check_spelling(query)
         # Step 6: If any matches found in step 5, return the search scores
         if spelled_query is not None:
-            spelled_scores = full_text_search(spelled_query, keyword_search)
             if max(spelled_scores) != 0.0:
                 return spelled_scores
@@ -258,6 +267,7 @@ def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
             detail=f"An error occurred during query validity check: {e}",
         )
 def is_cheapest(query: str, request_json: list) -> list:
     """
     Check which product is the cheapest within the same category as
@@ -300,7 +310,8 @@ def is_cheapest(query: str, request_json: list) -> list:
             status_code=500,
             detail=f"An error occurred during cheapest product identification: {e}",
         )
 def check_keys(request_json: List[dict], required_keys: list):
     """
     Check if each dictionary in a list contains all the required keys.
@@ -314,5 +325,6 @@ def check_keys(request_json: List[dict], required_keys: list):
     """
     for item in request_json:
         if not all(key in item for key in required_keys):
-            raise HTTPException(status_code=400, detail=f"Missing keys in dictionary: {item}")

 from optimum.onnxruntime import ORTModelForFeatureExtraction
 from sentenceTranformer import SentenceEmbeddingPipeline
 from transformers import AutoTokenizer
 # Initialize
 # model_path = "Abdul-Ib/all-MiniLM-L6-v2-2024"
 # semantic_model = SentenceTransformer(model_path, cache_folder="./assets")
 try:
     # Load the semantic model
+    tokenizer = AutoTokenizer.from_pretrained("./app/assets/onnx")
     model = ORTModelForFeatureExtraction.from_pretrained(
+        "./app/assets/onnx", file_name="model_quantized.onnx"
     )
     semantic_model = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
 except Exception as e:
 # Initialization
 try:
     normalizer = Normalizer()
+    categorizer = fasttext.load_model("./app/assets/categorization_pipeline.ftz")
+    category_map = np.load("./app/assets/category_map.npy", allow_pickle=True).item()
 except Exception as e:
     raise HTTPException(
         status_code=500,
         )
+async def full_text_search(query: str, keyword_search: BM25L) -> np.ndarray:
     """
     Perform full-text search using the given query and BM25L model.
     - np.ndarray: The scores of the search results.
     """
     try:
+        translated_query = await normalizer.translate_text(query)
+        tokenized_query = translated_query.split(" ")
         ft_scores = keyword_search.get_scores(tokenized_query)
         return ft_scores
     except Exception as e:
         )
+async def semantic_search(query: str, doc_embeddings: torch.Tensor) -> torch.Tensor:
     """
     Perform semantic search using the given query and document embeddings.
     - np.ndarray: The cosine similarity scores of the search results.
     """
     try:
+        translated_query = await normalizer.translate_text(query)
+        query_embedding = semantic_model(translated_query)[0]
         cos_sim = torch.nn.functional.cosine_similarity(
             query_embedding, doc_embeddings, dim=-1
         )
     - doc_embeddings (np.ndarray): The document embeddings for products.
     - interrelation_threshold (float): How similar two products are.
+    Raises:
+    - HTTPException: If an error occurs during interrelation calculation.
     Returns:
     - None
     """
         cos_sim_matrix = torch.mm(
             doc_embeddings_norm, doc_embeddings_norm.transpose(0, 1)
         )
+        # cos_sim_matrix = torch.nn.functional.cosine_similarity(
+        #     doc_embeddings, doc_embeddings, dim=1
+        # )
+        # logger.info(f"sentransformers.utils. {util.cos_sim(doc_embeddings, doc_embeddings)}")
+        # logger.warning(f"cos_sim_matrix: {cos_sim_matrix}")
         for i in range(num_products):
             related_indices = np.where(cos_sim_matrix[i] > interrelation_threshold)[0]
         )
+async def check_validity(query: str, keyword_search: BM25L) -> np.ndarray:
     """
     Check the validity of the input query against keyword match search.
     """
     try:
         # Step 1: Perform keyword match search on the original query
+        keyword_scores = await full_text_search(query, keyword_search)
         # Step 2: If any matches found in step 1, return the search scores
         if max(keyword_scores) != 0.0:
         # Step 3: Generate a modified query by keeping only one character and perform a keyword match search
         one_char_query = normalizer.keep_one_char(query)
+        one_char_scores = await full_text_search(one_char_query, keyword_search)
         # Step 4: If any matches found in step 3, return the search scores
         if max(one_char_scores) != 0.0:
             return one_char_scores
         spelled_query = normalizer.check_spelling(query)
         # Step 6: If any matches found in step 5, return the search scores
         if spelled_query is not None:
+            spelled_scores = await full_text_search(spelled_query, keyword_search)
             if max(spelled_scores) != 0.0:
                 return spelled_scores
             detail=f"An error occurred during query validity check: {e}",
         )
 def is_cheapest(query: str, request_json: list) -> list:
     """
     Check which product is the cheapest within the same category as
             status_code=500,
             detail=f"An error occurred during cheapest product identification: {e}",
         )
 def check_keys(request_json: List[dict], required_keys: list):
     """
     Check if each dictionary in a list contains all the required keys.
     """
     for item in request_json:
         if not all(key in item for key in required_keys):
+            raise HTTPException(
+                status_code=400, detail=f"Missing keys in dictionary: {item}"
+            )