More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 25

Commit

075fdaa

•

1 Parent(s): 83c4a82

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -23

app.py CHANGED Viewed

@@ -96,9 +96,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 #####
 from huggingface_hub import InferenceClient
-repo_id = "meta-llama/Llama-3.2-1B-Instruct"
-llm = InferenceClient(model=repo_id, timeout=120)
 # Test your LLM client
 #llm_client.text_generation(prompt="How are you today?", max_new_tokens=20)
@@ -108,7 +108,7 @@ def download_nltk_resources():
     resources = ['punkt', 'stopwords', 'snowball_data']
     for resource in resources:
         try:
-            nltk.download(resource, quiet=True)
         except Exception as e:
             print(f"Failed to download {resource}: {str(e)}")
@@ -331,21 +331,25 @@ import nltk
 def optimize_query(
     query: str,
     chunks: List[str],
     embedding_model: str,
     top_k: int = 3,
-    model_name: str = "google/flan-t5-small",  # Small model (only 80M parameters)
-    use_gpu: bool = False  # Default to CPU
 ) -> str:
     """
     CPU-optimized version of query expansion using a small language model.
     Args:
         query: Original search query
         chunks: List of text chunks to search through
         embedding_model: Name of the embedding model being used
         top_k: Number of expansion terms to add
-        model_name: Name of the small language model to use
         use_gpu: Whether to use GPU if available (defaults to False for CPU)
     Returns:
@@ -367,42 +371,42 @@ def optimize_query(
                 # Limit number of lemmas
                 expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
-        # 3. Use small T5 model with reduced complexity
         try:
             # Load model with reduced memory footprint
             tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                model_max_length=128,  # Limit maximum sequence length
-                cache_dir="./model_cache"  # Cache models locally
             )
-            model = AutoModel.from_pretrained(
-                model_name,
-                low_cpu_mem_usage=True,  # Enable low memory usage
-                device_map="cpu"  # Explicitly set to CPU
             )
             # Move model to CPU and eval mode
             model = model.to(device)
-            model.eval()  # Set to evaluation mode to reduce memory usage
             # Prepare input with reduced length
             prompt = f"Enhance this search query with relevant terms: {query}"
             inputs = tokenizer(
                 prompt,
                 return_tensors="pt",
-                max_length=64,  # Reduced from 128
                 truncation=True,
                 padding=True
             )
             # Generate with minimal parameters
-            with torch.no_grad():  # Disable gradient calculation
                 outputs = model.generate(
                     inputs.input_ids.to(device),
-                    max_length=32,  # Reduced from 64
                     num_return_sequences=1,
                     temperature=0.7,
-                    do_sample=False,  # Disable sampling for faster generation
                     early_stopping=True
                 )
@@ -414,12 +418,12 @@ def optimize_query(
         except Exception as model_error:
             print(f"Model-based expansion failed: {str(model_error)}")
-            enhanced_query = query  # Fallback to original query
         # 4. Combine original and expanded terms
         final_terms = set(tokens)
         final_terms.update(expanded_terms)
-        if enhanced_query != query:  # Only add if model expansion worked
             final_terms.update(word_tokenize(enhanced_query.lower()))
         # 5. Remove stopwords and select top_k most relevant terms
@@ -434,13 +438,15 @@ def optimize_query(
         del tokenizer
         if device == "cuda":
             torch.cuda.empty_cache()
-        print(expanded_query.strip())
         return expanded_query.strip()
     except Exception as e:
         print(f"Query optimization failed: {str(e)}")
         return query  # Return original query if optimization fails
 # Example usage
 """
 chunks = ["sample text chunk 1", "sample text chunk 2"]
@@ -843,6 +849,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
         "apply_phonetic": apply_phonetic,
         "phonetic_weight": phonetic_weight,
         "use_query_optimization": use_query_optimization,
         "use_reranking": use_reranking
     }
@@ -1337,7 +1344,7 @@ def launch_interface(share=True):
                     'apply_phonetic': [False],  # Default phonetic settings
                     'phonetic_weight': [0.5],
                     'custom_separators': [None],
-                    'query_optimization_model': ['gpt-3.5-turbo']  # Default query optimization model
                 }
                 # Run automated tests

 #####
 from huggingface_hub import InferenceClient
+#repo_id = "meta-llama/Llama-3.2-1B-Instruct"
+#llm = InferenceClient(model=repo_id, timeout=120)
 # Test your LLM client
 #llm_client.text_generation(prompt="How are you today?", max_new_tokens=20)
     resources = ['punkt', 'stopwords', 'snowball_data']
     for resource in resources:
         try:
+            nltk.download(resource, quiet=False)
         except Exception as e:
             print(f"Failed to download {resource}: {str(e)}")
 def optimize_query(
     query: str,
+    query_optimization_model: str,  # Added to match your signature = "google/flan-t5-small"
     chunks: List[str],
     embedding_model: str,
+    vector_store_type: str,  # Added to match your signature
+    search_type: str,  # Added to match your signature
     top_k: int = 3,
+    use_gpu: bool = False
 ) -> str:
     """
     CPU-optimized version of query expansion using a small language model.
     Args:
         query: Original search query
+        query_optimization_model: Name or path of the model to use for optimization
         chunks: List of text chunks to search through
         embedding_model: Name of the embedding model being used
+        vector_store_type: Type of vector store being used
+        search_type: Type of search being performed
         top_k: Number of expansion terms to add
         use_gpu: Whether to use GPU if available (defaults to False for CPU)
     Returns:
                 # Limit number of lemmas
                 expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
+        # 3. Use provided model with reduced complexity
         try:
             # Load model with reduced memory footprint
             tokenizer = AutoTokenizer.from_pretrained(
+                query_optimization_model,  # Use the provided model name
+                model_max_length=128,
+                cache_dir="./model_cache"
             )
+            model = AutoModelForSeq2Gen.from_pretrained(
+                query_optimization_model,  # Use the provided model name
+                low_cpu_mem_usage=True,
+                device_map="cpu"
             )
             # Move model to CPU and eval mode
             model = model.to(device)
+            model.eval()
             # Prepare input with reduced length
             prompt = f"Enhance this search query with relevant terms: {query}"
             inputs = tokenizer(
                 prompt,
                 return_tensors="pt",
+                max_length=64,
                 truncation=True,
                 padding=True
             )
             # Generate with minimal parameters
+            with torch.no_grad():
                 outputs = model.generate(
                     inputs.input_ids.to(device),
+                    max_length=32,
                     num_return_sequences=1,
                     temperature=0.7,
+                    do_sample=False,
                     early_stopping=True
                 )
         except Exception as model_error:
             print(f"Model-based expansion failed: {str(model_error)}")
+            enhanced_query = query
         # 4. Combine original and expanded terms
         final_terms = set(tokens)
         final_terms.update(expanded_terms)
+        if enhanced_query != query:
             final_terms.update(word_tokenize(enhanced_query.lower()))
         # 5. Remove stopwords and select top_k most relevant terms
         del tokenizer
         if device == "cuda":
             torch.cuda.empty_cache()
         return expanded_query.strip()
     except Exception as e:
         print(f"Query optimization failed: {str(e)}")
         return query  # Return original query if optimization fails
 # Example usage
 """
 chunks = ["sample text chunk 1", "sample text chunk 2"]
         "apply_phonetic": apply_phonetic,
         "phonetic_weight": phonetic_weight,
         "use_query_optimization": use_query_optimization,
+        "query_optimization_model": query_optimization_model
         "use_reranking": use_reranking
     }
                     'apply_phonetic': [False],  # Default phonetic settings
                     'phonetic_weight': [0.5],
                     'custom_separators': [None],
+                    'query_optimization_model': ['google/flan-t5-base']  # Default query optimization model
                 }
                 # Run automated tests