Chris4K commited on
Commit
9c4e039
1 Parent(s): 928b41f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -42
app.py CHANGED
@@ -320,48 +320,138 @@ def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=Fa
320
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
321
  return 0
322
 
323
- def optimize_query(
324
- query: str,
325
- llm_model: str = "meta-llama/Llama-3.2-1B",
326
- chunks: List[str] = None,
327
- embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
328
- vector_store_type: str = "faiss",
329
- search_type: str = "similarity",
330
- top_k: int = 3 # Reduce top_k for quicker test
331
- ) -> List[str]:
332
- # Initialize the language model
333
- #llm = HuggingFacePipeline(pipeline(model=llm_model))
334
-
335
- print('---- optimize query ----')
336
- # Create a temporary vector store for query optimization
337
- temp_vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
338
 
339
- print('---- optimize query 2 ----')
340
-
341
- # Create a retriever with the temporary vector store
342
- temp_retriever = get_retriever(temp_vector_store, search_type, {"k": top_k})
343
-
344
- print('---- optimize query 3 ----')
345
-
346
- # Initialize MultiQueryRetriever with the temporary retriever and the language model
347
- multi_query_retriever = MultiQueryRetriever.from_llm(
348
- retriever=temp_retriever,
349
- llm=llm
350
- )
351
 
352
- print('---- optimize query 4 ----')
353
- #print(llm.invoke('Hello'))
354
- # Limit max time or set a timeout for LLM to avoid endless execution
355
- try:
356
- optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
357
- except Exception as err:
358
- print(f"Unexpected {err=}, {type(err)=}")
 
 
 
359
 
360
- print(optimized_queries)
361
- print('---- optimize query 5 ----')
 
 
 
 
 
362
 
363
- return optimized_queries
364
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
 
367
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
@@ -781,24 +871,26 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
781
  if optimize_vocab:
782
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
783
  chunks = optimized_chunks
784
-
 
 
785
  if use_query_optimization:
786
  optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
787
  #query = " ".join(optimized_queries)
788
- query = " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
789
 
790
  results, search_time, vector_store, results_raw = search_embeddings(
791
  chunks,
792
  embedding_model,
793
  vector_store_type,
794
  search_type,
795
- query,
796
  top_k,
797
  expected_result,
798
  lang,
799
  apply_phonetic,
800
  phonetic_weight
801
- )
802
 
803
  if use_reranking:
804
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
 
320
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
321
  return 0
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
+ from typing import List, Union
325
+ import torch
326
+ from transformers import AutoTokenizer, AutoModelForSeq2Gen
327
+ import numpy as np
328
+ from nltk.tokenize import word_tokenize
329
+ from nltk.corpus import wordnet
330
+ import nltk
 
 
 
 
 
331
 
332
+ def optimize_query(
333
+ query: str,
334
+ chunks: List[str],
335
+ embedding_model: str,
336
+ top_k: int = 3,
337
+ model_name: str = "google/flan-t5-small", # Small model (only 80M parameters)
338
+ use_gpu: bool = False # Default to CPU
339
+ ) -> str:
340
+ """
341
+ CPU-optimized version of query expansion using a small language model.
342
 
343
+ Args:
344
+ query: Original search query
345
+ chunks: List of text chunks to search through
346
+ embedding_model: Name of the embedding model being used
347
+ top_k: Number of expansion terms to add
348
+ model_name: Name of the small language model to use
349
+ use_gpu: Whether to use GPU if available (defaults to False for CPU)
350
 
351
+ Returns:
352
+ Expanded query string
353
+ """
354
+ try:
355
+ # Set device
356
+ device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
357
+
358
+ # 1. Basic text preprocessing (CPU-based)
359
+ tokens = word_tokenize(query.lower())
360
+
361
+ # 2. WordNet synonyms expansion (CPU-based)
362
+ expanded_terms = set()
363
+ for token in tokens:
364
+ # Limit synonym lookup to save CPU resources
365
+ synsets = wordnet.synsets(token)[:1] # Take only top synset per word
366
+ for syn in synsets:
367
+ # Limit number of lemmas
368
+ expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
369
+
370
+ # 3. Use small T5 model with reduced complexity
371
+ try:
372
+ # Load model with reduced memory footprint
373
+ tokenizer = AutoTokenizer.from_pretrained(
374
+ model_name,
375
+ model_max_length=128, # Limit maximum sequence length
376
+ cache_dir="./model_cache" # Cache models locally
377
+ )
378
+ model = AutoModelForSeq2Gen.from_pretrained(
379
+ model_name,
380
+ low_cpu_mem_usage=True, # Enable low memory usage
381
+ device_map="cpu" # Explicitly set to CPU
382
+ )
383
+
384
+ # Move model to CPU and eval mode
385
+ model = model.to(device)
386
+ model.eval() # Set to evaluation mode to reduce memory usage
387
+
388
+ # Prepare input with reduced length
389
+ prompt = f"Enhance this search query with relevant terms: {query}"
390
+ inputs = tokenizer(
391
+ prompt,
392
+ return_tensors="pt",
393
+ max_length=64, # Reduced from 128
394
+ truncation=True,
395
+ padding=True
396
+ )
397
+
398
+ # Generate with minimal parameters
399
+ with torch.no_grad(): # Disable gradient calculation
400
+ outputs = model.generate(
401
+ inputs.input_ids.to(device),
402
+ max_length=32, # Reduced from 64
403
+ num_return_sequences=1,
404
+ temperature=0.7,
405
+ do_sample=False, # Disable sampling for faster generation
406
+ early_stopping=True
407
+ )
408
+
409
+ enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
410
+
411
+ # Clear CUDA cache if GPU was used
412
+ if device == "cuda":
413
+ torch.cuda.empty_cache()
414
+
415
+ except Exception as model_error:
416
+ print(f"Model-based expansion failed: {str(model_error)}")
417
+ enhanced_query = query # Fallback to original query
418
+
419
+ # 4. Combine original and expanded terms
420
+ final_terms = set(tokens)
421
+ final_terms.update(expanded_terms)
422
+ if enhanced_query != query: # Only add if model expansion worked
423
+ final_terms.update(word_tokenize(enhanced_query.lower()))
424
+
425
+ # 5. Remove stopwords and select top_k most relevant terms
426
+ stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
427
+ final_terms = [term for term in final_terms if term not in stopwords]
428
+
429
+ # Combine with original query
430
+ expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
431
+
432
+ # Clean up
433
+ del model
434
+ del tokenizer
435
+ if device == "cuda":
436
+ torch.cuda.empty_cache()
437
+ print(expanded_query.strip())
438
+ return expanded_query.strip()
439
+
440
+ except Exception as e:
441
+ print(f"Query optimization failed: {str(e)}")
442
+ return query # Return original query if optimization fails
443
+
444
+ # Example usage
445
+ """
446
+ chunks = ["sample text chunk 1", "sample text chunk 2"]
447
+ query = "machine learning algorithms"
448
+ optimized_query = optimize_query(
449
+ query=query,
450
+ chunks=chunks,
451
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
452
+ use_gpu=False # Explicitly use CPU
453
+ )
454
+ """
455
 
456
 
457
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
 
871
  if optimize_vocab:
872
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
873
  chunks = optimized_chunks
874
+
875
+ search_query = query
876
+
877
  if use_query_optimization:
878
  optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
879
  #query = " ".join(optimized_queries)
880
+ search_query = " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
881
 
882
  results, search_time, vector_store, results_raw = search_embeddings(
883
  chunks,
884
  embedding_model,
885
  vector_store_type,
886
  search_type,
887
+ search_query,
888
  top_k,
889
  expected_result,
890
  lang,
891
  apply_phonetic,
892
  phonetic_weight
893
+ )
894
 
895
  if use_reranking:
896
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")