bhlewis commited on
Commit
b661953
·
verified ·
1 Parent(s): 5df2c8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -4
app.py CHANGED
@@ -49,10 +49,32 @@ model = SentenceTransformer('all-mpnet-base-v2')
49
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
50
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def hybrid_search(query, top_k=5):
53
- print(f"Searching for: {query}")
 
 
 
 
54
 
55
- # Encode the query using the transformer model
56
  query_embedding = model.encode([query])[0]
57
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
58
 
@@ -60,7 +82,7 @@ def hybrid_search(query, top_k=5):
60
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
61
 
62
  # Perform TF-IDF based search
63
- query_tfidf = tfidf_vectorizer.transform([query])
64
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
65
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
66
 
@@ -68,7 +90,7 @@ def hybrid_search(query, top_k=5):
68
  combined_results = {}
69
  for i, idx in enumerate(semantic_indices[0]):
70
  patent_number = patent_numbers[idx].decode('utf-8')
71
- combined_results[patent_number] = semantic_distances[0][i]
72
 
73
  for idx in tfidf_indices:
74
  patent_number = patent_numbers[idx].decode('utf-8')
 
49
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
50
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
51
 
52
+ # Synonym dictionary for query expansion
53
+ synonyms = {
54
+ "slidable": ["detachable", "removable"],
55
+ "flexible": ["elastic", "deformable"],
56
+ "aerosol": ["vapor"],
57
+ "device": ["generator"]
58
+ }
59
+
60
+ def expand_query(query):
61
+ words = query.split()
62
+ expanded_query = []
63
+ for word in words:
64
+ if word in synonyms:
65
+ expanded_query.append(f"({word} OR {' OR '.join(synonyms[word])})")
66
+ else:
67
+ expanded_query.append(word)
68
+ return " ".join(expanded_query)
69
+
70
  def hybrid_search(query, top_k=5):
71
+ print(f"Original query: {query}")
72
+
73
+ # Expand the query using synonyms
74
+ expanded_query = expand_query(query)
75
+ print(f"Expanded query: {expanded_query}")
76
 
77
+ # Encode the original query using the transformer model
78
  query_embedding = model.encode([query])[0]
79
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
80
 
 
82
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
83
 
84
  # Perform TF-IDF based search
85
+ query_tfidf = tfidf_vectorizer.transform([expanded_query])
86
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
87
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
88
 
 
90
  combined_results = {}
91
  for i, idx in enumerate(semantic_indices[0]):
92
  patent_number = patent_numbers[idx].decode('utf-8')
93
+ combined_results[patent_number] = semantic_distances[0][i] * 1.5 # Increase weight for semantic similarity
94
 
95
  for idx in tfidf_indices:
96
  patent_number = patent_numbers[idx].decode('utf-8')