Update app.py
Browse files
app.py
CHANGED
@@ -49,10 +49,32 @@ model = SentenceTransformer('all-mpnet-base-v2')
|
|
49 |
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
50 |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def hybrid_search(query, top_k=5):
|
53 |
-
print(f"
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
# Encode the query using the transformer model
|
56 |
query_embedding = model.encode([query])[0]
|
57 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
58 |
|
@@ -60,7 +82,7 @@ def hybrid_search(query, top_k=5):
|
|
60 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
|
61 |
|
62 |
# Perform TF-IDF based search
|
63 |
-
query_tfidf = tfidf_vectorizer.transform([
|
64 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
65 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
66 |
|
@@ -68,7 +90,7 @@ def hybrid_search(query, top_k=5):
|
|
68 |
combined_results = {}
|
69 |
for i, idx in enumerate(semantic_indices[0]):
|
70 |
patent_number = patent_numbers[idx].decode('utf-8')
|
71 |
-
combined_results[patent_number] = semantic_distances[0][i]
|
72 |
|
73 |
for idx in tfidf_indices:
|
74 |
patent_number = patent_numbers[idx].decode('utf-8')
|
|
|
49 |
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
50 |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
|
51 |
|
52 |
+
# Synonym dictionary for query expansion
|
53 |
+
synonyms = {
|
54 |
+
"slidable": ["detachable", "removable"],
|
55 |
+
"flexible": ["elastic", "deformable"],
|
56 |
+
"aerosol": ["vapor"],
|
57 |
+
"device": ["generator"]
|
58 |
+
}
|
59 |
+
|
60 |
+
def expand_query(query):
|
61 |
+
words = query.split()
|
62 |
+
expanded_query = []
|
63 |
+
for word in words:
|
64 |
+
if word in synonyms:
|
65 |
+
expanded_query.append(f"({word} OR {' OR '.join(synonyms[word])})")
|
66 |
+
else:
|
67 |
+
expanded_query.append(word)
|
68 |
+
return " ".join(expanded_query)
|
69 |
+
|
70 |
def hybrid_search(query, top_k=5):
|
71 |
+
print(f"Original query: {query}")
|
72 |
+
|
73 |
+
# Expand the query using synonyms
|
74 |
+
expanded_query = expand_query(query)
|
75 |
+
print(f"Expanded query: {expanded_query}")
|
76 |
|
77 |
+
# Encode the original query using the transformer model
|
78 |
query_embedding = model.encode([query])[0]
|
79 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
80 |
|
|
|
82 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
|
83 |
|
84 |
# Perform TF-IDF based search
|
85 |
+
query_tfidf = tfidf_vectorizer.transform([expanded_query])
|
86 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
87 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
88 |
|
|
|
90 |
combined_results = {}
|
91 |
for i, idx in enumerate(semantic_indices[0]):
|
92 |
patent_number = patent_numbers[idx].decode('utf-8')
|
93 |
+
combined_results[patent_number] = semantic_distances[0][i] * 1.5 # Increase weight for semantic similarity
|
94 |
|
95 |
for idx in tfidf_indices:
|
96 |
patent_number = patent_numbers[idx].decode('utf-8')
|