Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,9 @@ import json
|
|
6 |
from transformers import AutoTokenizer, AutoModel
|
7 |
from sentence_transformers import SentenceTransformer, models
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
9 |
import re
|
|
|
10 |
import spacy
|
11 |
import joblib
|
12 |
|
@@ -50,7 +52,9 @@ embeddings, patent_numbers, metadata, texts = load_data()
|
|
50 |
try:
|
51 |
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
|
52 |
bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
|
53 |
-
|
|
|
|
|
54 |
except Exception as e:
|
55 |
print(f"Error loading anferico/bert-for-patents: {e}")
|
56 |
print("Falling back to a general-purpose model.")
|
@@ -99,28 +103,4 @@ def hybrid_search(query, top_k=5):
|
|
99 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
100 |
|
101 |
# Perform semantic similarity search
|
102 |
-
semantic_distances,
|
103 |
-
|
104 |
-
results = []
|
105 |
-
for i in range(top_k):
|
106 |
-
patent_number = patent_numbers[semantic_indices[0][i]]
|
107 |
-
patent_data = metadata[patent_number]
|
108 |
-
patent_features = extract_key_features(patent_data['text'])
|
109 |
-
common_features, similarity_score = compare_features(query_features, patent_features)
|
110 |
-
|
111 |
-
results.append({
|
112 |
-
'patent_number': patent_number,
|
113 |
-
'common_features': common_features,
|
114 |
-
'similarity_score': similarity_score,
|
115 |
-
'semantic_score': semantic_distances[0][i]
|
116 |
-
})
|
117 |
-
|
118 |
-
return results
|
119 |
-
|
120 |
-
iface = gr.Interface(
|
121 |
-
fn=hybrid_search,
|
122 |
-
inputs=gr.inputs.Textbox(label="Enter your search query"),
|
123 |
-
outputs=gr.outputs.JSON(label="Search Results")
|
124 |
-
)
|
125 |
-
|
126 |
-
iface.launch()
|
|
|
6 |
from transformers import AutoTokenizer, AutoModel
|
7 |
from sentence_transformers import SentenceTransformer, models
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
import re
|
11 |
+
from collections import Counter
|
12 |
import spacy
|
13 |
import joblib
|
14 |
|
|
|
52 |
try:
|
53 |
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
|
54 |
bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
|
55 |
+
word_embedding_model = models.Transformer(model_name='anferico/bert-for-patents', tokenizer=tokenizer, model=bert_model)
|
56 |
+
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
57 |
+
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
58 |
except Exception as e:
|
59 |
print(f"Error loading anferico/bert-for-patents: {e}")
|
60 |
print("Falling back to a general-purpose model.")
|
|
|
103 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
104 |
|
105 |
# Perform semantic similarity search
|
106 |
+
semantic_distances, semantic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|