bhlewis's picture
Update app.py
cf6f1f1 verified
raw
history blame
6.34 kB
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter
import spacy
# Load Spacy model for advanced NLP
try:
nlp = spacy.load("en_core_web_sm")
except IOError:
print("Downloading spacy model...")
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
def load_data():
try:
with h5py.File('patent_embeddings.h5', 'r') as f:
embeddings = f['embeddings'][:]
patent_numbers = f['patent_numbers'][:]
metadata = {}
texts = []
with open('patent_metadata.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
metadata[data['patent_number']] = data
texts.append(data['text'])
print(f"Embedding shape: {embeddings.shape}")
print(f"Number of patent numbers: {len(patent_numbers)}")
print(f"Number of metadata entries: {len(metadata)}")
return embeddings, patent_numbers, metadata, texts
except FileNotFoundError as e:
print(f"Error: Could not find file. {e}")
raise
except Exception as e:
print(f"An unexpected error occurred while loading data: {e}")
raise
embeddings, patent_numbers, metadata, texts = load_data()
# Load BERT model for encoding search queries
try:
bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
word_embedding_model = models.Transformer(bert_model, tokenizer)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
except Exception as e:
print(f"Error loading anferico/bert-for-patents: {e}")
print("Falling back to a general-purpose model.")
model = SentenceTransformer('all-MiniLM-L6-v2')
# Check if the embedding dimensions match
if embeddings.shape[1] != model.get_sentence_embedding_dimension():
print("Embedding dimensions do not match. Rebuilding FAISS index.")
# Rebuild embeddings using the new model
embeddings = np.array([model.encode(text) for text in texts])
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
def extract_key_features(text):
# Use Spacy to extract noun phrases and key phrases
doc = nlp(text)
noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of)', sent.text, re.IGNORECASE)]
all_features = noun_phrases + feature_phrases
return list(set(all_features))
def compare_features(query_features, patent_features):
common_features = set(query_features) & set(patent_features)
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
return common_features, similarity_score
def hybrid_search(query, top_k=5):
print(f"Original query: {query}")
query_features = extract_key_features(query)
# Encode the query using the transformer model
query_embedding = model.encode([query])[0]
query_embedding = query_embedding / np.linalg.norm(query_embedding)
# Perform semantic similarity search
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
# Perform TF-IDF based search
query_tfidf = tfidf_vectorizer.transform([query])
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
# Combine and rank results
combined_results = {}
for i, idx in enumerate(semantic_indices[0]):
patent_number = patent_numbers[idx].decode('utf-8')
text = metadata[patent_number]['text']
patent_features = extract_key_features(text)
common_features, feature_similarity = compare_features(query_features, patent_features)
combined_results[patent_number] = {
'score': semantic_distances[0][i] * 1.5 + feature_similarity,
'common_features': common_features,
'text': text
}
for idx in tfidf_indices:
patent_number = patent_numbers[idx].decode('utf-8')
if patent_number not in combined_results:
text = metadata[patent_number]['text']
patent_features = extract_key_features(text)
common_features, feature_similarity = compare_features(query_features, patent_features)
combined_results[patent_number] = {
'score': tfidf_similarities[idx] + feature_similarity,
'common_features': common_features,
'text': text
}
# Sort and get top results
top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
results = []
for patent_number, data in top_results:
result = f"Patent Number: {patent_number}\n"
result += f"Text: {data['text'][:200]}...\n"
result += f"Combined Score: {data['score']:.4f}\n"
result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
results.append(result)
return "\n".join(results)
# Create Gradio interface
iface = gr.Interface(
fn=hybrid_search,
inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
outputs=gr.Textbox(lines=10, label="Search Results"),
title="Patent Similarity Search",
description="Enter a patent description to find similar patents based on key features."
)
if __name__ == "__main__":
iface.launch()