BERT-for-Patents_Semantic-Patent-Finder-v2

Running

App Files Files Community

bhlewis commited on Aug 2

Commit

86b10f3

•

1 Parent(s): f05d8cd

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -92

app.py CHANGED Viewed

@@ -3,80 +3,33 @@ import numpy as np
 import h5py
 import faiss
 import json
 import re
 from collections import Counter
 import torch
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
 import nltk
-from sentence_transformers import SentenceTransformer
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-# Download necessary NLTK data
-nltk.download('stopwords', quiet=True)
-nltk.download('punkt', quiet=True)
-# Load SentenceTransformer model
-model = SentenceTransformer('anferico/bert-for-patents')
-def preprocess_query(text):
-    # Remove "[EN]" label and claim numbers
-    text = re.sub(r'\[EN\]\s*', '', text)
-    text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE)
-    # Convert to lowercase while preserving acronyms and units
-    words = text.split()
-    text = ' '.join(word if word.isupper() or re.match(r'^\d+(\.\d+)?[a-zA-Z]+$', word) else word.lower() for word in words)
-    # Remove special characters except hyphens and periods in numbers
-    text = re.sub(r'[^\w\s\-.]', ' ', text)
-    text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)  # Remove periods not in numbers
-    # Normalize spaces
-    text = re.sub(r'\s+', ' ', text).strip()
-    # Tokenize
-    tokens = word_tokenize(text)
-    # Remove stopwords
-    stop_words = set(stopwords.words('english'))
-    tokens = [word for word in tokens if word.lower() not in stop_words]
-    # Join tokens back into text
-    text = ' '.join(tokens)
-    # Preserve numerical values with units
-    text = re.sub(r'(\d+(\.\d+)?)([a-zA-Z]+)', r'\1_\3', text)
-    # Handle ranges and measurements
-    text = re.sub(r'(\d+(\.\d+)?)(\s*to\s*)(\d+(\.\d+)?)(\s*[a-zA-Z]+)', r'\1_to_\4_\6', text)
-    text = re.sub(r'between\s*(\d+(\.\d+)?)(\s*and\s*)(\d+(\.\d+)?)\s*([a-zA-Z]+)', r'between_\1_and_\4_\5', text)
-    # Preserve chemical formulas
-    text = re.sub(r'\b([A-Z][a-z]?\d*)+\b', lambda m: m.group().replace(' ', ''), text)
-    return text
-def extract_key_features(text):
-    # For queries, we'll just preprocess and return all non-stopword terms
-    processed_text = preprocess_query(text)
-    # Split the processed text into individual terms
-    features = processed_text.split()
-    # Remove duplicates while preserving order
-    features = list(dict.fromkeys(features))
-    return features
-def encode_texts(texts):
-    embeddings = model.encode(texts, show_progress_bar=True)
-    return embeddings
 def load_data():
     try:
         with h5py.File('patent_embeddings.h5', 'r') as f:
             embeddings = f['embeddings'][:]
             patent_numbers = f['patent_numbers'][:]
         metadata = {}
         texts = []
         with open('patent_metadata.jsonl', 'r') as f:
@@ -84,15 +37,63 @@ def load_data():
                 data = json.loads(line)
                 metadata[data['patent_number']] = data
                 texts.append(data['text'])
         print(f"Embedding shape: {embeddings.shape}")
         print(f"Number of patent numbers: {len(patent_numbers)}")
         print(f"Number of metadata entries: {len(metadata)}")
         return embeddings, patent_numbers, metadata, texts
     except Exception as e:
-        print(f"An error occurred while loading data: {e}")
         raise
 def compare_features(query_features, patent_features):
     common_features = set(query_features) & set(patent_features)
     similarity_score = len(common_features) / max(len(query_features), len(patent_features))
@@ -100,21 +101,21 @@ def compare_features(query_features, patent_features):
 def hybrid_search(query, top_k=5):
     print(f"Original query: {query}")
-    processed_query = preprocess_query(query)
-    query_features = extract_key_features(processed_query)
-    # Encode the processed query using the SentenceTransformer model
-    query_embedding = encode_texts([processed_query])[0]
     query_embedding = query_embedding / np.linalg.norm(query_embedding)
     # Perform semantic similarity search
     semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
     # Perform TF-IDF based search
-    query_tfidf = tfidf_vectorizer.transform([processed_query])
     tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
     tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
     # Combine and rank results
     combined_results = {}
     for i, idx in enumerate(semantic_indices[0]):
@@ -127,7 +128,7 @@ def hybrid_search(query, top_k=5):
             'common_features': common_features,
             'text': text
         }
     for idx in tfidf_indices:
         patent_number = patent_numbers[idx].decode('utf-8')
         if patent_number not in combined_results:
@@ -139,9 +140,10 @@ def hybrid_search(query, top_k=5):
                 'common_features': common_features,
                 'text': text
             }
     # Sort and get top results
     top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
     results = []
     for patent_number, data in top_results:
         result = f"Patent Number: {patent_number}\n"
@@ -149,24 +151,10 @@ def hybrid_search(query, top_k=5):
         result += f"Combined Score: {data['score']:.4f}\n"
         result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
         results.append(result)
     return "\n".join(results)
-# Load data and prepare the FAISS index
-embeddings, patent_numbers, metadata, texts = load_data()
-# Normalize embeddings for cosine similarity
-embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
-# Create FAISS index for cosine similarity
-index = faiss.IndexFlatIP(embeddings.shape[1])
-index.add(embeddings)
-# Create TF-IDF vectorizer
-tfidf_vectorizer = TfidfVectorizer(stop_words='english')
-tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
-# Create Gradio interface
 iface = gr.Interface(
     fn=hybrid_search,
     inputs=[
@@ -179,4 +167,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

 import h5py
 import faiss
 import json
+from transformers import AutoTokenizer, AutoModel
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 import re
 from collections import Counter
+import spacy
 import torch
+from nltk.corpus import wordnet
 import nltk
+# Download WordNet data
+nltk.download('wordnet')
+# Load Spacy model for advanced NLP
+try:
+    nlp = spacy.load("en_core_web_sm")
+except IOError:
+    print("Downloading spacy model...")
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
 def load_data():
     try:
         with h5py.File('patent_embeddings.h5', 'r') as f:
             embeddings = f['embeddings'][:]
             patent_numbers = f['patent_numbers'][:]
         metadata = {}
         texts = []
         with open('patent_metadata.jsonl', 'r') as f:
                 data = json.loads(line)
                 metadata[data['patent_number']] = data
                 texts.append(data['text'])
         print(f"Embedding shape: {embeddings.shape}")
         print(f"Number of patent numbers: {len(patent_numbers)}")
         print(f"Number of metadata entries: {len(metadata)}")
         return embeddings, patent_numbers, metadata, texts
+    except FileNotFoundError as e:
+        print(f"Error: Could not find file. {e}")
+        raise
     except Exception as e:
+        print(f"An unexpected error occurred while loading data: {e}")
         raise
+embeddings, patent_numbers, metadata, texts = load_data()
+# Load BERT model for encoding search queries
+tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
+bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
+def encode_texts(texts, max_length=512):
+    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
+    with torch.no_grad():
+        outputs = bert_model(**inputs)
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    return embeddings.numpy()
+# Check if the embedding dimensions match
+if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
+    print("Embedding dimensions do not match. Rebuilding FAISS index.")
+    # Rebuild embeddings using the new model
+    embeddings = encode_texts(texts)
+    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Normalize embeddings for cosine similarity
+embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Create FAISS index for cosine similarity
+index = faiss.IndexFlatIP(embeddings.shape[1])
+index.add(embeddings)
+# Create TF-IDF vectorizer
+tfidf_vectorizer = TfidfVectorizer(stop_words='english')
+tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
+def extract_key_features(text):
+    # Use Spacy to extract technical terms and phrases
+    doc = nlp(text)
+    technical_terms = []
+    for token in doc:
+        if token.dep_ in ('amod', 'compound') or token.ent_type_ in ('PRODUCT', 'ORG', 'GPE', 'NORP'):
+            technical_terms.append(token.text.lower())
+    noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
+    feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of|deformable|insulation|heat-resistant|memory foam|high-temperature)', sent.text, re.IGNORECASE)]
+    all_features = technical_terms + noun_phrases + feature_phrases
+    return list(set(all_features))
 def compare_features(query_features, patent_features):
     common_features = set(query_features) & set(patent_features)
     similarity_score = len(common_features) / max(len(query_features), len(patent_features))
 def hybrid_search(query, top_k=5):
     print(f"Original query: {query}")
+    query_features = extract_key_features(query)
+    # Encode the query using the transformer model
+    query_embedding = encode_texts([query])[0]
     query_embedding = query_embedding / np.linalg.norm(query_embedding)
     # Perform semantic similarity search
     semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
     # Perform TF-IDF based search
+    query_tfidf = tfidf_vectorizer.transform([query])
     tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
     tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
     # Combine and rank results
     combined_results = {}
     for i, idx in enumerate(semantic_indices[0]):
             'common_features': common_features,
             'text': text
         }
     for idx in tfidf_indices:
         patent_number = patent_numbers[idx].decode('utf-8')
         if patent_number not in combined_results:
                 'common_features': common_features,
                 'text': text
             }
     # Sort and get top results
     top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
     results = []
     for patent_number, data in top_results:
         result = f"Patent Number: {patent_number}\n"
         result += f"Combined Score: {data['score']:.4f}\n"
         result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
         results.append(result)
     return "\n".join(results)
+# Create Gradio interface with additional input fields
 iface = gr.Interface(
     fn=hybrid_search,
     inputs=[
 )
 if __name__ == "__main__":
+    iface.launch()