File size: 5,388 Bytes
663d1ad
 
 
 
 
 
5c704be
 
2a70269
 
eaf6036
 
 
8b7ba6b
 
 
 
 
 
663d1ad
 
6744e1a
 
 
 
 
 
5c704be
6744e1a
 
 
 
5c704be
6744e1a
 
 
 
 
5c704be
6744e1a
 
 
 
 
 
663d1ad
5c704be
663d1ad
4a2057c
 
 
 
 
663d1ad
 
 
e0eddce
74523b8
5c704be
 
 
 
2a70269
eaf6036
 
 
 
2a70269
 
eaf6036
b661953
2a70269
 
 
 
b661953
5c704be
b661953
 
2a70269
4a2057c
2a70269
663d1ad
4a2057c
663d1ad
5c704be
8b7ba6b
74523b8
5c704be
2a70269
5c704be
 
663d1ad
5c704be
 
 
 
2a70269
 
 
 
 
 
 
 
086c46f
5c704be
13d437f
2a70269
 
 
 
 
 
 
 
 
5c704be
 
2a70269
5c704be
 
2a70269
663d1ad
2a70269
 
 
663d1ad
 
5c704be
663d1ad
 
 
5c704be
2a70269
663d1ad
 
2a70269
663d1ad
 
74523b8
e0eddce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter
import spacy

# Load Spacy model for advanced NLP
try:
    nlp = spacy.load("en_core_web_sm")
except IOError:
    print("Downloading spacy model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def load_data():
    try:
        with h5py.File('patent_embeddings.h5', 'r') as f:
            embeddings = f['embeddings'][:]
            patent_numbers = f['patent_numbers'][:]
        
        metadata = {}
        texts = []
        with open('patent_metadata.jsonl', 'r') as f:
            for line in f:
                data = json.loads(line)
                metadata[data['patent_number']] = data
                texts.append(data['text'])
        
        print(f"Embedding shape: {embeddings.shape}")
        print(f"Number of patent numbers: {len(patent_numbers)}")
        print(f"Number of metadata entries: {len(metadata)}")
        
        return embeddings, patent_numbers, metadata, texts
    except FileNotFoundError as e:
        print(f"Error: Could not find file. {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while loading data: {e}")
        raise

embeddings, patent_numbers, metadata, texts = load_data()

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Load BERT model for encoding search queries
model = SentenceTransformer('anferico/bert-for-patents')

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

def extract_key_features(text):
    # Use Spacy to extract noun phrases and key phrases
    doc = nlp(text)
    noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
    feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of)', sent.text, re.IGNORECASE)]
    
    all_features = noun_phrases + feature_phrases
    return list(set(all_features))

def compare_features(query_features, patent_features):
    common_features = set(query_features) & set(patent_features)
    similarity_score = len(common_features) / max(len(query_features), len(patent_features))
    return common_features, similarity_score

def hybrid_search(query, top_k=5):
    print(f"Original query: {query}")
    
    query_features = extract_key_features(query)
    
    # Encode the query using the transformer model
    query_embedding = model.encode([query])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    # Perform semantic similarity search
    semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
    
    # Perform TF-IDF based search
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
    
    # Combine and rank results
    combined_results = {}
    for i, idx in enumerate(semantic_indices[0]):
        patent_number = patent_numbers[idx].decode('utf-8')
        text = metadata[patent_number]['text']
        patent_features = extract_key_features(text)
        common_features, feature_similarity = compare_features(query_features, patent_features)
        combined_results[patent_number] = {
            'score': semantic_distances[0][i] * 1.5 + feature_similarity,
            'common_features': common_features,
            'text': text
        }
    
    for idx in tfidf_indices:
        patent_number = patent_numbers[idx].decode('utf-8')
        if patent_number not in combined_results:
            text = metadata[patent_number]['text']
            patent_features = extract_key_features(text)
            common_features, feature_similarity = compare_features(query_features, patent_features)
            combined_results[patent_number] = {
                'score': tfidf_similarities[idx] + feature_similarity,
                'common_features': common_features,
                'text': text
            }
    
    # Sort and get top results
    top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
    
    results = []
    for patent_number, data in top_results:
        result = f"Patent Number: {patent_number}\n"
        result += f"Text: {data['text'][:200]}...\n"
        result += f"Combined Score: {data['score']:.4f}\n"
        result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
        results.append(result)
    
    return "\n".join(results)

# Create Gradio interface
iface = gr.Interface(
    fn=hybrid_search,
    inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
    outputs=gr.Textbox(lines=10, label="Search Results"),
    title="Patent Similarity Search",
    description="Enter a patent description to find similar patents based on key features."
)

if __name__ == "__main__":
    iface.launch()