Zeitstaub commited on
Commit
e6c2600
1 Parent(s): f257665

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import euclidean_distances
6
+
7
+ # Load DataFrame
8
+ text_embeddings = pd.read_parquet('text_embeddings_abstract_all.parquet')
9
+
10
+ # Initialize models
11
+ model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2')
12
+ model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2')
13
+ model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2')
14
+ model_gte_large = SentenceTransformer('thenlper/gte-large')
15
+ model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0')
16
+
17
+ # Model selection drop-down list
18
+ model_options = {
19
+ 'all-MiniLM-L6-v2': model_all_Mini,
20
+ 'intfloat/e5-large-v2': model_e5_large_v2,
21
+ 'intfloat/e5-small-v2': model_e5_small_v2,
22
+ 'thenlper/gte-large': model_gte_large,
23
+ 'avsolatorio/GIST-large-Embedding-v0': model_GIST_large
24
+ }
25
+
26
+ # Main function for the Gradio interface
27
+ def find_similar_texts(model_name, input_text):
28
+ # Check whether there are abstracts matching the text input
29
+ input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1)
30
+ embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2'])
31
+ distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten()
32
+
33
+ # Only continue if similar abstract found
34
+ if any(distances_mini < 1.05):
35
+ selected_model = model_options[model_name]
36
+ embedding_column = 'embedding_' + model_name
37
+ input_embedding = selected_model.encode(input_text).reshape(1, -1)
38
+ embedding_matrix = np.vstack(text_embeddings[embedding_column])
39
+ distances = euclidean_distances(embedding_matrix, input_embedding).flatten()
40
+ text_embeddings['euclidean_distance'] = distances
41
+ sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True)
42
+ top_five = sorted_embeddings.head(5)[['abstract', 'patent no']]
43
+ formatted_output = '\n\n'.join([f"Patent No: {row['patent no']}\nAbstract: {row['abstract']}\n" for index, row in top_five.iterrows()])
44
+ return formatted_output
45
+ else:
46
+ return "It seems there is no patent abstract close to your description."
47
+
48
+ # Create Gradio interface using Blocks
49
+ with gr.Blocks() as demo:
50
+ gr.Markdown("## Sentence-Transformer based Patent-Abstract Search")
51
+ with gr.Row():
52
+ with gr.Column():
53
+ model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Chose Sentence-Transformer")
54
+ text_input = gr.Textbox(lines=2, placeholder="input_text", label="input_text (your description)")
55
+ submit_button = gr.Button("search")
56
+
57
+ with gr.Column():
58
+ output = gr.Textbox(label="top 5 patent abstracts if available)")
59
+
60
+ submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
61
+
62
+
63
+ gr.Markdown("""
64
+ ### Description
65
+ This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of patent abstracts in the field of machine learning and AI.
66
+
67
+ - 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/).
68
+ - 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2).
69
+ - 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2).
70
+ - 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281).
71
+ - 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829).
72
+
73
+ The patents can be viewed at [Espacenet](https://worldwide.espacenet.com/?locale=en_EP), the free onine service by the European Patent Office.
74
+
75
+ Please note: The data used in this demo contains only a very limited subset of patent abstracts and is intended only for demonstration purposes. It does by far not cover all patents or their complete data.
76
+ """)
77
+ model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
78
+ text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
79
+
80
+ demo.launch()