Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import euclidean_distances | |
# Load DataFrame | |
text_embeddings = pd.read_parquet('text_embeddings_abstract_all.parquet') | |
# Initialize models | |
model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2') | |
model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2') | |
model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2') | |
model_gte_large = SentenceTransformer('thenlper/gte-large') | |
model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0') | |
# Model selection drop-down list | |
model_options = { | |
'all-MiniLM-L6-v2': model_all_Mini, | |
'intfloat/e5-large-v2': model_e5_large_v2, | |
'intfloat/e5-small-v2': model_e5_small_v2, | |
'thenlper/gte-large': model_gte_large, | |
'avsolatorio/GIST-large-Embedding-v0': model_GIST_large | |
} | |
# Main function for the Gradio interface | |
def find_similar_texts(model_name, input_text): | |
# Check whether model has been selected | |
if not model_name: | |
return "You forgot to choose a sentence-transformer." | |
# Check whether there are abstracts matching the text input | |
input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1) | |
embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2']) | |
distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten() | |
# Only continue if similar abstract found | |
if any(distances_mini < 1.05): | |
selected_model = model_options[model_name] | |
embedding_column = 'embedding_' + model_name | |
input_embedding = selected_model.encode(input_text).reshape(1, -1) | |
embedding_matrix = np.vstack(text_embeddings[embedding_column]) | |
distances = euclidean_distances(embedding_matrix, input_embedding).flatten() | |
text_embeddings['euclidean_distance'] = distances | |
sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True) | |
top_five = sorted_embeddings.head(5)[['abstract', 'patent no']] | |
formatted_output = '\n\n'.join([f"Patent No: {row['patent no']}\nAbstract: {row['abstract']}\n" for index, row in top_five.iterrows()]) | |
return formatted_output | |
else: | |
return "It seems there is no patent abstract close to your description." | |
# Create Gradio interface using Blocks | |
with gr.Blocks() as demo: | |
gr.Markdown("## Sentence-Transformer based Patent-Abstract Search") | |
with gr.Row(): | |
with gr.Column(): | |
model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Chose Sentence-Transformer") | |
text_input = gr.Textbox(lines=2, placeholder="machine learning for drug dosing", label="input_text (like <<machine learning for dug dosing>>. Remember, this is only a small selection of machine learning patents!)") | |
submit_button = gr.Button("search") | |
with gr.Column(): | |
output = gr.Textbox(label="top 5 patent abstracts (if available)") | |
submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output) | |
gr.Markdown(""" | |
### Description | |
This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of patent abstracts in the field of machine learning and AI. | |
- 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/). | |
- 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2). | |
- 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2). | |
- 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281). | |
- 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829). | |
The patents can be viewed at [Espacenet](https://worldwide.espacenet.com/?locale=en_EP), the free onine service by the European Patent Office. | |
Please note: The data used in this demo contains only a very limited subset of patent abstracts and is intended only for demonstration purposes. It does by far not cover all patents or their complete data. | |
""") | |
model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output) | |
text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output) | |
demo.launch() | |