Spaces:

flax-sentence-embeddings
/

sentence-embeddings

Runtime error

Trent commited on Jul 19, 2021

Commit

75c3a89

1 Parent(s): fa5d8a4

Search function

Files changed (8) hide show

.gitattributes CHANGED Viewed

@@ -14,3 +14,5 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*.jsonl.gz filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -2,12 +2,12 @@ import streamlit as st
 import pandas as pd
 from backend import inference
-from backend.config import MODELS_ID, QA_MODELS_ID
 st.title('Demo using Flax-Sentence-Tranformers')
 st.sidebar.title('Tasks')
-menu = st.sidebar.radio("", options=["Sentence Similarity", "Asymmetric QA", "Search", "Clustering"], index=0)
 st.markdown('''
@@ -71,7 +71,7 @@ For more cool information on sentence embeddings, see the [sBert project](https:
     n_texts = st.number_input(
         f'''How many answers you want to compare with: '{anchor}'?''',
-        value=3,
         min_value=2)
     inputs = []
@@ -97,7 +97,25 @@ For more cool information on sentence embeddings, see the [sBert project](https:
         st.line_chart(df_total)
 elif menu == "Search":
-    select_models = st.multiselect("Choose models", options=list(MODELS_ID), default=list(MODELS_ID)[0])
-elif menu == "Clustering":
-    select_models = st.multiselect("Choose models", options=list(MODELS_ID), default=list(MODELS_ID)[0])

 import pandas as pd
 from backend import inference
+from backend.config import MODELS_ID, QA_MODELS_ID, SEARCH_MODELS_ID
 st.title('Demo using Flax-Sentence-Tranformers')
 st.sidebar.title('Tasks')
+menu = st.sidebar.radio("", options=["Sentence Similarity", "Asymmetric QA", "Search"], index=0)
 st.markdown('''
     n_texts = st.number_input(
         f'''How many answers you want to compare with: '{anchor}'?''',
+        value=10,
         min_value=2)
     inputs = []
         st.line_chart(df_total)
 elif menu == "Search":
+    st.header('SEARCH')
+    st.markdown('''
+**Instructions**: Make a query for anything related to "Python" and the model you choose will return you similar queries.
+For more cool information on sentence embeddings, see the [sBert project](https://www.sbert.net/examples/applications/computing-embeddings/README.html).
+''')
+    select_models = st.multiselect("Choose models", options=list(SEARCH_MODELS_ID), default=list(SEARCH_MODELS_ID)[0])
+    anchor = st.text_input(
+        'Please enter here your query about "Python", we will look for similar ones:',
+        value="How do I sort a dataframe by column"
+    )
+    n_texts = st.number_input(
+        f'''How many similar queries you want?''',
+        value=3,
+        min_value=2)
+    if st.button('Give me my search.'):
+        results = {model: inference.text_search(anchor, n_texts, model, QA_MODELS_ID) for model in select_models}
+        st.table(pd.DataFrame(results[select_models[0]]).T)

backend/config.py CHANGED Viewed

@@ -7,4 +7,8 @@ QA_MODELS_ID = dict(
                            'flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-A'],
     mpnet_qa='flax-sentence-embeddings/mpnet_stackexchange_v1',
     distilbert_qa = 'flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot'
 )

                            'flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-A'],
     mpnet_qa='flax-sentence-embeddings/mpnet_stackexchange_v1',
     distilbert_qa = 'flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot'
+)
+SEARCH_MODELS_ID = dict(
+    mpnet_qa='flax-sentence-embeddings/mpnet_stackexchange_v1'
 )

backend/inference.py CHANGED Viewed

@@ -1,11 +1,16 @@
 import pandas as pd
 import jax.numpy as jnp
 from typing import List, Union
-# Defining cosine similarity using flax.
-from backend.config import MODELS_ID
-from backend.utils import load_model
 def cos_sim(a, b):
@@ -35,3 +40,34 @@ def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict:
     df = pd.DataFrame(d, columns=['inputs', 'score'])
     return df

+import gzip
+import json
 import pandas as pd
+import numpy as np
 import jax.numpy as jnp
+import tqdm
+from sentence_transformers import util
 from typing import List, Union
+import torch
+from backend.utils import load_model, filter_questions, load_embeddings
 def cos_sim(a, b):
     df = pd.DataFrame(d, columns=['inputs', 'score'])
     return df
+# Search
+def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict):
+    # Proceeding with model
+    print(model_name)
+    assert model_name == "mpnet_qa"
+    model = load_model(model_name, model_dict)
+    # Creating embeddings
+    query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
+    print("loading embeddings")
+    corpus_emb = load_embeddings()
+    # Getting hits
+    hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
+    filtered_posts = filter_questions("python")
+    print(f"{len(filtered_posts)} posts found with tag: python")
+    hits_titles = []
+    hits_scores = []
+    urls = []
+    for hit in hits:
+        post = filtered_posts[hit['corpus_id']]
+        hits_titles.append(post['title'])
+        hits_scores.append("{:.3f}".format(hit['score']))
+        urls.append(f"https://stackoverflow.com/q/{post['id']}")
+    return hits_titles, hits_scores, urls

backend/utils.py CHANGED Viewed

@@ -1,4 +1,10 @@
 import streamlit as st
 from sentence_transformers import SentenceTransformer
@@ -13,3 +19,28 @@ def load_model(model_name, model_dict):
         output = [SentenceTransformer(name) for name in model_ids]
     return output

+import gzip
+import json
+import numpy as np
 import streamlit as st
+import torch
+import tqdm
 from sentence_transformers import SentenceTransformer
         output = [SentenceTransformer(name) for name in model_ids]
     return output
+@st.cache(allow_output_mutation=True)
+def load_embeddings():
+    # embedding pre-generated
+    corpus_emb = torch.from_numpy(np.loadtxt('./data/stackoverflow-titles-mpnet-emb.csv', max_rows=10000))
+    return corpus_emb.float()
+@st.cache(allow_output_mutation=True)
+def filter_questions(tag, max_questions=10000):
+    posts = []
+    max_posts = 6e6
+    with gzip.open("./data/stackoverflow-titles.jsonl.gz", "rt") as fIn:
+        for line in tqdm.auto.tqdm(fIn, total=max_posts, desc="Load data"):
+            posts.append(json.loads(line))
+            if len(posts) >= max_posts:
+                break
+    filtered_posts = []
+    for post in posts:
+        if tag in post["tags"]:
+            filtered_posts.append(post)
+            if len(filtered_posts) >= max_questions:
+                break
+    return filtered_posts

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/__init__.py ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ pandas
 jax
 jaxlib
 streamlit

 jax
 jaxlib
 streamlit
+numpy
+torch