Spaces:

flax-sentence-embeddings
/

sentence-embeddings

Runtime error

App Files Files Community

Trent commited on Jul 19, 2021

Commit

883e41e

1 Parent(s): 75c3a89

Clustering function

Browse files

Files changed (3) hide show

app.py +5 -1
backend/inference.py +68 -1
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -118,4 +118,8 @@ For more cool information on sentence embeddings, see the [sBert project](https:
     if st.button('Give me my search.'):
         results = {model: inference.text_search(anchor, n_texts, model, QA_MODELS_ID) for model in select_models}
-        st.table(pd.DataFrame(results[select_models[0]]).T)

     if st.button('Give me my search.'):
         results = {model: inference.text_search(anchor, n_texts, model, QA_MODELS_ID) for model in select_models}
+        st.table(pd.DataFrame(results[select_models[0]]).T)
+    if st.button('3D Clustering of search result (new window)'):
+        fig = inference.text_cluster(anchor, 1000, select_models[0], QA_MODELS_ID)
+        fig.show()

backend/inference.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gzip
 import json
 import pandas as pd
 import numpy as np
@@ -11,7 +12,7 @@ from typing import List, Union
 import torch
 from backend.utils import load_model, filter_questions, load_embeddings
 def cos_sim(a, b):
     return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b))
@@ -71,3 +72,69 @@ def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict):
         urls.append(f"https://stackoverflow.com/q/{post['id']}")
     return hits_titles, hits_scores, urls

 import gzip
 import json
+from collections import Counter
 import pandas as pd
 import numpy as np
 import torch
 from backend.utils import load_model, filter_questions, load_embeddings
+from MulticoreTSNE import MulticoreTSNE as TSNE
 def cos_sim(a, b):
     return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b))
         urls.append(f"https://stackoverflow.com/q/{post['id']}")
     return hits_titles, hits_scores, urls
+def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict):
+    # Proceeding with model
+    print(model_name)
+    assert model_name == "mpnet_qa"
+    model = load_model(model_name, model_dict)
+    # Creating embeddings
+    query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
+    print("loading embeddings")
+    corpus_emb = load_embeddings()
+    # Getting hits
+    hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
+    filtered_posts = filter_questions("python")
+    hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits]
+    hits_dict.append(dict(id = '1', title = anchor, tags = ['']))
+    hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits])
+    hits_emb = torch.cat((hits_emb, query_emb))
+    # Dimensionality reduction with t-SNE
+    tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000)
+    tsne_results = tsne.fit_transform(hits_emb.cpu())
+    df = pd.DataFrame(hits_dict)
+    tags = list(df['tags'])
+    counter = Counter(tags[0])
+    for i in tags[1:]:
+        counter.update(i)
+    df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions'])
+    most_common_tags = list(df_tags['Tag'])[1:5]
+    labels = []
+    for tags_list in list(df['tags']):
+        for common_tag in most_common_tags:
+            if common_tag in tags_list:
+                labels.append(common_tag)
+                break
+            elif common_tag != most_common_tags[-1]:
+                continue
+            else:
+                labels.append('others')
+    df['title'] = [post['title'] for post in hits_dict]
+    df['labels'] = labels
+    df['tsne_x'] = tsne_results[:, 0]
+    df['tsne_y'] = tsne_results[:, 1]
+    df['tsne_z'] = tsne_results[:, 2]
+    df['size'] = [2 for i in range(len(df))]
+    # Making the query bigger than the rest of the observations
+    df['size'][len(df) - 1] = 10
+    df['labels'][len(df) - 1] = 'QUERY'
+    import plotly.express as px
+    fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size',
+                        color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title])
+    return fig

requirements.txt CHANGED Viewed

@@ -5,3 +5,5 @@ jaxlib
 streamlit
 numpy
 torch

 streamlit
 numpy
 torch
+MulticoreTSNE
+plotly