Spaces:
Sleeping
Sleeping
gabriel lopez
commited on
Commit
•
6afa13b
1
Parent(s):
b1709c2
remove shared link
Browse files- arxiv_tool/app.py +23 -12
- arxiv_tool/core.py +60 -53
- arxiv_tool/plot.py +30 -19
arxiv_tool/app.py
CHANGED
@@ -4,7 +4,12 @@ from plot import EmbeddingPlotter
|
|
4 |
|
5 |
TITLE = "Search tool for ArXiv papers"
|
6 |
DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
|
7 |
-
EXAMPLES=[
|
|
|
|
|
|
|
|
|
|
|
8 |
ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
|
9 |
|
10 |
# interface function
|
@@ -14,21 +19,27 @@ def search_and_plot(querry):
|
|
14 |
df, result = SentenceEncoder().transform(df, querry, model, embeddings)
|
15 |
# plot
|
16 |
fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
|
17 |
-
return result[[
|
|
|
18 |
|
19 |
# gradio elements
|
20 |
-
in_textbox = gr.Textbox(
|
|
|
|
|
21 |
# in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
|
22 |
out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
|
23 |
out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
|
24 |
-
out_plot_projected_sphere = gr.Plot(
|
|
|
|
|
25 |
|
26 |
# launch interface
|
27 |
-
gr.Interface(
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
4 |
|
5 |
TITLE = "Search tool for ArXiv papers"
|
6 |
DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
|
7 |
+
EXAMPLES = [
|
8 |
+
"RoBERTa optimisation",
|
9 |
+
"Permutation invariant AI models",
|
10 |
+
"Gradient descent",
|
11 |
+
"Black hole information theory",
|
12 |
+
]
|
13 |
ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
|
14 |
|
15 |
# interface function
|
|
|
19 |
df, result = SentenceEncoder().transform(df, querry, model, embeddings)
|
20 |
# plot
|
21 |
fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
|
22 |
+
return result[["title", "similarity"]], fig1, fig2
|
23 |
+
|
24 |
|
25 |
# gradio elements
|
26 |
+
in_textbox = gr.Textbox(
|
27 |
+
label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1
|
28 |
+
)
|
29 |
# in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
|
30 |
out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
|
31 |
out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
|
32 |
+
out_plot_projected_sphere = gr.Plot(
|
33 |
+
label="Lambert-conformal projection over a plane", visible=False
|
34 |
+
)
|
35 |
|
36 |
# launch interface
|
37 |
+
gr.Interface(
|
38 |
+
inputs=in_textbox,
|
39 |
+
outputs=[out_dataframe, out_plot_sphere, out_plot_projected_sphere],
|
40 |
+
examples=EXAMPLES,
|
41 |
+
fn=search_and_plot,
|
42 |
+
title=TITLE,
|
43 |
+
description=DESCRIPTION,
|
44 |
+
article=ARTICLE,
|
45 |
+
).launch()
|
arxiv_tool/core.py
CHANGED
@@ -2,73 +2,80 @@ import pandas as pd
|
|
2 |
import numpy as np
|
3 |
import nmslib
|
4 |
from sentence_transformers import SentenceTransformer
|
|
|
5 |
# TODO: Use pipe, remove embeddings
|
6 |
|
7 |
|
8 |
class SentenceEncoder:
|
9 |
-
"""
|
10 |
|
11 |
def load_and_encode(self):
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
def transform(self, df, querry, model, embeddings):
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
|
27 |
def _load(self):
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
def _encode_papers(self,df):
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
def _econde_querry(self,querry, model):
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
|
46 |
def _make_search(self, df, emb_querry, embeddings):
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
def _extract_search_result(self,index, emb_querry, df, k):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def _add_relevant_columns(self, df, result):
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import nmslib
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
# TODO: Use pipe, remove embeddings
|
7 |
|
8 |
|
9 |
class SentenceEncoder:
|
10 |
+
"""Encodes the querry and papers data set and finds elements with the lowest cosine similarity"""
|
11 |
|
12 |
def load_and_encode(self):
|
13 |
+
# load
|
14 |
+
df = self._load()
|
15 |
+
# encode
|
16 |
+
df, model, embeddings = self._encode_papers(df)
|
17 |
+
return df, model, embeddings
|
18 |
|
19 |
def transform(self, df, querry, model, embeddings):
|
20 |
+
# create_index
|
21 |
+
emb_querry = self._econde_querry(querry, model)
|
22 |
+
# search
|
23 |
+
result = self._make_search(df, emb_querry, embeddings)
|
24 |
+
# add_relevant_columns
|
25 |
+
df = self._add_relevant_columns(df, result)
|
26 |
+
return df, result
|
27 |
|
28 |
def _load(self):
|
29 |
+
# Load data
|
30 |
+
df = pd.read_csv("data/arxiv.csv")
|
31 |
+
return df
|
32 |
|
33 |
+
def _encode_papers(self, df):
|
34 |
+
# Encode the papers title
|
35 |
+
checkpoint = "distilbert-base-uncased"
|
36 |
+
model = SentenceTransformer(checkpoint)
|
37 |
+
embeddings = model.encode(df["title"], convert_to_tensor=True)
|
38 |
+
# embeddings column
|
39 |
+
df["embeddings"] = np.array(embeddings).tolist()
|
40 |
+
return df, model, embeddings
|
41 |
|
42 |
+
def _econde_querry(self, querry, model):
|
43 |
+
# Encode the querry
|
44 |
+
emb_querry = model.encode([querry])
|
45 |
+
return emb_querry
|
46 |
|
47 |
def _make_search(self, df, emb_querry, embeddings):
|
48 |
+
# initialize a new index, using a HNSW index on Cosine Similarity
|
49 |
+
index = nmslib.init(method="hnsw", space="cosinesimil")
|
50 |
+
index.addDataPointBatch(embeddings)
|
51 |
+
index.createIndex({"post": 2}, print_progress=True)
|
52 |
+
# search
|
53 |
+
result = self._extract_search_result(index, emb_querry, df, k=10)
|
54 |
+
return result
|
55 |
|
56 |
+
def _extract_search_result(self, index, emb_querry, df, k):
|
57 |
+
data = []
|
58 |
+
idx, distances = index.knnQuery(emb_querry, k=k)
|
59 |
+
for i, j in zip(idx, distances):
|
60 |
+
data.append(
|
61 |
+
{
|
62 |
+
"index": i,
|
63 |
+
"title": df.title[i],
|
64 |
+
"abstract": df.abstract[i],
|
65 |
+
"similarity": 1.0 - j,
|
66 |
+
}
|
67 |
+
)
|
68 |
+
return pd.DataFrame(data)
|
69 |
|
70 |
def _add_relevant_columns(self, df, result):
|
71 |
+
# get categories
|
72 |
+
df["categories_parsed"] = (
|
73 |
+
df.categories.str.split()
|
74 |
+
.apply(lambda x: x[0])
|
75 |
+
.str.split(".")
|
76 |
+
.apply(lambda x: x[0])
|
77 |
+
)
|
78 |
+
# create columns for plotting
|
79 |
+
df["index_papers"] = df.index
|
80 |
+
df["selected"] = df.index_papers.apply(lambda x: x in list(result["index"]))
|
81 |
+
return df
|
arxiv_tool/plot.py
CHANGED
@@ -4,37 +4,48 @@ import plotly.express as px
|
|
4 |
from pandas import DataFrame
|
5 |
import numpy as np
|
6 |
|
|
|
7 |
class EmbeddingPlotter:
|
8 |
-
"""
|
9 |
|
10 |
def transform(self, df, embeddings):
|
11 |
df = self.umap_embedding(df, embeddings)
|
12 |
fig1, fig2 = self.plot(df)
|
13 |
return fig1, fig2
|
14 |
|
15 |
-
|
16 |
def umap_embedding(self, df, embeddings):
|
17 |
# UMAP - Spherical
|
18 |
-
sphere_mapper = umap.UMAP(output_metric=
|
19 |
-
|
20 |
-
|
21 |
-
df[
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# UMAP - Lambert Conformal
|
23 |
-
df[
|
24 |
-
|
|
|
|
|
25 |
return df
|
26 |
|
27 |
def plot(self, df):
|
28 |
# on the 3d sphere
|
29 |
-
fig1 = px.scatter_3d(
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
# on the projected spehre
|
35 |
-
fig2 = px.scatter(
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
4 |
from pandas import DataFrame
|
5 |
import numpy as np
|
6 |
|
7 |
+
|
8 |
class EmbeddingPlotter:
|
9 |
+
"""Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere"""
|
10 |
|
11 |
def transform(self, df, embeddings):
|
12 |
df = self.umap_embedding(df, embeddings)
|
13 |
fig1, fig2 = self.plot(df)
|
14 |
return fig1, fig2
|
15 |
|
|
|
16 |
def umap_embedding(self, df, embeddings):
|
17 |
# UMAP - Spherical
|
18 |
+
sphere_mapper = umap.UMAP(output_metric="haversine", random_state=42).fit(
|
19 |
+
np.array(embeddings)
|
20 |
+
)
|
21 |
+
df["spherical_emb_X"] = np.sin(sphere_mapper.embedding_[:, 0]) * np.cos(
|
22 |
+
sphere_mapper.embedding_[:, 1]
|
23 |
+
)
|
24 |
+
df["spherical_emb_Y"] = np.sin(sphere_mapper.embedding_[:, 0]) * np.sin(
|
25 |
+
sphere_mapper.embedding_[:, 1]
|
26 |
+
)
|
27 |
+
df["spherical_emb_Z"] = np.cos(sphere_mapper.embedding_[:, 0])
|
28 |
# UMAP - Lambert Conformal
|
29 |
+
df["lambert_conformal_emb_x"] = np.arctan2(
|
30 |
+
df["spherical_emb_X"], df["spherical_emb_Y"]
|
31 |
+
)
|
32 |
+
df["lambert_conformal_emb_y"] = -np.arccos(df["spherical_emb_Z"])
|
33 |
return df
|
34 |
|
35 |
def plot(self, df):
|
36 |
# on the 3d sphere
|
37 |
+
fig1 = px.scatter_3d(
|
38 |
+
df,
|
39 |
+
x="spherical_emb_X",
|
40 |
+
y="spherical_emb_Y",
|
41 |
+
z="spherical_emb_Z",
|
42 |
+
color="categories_parsed",
|
43 |
+
)
|
44 |
# on the projected spehre
|
45 |
+
fig2 = px.scatter(
|
46 |
+
data_frame=df,
|
47 |
+
x="lambert_conformal_emb_x",
|
48 |
+
y="lambert_conformal_emb_y",
|
49 |
+
color="categories_parsed",
|
50 |
+
)
|
51 |
+
return fig1, fig2
|