gabriel lopez commited on
Commit
6afa13b
1 Parent(s): b1709c2

remove shared link

Browse files
Files changed (3) hide show
  1. arxiv_tool/app.py +23 -12
  2. arxiv_tool/core.py +60 -53
  3. arxiv_tool/plot.py +30 -19
arxiv_tool/app.py CHANGED
@@ -4,7 +4,12 @@ from plot import EmbeddingPlotter
4
 
5
  TITLE = "Search tool for ArXiv papers"
6
  DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
7
- EXAMPLES=["RoBERTa optimisation", "Permutation invariant AI models", "Gradient descent", "Black hole information theory"]
 
 
 
 
 
8
  ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
9
 
10
  # interface function
@@ -14,21 +19,27 @@ def search_and_plot(querry):
14
  df, result = SentenceEncoder().transform(df, querry, model, embeddings)
15
  # plot
16
  fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
17
- return result[['title', 'similarity']], fig1, fig2
 
18
 
19
  # gradio elements
20
- in_textbox = gr.Textbox(label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1)
 
 
21
  # in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
22
  out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
23
  out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
24
- out_plot_projected_sphere = gr.Plot(label="Lambert-conformal projection over a plane", visible=False)
 
 
25
 
26
  # launch interface
27
- gr.Interface(inputs=in_textbox,
28
- outputs=[out_dataframe,out_plot_sphere,out_plot_projected_sphere],
29
- examples=EXAMPLES,
30
- fn=search_and_plot,
31
- title=TITLE,
32
- description=DESCRIPTION,
33
- article=ARTICLE,
34
- ).launch(share=True)
 
 
4
 
5
  TITLE = "Search tool for ArXiv papers"
6
  DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
7
+ EXAMPLES = [
8
+ "RoBERTa optimisation",
9
+ "Permutation invariant AI models",
10
+ "Gradient descent",
11
+ "Black hole information theory",
12
+ ]
13
  ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
14
 
15
  # interface function
 
19
  df, result = SentenceEncoder().transform(df, querry, model, embeddings)
20
  # plot
21
  fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
22
+ return result[["title", "similarity"]], fig1, fig2
23
+
24
 
25
  # gradio elements
26
+ in_textbox = gr.Textbox(
27
+ label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1
28
+ )
29
  # in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
30
  out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
31
  out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
32
+ out_plot_projected_sphere = gr.Plot(
33
+ label="Lambert-conformal projection over a plane", visible=False
34
+ )
35
 
36
  # launch interface
37
+ gr.Interface(
38
+ inputs=in_textbox,
39
+ outputs=[out_dataframe, out_plot_sphere, out_plot_projected_sphere],
40
+ examples=EXAMPLES,
41
+ fn=search_and_plot,
42
+ title=TITLE,
43
+ description=DESCRIPTION,
44
+ article=ARTICLE,
45
+ ).launch()
arxiv_tool/core.py CHANGED
@@ -2,73 +2,80 @@ import pandas as pd
2
  import numpy as np
3
  import nmslib
4
  from sentence_transformers import SentenceTransformer
 
5
  # TODO: Use pipe, remove embeddings
6
 
7
 
8
  class SentenceEncoder:
9
- """ Encodes the querry and papers data set and finds elements with the lowest cosine similarity """
10
 
11
  def load_and_encode(self):
12
- # load
13
- df = self._load()
14
- # encode
15
- df, model, embeddings = self._encode_papers(df)
16
- return df, model, embeddings
17
 
18
  def transform(self, df, querry, model, embeddings):
19
- # create_index
20
- emb_querry = self._econde_querry(querry, model)
21
- # search
22
- result = self._make_search(df,emb_querry, embeddings)
23
- # add_relevant_columns
24
- df = self._add_relevant_columns(df, result)
25
- return df, result
26
 
27
  def _load(self):
28
- # Load data
29
- df = pd.read_csv("data/arxiv.csv")
30
- return df
31
 
32
- def _encode_papers(self,df):
33
- # Encode the papers title
34
- checkpoint = 'distilbert-base-uncased'
35
- model = SentenceTransformer(checkpoint)
36
- embeddings = model.encode(df['title'], convert_to_tensor=True)
37
- # embeddings column
38
- df['embeddings'] = np.array(embeddings).tolist()
39
- return df, model, embeddings
40
 
41
- def _econde_querry(self,querry, model):
42
- # Encode the querry
43
- emb_querry = model.encode([querry])
44
- return emb_querry
45
 
46
  def _make_search(self, df, emb_querry, embeddings):
47
- # initialize a new index, using a HNSW index on Cosine Similarity
48
- index = nmslib.init(method='hnsw', space='cosinesimil')
49
- index.addDataPointBatch(embeddings)
50
- index.createIndex({'post': 2}, print_progress=True)
51
- # search
52
- result = self._extract_search_result(index, emb_querry, df, k=10)
53
- return result
54
 
55
- def _extract_search_result(self,index, emb_querry, df, k):
56
- data = []
57
- idx, distances = index.knnQuery(emb_querry, k=k)
58
- for i, j in zip(idx, distances):
59
- data.append({'index': i,
60
- 'title': df.title[i],
61
- 'abstract': df.abstract[i],
62
- 'similarity': 1.0 - j})
63
- return pd.DataFrame(data)
 
 
 
 
64
 
65
  def _add_relevant_columns(self, df, result):
66
- # get categories
67
- df['categories_parsed'] = df.categories.str.split().apply(lambda x: x[0]).str.split('.').apply(lambda x: x[0])
68
- # create columns for plotting
69
- df['index_papers'] = df.index
70
- df['selected'] = df.index_papers.apply(lambda x: x in list(result['index']) )
71
- return df
72
-
73
-
74
-
 
 
 
2
  import numpy as np
3
  import nmslib
4
  from sentence_transformers import SentenceTransformer
5
+
6
  # TODO: Use pipe, remove embeddings
7
 
8
 
9
  class SentenceEncoder:
10
+ """Encodes the querry and papers data set and finds elements with the lowest cosine similarity"""
11
 
12
  def load_and_encode(self):
13
+ # load
14
+ df = self._load()
15
+ # encode
16
+ df, model, embeddings = self._encode_papers(df)
17
+ return df, model, embeddings
18
 
19
  def transform(self, df, querry, model, embeddings):
20
+ # create_index
21
+ emb_querry = self._econde_querry(querry, model)
22
+ # search
23
+ result = self._make_search(df, emb_querry, embeddings)
24
+ # add_relevant_columns
25
+ df = self._add_relevant_columns(df, result)
26
+ return df, result
27
 
28
  def _load(self):
29
+ # Load data
30
+ df = pd.read_csv("data/arxiv.csv")
31
+ return df
32
 
33
+ def _encode_papers(self, df):
34
+ # Encode the papers title
35
+ checkpoint = "distilbert-base-uncased"
36
+ model = SentenceTransformer(checkpoint)
37
+ embeddings = model.encode(df["title"], convert_to_tensor=True)
38
+ # embeddings column
39
+ df["embeddings"] = np.array(embeddings).tolist()
40
+ return df, model, embeddings
41
 
42
+ def _econde_querry(self, querry, model):
43
+ # Encode the querry
44
+ emb_querry = model.encode([querry])
45
+ return emb_querry
46
 
47
  def _make_search(self, df, emb_querry, embeddings):
48
+ # initialize a new index, using a HNSW index on Cosine Similarity
49
+ index = nmslib.init(method="hnsw", space="cosinesimil")
50
+ index.addDataPointBatch(embeddings)
51
+ index.createIndex({"post": 2}, print_progress=True)
52
+ # search
53
+ result = self._extract_search_result(index, emb_querry, df, k=10)
54
+ return result
55
 
56
+ def _extract_search_result(self, index, emb_querry, df, k):
57
+ data = []
58
+ idx, distances = index.knnQuery(emb_querry, k=k)
59
+ for i, j in zip(idx, distances):
60
+ data.append(
61
+ {
62
+ "index": i,
63
+ "title": df.title[i],
64
+ "abstract": df.abstract[i],
65
+ "similarity": 1.0 - j,
66
+ }
67
+ )
68
+ return pd.DataFrame(data)
69
 
70
  def _add_relevant_columns(self, df, result):
71
+ # get categories
72
+ df["categories_parsed"] = (
73
+ df.categories.str.split()
74
+ .apply(lambda x: x[0])
75
+ .str.split(".")
76
+ .apply(lambda x: x[0])
77
+ )
78
+ # create columns for plotting
79
+ df["index_papers"] = df.index
80
+ df["selected"] = df.index_papers.apply(lambda x: x in list(result["index"]))
81
+ return df
arxiv_tool/plot.py CHANGED
@@ -4,37 +4,48 @@ import plotly.express as px
4
  from pandas import DataFrame
5
  import numpy as np
6
 
 
7
  class EmbeddingPlotter:
8
- """ Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere """
9
 
10
  def transform(self, df, embeddings):
11
  df = self.umap_embedding(df, embeddings)
12
  fig1, fig2 = self.plot(df)
13
  return fig1, fig2
14
 
15
-
16
  def umap_embedding(self, df, embeddings):
17
  # UMAP - Spherical
18
- sphere_mapper = umap.UMAP(output_metric='haversine', random_state=42).fit(np.array(embeddings))
19
- df['spherical_emb_X'] = np.sin(sphere_mapper.embedding_[:,0])*np.cos(sphere_mapper.embedding_[:,1])
20
- df['spherical_emb_Y'] = np.sin(sphere_mapper.embedding_[:,0])*np.sin(sphere_mapper.embedding_[:,1])
21
- df['spherical_emb_Z'] = np.cos(sphere_mapper.embedding_[:,0])
 
 
 
 
 
 
22
  # UMAP - Lambert Conformal
23
- df['lambert_conformal_emb_x'] = np.arctan2(df['spherical_emb_X'], df['spherical_emb_Y'])
24
- df['lambert_conformal_emb_y'] = -np.arccos(df['spherical_emb_Z'])
 
 
25
  return df
26
 
27
  def plot(self, df):
28
  # on the 3d sphere
29
- fig1 = px.scatter_3d(df,
30
- x='spherical_emb_X',
31
- y='spherical_emb_Y',
32
- z='spherical_emb_Z',
33
- color="categories_parsed")
 
 
34
  # on the projected spehre
35
- fig2 = px.scatter(data_frame=df ,
36
- x='lambert_conformal_emb_x',
37
- y='lambert_conformal_emb_y',
38
- color="categories_parsed",
39
- )
40
- return fig1, fig2
 
 
4
  from pandas import DataFrame
5
  import numpy as np
6
 
7
+
8
  class EmbeddingPlotter:
9
+ """Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere"""
10
 
11
  def transform(self, df, embeddings):
12
  df = self.umap_embedding(df, embeddings)
13
  fig1, fig2 = self.plot(df)
14
  return fig1, fig2
15
 
 
16
  def umap_embedding(self, df, embeddings):
17
  # UMAP - Spherical
18
+ sphere_mapper = umap.UMAP(output_metric="haversine", random_state=42).fit(
19
+ np.array(embeddings)
20
+ )
21
+ df["spherical_emb_X"] = np.sin(sphere_mapper.embedding_[:, 0]) * np.cos(
22
+ sphere_mapper.embedding_[:, 1]
23
+ )
24
+ df["spherical_emb_Y"] = np.sin(sphere_mapper.embedding_[:, 0]) * np.sin(
25
+ sphere_mapper.embedding_[:, 1]
26
+ )
27
+ df["spherical_emb_Z"] = np.cos(sphere_mapper.embedding_[:, 0])
28
  # UMAP - Lambert Conformal
29
+ df["lambert_conformal_emb_x"] = np.arctan2(
30
+ df["spherical_emb_X"], df["spherical_emb_Y"]
31
+ )
32
+ df["lambert_conformal_emb_y"] = -np.arccos(df["spherical_emb_Z"])
33
  return df
34
 
35
  def plot(self, df):
36
  # on the 3d sphere
37
+ fig1 = px.scatter_3d(
38
+ df,
39
+ x="spherical_emb_X",
40
+ y="spherical_emb_Y",
41
+ z="spherical_emb_Z",
42
+ color="categories_parsed",
43
+ )
44
  # on the projected spehre
45
+ fig2 = px.scatter(
46
+ data_frame=df,
47
+ x="lambert_conformal_emb_x",
48
+ y="lambert_conformal_emb_y",
49
+ color="categories_parsed",
50
+ )
51
+ return fig1, fig2