Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

App Files Files Community

Michael-Geis commited on Jul 18, 2023

Commit

3d2ca49

•

1 Parent(s): b2af341

reorganized and created a pipeline for CBF

Browse files

Files changed (10) hide show

.gitignore +2 -1
config/main.yaml +13 -0
requirements.txt → config/requirements.txt +2 -1
prediction-flow-sample.ipynb +207 -5
__init__.py → src/__init__.py +0 -0
cleaning.py → src/cleaning.py +12 -0
embedding.py → src/embedding.py +18 -1
src/model.py +33 -0
src/search.py +30 -0
storage.py → src/storage.py +15 -7

.gitignore CHANGED Viewed

@@ -7,4 +7,5 @@ env
 settings.json
 collection.ipynb
 testing.ipynb
-testnb.ipynb

 settings.json
 collection.ipynb
 testing.ipynb
+testnb.ipynb
+output

config/main.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+input_id:
+  id: 1602.00730
+paths:
+  path_to_library: ./data/libraries/APSP_50_allenai-specter
+  path_to_save_recs: ./output/first_recs.feather

requirements.txt → config/requirements.txt RENAMED Viewed

@@ -4,4 +4,5 @@ numpy
 arxiv
 sentence_transformers
 regex
-sklearn

 arxiv
 sentence_transformers
 regex
+sklearn
+hydra

prediction-flow-sample.ipynb CHANGED Viewed

@@ -37,11 +37,11 @@
     }
    ],
    "source": [
-    "import embedding\n",
-    "import storage\n",
-    "from storage import ArXivData\n",
-    "from cleaning import TextCleaner\n",
-    "from embedding import Embedder\n",
     "from sentence_transformers import util\n",
     "\n",
     "importlib.reload(embedding)\n",
@@ -301,6 +301,208 @@
     "\n",
     "prepped_library._returned_metadata.iloc[indices]"
    ]
   }
  ],
  "metadata": {

     }
    ],
    "source": [
+    "import src.embedding as embedding\n",
+    "import src.storage as storage\n",
+    "from src.storage import ArXivData\n",
+    "from src.cleaning import TextCleaner\n",
+    "from src.embedding import Embedder\n",
     "from sentence_transformers import util\n",
     "\n",
     "importlib.reload(embedding)\n",
     "\n",
     "prepped_library._returned_metadata.iloc[indices]"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id_list = [\"1602.00730\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<module 'search' from 'c:\\\\Users\\\\Leems\\\\Desktop\\\\Coding\\\\Projects\\\\Fritz\\\\search.py'>"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import src.embedding as embedding\n",
+    "import src.search as search\n",
+    "import importlib\n",
+    "from src.storage import Fetch\n",
+    "from src.cleaning import TextCleaner\n",
+    "from src.embedding import Embedder\n",
+    "from src.search import Search\n",
+    "\n",
+    "importlib.reload(embedding)\n",
+    "importlib.reload(search)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Fetch metadata of input\n",
+    "getter = Fetch()\n",
+    "into_cleaner = getter.transform(X=id_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cleaner = TextCleaner()\n",
+    "\n",
+    "into_embedder = cleaner.transform(into_cleaner)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedder = Embedder(model_name=\"allenai-specter\")\n",
+    "into_search = embedder.transform(into_embedder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['1602.00730v1', '1411.0658v3', '1905.05136v3', '1103.1276v4', '2003.04597v2']"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "search = Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")\n",
+    "\n",
+    "search.transform(X=into_search).id.to_list()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>abstract</th>\n",
+       "      <th>id</th>\n",
+       "      <th>arxiv_subjects</th>\n",
+       "      <th>msc_tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>38787</th>\n",
+       "      <td>C-infinity Scaling Asymptotics for the Spectra...</td>\n",
+       "      <td>This article concerns new off-diagonal estimat...</td>\n",
+       "      <td>1602.00730v1</td>\n",
+       "      <td>[math.AP, math-ph, math.DG, math.FA, math.MP, ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39127</th>\n",
+       "      <td>Scaling Limit for the Kernel of the Spectral P...</td>\n",
+       "      <td>Let (M, g) be a compact smooth Riemannian mani...</td>\n",
+       "      <td>1411.0658v3</td>\n",
+       "      <td>[math.SP, math.AP, math.DG]</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9786</th>\n",
+       "      <td>A logarithmic improvement in the two-point Wey...</td>\n",
+       "      <td>In this paper, we study the two-point Weyl Law...</td>\n",
+       "      <td>1905.05136v3</td>\n",
+       "      <td>[math.AP, math.SP]</td>\n",
+       "      <td>[35P20]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49609</th>\n",
+       "      <td>The blowup along the diagonal of the spectral ...</td>\n",
+       "      <td>We formulate a precise conjecture about the un...</td>\n",
+       "      <td>1103.1276v4</td>\n",
+       "      <td>[math.DG, math-ph, math.AP, math.MP]</td>\n",
+       "      <td>[58J50, 35J15, 33C45, 32C05]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14857</th>\n",
+       "      <td>Growth of high $L^p$ norms for eigenfunctions:...</td>\n",
+       "      <td>This work concerns $L^p$ norms of high energy ...</td>\n",
+       "      <td>2003.04597v2</td>\n",
+       "      <td>[math.AP, math.SP]</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   title  ...                      msc_tags\n",
+       "38787  C-infinity Scaling Asymptotics for the Spectra...  ...                          None\n",
+       "39127  Scaling Limit for the Kernel of the Spectral P...  ...                          None\n",
+       "9786   A logarithmic improvement in the two-point Wey...  ...                       [35P20]\n",
+       "49609  The blowup along the diagonal of the spectral ...  ...  [58J50, 35J15, 33C45, 32C05]\n",
+       "14857  Growth of high $L^p$ norms for eigenfunctions:...  ...                          None\n",
+       "\n",
+       "[5 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "\n",
+    "pipe = Pipeline(\n",
+    "    [\n",
+    "        (\"fetch\", Fetch()),\n",
+    "        (\"clean\", TextCleaner()),\n",
+    "        (\"embed\", Embedder(model_name=\"allenai-specter\")),\n",
+    "        (\"search\", Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "\n",
+    "pipe.transform(X=id_list)"
+   ]
   }
  ],
  "metadata": {

__init__.py → src/__init__.py RENAMED Viewed

File without changes

cleaning.py → src/cleaning.py RENAMED Viewed

@@ -9,6 +9,18 @@ from sklearn.base import BaseEstimator, TransformerMixin
 class TextCleaner(BaseEstimator, TransformerMixin):
     """Return ArXivData class object with its metadata attribute modified so that
     1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
     2. The msc tag list has been translated to english.

 class TextCleaner(BaseEstimator, TransformerMixin):
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        doc_strings = (
+            X.title.apply(cleanse) + " " + X.abstract.apply(cleanse)
+        ).to_list()
+        return doc_strings
+class FullTextCleaner(BaseEstimator, TransformerMixin):
     """Return ArXivData class object with its metadata attribute modified so that
     1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
     2. The msc tag list has been translated to english.

embedding.py → src/embedding.py RENAMED Viewed

@@ -1,4 +1,4 @@
-import cleaning as clean
 from sentence_transformers import SentenceTransformer, util
 import pandas as pd
 import numpy as np
@@ -8,6 +8,23 @@ import os
 class Embedder(BaseEstimator, TransformerMixin):
     """A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
     def fit(self, X, y=None):

+import src.cleaning as clean
 from sentence_transformers import SentenceTransformer, util
 import pandas as pd
 import numpy as np
 class Embedder(BaseEstimator, TransformerMixin):
+    """Takes a list of clean strings and outputs a numpy array of their embeddings generated by the ST model model_name."""
+    def __init__(self, model_name) -> None:
+        super().__init__()
+        self.model_name = model_name
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        encoder = SentenceTransformer(self.model_name)
+        embedded_documents = encoder.encode(sentences=X)
+        return embedded_documents
+class FullEmbedder(BaseEstimator, TransformerMixin):
     """A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
     def fit(self, X, y=None):

src/model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import pandas as pd
+from sklearn.pipeline import Pipeline
+from src.storage import Fetch
+from src.cleaning import TextCleaner
+from src.embedding import Embedder
+from src.search import Search
+def main():
+    id_list = [id]
+    path_to_library = ""
+    save_recs = False
+    path_to_save_recs = ""
+    ## Create pipeline
+    model = Pipeline(
+        [
+            ("fetch", Fetch()),
+            ("clean", TextCleaner()),
+            ("embed", Embedder(model_name="allenai-specter")),
+            ("search", Search(path_to_library=path_to_library)),
+        ]
+    )
+    recommendation_df = model.transform(id_list)
+    if save_recs:
+        recommendation_df.to_feather(path_to_save_recs)
+if __name__ == "main":
+    main()

src/search.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from sentence_transformers import util
+from sklearn.base import BaseEstimator, TransformerMixin
+import os
+import pandas as pd
+class Search(BaseEstimator, TransformerMixin):
+    def __init__(self, path_to_library) -> None:
+        super().__init__()
+        self.path_to_library = path_to_library
+    def fit(self):
+        return self
+    def transform(self, X, y=None):
+        library_metadata = pd.read_feather(
+            os.path.join(self.path_to_library, "metadata.feather")
+        )
+        library_embeddings = pd.read_feather(
+            os.path.join(self.path_to_library, "embeddings.feather")
+        ).values
+        matches = util.semantic_search(
+            query_embeddings=X, corpus_embeddings=library_embeddings, top_k=5
+        )
+        recommended_indices = [dict["corpus_id"] for dict in matches[0]]
+        return library_metadata.iloc[recommended_indices]

storage.py → src/storage.py RENAMED Viewed

@@ -1,8 +1,16 @@
 import arxiv
 import pandas as pd
 import numpy as np
-import cleaning as clean
-from dataclasses import dataclass, astuple, asdict
 class ArXivData:
@@ -62,7 +70,7 @@ class ArXivData:
         self.metadata.to_feather(path_to_dataset)
-def query_to_df(query=None, id_list=None, max_results=None, offset=0):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
@@ -84,6 +92,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
     client = arxiv.Client(page_size=2000, num_retries=10)
     if id_list:
         search = arxiv.Search(
             id_list=id_list,
             max_results=max_results,
@@ -102,8 +111,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
             sort_by=arxiv.SortCriterion.LastUpdatedDate,
         )
-    columns = ["title", "summary", "categories", "id"]
-    index = range(offset, max_results)
     results = client.results(search, offset=offset)
@@ -111,14 +119,14 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
         (
             result.title,
             result.summary,
             result.categories,
             result.entry_id.split("/")[-1],
         )
         for result in results
     )
-    returned_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
-    returned_metadata = returned_metadata.rename(columns={"summary": "abstract"})
     return returned_metadata

 import arxiv
 import pandas as pd
 import numpy as np
+import src.cleaning as clean
+from sklearn.base import TransformerMixin, BaseEstimator
+class Fetch(BaseEstimator, TransformerMixin):
+    def fit(self):
+        return self
+    def transform(self, X, y=None):
+        return query_to_df(id_list=X)
 class ArXivData:
         self.metadata.to_feather(path_to_dataset)
+def query_to_df(query=None, id_list=None, max_results=10, offset=0):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
     client = arxiv.Client(page_size=2000, num_retries=10)
     if id_list:
+        max_results = len(id_list)
         search = arxiv.Search(
             id_list=id_list,
             max_results=max_results,
             sort_by=arxiv.SortCriterion.LastUpdatedDate,
         )
+    columns = ["title", "abstract", "authors", "categories", "id"]
     results = client.results(search, offset=offset)
         (
             result.title,
             result.summary,
+            [author.name for author in result.authors],
             result.categories,
             result.entry_id.split("/")[-1],
         )
         for result in results
     )
+    returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
     return returned_metadata