Michael-Geis commited on
Commit
3d2ca49
β€’
1 Parent(s): b2af341

reorganized and created a pipeline for CBF

Browse files
.gitignore CHANGED
@@ -7,4 +7,5 @@ env
7
  settings.json
8
  collection.ipynb
9
  testing.ipynb
10
- testnb.ipynb
 
 
7
  settings.json
8
  collection.ipynb
9
  testing.ipynb
10
+ testnb.ipynb
11
+ output
config/main.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ input_id:
4
+ id: 1602.00730
5
+
6
+
7
+
8
+
9
+
10
+ paths:
11
+ path_to_library: ./data/libraries/APSP_50_allenai-specter
12
+ path_to_save_recs: ./output/first_recs.feather
13
+
requirements.txt β†’ config/requirements.txt RENAMED
@@ -4,4 +4,5 @@ numpy
4
  arxiv
5
  sentence_transformers
6
  regex
7
- sklearn
 
 
4
  arxiv
5
  sentence_transformers
6
  regex
7
+ sklearn
8
+ hydra
prediction-flow-sample.ipynb CHANGED
@@ -37,11 +37,11 @@
37
  }
38
  ],
39
  "source": [
40
- "import embedding\n",
41
- "import storage\n",
42
- "from storage import ArXivData\n",
43
- "from cleaning import TextCleaner\n",
44
- "from embedding import Embedder\n",
45
  "from sentence_transformers import util\n",
46
  "\n",
47
  "importlib.reload(embedding)\n",
@@ -301,6 +301,208 @@
301
  "\n",
302
  "prepped_library._returned_metadata.iloc[indices]"
303
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  }
305
  ],
306
  "metadata": {
 
37
  }
38
  ],
39
  "source": [
40
+ "import src.embedding as embedding\n",
41
+ "import src.storage as storage\n",
42
+ "from src.storage import ArXivData\n",
43
+ "from src.cleaning import TextCleaner\n",
44
+ "from src.embedding import Embedder\n",
45
  "from sentence_transformers import util\n",
46
  "\n",
47
  "importlib.reload(embedding)\n",
 
301
  "\n",
302
  "prepped_library._returned_metadata.iloc[indices]"
303
  ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 20,
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "id_list = [\"1602.00730\"]"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 46,
317
+ "metadata": {},
318
+ "outputs": [
319
+ {
320
+ "data": {
321
+ "text/plain": [
322
+ "<module 'search' from 'c:\\\\Users\\\\Leems\\\\Desktop\\\\Coding\\\\Projects\\\\Fritz\\\\search.py'>"
323
+ ]
324
+ },
325
+ "execution_count": 46,
326
+ "metadata": {},
327
+ "output_type": "execute_result"
328
+ }
329
+ ],
330
+ "source": [
331
+ "import src.embedding as embedding\n",
332
+ "import src.search as search\n",
333
+ "import importlib\n",
334
+ "from src.storage import Fetch\n",
335
+ "from src.cleaning import TextCleaner\n",
336
+ "from src.embedding import Embedder\n",
337
+ "from src.search import Search\n",
338
+ "\n",
339
+ "importlib.reload(embedding)\n",
340
+ "importlib.reload(search)"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 47,
346
+ "metadata": {},
347
+ "outputs": [],
348
+ "source": [
349
+ "## Fetch metadata of input\n",
350
+ "getter = Fetch()\n",
351
+ "into_cleaner = getter.transform(X=id_list)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 48,
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "cleaner = TextCleaner()\n",
361
+ "\n",
362
+ "into_embedder = cleaner.transform(into_cleaner)"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 49,
368
+ "metadata": {},
369
+ "outputs": [],
370
+ "source": [
371
+ "embedder = Embedder(model_name=\"allenai-specter\")\n",
372
+ "into_search = embedder.transform(into_embedder)"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 52,
378
+ "metadata": {},
379
+ "outputs": [
380
+ {
381
+ "data": {
382
+ "text/plain": [
383
+ "['1602.00730v1', '1411.0658v3', '1905.05136v3', '1103.1276v4', '2003.04597v2']"
384
+ ]
385
+ },
386
+ "execution_count": 52,
387
+ "metadata": {},
388
+ "output_type": "execute_result"
389
+ }
390
+ ],
391
+ "source": [
392
+ "search = Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")\n",
393
+ "\n",
394
+ "search.transform(X=into_search).id.to_list()"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": 54,
400
+ "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/html": [
405
+ "<div>\n",
406
+ "<style scoped>\n",
407
+ " .dataframe tbody tr th:only-of-type {\n",
408
+ " vertical-align: middle;\n",
409
+ " }\n",
410
+ "\n",
411
+ " .dataframe tbody tr th {\n",
412
+ " vertical-align: top;\n",
413
+ " }\n",
414
+ "\n",
415
+ " .dataframe thead th {\n",
416
+ " text-align: right;\n",
417
+ " }\n",
418
+ "</style>\n",
419
+ "<table border=\"1\" class=\"dataframe\">\n",
420
+ " <thead>\n",
421
+ " <tr style=\"text-align: right;\">\n",
422
+ " <th></th>\n",
423
+ " <th>title</th>\n",
424
+ " <th>abstract</th>\n",
425
+ " <th>id</th>\n",
426
+ " <th>arxiv_subjects</th>\n",
427
+ " <th>msc_tags</th>\n",
428
+ " </tr>\n",
429
+ " </thead>\n",
430
+ " <tbody>\n",
431
+ " <tr>\n",
432
+ " <th>38787</th>\n",
433
+ " <td>C-infinity Scaling Asymptotics for the Spectra...</td>\n",
434
+ " <td>This article concerns new off-diagonal estimat...</td>\n",
435
+ " <td>1602.00730v1</td>\n",
436
+ " <td>[math.AP, math-ph, math.DG, math.FA, math.MP, ...</td>\n",
437
+ " <td>None</td>\n",
438
+ " </tr>\n",
439
+ " <tr>\n",
440
+ " <th>39127</th>\n",
441
+ " <td>Scaling Limit for the Kernel of the Spectral P...</td>\n",
442
+ " <td>Let (M, g) be a compact smooth Riemannian mani...</td>\n",
443
+ " <td>1411.0658v3</td>\n",
444
+ " <td>[math.SP, math.AP, math.DG]</td>\n",
445
+ " <td>None</td>\n",
446
+ " </tr>\n",
447
+ " <tr>\n",
448
+ " <th>9786</th>\n",
449
+ " <td>A logarithmic improvement in the two-point Wey...</td>\n",
450
+ " <td>In this paper, we study the two-point Weyl Law...</td>\n",
451
+ " <td>1905.05136v3</td>\n",
452
+ " <td>[math.AP, math.SP]</td>\n",
453
+ " <td>[35P20]</td>\n",
454
+ " </tr>\n",
455
+ " <tr>\n",
456
+ " <th>49609</th>\n",
457
+ " <td>The blowup along the diagonal of the spectral ...</td>\n",
458
+ " <td>We formulate a precise conjecture about the un...</td>\n",
459
+ " <td>1103.1276v4</td>\n",
460
+ " <td>[math.DG, math-ph, math.AP, math.MP]</td>\n",
461
+ " <td>[58J50, 35J15, 33C45, 32C05]</td>\n",
462
+ " </tr>\n",
463
+ " <tr>\n",
464
+ " <th>14857</th>\n",
465
+ " <td>Growth of high $L^p$ norms for eigenfunctions:...</td>\n",
466
+ " <td>This work concerns $L^p$ norms of high energy ...</td>\n",
467
+ " <td>2003.04597v2</td>\n",
468
+ " <td>[math.AP, math.SP]</td>\n",
469
+ " <td>None</td>\n",
470
+ " </tr>\n",
471
+ " </tbody>\n",
472
+ "</table>\n",
473
+ "</div>"
474
+ ],
475
+ "text/plain": [
476
+ " title ... msc_tags\n",
477
+ "38787 C-infinity Scaling Asymptotics for the Spectra... ... None\n",
478
+ "39127 Scaling Limit for the Kernel of the Spectral P... ... None\n",
479
+ "9786 A logarithmic improvement in the two-point Wey... ... [35P20]\n",
480
+ "49609 The blowup along the diagonal of the spectral ... ... [58J50, 35J15, 33C45, 32C05]\n",
481
+ "14857 Growth of high $L^p$ norms for eigenfunctions:... ... None\n",
482
+ "\n",
483
+ "[5 rows x 5 columns]"
484
+ ]
485
+ },
486
+ "execution_count": 54,
487
+ "metadata": {},
488
+ "output_type": "execute_result"
489
+ }
490
+ ],
491
+ "source": [
492
+ "from sklearn.pipeline import Pipeline\n",
493
+ "\n",
494
+ "pipe = Pipeline(\n",
495
+ " [\n",
496
+ " (\"fetch\", Fetch()),\n",
497
+ " (\"clean\", TextCleaner()),\n",
498
+ " (\"embed\", Embedder(model_name=\"allenai-specter\")),\n",
499
+ " (\"search\", Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")),\n",
500
+ " ]\n",
501
+ ")\n",
502
+ "\n",
503
+ "\n",
504
+ "pipe.transform(X=id_list)"
505
+ ]
506
  }
507
  ],
508
  "metadata": {
__init__.py β†’ src/__init__.py RENAMED
File without changes
cleaning.py β†’ src/cleaning.py RENAMED
@@ -9,6 +9,18 @@ from sklearn.base import BaseEstimator, TransformerMixin
9
 
10
 
11
  class TextCleaner(BaseEstimator, TransformerMixin):
 
 
 
 
 
 
 
 
 
 
 
 
12
  """Return ArXivData class object with its metadata attribute modified so that
13
  1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
14
  2. The msc tag list has been translated to english.
 
9
 
10
 
11
  class TextCleaner(BaseEstimator, TransformerMixin):
12
+ def fit(self, X, y=None):
13
+ return self
14
+
15
+ def transform(self, X, y=None):
16
+ doc_strings = (
17
+ X.title.apply(cleanse) + " " + X.abstract.apply(cleanse)
18
+ ).to_list()
19
+
20
+ return doc_strings
21
+
22
+
23
+ class FullTextCleaner(BaseEstimator, TransformerMixin):
24
  """Return ArXivData class object with its metadata attribute modified so that
25
  1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
26
  2. The msc tag list has been translated to english.
embedding.py β†’ src/embedding.py RENAMED
@@ -1,4 +1,4 @@
1
- import cleaning as clean
2
  from sentence_transformers import SentenceTransformer, util
3
  import pandas as pd
4
  import numpy as np
@@ -8,6 +8,23 @@ import os
8
 
9
 
10
  class Embedder(BaseEstimator, TransformerMixin):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
12
 
13
  def fit(self, X, y=None):
 
1
+ import src.cleaning as clean
2
  from sentence_transformers import SentenceTransformer, util
3
  import pandas as pd
4
  import numpy as np
 
8
 
9
 
10
  class Embedder(BaseEstimator, TransformerMixin):
11
+ """Takes a list of clean strings and outputs a numpy array of their embeddings generated by the ST model model_name."""
12
+
13
+ def __init__(self, model_name) -> None:
14
+ super().__init__()
15
+ self.model_name = model_name
16
+
17
+ def fit(self, X, y=None):
18
+ return self
19
+
20
+ def transform(self, X, y=None):
21
+ encoder = SentenceTransformer(self.model_name)
22
+ embedded_documents = encoder.encode(sentences=X)
23
+
24
+ return embedded_documents
25
+
26
+
27
+ class FullEmbedder(BaseEstimator, TransformerMixin):
28
  """A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
29
 
30
  def fit(self, X, y=None):
src/model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.pipeline import Pipeline
3
+ from src.storage import Fetch
4
+ from src.cleaning import TextCleaner
5
+ from src.embedding import Embedder
6
+ from src.search import Search
7
+
8
+
9
+ def main():
10
+ id_list = [id]
11
+ path_to_library = ""
12
+ save_recs = False
13
+ path_to_save_recs = ""
14
+
15
+ ## Create pipeline
16
+
17
+ model = Pipeline(
18
+ [
19
+ ("fetch", Fetch()),
20
+ ("clean", TextCleaner()),
21
+ ("embed", Embedder(model_name="allenai-specter")),
22
+ ("search", Search(path_to_library=path_to_library)),
23
+ ]
24
+ )
25
+
26
+ recommendation_df = model.transform(id_list)
27
+
28
+ if save_recs:
29
+ recommendation_df.to_feather(path_to_save_recs)
30
+
31
+
32
+ if __name__ == "main":
33
+ main()
src/search.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import util
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ import os
4
+ import pandas as pd
5
+
6
+
7
+ class Search(BaseEstimator, TransformerMixin):
8
+ def __init__(self, path_to_library) -> None:
9
+ super().__init__()
10
+
11
+ self.path_to_library = path_to_library
12
+
13
+ def fit(self):
14
+ return self
15
+
16
+ def transform(self, X, y=None):
17
+ library_metadata = pd.read_feather(
18
+ os.path.join(self.path_to_library, "metadata.feather")
19
+ )
20
+ library_embeddings = pd.read_feather(
21
+ os.path.join(self.path_to_library, "embeddings.feather")
22
+ ).values
23
+
24
+ matches = util.semantic_search(
25
+ query_embeddings=X, corpus_embeddings=library_embeddings, top_k=5
26
+ )
27
+
28
+ recommended_indices = [dict["corpus_id"] for dict in matches[0]]
29
+
30
+ return library_metadata.iloc[recommended_indices]
storage.py β†’ src/storage.py RENAMED
@@ -1,8 +1,16 @@
1
  import arxiv
2
  import pandas as pd
3
  import numpy as np
4
- import cleaning as clean
5
- from dataclasses import dataclass, astuple, asdict
 
 
 
 
 
 
 
 
6
 
7
 
8
  class ArXivData:
@@ -62,7 +70,7 @@ class ArXivData:
62
  self.metadata.to_feather(path_to_dataset)
63
 
64
 
65
- def query_to_df(query=None, id_list=None, max_results=None, offset=0):
66
  """Returns the results of an arxiv API query in a pandas dataframe.
67
 
68
  Args:
@@ -84,6 +92,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
84
  client = arxiv.Client(page_size=2000, num_retries=10)
85
 
86
  if id_list:
 
87
  search = arxiv.Search(
88
  id_list=id_list,
89
  max_results=max_results,
@@ -102,8 +111,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
102
  sort_by=arxiv.SortCriterion.LastUpdatedDate,
103
  )
104
 
105
- columns = ["title", "summary", "categories", "id"]
106
- index = range(offset, max_results)
107
 
108
  results = client.results(search, offset=offset)
109
 
@@ -111,14 +119,14 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
111
  (
112
  result.title,
113
  result.summary,
 
114
  result.categories,
115
  result.entry_id.split("/")[-1],
116
  )
117
  for result in results
118
  )
119
 
120
- returned_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
121
- returned_metadata = returned_metadata.rename(columns={"summary": "abstract"})
122
  return returned_metadata
123
 
124
 
 
1
  import arxiv
2
  import pandas as pd
3
  import numpy as np
4
+ import src.cleaning as clean
5
+ from sklearn.base import TransformerMixin, BaseEstimator
6
+
7
+
8
+ class Fetch(BaseEstimator, TransformerMixin):
9
+ def fit(self):
10
+ return self
11
+
12
+ def transform(self, X, y=None):
13
+ return query_to_df(id_list=X)
14
 
15
 
16
  class ArXivData:
 
70
  self.metadata.to_feather(path_to_dataset)
71
 
72
 
73
+ def query_to_df(query=None, id_list=None, max_results=10, offset=0):
74
  """Returns the results of an arxiv API query in a pandas dataframe.
75
 
76
  Args:
 
92
  client = arxiv.Client(page_size=2000, num_retries=10)
93
 
94
  if id_list:
95
+ max_results = len(id_list)
96
  search = arxiv.Search(
97
  id_list=id_list,
98
  max_results=max_results,
 
111
  sort_by=arxiv.SortCriterion.LastUpdatedDate,
112
  )
113
 
114
+ columns = ["title", "abstract", "authors", "categories", "id"]
 
115
 
116
  results = client.results(search, offset=offset)
117
 
 
119
  (
120
  result.title,
121
  result.summary,
122
+ [author.name for author in result.authors],
123
  result.categories,
124
  result.entry_id.split("/")[-1],
125
  )
126
  for result in results
127
  )
128
 
129
+ returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
 
130
  return returned_metadata
131
 
132