Michael-Geis commited on
Commit
fcfd917
1 Parent(s): cbdef5e

updated load_from_query in data_storage, added to data cleaning

Browse files
Files changed (3) hide show
  1. collection.ipynb +913 -92
  2. data_cleaning.py +16 -25
  3. data_storage.py +27 -15
collection.ipynb CHANGED
@@ -6,7 +6,7 @@
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
- "from util import format_query , query_to_df\n",
10
  "import pandas as pd\n",
11
  "import numpy as np"
12
  ]
@@ -19,10 +19,10 @@
19
  "source": [
20
  "## Try collection data for pde articles\n",
21
  "\n",
22
- "max_results=3e3\n",
23
- "cat='math.AP'\n",
24
  "query = format_query(cat=cat)\n",
25
- "pde = query_to_df(query=query,max_results=max_results)\n"
26
  ]
27
  },
28
  {
@@ -66,19 +66,18 @@
66
  "## Find the msc subject tags within the categories\n",
67
  "import regex\n",
68
  "\n",
 
69
  "def find_msc(cat_list):\n",
70
- " pattern = r'\\b\\d{2}[0-9a-zA-Z]{3}\\b'\n",
71
  " out = []\n",
72
  " for cat in cat_list:\n",
73
- " tags = regex.findall(pattern,cat)\n",
74
  " for tag in tags:\n",
75
  " out.append(tag)\n",
76
  " if out == []:\n",
77
  " return None\n",
78
  " else:\n",
79
- " return out\n",
80
- "\n",
81
- " "
82
  ]
83
  },
84
  {
@@ -89,7 +88,7 @@
89
  "source": [
90
  "## Now create a new column for msc tags\n",
91
  "\n",
92
- "pde['msc_tags'] = pde.categories.apply(find_msc)"
93
  ]
94
  },
95
  {
@@ -119,7 +118,7 @@
119
  "\n",
120
  "msc = pde.msc_tags.sample(10)\n",
121
  "for tag in msc:\n",
122
- " print(tag)\n"
123
  ]
124
  },
125
  {
@@ -139,9 +138,8 @@
139
  "## what fraction of these articles has non-zero msc tags?\n",
140
  "\n",
141
  "tagged = pde.msc_tags.count()\n",
142
- "fraction = tagged/len(pde)\n",
143
- "print(fraction)\n",
144
- "\n"
145
  ]
146
  },
147
  {
@@ -153,11 +151,11 @@
153
  "## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
154
  "\n",
155
  "from bs4 import BeautifulSoup\n",
156
- "import requests \n",
157
  "\n",
158
- "url = r'https://cran.r-project.org/web/classifications/MSC.html'\n",
159
  "\n",
160
- "source = requests.get(url)\n"
161
  ]
162
  },
163
  {
@@ -177,7 +175,7 @@
177
  }
178
  ],
179
  "source": [
180
- "source.headers['content-type']"
181
  ]
182
  },
183
  {
@@ -19291,7 +19289,7 @@
19291
  }
19292
  ],
19293
  "source": [
19294
- "soup = BeautifulSoup(document, 'html.parser')\n",
19295
  "print(soup.prettify())"
19296
  ]
19297
  },
@@ -19338,25 +19336,24 @@
19338
  "import PyPDF2\n",
19339
  "import regex\n",
19340
  "\n",
19341
- "with open('msc2020.pdf', 'rb') as file:\n",
19342
- "\n",
19343
  " reader = PyPDF2.PdfReader(file)\n",
19344
  " print(len(reader.pages))\n",
19345
  " page = reader.pages[0]\n",
19346
  " raw_text = page.extract_text()\n",
19347
  "\n",
19348
- " lines = raw_text.split('\\n')\n",
19349
  "\n",
19350
  " subject_dict = {}\n",
19351
  " for line in lines[2:]:\n",
19352
  " subject_dict[str(line[:2])] = line[2:]\n",
19353
  "\n",
19354
  "\n",
19355
- "subject_dict['44'] = 'Integral transforms, operational calculus'\n",
19356
- "subject_dict['45'] = 'Integral equations'\n",
19357
  "\n",
19358
  "for k in subject_dict.keys():\n",
19359
- " subject_dict[k] = regex.sub(r'\\x0b','ff',subject_dict[k])\n",
19360
  "\n",
19361
  "print(subject_dict)"
19362
  ]
@@ -19375,7 +19372,7 @@
19375
  }
19376
  ],
19377
  "source": [
19378
- "subject_dict.pop('1')\n",
19379
  "print(subject_dict)"
19380
  ]
19381
  },
@@ -19390,9 +19387,8 @@
19390
  "import json\n",
19391
  "\n",
19392
  "json_subjects = json.dumps(subject_dict)\n",
19393
- "with open('./data/msc_subjects.json','w+') as file:\n",
19394
- " file.write(json_subjects)\n",
19395
- " "
19396
  ]
19397
  },
19398
  {
@@ -19476,6 +19472,7 @@
19476
  "source": [
19477
  "import util\n",
19478
  "import importlib\n",
 
19479
  "importlib.reload(util)\n",
19480
  "\n",
19481
  "util.msc_subjects()"
@@ -19489,11 +19486,10 @@
19489
  "source": [
19490
  "## Next we make a dictionary consisting of all other subject tags\n",
19491
  "\n",
19492
- "with open('msc2020.pdf', 'rb') as file:\n",
19493
- "\n",
19494
  " reader = PyPDF2.PdfReader(file)\n",
19495
  " page = reader.pages[3]\n",
19496
- " raw_text = page.extract_text()\n"
19497
  ]
19498
  },
19499
  {
@@ -19552,17 +19548,17 @@
19552
  "source": [
19553
  "## Try splitting on a pattern \\d\\d[A-Z]xx\n",
19554
  "\n",
19555
- "pattern = r'\\b\\d\\d[A-Z]xx\\b'\n",
19556
- "splitting = regex.split(pattern,raw_text)\n",
19557
  "# for line in splitting:\n",
19558
- " # print(line + 'END')\n",
19559
  "\n",
19560
  "print(splitting[1])\n",
19561
  "\n",
19562
- "## Within each of these, find all text between two instances of the pattern \n",
19563
- "tag_pattern = r'(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)'\n",
19564
  "\n",
19565
- "patterns = regex.findall(tag_pattern, splitting[1])\n"
19566
  ]
19567
  },
19568
  {
@@ -19648,7 +19644,7 @@
19648
  "## Turn this into a dict\n",
19649
  "dict = {}\n",
19650
  "for item in patterns:\n",
19651
- " k , v = item\n",
19652
  " dict[k] = v\n",
19653
  "\n",
19654
  "for item in dict.items():\n",
@@ -19661,24 +19657,22 @@
19661
  "metadata": {},
19662
  "outputs": [],
19663
  "source": [
19664
- "\n",
19665
- "## Within each of these, find all text between two instances of the pattern \n",
19666
- "tag_pattern = r'(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)'\n",
19667
  "dict = {}\n",
19668
  "\n",
19669
- "with open('msc2020.pdf', 'rb') as file:\n",
19670
- "\n",
19671
  " reader = PyPDF2.PdfReader(file)\n",
19672
  " for page in reader.pages:\n",
19673
  " page_text = page.extract_text()\n",
19674
  "\n",
19675
- " ## Find all the msc tags \n",
19676
  "\n",
19677
  " tags = regex.findall(tag_pattern, page_text)\n",
19678
- " \n",
19679
  " for item in tags:\n",
19680
- " k , v = item\n",
19681
- " dict[k] = v\n"
19682
  ]
19683
  },
19684
  {
@@ -20737,15 +20731,15 @@
20737
  "source": [
20738
  "def clean_msc_dict(dict):\n",
20739
  " for item in dict.items():\n",
20740
- " k , v = item \n",
20741
- " v = regex.sub(r'\\x0c','fi',v)\n",
20742
- " v = regex.sub(r'\\x0b','ff',v)\n",
20743
- " v = regex.sub(r'\\r','fl',v)\n",
20744
- " v = regex.sub(r'\\xf7 ','',v)\n",
20745
- " v = regex.sub(r'\\x0e','ffi',v)\n",
20746
- " v = regex.sub(r'\\x13','',v)\n",
20747
  " dict[k] = v\n",
20748
- " return dict\n"
20749
  ]
20750
  },
20751
  {
@@ -31666,7 +31660,8 @@
31666
  "outputs": [],
31667
  "source": [
31668
  "import pandas as pd\n",
31669
- "codes = pd.read_csv('./data/MSC_2020.csv', encoding='windows-1252', on_bad_lines='skip')"
 
31670
  ]
31671
  },
31672
  {
@@ -31749,15 +31744,17 @@
31749
  "source": [
31750
  "## Look for all entries that start with a valid msc tag pattern\n",
31751
  "\n",
31752
- "pattern = '\\d\\d[A-Z]\\d\\d'\n",
 
31753
  "\n",
31754
  "def check_valid(entry):\n",
31755
- " if regex.match(pattern,entry):\n",
31756
  " return True\n",
31757
  " else:\n",
31758
  " return False\n",
31759
  "\n",
31760
- "valid_codes = codes.loc[codes['code\\ttext\\tdescription'].apply(check_valid)]\n"
 
31761
  ]
31762
  },
31763
  {
@@ -31873,11 +31870,11 @@
31873
  "source": [
31874
  "dict = {}\n",
31875
  "\n",
31876
- "for entry in valid_codes['code\\ttext\\tdescription']:\n",
31877
- " split = entry.split('\\t')\n",
31878
  " code = split[0]\n",
31879
  " desc = split[1][1:-1]\n",
31880
- " dict[code] = desc\n"
31881
  ]
31882
  },
31883
  {
@@ -33920,9 +33917,9 @@
33920
  "## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
33921
  "\n",
33922
  "for item in dict.items():\n",
33923
- " k , v = item\n",
33924
- " v = v.replace('\\\\(','')\n",
33925
- " v = v.replace('\\\\)','')\n",
33926
  " dict[k] = v\n",
33927
  "\n",
33928
  "dict"
@@ -33963,8 +33960,8 @@
33963
  "dict_stripped_accents = {}\n",
33964
  "\n",
33965
  "for item in dict.items():\n",
33966
- " k , v = item\n",
33967
- " dict_stripped_accents[k] = unidecode(v)\n"
33968
  ]
33969
  },
33970
  {
@@ -33973,7 +33970,7 @@
33973
  "metadata": {},
33974
  "outputs": [],
33975
  "source": [
33976
- "with open('./data/msc.json','w') as file:\n",
33977
  " json = json.dumps(dict_stripped_accents)\n",
33978
  " file.write(json)"
33979
  ]
@@ -35010,7 +35007,7 @@
35010
  "\n",
35011
  "lib = Library()\n",
35012
  "\n",
35013
- "lib.load_from_query(query_string='cat:math.AP',max_results=5000)"
35014
  ]
35015
  },
35016
  {
@@ -35386,6 +35383,7 @@
35386
  "outputs": [],
35387
  "source": [
35388
  "import importlib\n",
 
35389
  "importlib.reload(util)\n",
35390
  "\n",
35391
  "lib.clean_library()"
@@ -35659,9 +35657,7 @@
35659
  "from library_class import Library\n",
35660
  "\n",
35661
  "lib = Library()\n",
35662
- "lib.load_from_query(query_string='cat:math.AP OR math.SP',max_results=2e4)\n",
35663
- "\n",
35664
- "\n"
35665
  ]
35666
  },
35667
  {
@@ -35671,7 +35667,7 @@
35671
  "outputs": [],
35672
  "source": [
35673
  "raw_lib = lib.raw_lib\n",
35674
- "raw_lib.to_parquet('./data/APSP.parquet')"
35675
  ]
35676
  },
35677
  {
@@ -35682,7 +35678,7 @@
35682
  "source": [
35683
  "## Is the list information preserved?\n",
35684
  "\n",
35685
- "df = pd.read_parquet('./data/APSP.parquet')"
35686
  ]
35687
  },
35688
  {
@@ -35808,7 +35804,7 @@
35808
  "metadata": {},
35809
  "outputs": [],
35810
  "source": [
35811
- "pd.set_option('display.max_colwidth', 0)"
35812
  ]
35813
  },
35814
  {
@@ -35820,13 +35816,14 @@
35820
  "from cleaning import cleaning\n",
35821
  "import pandas as pd\n",
35822
  "import importlib\n",
 
35823
  "importlib.reload(cleaning)\n",
35824
  "\n",
35825
- "data = pd.read_parquet('./data/APSP.parquet')\n",
35826
  "\n",
35827
  "clean_data = cleaning.main(\n",
35828
- " raw_arxiv_results=data,path_to_embeddings='./data/APSP_mini_vec.parquet'\n",
35829
- ")\n"
35830
  ]
35831
  },
35832
  {
@@ -35950,7 +35947,7 @@
35950
  }
35951
  ],
35952
  "source": [
35953
- "pd.set_option('display.max_colwidth', 0)\n",
35954
  "clean_data.head()"
35955
  ]
35956
  },
@@ -36094,6 +36091,7 @@
36094
  "source": [
36095
  "import data_storage\n",
36096
  "import importlib\n",
 
36097
  "importlib.reload(data_storage)\n",
36098
  "\n",
36099
  "\n",
@@ -36101,10 +36099,11 @@
36101
  "\n",
36102
  "max_results = 20000\n",
36103
  "offset = 0\n",
36104
- "data.load_from_query(query_string='cat:math.AP',\n",
36105
- " max_results=max_results,\n",
36106
- " offset=offset,\n",
36107
- " )\n",
 
36108
  "data.data"
36109
  ]
36110
  },
@@ -36123,16 +36122,20 @@
36123
  ],
36124
  "source": [
36125
  "import arxiv\n",
36126
- "from datetime import datetime , timedelta , timezone\n",
36127
  "\n",
36128
  "\n",
36129
- "search = arxiv.Search(query='cat:math.AP', max_results=1e3,sort_by=arxiv.SortCriterion.LastUpdatedDate, sort_order=arxiv.SortOrder.Descending)\n",
 
 
 
 
 
36130
  "\n",
36131
  "for result in search.results():\n",
36132
  " if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
36133
- " print(result.title,result.updated)\n",
36134
- " break\n",
36135
- "\n"
36136
  ]
36137
  },
36138
  {
@@ -36151,7 +36154,7 @@
36151
  "source": [
36152
  "##\n",
36153
  "oldest = list(search.results())[-1]\n",
36154
- "print(oldest.updated)\n"
36155
  ]
36156
  },
36157
  {
@@ -36174,10 +36177,828 @@
36174
  },
36175
  {
36176
  "cell_type": "code",
36177
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36178
  "metadata": {},
36179
  "outputs": [],
36180
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36181
  }
36182
  ],
36183
  "metadata": {
 
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
+ "from util import format_query, query_to_df\n",
10
  "import pandas as pd\n",
11
  "import numpy as np"
12
  ]
 
19
  "source": [
20
  "## Try collection data for pde articles\n",
21
  "\n",
22
+ "max_results = 3e3\n",
23
+ "cat = \"math.AP\"\n",
24
  "query = format_query(cat=cat)\n",
25
+ "pde = query_to_df(query=query, max_results=max_results)"
26
  ]
27
  },
28
  {
 
66
  "## Find the msc subject tags within the categories\n",
67
  "import regex\n",
68
  "\n",
69
+ "\n",
70
  "def find_msc(cat_list):\n",
71
+ " pattern = r\"\\b\\d{2}[0-9a-zA-Z]{3}\\b\"\n",
72
  " out = []\n",
73
  " for cat in cat_list:\n",
74
+ " tags = regex.findall(pattern, cat)\n",
75
  " for tag in tags:\n",
76
  " out.append(tag)\n",
77
  " if out == []:\n",
78
  " return None\n",
79
  " else:\n",
80
+ " return out"
 
 
81
  ]
82
  },
83
  {
 
88
  "source": [
89
  "## Now create a new column for msc tags\n",
90
  "\n",
91
+ "pde[\"msc_tags\"] = pde.categories.apply(find_msc)"
92
  ]
93
  },
94
  {
 
118
  "\n",
119
  "msc = pde.msc_tags.sample(10)\n",
120
  "for tag in msc:\n",
121
+ " print(tag)"
122
  ]
123
  },
124
  {
 
138
  "## what fraction of these articles has non-zero msc tags?\n",
139
  "\n",
140
  "tagged = pde.msc_tags.count()\n",
141
+ "fraction = tagged / len(pde)\n",
142
+ "print(fraction)"
 
143
  ]
144
  },
145
  {
 
151
  "## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
152
  "\n",
153
  "from bs4 import BeautifulSoup\n",
154
+ "import requests\n",
155
  "\n",
156
+ "url = r\"https://cran.r-project.org/web/classifications/MSC.html\"\n",
157
  "\n",
158
+ "source = requests.get(url)"
159
  ]
160
  },
161
  {
 
175
  }
176
  ],
177
  "source": [
178
+ "source.headers[\"content-type\"]"
179
  ]
180
  },
181
  {
 
19289
  }
19290
  ],
19291
  "source": [
19292
+ "soup = BeautifulSoup(document, \"html.parser\")\n",
19293
  "print(soup.prettify())"
19294
  ]
19295
  },
 
19336
  "import PyPDF2\n",
19337
  "import regex\n",
19338
  "\n",
19339
+ "with open(\"msc2020.pdf\", \"rb\") as file:\n",
 
19340
  " reader = PyPDF2.PdfReader(file)\n",
19341
  " print(len(reader.pages))\n",
19342
  " page = reader.pages[0]\n",
19343
  " raw_text = page.extract_text()\n",
19344
  "\n",
19345
+ " lines = raw_text.split(\"\\n\")\n",
19346
  "\n",
19347
  " subject_dict = {}\n",
19348
  " for line in lines[2:]:\n",
19349
  " subject_dict[str(line[:2])] = line[2:]\n",
19350
  "\n",
19351
  "\n",
19352
+ "subject_dict[\"44\"] = \"Integral transforms, operational calculus\"\n",
19353
+ "subject_dict[\"45\"] = \"Integral equations\"\n",
19354
  "\n",
19355
  "for k in subject_dict.keys():\n",
19356
+ " subject_dict[k] = regex.sub(r\"\\x0b\", \"ff\", subject_dict[k])\n",
19357
  "\n",
19358
  "print(subject_dict)"
19359
  ]
 
19372
  }
19373
  ],
19374
  "source": [
19375
+ "subject_dict.pop(\"1\")\n",
19376
  "print(subject_dict)"
19377
  ]
19378
  },
 
19387
  "import json\n",
19388
  "\n",
19389
  "json_subjects = json.dumps(subject_dict)\n",
19390
+ "with open(\"./data/msc_subjects.json\", \"w+\") as file:\n",
19391
+ " file.write(json_subjects)"
 
19392
  ]
19393
  },
19394
  {
 
19472
  "source": [
19473
  "import util\n",
19474
  "import importlib\n",
19475
+ "\n",
19476
  "importlib.reload(util)\n",
19477
  "\n",
19478
  "util.msc_subjects()"
 
19486
  "source": [
19487
  "## Next we make a dictionary consisting of all other subject tags\n",
19488
  "\n",
19489
+ "with open(\"msc2020.pdf\", \"rb\") as file:\n",
 
19490
  " reader = PyPDF2.PdfReader(file)\n",
19491
  " page = reader.pages[3]\n",
19492
+ " raw_text = page.extract_text()"
19493
  ]
19494
  },
19495
  {
 
19548
  "source": [
19549
  "## Try splitting on a pattern \\d\\d[A-Z]xx\n",
19550
  "\n",
19551
+ "pattern = r\"\\b\\d\\d[A-Z]xx\\b\"\n",
19552
+ "splitting = regex.split(pattern, raw_text)\n",
19553
  "# for line in splitting:\n",
19554
+ "# print(line + 'END')\n",
19555
  "\n",
19556
  "print(splitting[1])\n",
19557
  "\n",
19558
+ "## Within each of these, find all text between two instances of the pattern\n",
19559
+ "tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
19560
  "\n",
19561
+ "patterns = regex.findall(tag_pattern, splitting[1])"
19562
  ]
19563
  },
19564
  {
 
19644
  "## Turn this into a dict\n",
19645
  "dict = {}\n",
19646
  "for item in patterns:\n",
19647
+ " k, v = item\n",
19648
  " dict[k] = v\n",
19649
  "\n",
19650
  "for item in dict.items():\n",
 
19657
  "metadata": {},
19658
  "outputs": [],
19659
  "source": [
19660
+ "## Within each of these, find all text between two instances of the pattern\n",
19661
+ "tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
 
19662
  "dict = {}\n",
19663
  "\n",
19664
+ "with open(\"msc2020.pdf\", \"rb\") as file:\n",
 
19665
  " reader = PyPDF2.PdfReader(file)\n",
19666
  " for page in reader.pages:\n",
19667
  " page_text = page.extract_text()\n",
19668
  "\n",
19669
+ " ## Find all the msc tags\n",
19670
  "\n",
19671
  " tags = regex.findall(tag_pattern, page_text)\n",
19672
+ "\n",
19673
  " for item in tags:\n",
19674
+ " k, v = item\n",
19675
+ " dict[k] = v"
19676
  ]
19677
  },
19678
  {
 
20731
  "source": [
20732
  "def clean_msc_dict(dict):\n",
20733
  " for item in dict.items():\n",
20734
+ " k, v = item\n",
20735
+ " v = regex.sub(r\"\\x0c\", \"fi\", v)\n",
20736
+ " v = regex.sub(r\"\\x0b\", \"ff\", v)\n",
20737
+ " v = regex.sub(r\"\\r\", \"fl\", v)\n",
20738
+ " v = regex.sub(r\"\\xf7 \", \"\", v)\n",
20739
+ " v = regex.sub(r\"\\x0e\", \"ffi\", v)\n",
20740
+ " v = regex.sub(r\"\\x13\", \"\", v)\n",
20741
  " dict[k] = v\n",
20742
+ " return dict"
20743
  ]
20744
  },
20745
  {
 
31660
  "outputs": [],
31661
  "source": [
31662
  "import pandas as pd\n",
31663
+ "\n",
31664
+ "codes = pd.read_csv(\"./data/MSC_2020.csv\", encoding=\"windows-1252\", on_bad_lines=\"skip\")"
31665
  ]
31666
  },
31667
  {
 
31744
  "source": [
31745
  "## Look for all entries that start with a valid msc tag pattern\n",
31746
  "\n",
31747
+ "pattern = \"\\d\\d[A-Z]\\d\\d\"\n",
31748
+ "\n",
31749
  "\n",
31750
  "def check_valid(entry):\n",
31751
+ " if regex.match(pattern, entry):\n",
31752
  " return True\n",
31753
  " else:\n",
31754
  " return False\n",
31755
  "\n",
31756
+ "\n",
31757
+ "valid_codes = codes.loc[codes[\"code\\ttext\\tdescription\"].apply(check_valid)]"
31758
  ]
31759
  },
31760
  {
 
31870
  "source": [
31871
  "dict = {}\n",
31872
  "\n",
31873
+ "for entry in valid_codes[\"code\\ttext\\tdescription\"]:\n",
31874
+ " split = entry.split(\"\\t\")\n",
31875
  " code = split[0]\n",
31876
  " desc = split[1][1:-1]\n",
31877
+ " dict[code] = desc"
31878
  ]
31879
  },
31880
  {
 
33917
  "## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
33918
  "\n",
33919
  "for item in dict.items():\n",
33920
+ " k, v = item\n",
33921
+ " v = v.replace(\"\\\\(\", \"\")\n",
33922
+ " v = v.replace(\"\\\\)\", \"\")\n",
33923
  " dict[k] = v\n",
33924
  "\n",
33925
  "dict"
 
33960
  "dict_stripped_accents = {}\n",
33961
  "\n",
33962
  "for item in dict.items():\n",
33963
+ " k, v = item\n",
33964
+ " dict_stripped_accents[k] = unidecode(v)"
33965
  ]
33966
  },
33967
  {
 
33970
  "metadata": {},
33971
  "outputs": [],
33972
  "source": [
33973
+ "with open(\"./data/msc.json\", \"w\") as file:\n",
33974
  " json = json.dumps(dict_stripped_accents)\n",
33975
  " file.write(json)"
33976
  ]
 
35007
  "\n",
35008
  "lib = Library()\n",
35009
  "\n",
35010
+ "lib.load_from_query(query_string=\"cat:math.AP\", max_results=5000)"
35011
  ]
35012
  },
35013
  {
 
35383
  "outputs": [],
35384
  "source": [
35385
  "import importlib\n",
35386
+ "\n",
35387
  "importlib.reload(util)\n",
35388
  "\n",
35389
  "lib.clean_library()"
 
35657
  "from library_class import Library\n",
35658
  "\n",
35659
  "lib = Library()\n",
35660
+ "lib.load_from_query(query_string=\"cat:math.AP OR math.SP\", max_results=2e4)"
 
 
35661
  ]
35662
  },
35663
  {
 
35667
  "outputs": [],
35668
  "source": [
35669
  "raw_lib = lib.raw_lib\n",
35670
+ "raw_lib.to_parquet(\"./data/APSP.parquet\")"
35671
  ]
35672
  },
35673
  {
 
35678
  "source": [
35679
  "## Is the list information preserved?\n",
35680
  "\n",
35681
+ "df = pd.read_parquet(\"./data/APSP.parquet\")"
35682
  ]
35683
  },
35684
  {
 
35804
  "metadata": {},
35805
  "outputs": [],
35806
  "source": [
35807
+ "pd.set_option(\"display.max_colwidth\", 0)"
35808
  ]
35809
  },
35810
  {
 
35816
  "from cleaning import cleaning\n",
35817
  "import pandas as pd\n",
35818
  "import importlib\n",
35819
+ "\n",
35820
  "importlib.reload(cleaning)\n",
35821
  "\n",
35822
+ "data = pd.read_parquet(\"./data/APSP.parquet\")\n",
35823
  "\n",
35824
  "clean_data = cleaning.main(\n",
35825
+ " raw_arxiv_results=data, path_to_embeddings=\"./data/APSP_mini_vec.parquet\"\n",
35826
+ ")"
35827
  ]
35828
  },
35829
  {
 
35947
  }
35948
  ],
35949
  "source": [
35950
+ "pd.set_option(\"display.max_colwidth\", 0)\n",
35951
  "clean_data.head()"
35952
  ]
35953
  },
 
36091
  "source": [
36092
  "import data_storage\n",
36093
  "import importlib\n",
36094
+ "\n",
36095
  "importlib.reload(data_storage)\n",
36096
  "\n",
36097
  "\n",
 
36099
  "\n",
36100
  "max_results = 20000\n",
36101
  "offset = 0\n",
36102
+ "data.load_from_query(\n",
36103
+ " query_string=\"cat:math.AP\",\n",
36104
+ " max_results=max_results,\n",
36105
+ " offset=offset,\n",
36106
+ ")\n",
36107
  "data.data"
36108
  ]
36109
  },
 
36122
  ],
36123
  "source": [
36124
  "import arxiv\n",
36125
+ "from datetime import datetime, timedelta, timezone\n",
36126
  "\n",
36127
  "\n",
36128
+ "search = arxiv.Search(\n",
36129
+ " query=\"cat:math.AP\",\n",
36130
+ " max_results=1e3,\n",
36131
+ " sort_by=arxiv.SortCriterion.LastUpdatedDate,\n",
36132
+ " sort_order=arxiv.SortOrder.Descending,\n",
36133
+ ")\n",
36134
  "\n",
36135
  "for result in search.results():\n",
36136
  " if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
36137
+ " print(result.title, result.updated)\n",
36138
+ " break"
 
36139
  ]
36140
  },
36141
  {
 
36154
  "source": [
36155
  "##\n",
36156
  "oldest = list(search.results())[-1]\n",
36157
+ "print(oldest.updated)"
36158
  ]
36159
  },
36160
  {
 
36177
  },
36178
  {
36179
  "cell_type": "code",
36180
+ "execution_count": 256,
36181
+ "metadata": {},
36182
+ "outputs": [],
36183
+ "source": [
36184
+ "import data_storage\n",
36185
+ "import data_cleaning\n",
36186
+ "from data_storage import ArXivData\n",
36187
+ "import importlib\n",
36188
+ "\n",
36189
+ "importlib.reload(data_storage)\n",
36190
+ "importlib.reload(data_cleaning)\n",
36191
+ "import numpy as np"
36192
+ ]
36193
+ },
36194
+ {
36195
+ "cell_type": "code",
36196
+ "execution_count": 257,
36197
+ "metadata": {},
36198
+ "outputs": [],
36199
+ "source": [
36200
+ "data = ArXivData()"
36201
+ ]
36202
+ },
36203
+ {
36204
+ "cell_type": "code",
36205
+ "execution_count": 258,
36206
+ "metadata": {},
36207
+ "outputs": [],
36208
+ "source": [
36209
+ "data.load_from_query(query=\"cat:math.AP\", max_results=100, raw=True)"
36210
+ ]
36211
+ },
36212
+ {
36213
+ "cell_type": "code",
36214
+ "execution_count": 259,
36215
+ "metadata": {},
36216
+ "outputs": [
36217
+ {
36218
+ "data": {
36219
+ "text/html": [
36220
+ "<div>\n",
36221
+ "<style scoped>\n",
36222
+ " .dataframe tbody tr th:only-of-type {\n",
36223
+ " vertical-align: middle;\n",
36224
+ " }\n",
36225
+ "\n",
36226
+ " .dataframe tbody tr th {\n",
36227
+ " vertical-align: top;\n",
36228
+ " }\n",
36229
+ "\n",
36230
+ " .dataframe thead th {\n",
36231
+ " text-align: right;\n",
36232
+ " }\n",
36233
+ "</style>\n",
36234
+ "<table border=\"1\" class=\"dataframe\">\n",
36235
+ " <thead>\n",
36236
+ " <tr style=\"text-align: right;\">\n",
36237
+ " <th></th>\n",
36238
+ " <th>title</th>\n",
36239
+ " <th>summary</th>\n",
36240
+ " <th>categories</th>\n",
36241
+ " <th>id</th>\n",
36242
+ " </tr>\n",
36243
+ " </thead>\n",
36244
+ " <tbody>\n",
36245
+ " <tr>\n",
36246
+ " <th>0</th>\n",
36247
+ " <td>Future stability of expanding spatially homoge...</td>\n",
36248
+ " <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
36249
+ " <td>[gr-qc, math-ph, math.AP, math.DG, math.MP]</td>\n",
36250
+ " <td>2306.17774v1</td>\n",
36251
+ " </tr>\n",
36252
+ " <tr>\n",
36253
+ " <th>1</th>\n",
36254
+ " <td>Autonomous and asymptotically quasiconvex func...</td>\n",
36255
+ " <td>We obtain local regularity for minimizers of a...</td>\n",
36256
+ " <td>[math.AP, 35J47, 35B65, 46E30]</td>\n",
36257
+ " <td>2306.17768v1</td>\n",
36258
+ " </tr>\n",
36259
+ " <tr>\n",
36260
+ " <th>2</th>\n",
36261
+ " <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
36262
+ " <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
36263
+ " <td>[math.AP]</td>\n",
36264
+ " <td>2306.03349v2</td>\n",
36265
+ " </tr>\n",
36266
+ " <tr>\n",
36267
+ " <th>3</th>\n",
36268
+ " <td>Nonuniqueness results for constant sixth order...</td>\n",
36269
+ " <td>We prove nonuniqueness results for constant si...</td>\n",
36270
+ " <td>[math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,...</td>\n",
36271
+ " <td>2306.00679v2</td>\n",
36272
+ " </tr>\n",
36273
+ " <tr>\n",
36274
+ " <th>4</th>\n",
36275
+ " <td>Asymptotic limits of the principal spectrum po...</td>\n",
36276
+ " <td>This work examines the limits of the principal...</td>\n",
36277
+ " <td>[math.AP, math.DS, 92D40, 92D50, 35P15, 35K57]</td>\n",
36278
+ " <td>2306.17734v1</td>\n",
36279
+ " </tr>\n",
36280
+ " <tr>\n",
36281
+ " <th>...</th>\n",
36282
+ " <td>...</td>\n",
36283
+ " <td>...</td>\n",
36284
+ " <td>...</td>\n",
36285
+ " <td>...</td>\n",
36286
+ " </tr>\n",
36287
+ " <tr>\n",
36288
+ " <th>95</th>\n",
36289
+ " <td>Quantization of the Energy for the inhomogeneo...</td>\n",
36290
+ " <td>We consider the varifold associated to the All...</td>\n",
36291
+ " <td>[math.DG, math.AP, 53E99]</td>\n",
36292
+ " <td>2302.00137v2</td>\n",
36293
+ " </tr>\n",
36294
+ " <tr>\n",
36295
+ " <th>96</th>\n",
36296
+ " <td>Second order estimates for transition layers a...</td>\n",
36297
+ " <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
36298
+ " <td>[math.DG, math.AP, 53E99]</td>\n",
36299
+ " <td>2003.11886v3</td>\n",
36300
+ " </tr>\n",
36301
+ " <tr>\n",
36302
+ " <th>97</th>\n",
36303
+ " <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
36304
+ " <td>A compartment epidemic model for infectious di...</td>\n",
36305
+ " <td>[math.AP]</td>\n",
36306
+ " <td>2212.10137v2</td>\n",
36307
+ " </tr>\n",
36308
+ " <tr>\n",
36309
+ " <th>98</th>\n",
36310
+ " <td>Multiple positive solutions for a double phase...</td>\n",
36311
+ " <td>In this paper, we study a class of double phas...</td>\n",
36312
+ " <td>[math.AP, math.FA, 05J50, 03H10, 35D30]</td>\n",
36313
+ " <td>2306.01319v2</td>\n",
36314
+ " </tr>\n",
36315
+ " <tr>\n",
36316
+ " <th>99</th>\n",
36317
+ " <td>Stabilization of the wave equation on larger-d...</td>\n",
36318
+ " <td>This paper deals with uniform stabilization of...</td>\n",
36319
+ " <td>[math.AP, 93C20 (Primary) 35A27 (Secondary)]</td>\n",
36320
+ " <td>2303.03733v3</td>\n",
36321
+ " </tr>\n",
36322
+ " </tbody>\n",
36323
+ "</table>\n",
36324
+ "<p>100 rows × 4 columns</p>\n",
36325
+ "</div>"
36326
+ ],
36327
+ "text/plain": [
36328
+ " title \\\n",
36329
+ "0 Future stability of expanding spatially homoge... \n",
36330
+ "1 Autonomous and asymptotically quasiconvex func... \n",
36331
+ "2 A Coefficient Inverse Problem for the Mean Fie... \n",
36332
+ "3 Nonuniqueness results for constant sixth order... \n",
36333
+ "4 Asymptotic limits of the principal spectrum po... \n",
36334
+ ".. ... \n",
36335
+ "95 Quantization of the Energy for the inhomogeneo... \n",
36336
+ "96 Second order estimates for transition layers a... \n",
36337
+ "97 Well-Posedness and Stability Analysis of an Ep... \n",
36338
+ "98 Multiple positive solutions for a double phase... \n",
36339
+ "99 Stabilization of the wave equation on larger-d... \n",
36340
+ "\n",
36341
+ " summary \\\n",
36342
+ "0 Spatially homogeneous FLRW solutions constitut... \n",
36343
+ "1 We obtain local regularity for minimizers of a... \n",
36344
+ "2 A Coefficient Inverse Problem (CIP) of the det... \n",
36345
+ "3 We prove nonuniqueness results for constant si... \n",
36346
+ "4 This work examines the limits of the principal... \n",
36347
+ ".. ... \n",
36348
+ "95 We consider the varifold associated to the All... \n",
36349
+ "96 The parabolic Allen-Cahn equation is a semilin... \n",
36350
+ "97 A compartment epidemic model for infectious di... \n",
36351
+ "98 In this paper, we study a class of double phas... \n",
36352
+ "99 This paper deals with uniform stabilization of... \n",
36353
+ "\n",
36354
+ " categories id \n",
36355
+ "0 [gr-qc, math-ph, math.AP, math.DG, math.MP] 2306.17774v1 \n",
36356
+ "1 [math.AP, 35J47, 35B65, 46E30] 2306.17768v1 \n",
36357
+ "2 [math.AP] 2306.03349v2 \n",
36358
+ "3 [math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,... 2306.00679v2 \n",
36359
+ "4 [math.AP, math.DS, 92D40, 92D50, 35P15, 35K57] 2306.17734v1 \n",
36360
+ ".. ... ... \n",
36361
+ "95 [math.DG, math.AP, 53E99] 2302.00137v2 \n",
36362
+ "96 [math.DG, math.AP, 53E99] 2003.11886v3 \n",
36363
+ "97 [math.AP] 2212.10137v2 \n",
36364
+ "98 [math.AP, math.FA, 05J50, 03H10, 35D30] 2306.01319v2 \n",
36365
+ "99 [math.AP, 93C20 (Primary) 35A27 (Secondary)] 2303.03733v3 \n",
36366
+ "\n",
36367
+ "[100 rows x 4 columns]"
36368
+ ]
36369
+ },
36370
+ "execution_count": 259,
36371
+ "metadata": {},
36372
+ "output_type": "execute_result"
36373
+ }
36374
+ ],
36375
+ "source": [
36376
+ "data._returned_metadata"
36377
+ ]
36378
+ },
36379
+ {
36380
+ "cell_type": "code",
36381
+ "execution_count": 260,
36382
+ "metadata": {},
36383
+ "outputs": [
36384
+ {
36385
+ "data": {
36386
+ "text/html": [
36387
+ "<div>\n",
36388
+ "<style scoped>\n",
36389
+ " .dataframe tbody tr th:only-of-type {\n",
36390
+ " vertical-align: middle;\n",
36391
+ " }\n",
36392
+ "\n",
36393
+ " .dataframe tbody tr th {\n",
36394
+ " vertical-align: top;\n",
36395
+ " }\n",
36396
+ "\n",
36397
+ " .dataframe thead th {\n",
36398
+ " text-align: right;\n",
36399
+ " }\n",
36400
+ "</style>\n",
36401
+ "<table border=\"1\" class=\"dataframe\">\n",
36402
+ " <thead>\n",
36403
+ " <tr style=\"text-align: right;\">\n",
36404
+ " <th></th>\n",
36405
+ " <th>title</th>\n",
36406
+ " <th>summary</th>\n",
36407
+ " <th>id</th>\n",
36408
+ " <th>msc_tags</th>\n",
36409
+ " </tr>\n",
36410
+ " </thead>\n",
36411
+ " <tbody>\n",
36412
+ " <tr>\n",
36413
+ " <th>0</th>\n",
36414
+ " <td>Future stability of expanding spatially homoge...</td>\n",
36415
+ " <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
36416
+ " <td>2306.17774v1</td>\n",
36417
+ " <td>NaN</td>\n",
36418
+ " </tr>\n",
36419
+ " <tr>\n",
36420
+ " <th>1</th>\n",
36421
+ " <td>Autonomous and asymptotically quasiconvex func...</td>\n",
36422
+ " <td>We obtain local regularity for minimizers of a...</td>\n",
36423
+ " <td>2306.17768v1</td>\n",
36424
+ " <td>[35J47, 35B65, 46E30]</td>\n",
36425
+ " </tr>\n",
36426
+ " <tr>\n",
36427
+ " <th>2</th>\n",
36428
+ " <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
36429
+ " <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
36430
+ " <td>2306.03349v2</td>\n",
36431
+ " <td>NaN</td>\n",
36432
+ " </tr>\n",
36433
+ " <tr>\n",
36434
+ " <th>3</th>\n",
36435
+ " <td>Nonuniqueness results for constant sixth order...</td>\n",
36436
+ " <td>We prove nonuniqueness results for constant si...</td>\n",
36437
+ " <td>2306.00679v2</td>\n",
36438
+ " <td>[35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55]</td>\n",
36439
+ " </tr>\n",
36440
+ " <tr>\n",
36441
+ " <th>4</th>\n",
36442
+ " <td>Asymptotic limits of the principal spectrum po...</td>\n",
36443
+ " <td>This work examines the limits of the principal...</td>\n",
36444
+ " <td>2306.17734v1</td>\n",
36445
+ " <td>[92D40, 92D50, 35P15, 35K57]</td>\n",
36446
+ " </tr>\n",
36447
+ " <tr>\n",
36448
+ " <th>...</th>\n",
36449
+ " <td>...</td>\n",
36450
+ " <td>...</td>\n",
36451
+ " <td>...</td>\n",
36452
+ " <td>...</td>\n",
36453
+ " </tr>\n",
36454
+ " <tr>\n",
36455
+ " <th>95</th>\n",
36456
+ " <td>Quantization of the Energy for the inhomogeneo...</td>\n",
36457
+ " <td>We consider the varifold associated to the All...</td>\n",
36458
+ " <td>2302.00137v2</td>\n",
36459
+ " <td>[53E99]</td>\n",
36460
+ " </tr>\n",
36461
+ " <tr>\n",
36462
+ " <th>96</th>\n",
36463
+ " <td>Second order estimates for transition layers a...</td>\n",
36464
+ " <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
36465
+ " <td>2003.11886v3</td>\n",
36466
+ " <td>[53E99]</td>\n",
36467
+ " </tr>\n",
36468
+ " <tr>\n",
36469
+ " <th>97</th>\n",
36470
+ " <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
36471
+ " <td>A compartment epidemic model for infectious di...</td>\n",
36472
+ " <td>2212.10137v2</td>\n",
36473
+ " <td>NaN</td>\n",
36474
+ " </tr>\n",
36475
+ " <tr>\n",
36476
+ " <th>98</th>\n",
36477
+ " <td>Multiple positive solutions for a double phase...</td>\n",
36478
+ " <td>In this paper, we study a class of double phas...</td>\n",
36479
+ " <td>2306.01319v2</td>\n",
36480
+ " <td>[05J50, 03H10, 35D30]</td>\n",
36481
+ " </tr>\n",
36482
+ " <tr>\n",
36483
+ " <th>99</th>\n",
36484
+ " <td>Stabilization of the wave equation on larger-d...</td>\n",
36485
+ " <td>This paper deals with uniform stabilization of...</td>\n",
36486
+ " <td>2303.03733v3</td>\n",
36487
+ " <td>[93C20, 35A27]</td>\n",
36488
+ " </tr>\n",
36489
+ " </tbody>\n",
36490
+ "</table>\n",
36491
+ "<p>100 rows × 4 columns</p>\n",
36492
+ "</div>"
36493
+ ],
36494
+ "text/plain": [
36495
+ " title \\\n",
36496
+ "0 Future stability of expanding spatially homoge... \n",
36497
+ "1 Autonomous and asymptotically quasiconvex func... \n",
36498
+ "2 A Coefficient Inverse Problem for the Mean Fie... \n",
36499
+ "3 Nonuniqueness results for constant sixth order... \n",
36500
+ "4 Asymptotic limits of the principal spectrum po... \n",
36501
+ ".. ... \n",
36502
+ "95 Quantization of the Energy for the inhomogeneo... \n",
36503
+ "96 Second order estimates for transition layers a... \n",
36504
+ "97 Well-Posedness and Stability Analysis of an Ep... \n",
36505
+ "98 Multiple positive solutions for a double phase... \n",
36506
+ "99 Stabilization of the wave equation on larger-d... \n",
36507
+ "\n",
36508
+ " summary id \\\n",
36509
+ "0 Spatially homogeneous FLRW solutions constitut... 2306.17774v1 \n",
36510
+ "1 We obtain local regularity for minimizers of a... 2306.17768v1 \n",
36511
+ "2 A Coefficient Inverse Problem (CIP) of the det... 2306.03349v2 \n",
36512
+ "3 We prove nonuniqueness results for constant si... 2306.00679v2 \n",
36513
+ "4 This work examines the limits of the principal... 2306.17734v1 \n",
36514
+ ".. ... ... \n",
36515
+ "95 We consider the varifold associated to the All... 2302.00137v2 \n",
36516
+ "96 The parabolic Allen-Cahn equation is a semilin... 2003.11886v3 \n",
36517
+ "97 A compartment epidemic model for infectious di... 2212.10137v2 \n",
36518
+ "98 In this paper, we study a class of double phas... 2306.01319v2 \n",
36519
+ "99 This paper deals with uniform stabilization of... 2303.03733v3 \n",
36520
+ "\n",
36521
+ " msc_tags \n",
36522
+ "0 NaN \n",
36523
+ "1 [35J47, 35B65, 46E30] \n",
36524
+ "2 NaN \n",
36525
+ "3 [35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55] \n",
36526
+ "4 [92D40, 92D50, 35P15, 35K57] \n",
36527
+ ".. ... \n",
36528
+ "95 [53E99] \n",
36529
+ "96 [53E99] \n",
36530
+ "97 NaN \n",
36531
+ "98 [05J50, 03H10, 35D30] \n",
36532
+ "99 [93C20, 35A27] \n",
36533
+ "\n",
36534
+ "[100 rows x 4 columns]"
36535
+ ]
36536
+ },
36537
+ "execution_count": 260,
36538
+ "metadata": {},
36539
+ "output_type": "execute_result"
36540
+ }
36541
+ ],
36542
+ "source": [
36543
+ "data.load_from_query(query=\"cat:math.AP\", max_results=100)\n",
36544
+ "data.metadata"
36545
+ ]
36546
+ },
36547
+ {
36548
+ "cell_type": "code",
36549
+ "execution_count": 261,
36550
+ "metadata": {},
36551
+ "outputs": [
36552
+ {
36553
+ "data": {
36554
+ "text/html": [
36555
+ "<div>\n",
36556
+ "<style scoped>\n",
36557
+ " .dataframe tbody tr th:only-of-type {\n",
36558
+ " vertical-align: middle;\n",
36559
+ " }\n",
36560
+ "\n",
36561
+ " .dataframe tbody tr th {\n",
36562
+ " vertical-align: top;\n",
36563
+ " }\n",
36564
+ "\n",
36565
+ " .dataframe thead th {\n",
36566
+ " text-align: right;\n",
36567
+ " }\n",
36568
+ "</style>\n",
36569
+ "<table border=\"1\" class=\"dataframe\">\n",
36570
+ " <thead>\n",
36571
+ " <tr style=\"text-align: right;\">\n",
36572
+ " <th></th>\n",
36573
+ " <th>Materials Science</th>\n",
36574
+ " <th>Soft Condensed Matter</th>\n",
36575
+ " <th>Numerical Analysis</th>\n",
36576
+ " <th>General Relativity and Quantum Cosmology</th>\n",
36577
+ " <th>Mathematical Physics</th>\n",
36578
+ " <th>Analysis of PDEs</th>\n",
36579
+ " <th>Classical Analysis and ODEs</th>\n",
36580
+ " <th>Differential Geometry</th>\n",
36581
+ " <th>Dynamical Systems</th>\n",
36582
+ " <th>Functional Analysis</th>\n",
36583
+ " <th>...</th>\n",
36584
+ " <th>Optimization and Control</th>\n",
36585
+ " <th>Probability</th>\n",
36586
+ " <th>Spectral Theory</th>\n",
36587
+ " <th>Pattern Formation and Solitons</th>\n",
36588
+ " <th>Biological Physics</th>\n",
36589
+ " <th>Fluid Dynamics</th>\n",
36590
+ " <th>Optics</th>\n",
36591
+ " <th>Cell Behavior</th>\n",
36592
+ " <th>Populations and Evolution</th>\n",
36593
+ " <th>Tissues and Organs</th>\n",
36594
+ " </tr>\n",
36595
+ " </thead>\n",
36596
+ " <tbody>\n",
36597
+ " <tr>\n",
36598
+ " <th>0</th>\n",
36599
+ " <td>0</td>\n",
36600
+ " <td>0</td>\n",
36601
+ " <td>0</td>\n",
36602
+ " <td>1</td>\n",
36603
+ " <td>1</td>\n",
36604
+ " <td>1</td>\n",
36605
+ " <td>0</td>\n",
36606
+ " <td>1</td>\n",
36607
+ " <td>0</td>\n",
36608
+ " <td>0</td>\n",
36609
+ " <td>...</td>\n",
36610
+ " <td>0</td>\n",
36611
+ " <td>0</td>\n",
36612
+ " <td>0</td>\n",
36613
+ " <td>0</td>\n",
36614
+ " <td>0</td>\n",
36615
+ " <td>0</td>\n",
36616
+ " <td>0</td>\n",
36617
+ " <td>0</td>\n",
36618
+ " <td>0</td>\n",
36619
+ " <td>0</td>\n",
36620
+ " </tr>\n",
36621
+ " <tr>\n",
36622
+ " <th>1</th>\n",
36623
+ " <td>0</td>\n",
36624
+ " <td>0</td>\n",
36625
+ " <td>0</td>\n",
36626
+ " <td>0</td>\n",
36627
+ " <td>0</td>\n",
36628
+ " <td>1</td>\n",
36629
+ " <td>0</td>\n",
36630
+ " <td>0</td>\n",
36631
+ " <td>0</td>\n",
36632
+ " <td>0</td>\n",
36633
+ " <td>...</td>\n",
36634
+ " <td>0</td>\n",
36635
+ " <td>0</td>\n",
36636
+ " <td>0</td>\n",
36637
+ " <td>0</td>\n",
36638
+ " <td>0</td>\n",
36639
+ " <td>0</td>\n",
36640
+ " <td>0</td>\n",
36641
+ " <td>0</td>\n",
36642
+ " <td>0</td>\n",
36643
+ " <td>0</td>\n",
36644
+ " </tr>\n",
36645
+ " <tr>\n",
36646
+ " <th>2</th>\n",
36647
+ " <td>0</td>\n",
36648
+ " <td>0</td>\n",
36649
+ " <td>0</td>\n",
36650
+ " <td>0</td>\n",
36651
+ " <td>0</td>\n",
36652
+ " <td>1</td>\n",
36653
+ " <td>0</td>\n",
36654
+ " <td>0</td>\n",
36655
+ " <td>0</td>\n",
36656
+ " <td>0</td>\n",
36657
+ " <td>...</td>\n",
36658
+ " <td>0</td>\n",
36659
+ " <td>0</td>\n",
36660
+ " <td>0</td>\n",
36661
+ " <td>0</td>\n",
36662
+ " <td>0</td>\n",
36663
+ " <td>0</td>\n",
36664
+ " <td>0</td>\n",
36665
+ " <td>0</td>\n",
36666
+ " <td>0</td>\n",
36667
+ " <td>0</td>\n",
36668
+ " </tr>\n",
36669
+ " <tr>\n",
36670
+ " <th>3</th>\n",
36671
+ " <td>0</td>\n",
36672
+ " <td>0</td>\n",
36673
+ " <td>0</td>\n",
36674
+ " <td>0</td>\n",
36675
+ " <td>0</td>\n",
36676
+ " <td>1</td>\n",
36677
+ " <td>0</td>\n",
36678
+ " <td>1</td>\n",
36679
+ " <td>0</td>\n",
36680
+ " <td>0</td>\n",
36681
+ " <td>...</td>\n",
36682
+ " <td>0</td>\n",
36683
+ " <td>0</td>\n",
36684
+ " <td>0</td>\n",
36685
+ " <td>0</td>\n",
36686
+ " <td>0</td>\n",
36687
+ " <td>0</td>\n",
36688
+ " <td>0</td>\n",
36689
+ " <td>0</td>\n",
36690
+ " <td>0</td>\n",
36691
+ " <td>0</td>\n",
36692
+ " </tr>\n",
36693
+ " <tr>\n",
36694
+ " <th>4</th>\n",
36695
+ " <td>0</td>\n",
36696
+ " <td>0</td>\n",
36697
+ " <td>0</td>\n",
36698
+ " <td>0</td>\n",
36699
+ " <td>0</td>\n",
36700
+ " <td>1</td>\n",
36701
+ " <td>0</td>\n",
36702
+ " <td>0</td>\n",
36703
+ " <td>1</td>\n",
36704
+ " <td>0</td>\n",
36705
+ " <td>...</td>\n",
36706
+ " <td>0</td>\n",
36707
+ " <td>0</td>\n",
36708
+ " <td>0</td>\n",
36709
+ " <td>0</td>\n",
36710
+ " <td>0</td>\n",
36711
+ " <td>0</td>\n",
36712
+ " <td>0</td>\n",
36713
+ " <td>0</td>\n",
36714
+ " <td>0</td>\n",
36715
+ " <td>0</td>\n",
36716
+ " </tr>\n",
36717
+ " <tr>\n",
36718
+ " <th>...</th>\n",
36719
+ " <td>...</td>\n",
36720
+ " <td>...</td>\n",
36721
+ " <td>...</td>\n",
36722
+ " <td>...</td>\n",
36723
+ " <td>...</td>\n",
36724
+ " <td>...</td>\n",
36725
+ " <td>...</td>\n",
36726
+ " <td>...</td>\n",
36727
+ " <td>...</td>\n",
36728
+ " <td>...</td>\n",
36729
+ " <td>...</td>\n",
36730
+ " <td>...</td>\n",
36731
+ " <td>...</td>\n",
36732
+ " <td>...</td>\n",
36733
+ " <td>...</td>\n",
36734
+ " <td>...</td>\n",
36735
+ " <td>...</td>\n",
36736
+ " <td>...</td>\n",
36737
+ " <td>...</td>\n",
36738
+ " <td>...</td>\n",
36739
+ " <td>...</td>\n",
36740
+ " </tr>\n",
36741
+ " <tr>\n",
36742
+ " <th>95</th>\n",
36743
+ " <td>0</td>\n",
36744
+ " <td>0</td>\n",
36745
+ " <td>0</td>\n",
36746
+ " <td>0</td>\n",
36747
+ " <td>0</td>\n",
36748
+ " <td>1</td>\n",
36749
+ " <td>0</td>\n",
36750
+ " <td>1</td>\n",
36751
+ " <td>0</td>\n",
36752
+ " <td>0</td>\n",
36753
+ " <td>...</td>\n",
36754
+ " <td>0</td>\n",
36755
+ " <td>0</td>\n",
36756
+ " <td>0</td>\n",
36757
+ " <td>0</td>\n",
36758
+ " <td>0</td>\n",
36759
+ " <td>0</td>\n",
36760
+ " <td>0</td>\n",
36761
+ " <td>0</td>\n",
36762
+ " <td>0</td>\n",
36763
+ " <td>0</td>\n",
36764
+ " </tr>\n",
36765
+ " <tr>\n",
36766
+ " <th>96</th>\n",
36767
+ " <td>0</td>\n",
36768
+ " <td>0</td>\n",
36769
+ " <td>0</td>\n",
36770
+ " <td>0</td>\n",
36771
+ " <td>0</td>\n",
36772
+ " <td>1</td>\n",
36773
+ " <td>0</td>\n",
36774
+ " <td>1</td>\n",
36775
+ " <td>0</td>\n",
36776
+ " <td>0</td>\n",
36777
+ " <td>...</td>\n",
36778
+ " <td>0</td>\n",
36779
+ " <td>0</td>\n",
36780
+ " <td>0</td>\n",
36781
+ " <td>0</td>\n",
36782
+ " <td>0</td>\n",
36783
+ " <td>0</td>\n",
36784
+ " <td>0</td>\n",
36785
+ " <td>0</td>\n",
36786
+ " <td>0</td>\n",
36787
+ " <td>0</td>\n",
36788
+ " </tr>\n",
36789
+ " <tr>\n",
36790
+ " <th>97</th>\n",
36791
+ " <td>0</td>\n",
36792
+ " <td>0</td>\n",
36793
+ " <td>0</td>\n",
36794
+ " <td>0</td>\n",
36795
+ " <td>0</td>\n",
36796
+ " <td>1</td>\n",
36797
+ " <td>0</td>\n",
36798
+ " <td>0</td>\n",
36799
+ " <td>0</td>\n",
36800
+ " <td>0</td>\n",
36801
+ " <td>...</td>\n",
36802
+ " <td>0</td>\n",
36803
+ " <td>0</td>\n",
36804
+ " <td>0</td>\n",
36805
+ " <td>0</td>\n",
36806
+ " <td>0</td>\n",
36807
+ " <td>0</td>\n",
36808
+ " <td>0</td>\n",
36809
+ " <td>0</td>\n",
36810
+ " <td>0</td>\n",
36811
+ " <td>0</td>\n",
36812
+ " </tr>\n",
36813
+ " <tr>\n",
36814
+ " <th>98</th>\n",
36815
+ " <td>0</td>\n",
36816
+ " <td>0</td>\n",
36817
+ " <td>0</td>\n",
36818
+ " <td>0</td>\n",
36819
+ " <td>0</td>\n",
36820
+ " <td>1</td>\n",
36821
+ " <td>0</td>\n",
36822
+ " <td>0</td>\n",
36823
+ " <td>0</td>\n",
36824
+ " <td>1</td>\n",
36825
+ " <td>...</td>\n",
36826
+ " <td>0</td>\n",
36827
+ " <td>0</td>\n",
36828
+ " <td>0</td>\n",
36829
+ " <td>0</td>\n",
36830
+ " <td>0</td>\n",
36831
+ " <td>0</td>\n",
36832
+ " <td>0</td>\n",
36833
+ " <td>0</td>\n",
36834
+ " <td>0</td>\n",
36835
+ " <td>0</td>\n",
36836
+ " </tr>\n",
36837
+ " <tr>\n",
36838
+ " <th>99</th>\n",
36839
+ " <td>0</td>\n",
36840
+ " <td>0</td>\n",
36841
+ " <td>0</td>\n",
36842
+ " <td>0</td>\n",
36843
+ " <td>0</td>\n",
36844
+ " <td>1</td>\n",
36845
+ " <td>0</td>\n",
36846
+ " <td>0</td>\n",
36847
+ " <td>0</td>\n",
36848
+ " <td>0</td>\n",
36849
+ " <td>...</td>\n",
36850
+ " <td>0</td>\n",
36851
+ " <td>0</td>\n",
36852
+ " <td>0</td>\n",
36853
+ " <td>0</td>\n",
36854
+ " <td>0</td>\n",
36855
+ " <td>0</td>\n",
36856
+ " <td>0</td>\n",
36857
+ " <td>0</td>\n",
36858
+ " <td>0</td>\n",
36859
+ " <td>0</td>\n",
36860
+ " </tr>\n",
36861
+ " </tbody>\n",
36862
+ "</table>\n",
36863
+ "<p>100 rows × 21 columns</p>\n",
36864
+ "</div>"
36865
+ ],
36866
+ "text/plain": [
36867
+ " Materials Science Soft Condensed Matter Numerical Analysis \\\n",
36868
+ "0 0 0 0 \n",
36869
+ "1 0 0 0 \n",
36870
+ "2 0 0 0 \n",
36871
+ "3 0 0 0 \n",
36872
+ "4 0 0 0 \n",
36873
+ ".. ... ... ... \n",
36874
+ "95 0 0 0 \n",
36875
+ "96 0 0 0 \n",
36876
+ "97 0 0 0 \n",
36877
+ "98 0 0 0 \n",
36878
+ "99 0 0 0 \n",
36879
+ "\n",
36880
+ " General Relativity and Quantum Cosmology Mathematical Physics \\\n",
36881
+ "0 1 1 \n",
36882
+ "1 0 0 \n",
36883
+ "2 0 0 \n",
36884
+ "3 0 0 \n",
36885
+ "4 0 0 \n",
36886
+ ".. ... ... \n",
36887
+ "95 0 0 \n",
36888
+ "96 0 0 \n",
36889
+ "97 0 0 \n",
36890
+ "98 0 0 \n",
36891
+ "99 0 0 \n",
36892
+ "\n",
36893
+ " Analysis of PDEs Classical Analysis and ODEs Differential Geometry \\\n",
36894
+ "0 1 0 1 \n",
36895
+ "1 1 0 0 \n",
36896
+ "2 1 0 0 \n",
36897
+ "3 1 0 1 \n",
36898
+ "4 1 0 0 \n",
36899
+ ".. ... ... ... \n",
36900
+ "95 1 0 1 \n",
36901
+ "96 1 0 1 \n",
36902
+ "97 1 0 0 \n",
36903
+ "98 1 0 0 \n",
36904
+ "99 1 0 0 \n",
36905
+ "\n",
36906
+ " Dynamical Systems Functional Analysis ... Optimization and Control \\\n",
36907
+ "0 0 0 ... 0 \n",
36908
+ "1 0 0 ... 0 \n",
36909
+ "2 0 0 ... 0 \n",
36910
+ "3 0 0 ... 0 \n",
36911
+ "4 1 0 ... 0 \n",
36912
+ ".. ... ... ... ... \n",
36913
+ "95 0 0 ... 0 \n",
36914
+ "96 0 0 ... 0 \n",
36915
+ "97 0 0 ... 0 \n",
36916
+ "98 0 1 ... 0 \n",
36917
+ "99 0 0 ... 0 \n",
36918
+ "\n",
36919
+ " Probability Spectral Theory Pattern Formation and Solitons \\\n",
36920
+ "0 0 0 0 \n",
36921
+ "1 0 0 0 \n",
36922
+ "2 0 0 0 \n",
36923
+ "3 0 0 0 \n",
36924
+ "4 0 0 0 \n",
36925
+ ".. ... ... ... \n",
36926
+ "95 0 0 0 \n",
36927
+ "96 0 0 0 \n",
36928
+ "97 0 0 0 \n",
36929
+ "98 0 0 0 \n",
36930
+ "99 0 0 0 \n",
36931
+ "\n",
36932
+ " Biological Physics Fluid Dynamics Optics Cell Behavior \\\n",
36933
+ "0 0 0 0 0 \n",
36934
+ "1 0 0 0 0 \n",
36935
+ "2 0 0 0 0 \n",
36936
+ "3 0 0 0 0 \n",
36937
+ "4 0 0 0 0 \n",
36938
+ ".. ... ... ... ... \n",
36939
+ "95 0 0 0 0 \n",
36940
+ "96 0 0 0 0 \n",
36941
+ "97 0 0 0 0 \n",
36942
+ "98 0 0 0 0 \n",
36943
+ "99 0 0 0 0 \n",
36944
+ "\n",
36945
+ " Populations and Evolution Tissues and Organs \n",
36946
+ "0 0 0 \n",
36947
+ "1 0 0 \n",
36948
+ "2 0 0 \n",
36949
+ "3 0 0 \n",
36950
+ "4 0 0 \n",
36951
+ ".. ... ... \n",
36952
+ "95 0 0 \n",
36953
+ "96 0 0 \n",
36954
+ "97 0 0 \n",
36955
+ "98 0 0 \n",
36956
+ "99 0 0 \n",
36957
+ "\n",
36958
+ "[100 rows x 21 columns]"
36959
+ ]
36960
+ },
36961
+ "execution_count": 261,
36962
+ "metadata": {},
36963
+ "output_type": "execute_result"
36964
+ }
36965
+ ],
36966
+ "source": [
36967
+ "data.arxiv_subjects"
36968
+ ]
36969
+ },
36970
+ {
36971
+ "cell_type": "code",
36972
+ "execution_count": 156,
36973
  "metadata": {},
36974
  "outputs": [],
36975
+ "source": [
36976
+ "x = []\n",
36977
+ "\n",
36978
+ "if x:\n",
36979
+ " y = x"
36980
+ ]
36981
+ },
36982
+ {
36983
+ "cell_type": "code",
36984
+ "execution_count": 157,
36985
+ "metadata": {},
36986
+ "outputs": [
36987
+ {
36988
+ "ename": "NameError",
36989
+ "evalue": "name 'y' is not defined",
36990
+ "output_type": "error",
36991
+ "traceback": [
36992
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
36993
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
36994
+ "Cell \u001b[1;32mIn[157], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m y\n",
36995
+ "\u001b[1;31mNameError\u001b[0m: name 'y' is not defined"
36996
+ ]
36997
+ }
36998
+ ],
36999
+ "source": [
37000
+ "y"
37001
+ ]
37002
  }
37003
  ],
37004
  "metadata": {
data_cleaning.py CHANGED
@@ -2,6 +2,7 @@ import regex
2
  import pandas as pd
3
  import json
4
  import sentence_transformers.util
 
5
  import os
6
 
7
 
@@ -196,35 +197,25 @@ def category_map():
196
  }
197
 
198
 
199
- def split_categories_by_row(raw_metadata_row):
200
- """Takes in row of a dataframe returned by an arxiv query search, returns a tuple with the list
201
- of arXiv subject tags in the first slot, msc_tags in the second slot.
 
202
 
203
- Args:
204
- raw_metadata_row: row of a dataframe returned by an arXiv query request
205
 
206
- Returns:
207
- (x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
208
- """
209
- categories = raw_metadata_row.categories
210
- expanded_categories = pd.Series(categories)
211
- arxiv_subject_labels = category_map()
212
 
213
- if expanded_categories.isin(arxiv_subject_labels.keys()).all():
214
- return (raw_metadata_row.categories, None)
215
- else:
216
- msc_tags = find_msc(raw_metadata_row.categories[-1])
217
- return (raw_metadata_row.categories[:-2], msc_tags)
218
 
 
219
 
220
- def extract_tags(raw_metadata, arxiv_tag):
221
- split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
222
 
223
- flag = 1
224
- if arxiv_tag:
225
- flag = 0
226
 
227
- return split_categories.apply(lambda x: x[flag])
228
 
229
 
230
  ## 1. Latin-ize latex accents enclosed in brackets
@@ -285,9 +276,9 @@ def find_hyph(text):
285
 
286
 
287
  def find_msc(msc_string):
288
- pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
289
- tags = regex.findall(pattern, msc_string)
290
- return tags
291
 
292
 
293
  def msc_tags():
 
2
  import pandas as pd
3
  import json
4
  import sentence_transformers.util
5
+ import numpy as np
6
  import os
7
 
8
 
 
197
  }
198
 
199
 
200
+ def extract_arxiv_subjects(raw_metadata):
201
+ def get_arxiv_subjects_from_cats(categories):
202
+ arxiv_subject_labels = category_map()
203
+ return [tag for tag in categories if tag in arxiv_subject_labels.keys()]
204
 
205
+ return raw_metadata.categories.apply(get_arxiv_subjects_from_cats)
 
206
 
 
 
 
 
 
 
207
 
208
+ def extract_msc_tags(raw_metadata):
209
+ ## Check the last entry for 5 digit msc tags only.
210
+
211
+ msc_tags = raw_metadata.categories.apply(lambda x: find_msc(x[-1]))
 
212
 
213
+ msc_tags = msc_tags.apply(lambda x: np.nan if len(x) == 0 else x)
214
 
215
+ return msc_tags
 
216
 
 
 
 
217
 
218
+ #### LATEX CLEANING UTILITIES
219
 
220
 
221
  ## 1. Latin-ize latex accents enclosed in brackets
 
276
 
277
 
278
  def find_msc(msc_string):
279
+ five_digit_pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
280
+ five_digit_tags = regex.findall(five_digit_pattern, msc_string)
281
+ return five_digit_tags
282
 
283
 
284
  def msc_tags():
data_storage.py CHANGED
@@ -20,13 +20,19 @@ class ArXivData:
20
  self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
21
  self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
22
 
23
- def load_from_query(self, query_string, max_results, offset=0):
24
- self._returned_metadata = query_to_df(
25
- query=query_string, max_results=max_results, offset=offset
26
- )
 
27
 
28
- self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
29
- self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
 
 
 
 
 
30
 
31
  def clean(self, dataset):
32
  """Constructs this dataset by cleaning another one.
@@ -39,15 +45,20 @@ class ArXivData:
39
  self.raw = dataset.raw
40
  self.categories = dataset.categories
41
 
42
- def get_OHE_arxiv_subjects(returned_metadata):
43
  mlb = MultiLabelBinarizer()
44
 
45
  OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
46
  arxiv_subject_labels = clean.category_map()
47
 
48
- return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename(
49
- columns=arxiv_subject_labels
50
- )
 
 
 
 
 
51
 
52
 
53
  def format_query(author="", title="", cat="", abstract=""):
@@ -72,7 +83,7 @@ def format_query(author="", title="", cat="", abstract=""):
72
  return query
73
 
74
 
75
- def query_to_df(query, max_results, offset):
76
  """Returns the results of an arxiv API query in a pandas dataframe.
77
 
78
  Args:
@@ -116,9 +127,10 @@ def query_to_df(query, max_results, offset):
116
  raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
117
 
118
  returned_metadata = raw_metadata.copy().drop(columns=["categories"])
119
- returned_metadata["arxiv_subjects"] = clean.extract_tags(
120
- raw_metadata, arxiv_tag=True
121
- )
122
- returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False)
 
123
 
124
  return returned_metadata
 
20
  self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
21
  self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
22
 
23
+ def load_from_query(self, query, max_results, offset=0, raw=False):
24
+ if raw:
25
+ self._returned_metadata = query_to_df(
26
+ query=query, max_results=max_results, offset=offset, raw=True
27
+ )
28
 
29
+ else:
30
+ self._returned_metadata = query_to_df(
31
+ query=query, max_results=max_results, offset=offset
32
+ )
33
+
34
+ self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
35
+ self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
36
 
37
  def clean(self, dataset):
38
  """Constructs this dataset by cleaning another one.
 
45
  self.raw = dataset.raw
46
  self.categories = dataset.categories
47
 
48
+ def get_OHE_arxiv_subjects(self, returned_metadata):
49
  mlb = MultiLabelBinarizer()
50
 
51
  OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
52
  arxiv_subject_labels = clean.category_map()
53
 
54
+ OHE_arxiv_subjects = pd.DataFrame(
55
+ OHE_arxiv_subjects_array, columns=mlb.classes_
56
+ ).rename(columns=arxiv_subject_labels)
57
+
58
+ ## Remove duplicated columns
59
+ return OHE_arxiv_subjects.loc[
60
+ :, ~OHE_arxiv_subjects.columns.duplicated()
61
+ ].copy()
62
 
63
 
64
  def format_query(author="", title="", cat="", abstract=""):
 
83
  return query
84
 
85
 
86
+ def query_to_df(query, max_results, offset, raw=False):
87
  """Returns the results of an arxiv API query in a pandas dataframe.
88
 
89
  Args:
 
127
  raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
128
 
129
  returned_metadata = raw_metadata.copy().drop(columns=["categories"])
130
+ returned_metadata["arxiv_subjects"] = clean.extract_arxiv_subjects(raw_metadata)
131
+ returned_metadata["msc_tags"] = clean.extract_msc_tags(raw_metadata)
132
+
133
+ if raw:
134
+ return raw_metadata
135
 
136
  return returned_metadata