Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
fcfd917
1
Parent(s):
cbdef5e
updated load_from_query in data_storage, added to data cleaning
Browse files- collection.ipynb +913 -92
- data_cleaning.py +16 -25
- data_storage.py +27 -15
collection.ipynb
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
9 |
-
"from util import format_query
|
10 |
"import pandas as pd\n",
|
11 |
"import numpy as np"
|
12 |
]
|
@@ -19,10 +19,10 @@
|
|
19 |
"source": [
|
20 |
"## Try collection data for pde articles\n",
|
21 |
"\n",
|
22 |
-
"max_results=3e3\n",
|
23 |
-
"cat=
|
24 |
"query = format_query(cat=cat)\n",
|
25 |
-
"pde = query_to_df(query=query,max_results=max_results)
|
26 |
]
|
27 |
},
|
28 |
{
|
@@ -66,19 +66,18 @@
|
|
66 |
"## Find the msc subject tags within the categories\n",
|
67 |
"import regex\n",
|
68 |
"\n",
|
|
|
69 |
"def find_msc(cat_list):\n",
|
70 |
-
" pattern = r
|
71 |
" out = []\n",
|
72 |
" for cat in cat_list:\n",
|
73 |
-
" tags = regex.findall(pattern,cat)\n",
|
74 |
" for tag in tags:\n",
|
75 |
" out.append(tag)\n",
|
76 |
" if out == []:\n",
|
77 |
" return None\n",
|
78 |
" else:\n",
|
79 |
-
" return out
|
80 |
-
"\n",
|
81 |
-
" "
|
82 |
]
|
83 |
},
|
84 |
{
|
@@ -89,7 +88,7 @@
|
|
89 |
"source": [
|
90 |
"## Now create a new column for msc tags\n",
|
91 |
"\n",
|
92 |
-
"pde[
|
93 |
]
|
94 |
},
|
95 |
{
|
@@ -119,7 +118,7 @@
|
|
119 |
"\n",
|
120 |
"msc = pde.msc_tags.sample(10)\n",
|
121 |
"for tag in msc:\n",
|
122 |
-
" print(tag)
|
123 |
]
|
124 |
},
|
125 |
{
|
@@ -139,9 +138,8 @@
|
|
139 |
"## what fraction of these articles has non-zero msc tags?\n",
|
140 |
"\n",
|
141 |
"tagged = pde.msc_tags.count()\n",
|
142 |
-
"fraction = tagged/len(pde)\n",
|
143 |
-
"print(fraction)
|
144 |
-
"\n"
|
145 |
]
|
146 |
},
|
147 |
{
|
@@ -153,11 +151,11 @@
|
|
153 |
"## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
|
154 |
"\n",
|
155 |
"from bs4 import BeautifulSoup\n",
|
156 |
-
"import requests
|
157 |
"\n",
|
158 |
-
"url = r
|
159 |
"\n",
|
160 |
-
"source = requests.get(url)
|
161 |
]
|
162 |
},
|
163 |
{
|
@@ -177,7 +175,7 @@
|
|
177 |
}
|
178 |
],
|
179 |
"source": [
|
180 |
-
"source.headers[
|
181 |
]
|
182 |
},
|
183 |
{
|
@@ -19291,7 +19289,7 @@
|
|
19291 |
}
|
19292 |
],
|
19293 |
"source": [
|
19294 |
-
"soup = BeautifulSoup(document,
|
19295 |
"print(soup.prettify())"
|
19296 |
]
|
19297 |
},
|
@@ -19338,25 +19336,24 @@
|
|
19338 |
"import PyPDF2\n",
|
19339 |
"import regex\n",
|
19340 |
"\n",
|
19341 |
-
"with open(
|
19342 |
-
"\n",
|
19343 |
" reader = PyPDF2.PdfReader(file)\n",
|
19344 |
" print(len(reader.pages))\n",
|
19345 |
" page = reader.pages[0]\n",
|
19346 |
" raw_text = page.extract_text()\n",
|
19347 |
"\n",
|
19348 |
-
" lines = raw_text.split(
|
19349 |
"\n",
|
19350 |
" subject_dict = {}\n",
|
19351 |
" for line in lines[2:]:\n",
|
19352 |
" subject_dict[str(line[:2])] = line[2:]\n",
|
19353 |
"\n",
|
19354 |
"\n",
|
19355 |
-
"subject_dict[
|
19356 |
-
"subject_dict[
|
19357 |
"\n",
|
19358 |
"for k in subject_dict.keys():\n",
|
19359 |
-
" subject_dict[k] = regex.sub(r
|
19360 |
"\n",
|
19361 |
"print(subject_dict)"
|
19362 |
]
|
@@ -19375,7 +19372,7 @@
|
|
19375 |
}
|
19376 |
],
|
19377 |
"source": [
|
19378 |
-
"subject_dict.pop(
|
19379 |
"print(subject_dict)"
|
19380 |
]
|
19381 |
},
|
@@ -19390,9 +19387,8 @@
|
|
19390 |
"import json\n",
|
19391 |
"\n",
|
19392 |
"json_subjects = json.dumps(subject_dict)\n",
|
19393 |
-
"with open(
|
19394 |
-
" file.write(json_subjects)
|
19395 |
-
" "
|
19396 |
]
|
19397 |
},
|
19398 |
{
|
@@ -19476,6 +19472,7 @@
|
|
19476 |
"source": [
|
19477 |
"import util\n",
|
19478 |
"import importlib\n",
|
|
|
19479 |
"importlib.reload(util)\n",
|
19480 |
"\n",
|
19481 |
"util.msc_subjects()"
|
@@ -19489,11 +19486,10 @@
|
|
19489 |
"source": [
|
19490 |
"## Next we make a dictionary consisting of all other subject tags\n",
|
19491 |
"\n",
|
19492 |
-
"with open(
|
19493 |
-
"\n",
|
19494 |
" reader = PyPDF2.PdfReader(file)\n",
|
19495 |
" page = reader.pages[3]\n",
|
19496 |
-
" raw_text = page.extract_text()
|
19497 |
]
|
19498 |
},
|
19499 |
{
|
@@ -19552,17 +19548,17 @@
|
|
19552 |
"source": [
|
19553 |
"## Try splitting on a pattern \\d\\d[A-Z]xx\n",
|
19554 |
"\n",
|
19555 |
-
"pattern = r
|
19556 |
-
"splitting = regex.split(pattern,raw_text)\n",
|
19557 |
"# for line in splitting:\n",
|
19558 |
-
"
|
19559 |
"\n",
|
19560 |
"print(splitting[1])\n",
|
19561 |
"\n",
|
19562 |
-
"## Within each of these, find all text between two instances of the pattern
|
19563 |
-
"tag_pattern = r
|
19564 |
"\n",
|
19565 |
-
"patterns = regex.findall(tag_pattern, splitting[1])
|
19566 |
]
|
19567 |
},
|
19568 |
{
|
@@ -19648,7 +19644,7 @@
|
|
19648 |
"## Turn this into a dict\n",
|
19649 |
"dict = {}\n",
|
19650 |
"for item in patterns:\n",
|
19651 |
-
" k
|
19652 |
" dict[k] = v\n",
|
19653 |
"\n",
|
19654 |
"for item in dict.items():\n",
|
@@ -19661,24 +19657,22 @@
|
|
19661 |
"metadata": {},
|
19662 |
"outputs": [],
|
19663 |
"source": [
|
19664 |
-
"\n",
|
19665 |
-
"
|
19666 |
-
"tag_pattern = r'(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)'\n",
|
19667 |
"dict = {}\n",
|
19668 |
"\n",
|
19669 |
-
"with open(
|
19670 |
-
"\n",
|
19671 |
" reader = PyPDF2.PdfReader(file)\n",
|
19672 |
" for page in reader.pages:\n",
|
19673 |
" page_text = page.extract_text()\n",
|
19674 |
"\n",
|
19675 |
-
" ## Find all the msc tags
|
19676 |
"\n",
|
19677 |
" tags = regex.findall(tag_pattern, page_text)\n",
|
19678 |
-
"
|
19679 |
" for item in tags:\n",
|
19680 |
-
" k
|
19681 |
-
" dict[k] = v
|
19682 |
]
|
19683 |
},
|
19684 |
{
|
@@ -20737,15 +20731,15 @@
|
|
20737 |
"source": [
|
20738 |
"def clean_msc_dict(dict):\n",
|
20739 |
" for item in dict.items():\n",
|
20740 |
-
" k
|
20741 |
-
" v = regex.sub(r
|
20742 |
-
" v = regex.sub(r
|
20743 |
-
" v = regex.sub(r
|
20744 |
-
" v = regex.sub(r
|
20745 |
-
" v = regex.sub(r
|
20746 |
-
" v = regex.sub(r
|
20747 |
" dict[k] = v\n",
|
20748 |
-
" return dict
|
20749 |
]
|
20750 |
},
|
20751 |
{
|
@@ -31666,7 +31660,8 @@
|
|
31666 |
"outputs": [],
|
31667 |
"source": [
|
31668 |
"import pandas as pd\n",
|
31669 |
-
"
|
|
|
31670 |
]
|
31671 |
},
|
31672 |
{
|
@@ -31749,15 +31744,17 @@
|
|
31749 |
"source": [
|
31750 |
"## Look for all entries that start with a valid msc tag pattern\n",
|
31751 |
"\n",
|
31752 |
-
"pattern =
|
|
|
31753 |
"\n",
|
31754 |
"def check_valid(entry):\n",
|
31755 |
-
" if regex.match(pattern,entry):\n",
|
31756 |
" return True\n",
|
31757 |
" else:\n",
|
31758 |
" return False\n",
|
31759 |
"\n",
|
31760 |
-
"
|
|
|
31761 |
]
|
31762 |
},
|
31763 |
{
|
@@ -31873,11 +31870,11 @@
|
|
31873 |
"source": [
|
31874 |
"dict = {}\n",
|
31875 |
"\n",
|
31876 |
-
"for entry in valid_codes[
|
31877 |
-
" split = entry.split(
|
31878 |
" code = split[0]\n",
|
31879 |
" desc = split[1][1:-1]\n",
|
31880 |
-
" dict[code] = desc
|
31881 |
]
|
31882 |
},
|
31883 |
{
|
@@ -33920,9 +33917,9 @@
|
|
33920 |
"## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
|
33921 |
"\n",
|
33922 |
"for item in dict.items():\n",
|
33923 |
-
" k
|
33924 |
-
" v = v.replace(
|
33925 |
-
" v = v.replace(
|
33926 |
" dict[k] = v\n",
|
33927 |
"\n",
|
33928 |
"dict"
|
@@ -33963,8 +33960,8 @@
|
|
33963 |
"dict_stripped_accents = {}\n",
|
33964 |
"\n",
|
33965 |
"for item in dict.items():\n",
|
33966 |
-
" k
|
33967 |
-
" dict_stripped_accents[k] = unidecode(v)
|
33968 |
]
|
33969 |
},
|
33970 |
{
|
@@ -33973,7 +33970,7 @@
|
|
33973 |
"metadata": {},
|
33974 |
"outputs": [],
|
33975 |
"source": [
|
33976 |
-
"with open(
|
33977 |
" json = json.dumps(dict_stripped_accents)\n",
|
33978 |
" file.write(json)"
|
33979 |
]
|
@@ -35010,7 +35007,7 @@
|
|
35010 |
"\n",
|
35011 |
"lib = Library()\n",
|
35012 |
"\n",
|
35013 |
-
"lib.load_from_query(query_string
|
35014 |
]
|
35015 |
},
|
35016 |
{
|
@@ -35386,6 +35383,7 @@
|
|
35386 |
"outputs": [],
|
35387 |
"source": [
|
35388 |
"import importlib\n",
|
|
|
35389 |
"importlib.reload(util)\n",
|
35390 |
"\n",
|
35391 |
"lib.clean_library()"
|
@@ -35659,9 +35657,7 @@
|
|
35659 |
"from library_class import Library\n",
|
35660 |
"\n",
|
35661 |
"lib = Library()\n",
|
35662 |
-
"lib.load_from_query(query_string
|
35663 |
-
"\n",
|
35664 |
-
"\n"
|
35665 |
]
|
35666 |
},
|
35667 |
{
|
@@ -35671,7 +35667,7 @@
|
|
35671 |
"outputs": [],
|
35672 |
"source": [
|
35673 |
"raw_lib = lib.raw_lib\n",
|
35674 |
-
"raw_lib.to_parquet(
|
35675 |
]
|
35676 |
},
|
35677 |
{
|
@@ -35682,7 +35678,7 @@
|
|
35682 |
"source": [
|
35683 |
"## Is the list information preserved?\n",
|
35684 |
"\n",
|
35685 |
-
"df = pd.read_parquet(
|
35686 |
]
|
35687 |
},
|
35688 |
{
|
@@ -35808,7 +35804,7 @@
|
|
35808 |
"metadata": {},
|
35809 |
"outputs": [],
|
35810 |
"source": [
|
35811 |
-
"pd.set_option(
|
35812 |
]
|
35813 |
},
|
35814 |
{
|
@@ -35820,13 +35816,14 @@
|
|
35820 |
"from cleaning import cleaning\n",
|
35821 |
"import pandas as pd\n",
|
35822 |
"import importlib\n",
|
|
|
35823 |
"importlib.reload(cleaning)\n",
|
35824 |
"\n",
|
35825 |
-
"data = pd.read_parquet(
|
35826 |
"\n",
|
35827 |
"clean_data = cleaning.main(\n",
|
35828 |
-
" raw_arxiv_results=data,path_to_embeddings
|
35829 |
-
")
|
35830 |
]
|
35831 |
},
|
35832 |
{
|
@@ -35950,7 +35947,7 @@
|
|
35950 |
}
|
35951 |
],
|
35952 |
"source": [
|
35953 |
-
"pd.set_option(
|
35954 |
"clean_data.head()"
|
35955 |
]
|
35956 |
},
|
@@ -36094,6 +36091,7 @@
|
|
36094 |
"source": [
|
36095 |
"import data_storage\n",
|
36096 |
"import importlib\n",
|
|
|
36097 |
"importlib.reload(data_storage)\n",
|
36098 |
"\n",
|
36099 |
"\n",
|
@@ -36101,10 +36099,11 @@
|
|
36101 |
"\n",
|
36102 |
"max_results = 20000\n",
|
36103 |
"offset = 0\n",
|
36104 |
-
"data.load_from_query(
|
36105 |
-
"
|
36106 |
-
"
|
36107 |
-
"
|
|
|
36108 |
"data.data"
|
36109 |
]
|
36110 |
},
|
@@ -36123,16 +36122,20 @@
|
|
36123 |
],
|
36124 |
"source": [
|
36125 |
"import arxiv\n",
|
36126 |
-
"from datetime import datetime
|
36127 |
"\n",
|
36128 |
"\n",
|
36129 |
-
"search = arxiv.Search(
|
|
|
|
|
|
|
|
|
|
|
36130 |
"\n",
|
36131 |
"for result in search.results():\n",
|
36132 |
" if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
|
36133 |
-
" print(result.title,result.updated)\n",
|
36134 |
-
" break
|
36135 |
-
"\n"
|
36136 |
]
|
36137 |
},
|
36138 |
{
|
@@ -36151,7 +36154,7 @@
|
|
36151 |
"source": [
|
36152 |
"##\n",
|
36153 |
"oldest = list(search.results())[-1]\n",
|
36154 |
-
"print(oldest.updated)
|
36155 |
]
|
36156 |
},
|
36157 |
{
|
@@ -36174,10 +36177,828 @@
|
|
36174 |
},
|
36175 |
{
|
36176 |
"cell_type": "code",
|
36177 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36178 |
"metadata": {},
|
36179 |
"outputs": [],
|
36180 |
-
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36181 |
}
|
36182 |
],
|
36183 |
"metadata": {
|
|
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
9 |
+
"from util import format_query, query_to_df\n",
|
10 |
"import pandas as pd\n",
|
11 |
"import numpy as np"
|
12 |
]
|
|
|
19 |
"source": [
|
20 |
"## Try collection data for pde articles\n",
|
21 |
"\n",
|
22 |
+
"max_results = 3e3\n",
|
23 |
+
"cat = \"math.AP\"\n",
|
24 |
"query = format_query(cat=cat)\n",
|
25 |
+
"pde = query_to_df(query=query, max_results=max_results)"
|
26 |
]
|
27 |
},
|
28 |
{
|
|
|
66 |
"## Find the msc subject tags within the categories\n",
|
67 |
"import regex\n",
|
68 |
"\n",
|
69 |
+
"\n",
|
70 |
"def find_msc(cat_list):\n",
|
71 |
+
" pattern = r\"\\b\\d{2}[0-9a-zA-Z]{3}\\b\"\n",
|
72 |
" out = []\n",
|
73 |
" for cat in cat_list:\n",
|
74 |
+
" tags = regex.findall(pattern, cat)\n",
|
75 |
" for tag in tags:\n",
|
76 |
" out.append(tag)\n",
|
77 |
" if out == []:\n",
|
78 |
" return None\n",
|
79 |
" else:\n",
|
80 |
+
" return out"
|
|
|
|
|
81 |
]
|
82 |
},
|
83 |
{
|
|
|
88 |
"source": [
|
89 |
"## Now create a new column for msc tags\n",
|
90 |
"\n",
|
91 |
+
"pde[\"msc_tags\"] = pde.categories.apply(find_msc)"
|
92 |
]
|
93 |
},
|
94 |
{
|
|
|
118 |
"\n",
|
119 |
"msc = pde.msc_tags.sample(10)\n",
|
120 |
"for tag in msc:\n",
|
121 |
+
" print(tag)"
|
122 |
]
|
123 |
},
|
124 |
{
|
|
|
138 |
"## what fraction of these articles has non-zero msc tags?\n",
|
139 |
"\n",
|
140 |
"tagged = pde.msc_tags.count()\n",
|
141 |
+
"fraction = tagged / len(pde)\n",
|
142 |
+
"print(fraction)"
|
|
|
143 |
]
|
144 |
},
|
145 |
{
|
|
|
151 |
"## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
|
152 |
"\n",
|
153 |
"from bs4 import BeautifulSoup\n",
|
154 |
+
"import requests\n",
|
155 |
"\n",
|
156 |
+
"url = r\"https://cran.r-project.org/web/classifications/MSC.html\"\n",
|
157 |
"\n",
|
158 |
+
"source = requests.get(url)"
|
159 |
]
|
160 |
},
|
161 |
{
|
|
|
175 |
}
|
176 |
],
|
177 |
"source": [
|
178 |
+
"source.headers[\"content-type\"]"
|
179 |
]
|
180 |
},
|
181 |
{
|
|
|
19289 |
}
|
19290 |
],
|
19291 |
"source": [
|
19292 |
+
"soup = BeautifulSoup(document, \"html.parser\")\n",
|
19293 |
"print(soup.prettify())"
|
19294 |
]
|
19295 |
},
|
|
|
19336 |
"import PyPDF2\n",
|
19337 |
"import regex\n",
|
19338 |
"\n",
|
19339 |
+
"with open(\"msc2020.pdf\", \"rb\") as file:\n",
|
|
|
19340 |
" reader = PyPDF2.PdfReader(file)\n",
|
19341 |
" print(len(reader.pages))\n",
|
19342 |
" page = reader.pages[0]\n",
|
19343 |
" raw_text = page.extract_text()\n",
|
19344 |
"\n",
|
19345 |
+
" lines = raw_text.split(\"\\n\")\n",
|
19346 |
"\n",
|
19347 |
" subject_dict = {}\n",
|
19348 |
" for line in lines[2:]:\n",
|
19349 |
" subject_dict[str(line[:2])] = line[2:]\n",
|
19350 |
"\n",
|
19351 |
"\n",
|
19352 |
+
"subject_dict[\"44\"] = \"Integral transforms, operational calculus\"\n",
|
19353 |
+
"subject_dict[\"45\"] = \"Integral equations\"\n",
|
19354 |
"\n",
|
19355 |
"for k in subject_dict.keys():\n",
|
19356 |
+
" subject_dict[k] = regex.sub(r\"\\x0b\", \"ff\", subject_dict[k])\n",
|
19357 |
"\n",
|
19358 |
"print(subject_dict)"
|
19359 |
]
|
|
|
19372 |
}
|
19373 |
],
|
19374 |
"source": [
|
19375 |
+
"subject_dict.pop(\"1\")\n",
|
19376 |
"print(subject_dict)"
|
19377 |
]
|
19378 |
},
|
|
|
19387 |
"import json\n",
|
19388 |
"\n",
|
19389 |
"json_subjects = json.dumps(subject_dict)\n",
|
19390 |
+
"with open(\"./data/msc_subjects.json\", \"w+\") as file:\n",
|
19391 |
+
" file.write(json_subjects)"
|
|
|
19392 |
]
|
19393 |
},
|
19394 |
{
|
|
|
19472 |
"source": [
|
19473 |
"import util\n",
|
19474 |
"import importlib\n",
|
19475 |
+
"\n",
|
19476 |
"importlib.reload(util)\n",
|
19477 |
"\n",
|
19478 |
"util.msc_subjects()"
|
|
|
19486 |
"source": [
|
19487 |
"## Next we make a dictionary consisting of all other subject tags\n",
|
19488 |
"\n",
|
19489 |
+
"with open(\"msc2020.pdf\", \"rb\") as file:\n",
|
|
|
19490 |
" reader = PyPDF2.PdfReader(file)\n",
|
19491 |
" page = reader.pages[3]\n",
|
19492 |
+
" raw_text = page.extract_text()"
|
19493 |
]
|
19494 |
},
|
19495 |
{
|
|
|
19548 |
"source": [
|
19549 |
"## Try splitting on a pattern \\d\\d[A-Z]xx\n",
|
19550 |
"\n",
|
19551 |
+
"pattern = r\"\\b\\d\\d[A-Z]xx\\b\"\n",
|
19552 |
+
"splitting = regex.split(pattern, raw_text)\n",
|
19553 |
"# for line in splitting:\n",
|
19554 |
+
"# print(line + 'END')\n",
|
19555 |
"\n",
|
19556 |
"print(splitting[1])\n",
|
19557 |
"\n",
|
19558 |
+
"## Within each of these, find all text between two instances of the pattern\n",
|
19559 |
+
"tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
|
19560 |
"\n",
|
19561 |
+
"patterns = regex.findall(tag_pattern, splitting[1])"
|
19562 |
]
|
19563 |
},
|
19564 |
{
|
|
|
19644 |
"## Turn this into a dict\n",
|
19645 |
"dict = {}\n",
|
19646 |
"for item in patterns:\n",
|
19647 |
+
" k, v = item\n",
|
19648 |
" dict[k] = v\n",
|
19649 |
"\n",
|
19650 |
"for item in dict.items():\n",
|
|
|
19657 |
"metadata": {},
|
19658 |
"outputs": [],
|
19659 |
"source": [
|
19660 |
+
"## Within each of these, find all text between two instances of the pattern\n",
|
19661 |
+
"tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
|
|
|
19662 |
"dict = {}\n",
|
19663 |
"\n",
|
19664 |
+
"with open(\"msc2020.pdf\", \"rb\") as file:\n",
|
|
|
19665 |
" reader = PyPDF2.PdfReader(file)\n",
|
19666 |
" for page in reader.pages:\n",
|
19667 |
" page_text = page.extract_text()\n",
|
19668 |
"\n",
|
19669 |
+
" ## Find all the msc tags\n",
|
19670 |
"\n",
|
19671 |
" tags = regex.findall(tag_pattern, page_text)\n",
|
19672 |
+
"\n",
|
19673 |
" for item in tags:\n",
|
19674 |
+
" k, v = item\n",
|
19675 |
+
" dict[k] = v"
|
19676 |
]
|
19677 |
},
|
19678 |
{
|
|
|
20731 |
"source": [
|
20732 |
"def clean_msc_dict(dict):\n",
|
20733 |
" for item in dict.items():\n",
|
20734 |
+
" k, v = item\n",
|
20735 |
+
" v = regex.sub(r\"\\x0c\", \"fi\", v)\n",
|
20736 |
+
" v = regex.sub(r\"\\x0b\", \"ff\", v)\n",
|
20737 |
+
" v = regex.sub(r\"\\r\", \"fl\", v)\n",
|
20738 |
+
" v = regex.sub(r\"\\xf7 \", \"\", v)\n",
|
20739 |
+
" v = regex.sub(r\"\\x0e\", \"ffi\", v)\n",
|
20740 |
+
" v = regex.sub(r\"\\x13\", \"\", v)\n",
|
20741 |
" dict[k] = v\n",
|
20742 |
+
" return dict"
|
20743 |
]
|
20744 |
},
|
20745 |
{
|
|
|
31660 |
"outputs": [],
|
31661 |
"source": [
|
31662 |
"import pandas as pd\n",
|
31663 |
+
"\n",
|
31664 |
+
"codes = pd.read_csv(\"./data/MSC_2020.csv\", encoding=\"windows-1252\", on_bad_lines=\"skip\")"
|
31665 |
]
|
31666 |
},
|
31667 |
{
|
|
|
31744 |
"source": [
|
31745 |
"## Look for all entries that start with a valid msc tag pattern\n",
|
31746 |
"\n",
|
31747 |
+
"pattern = \"\\d\\d[A-Z]\\d\\d\"\n",
|
31748 |
+
"\n",
|
31749 |
"\n",
|
31750 |
"def check_valid(entry):\n",
|
31751 |
+
" if regex.match(pattern, entry):\n",
|
31752 |
" return True\n",
|
31753 |
" else:\n",
|
31754 |
" return False\n",
|
31755 |
"\n",
|
31756 |
+
"\n",
|
31757 |
+
"valid_codes = codes.loc[codes[\"code\\ttext\\tdescription\"].apply(check_valid)]"
|
31758 |
]
|
31759 |
},
|
31760 |
{
|
|
|
31870 |
"source": [
|
31871 |
"dict = {}\n",
|
31872 |
"\n",
|
31873 |
+
"for entry in valid_codes[\"code\\ttext\\tdescription\"]:\n",
|
31874 |
+
" split = entry.split(\"\\t\")\n",
|
31875 |
" code = split[0]\n",
|
31876 |
" desc = split[1][1:-1]\n",
|
31877 |
+
" dict[code] = desc"
|
31878 |
]
|
31879 |
},
|
31880 |
{
|
|
|
33917 |
"## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
|
33918 |
"\n",
|
33919 |
"for item in dict.items():\n",
|
33920 |
+
" k, v = item\n",
|
33921 |
+
" v = v.replace(\"\\\\(\", \"\")\n",
|
33922 |
+
" v = v.replace(\"\\\\)\", \"\")\n",
|
33923 |
" dict[k] = v\n",
|
33924 |
"\n",
|
33925 |
"dict"
|
|
|
33960 |
"dict_stripped_accents = {}\n",
|
33961 |
"\n",
|
33962 |
"for item in dict.items():\n",
|
33963 |
+
" k, v = item\n",
|
33964 |
+
" dict_stripped_accents[k] = unidecode(v)"
|
33965 |
]
|
33966 |
},
|
33967 |
{
|
|
|
33970 |
"metadata": {},
|
33971 |
"outputs": [],
|
33972 |
"source": [
|
33973 |
+
"with open(\"./data/msc.json\", \"w\") as file:\n",
|
33974 |
" json = json.dumps(dict_stripped_accents)\n",
|
33975 |
" file.write(json)"
|
33976 |
]
|
|
|
35007 |
"\n",
|
35008 |
"lib = Library()\n",
|
35009 |
"\n",
|
35010 |
+
"lib.load_from_query(query_string=\"cat:math.AP\", max_results=5000)"
|
35011 |
]
|
35012 |
},
|
35013 |
{
|
|
|
35383 |
"outputs": [],
|
35384 |
"source": [
|
35385 |
"import importlib\n",
|
35386 |
+
"\n",
|
35387 |
"importlib.reload(util)\n",
|
35388 |
"\n",
|
35389 |
"lib.clean_library()"
|
|
|
35657 |
"from library_class import Library\n",
|
35658 |
"\n",
|
35659 |
"lib = Library()\n",
|
35660 |
+
"lib.load_from_query(query_string=\"cat:math.AP OR math.SP\", max_results=2e4)"
|
|
|
|
|
35661 |
]
|
35662 |
},
|
35663 |
{
|
|
|
35667 |
"outputs": [],
|
35668 |
"source": [
|
35669 |
"raw_lib = lib.raw_lib\n",
|
35670 |
+
"raw_lib.to_parquet(\"./data/APSP.parquet\")"
|
35671 |
]
|
35672 |
},
|
35673 |
{
|
|
|
35678 |
"source": [
|
35679 |
"## Is the list information preserved?\n",
|
35680 |
"\n",
|
35681 |
+
"df = pd.read_parquet(\"./data/APSP.parquet\")"
|
35682 |
]
|
35683 |
},
|
35684 |
{
|
|
|
35804 |
"metadata": {},
|
35805 |
"outputs": [],
|
35806 |
"source": [
|
35807 |
+
"pd.set_option(\"display.max_colwidth\", 0)"
|
35808 |
]
|
35809 |
},
|
35810 |
{
|
|
|
35816 |
"from cleaning import cleaning\n",
|
35817 |
"import pandas as pd\n",
|
35818 |
"import importlib\n",
|
35819 |
+
"\n",
|
35820 |
"importlib.reload(cleaning)\n",
|
35821 |
"\n",
|
35822 |
+
"data = pd.read_parquet(\"./data/APSP.parquet\")\n",
|
35823 |
"\n",
|
35824 |
"clean_data = cleaning.main(\n",
|
35825 |
+
" raw_arxiv_results=data, path_to_embeddings=\"./data/APSP_mini_vec.parquet\"\n",
|
35826 |
+
")"
|
35827 |
]
|
35828 |
},
|
35829 |
{
|
|
|
35947 |
}
|
35948 |
],
|
35949 |
"source": [
|
35950 |
+
"pd.set_option(\"display.max_colwidth\", 0)\n",
|
35951 |
"clean_data.head()"
|
35952 |
]
|
35953 |
},
|
|
|
36091 |
"source": [
|
36092 |
"import data_storage\n",
|
36093 |
"import importlib\n",
|
36094 |
+
"\n",
|
36095 |
"importlib.reload(data_storage)\n",
|
36096 |
"\n",
|
36097 |
"\n",
|
|
|
36099 |
"\n",
|
36100 |
"max_results = 20000\n",
|
36101 |
"offset = 0\n",
|
36102 |
+
"data.load_from_query(\n",
|
36103 |
+
" query_string=\"cat:math.AP\",\n",
|
36104 |
+
" max_results=max_results,\n",
|
36105 |
+
" offset=offset,\n",
|
36106 |
+
")\n",
|
36107 |
"data.data"
|
36108 |
]
|
36109 |
},
|
|
|
36122 |
],
|
36123 |
"source": [
|
36124 |
"import arxiv\n",
|
36125 |
+
"from datetime import datetime, timedelta, timezone\n",
|
36126 |
"\n",
|
36127 |
"\n",
|
36128 |
+
"search = arxiv.Search(\n",
|
36129 |
+
" query=\"cat:math.AP\",\n",
|
36130 |
+
" max_results=1e3,\n",
|
36131 |
+
" sort_by=arxiv.SortCriterion.LastUpdatedDate,\n",
|
36132 |
+
" sort_order=arxiv.SortOrder.Descending,\n",
|
36133 |
+
")\n",
|
36134 |
"\n",
|
36135 |
"for result in search.results():\n",
|
36136 |
" if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
|
36137 |
+
" print(result.title, result.updated)\n",
|
36138 |
+
" break"
|
|
|
36139 |
]
|
36140 |
},
|
36141 |
{
|
|
|
36154 |
"source": [
|
36155 |
"##\n",
|
36156 |
"oldest = list(search.results())[-1]\n",
|
36157 |
+
"print(oldest.updated)"
|
36158 |
]
|
36159 |
},
|
36160 |
{
|
|
|
36177 |
},
|
36178 |
{
|
36179 |
"cell_type": "code",
|
36180 |
+
"execution_count": 256,
|
36181 |
+
"metadata": {},
|
36182 |
+
"outputs": [],
|
36183 |
+
"source": [
|
36184 |
+
"import data_storage\n",
|
36185 |
+
"import data_cleaning\n",
|
36186 |
+
"from data_storage import ArXivData\n",
|
36187 |
+
"import importlib\n",
|
36188 |
+
"\n",
|
36189 |
+
"importlib.reload(data_storage)\n",
|
36190 |
+
"importlib.reload(data_cleaning)\n",
|
36191 |
+
"import numpy as np"
|
36192 |
+
]
|
36193 |
+
},
|
36194 |
+
{
|
36195 |
+
"cell_type": "code",
|
36196 |
+
"execution_count": 257,
|
36197 |
+
"metadata": {},
|
36198 |
+
"outputs": [],
|
36199 |
+
"source": [
|
36200 |
+
"data = ArXivData()"
|
36201 |
+
]
|
36202 |
+
},
|
36203 |
+
{
|
36204 |
+
"cell_type": "code",
|
36205 |
+
"execution_count": 258,
|
36206 |
+
"metadata": {},
|
36207 |
+
"outputs": [],
|
36208 |
+
"source": [
|
36209 |
+
"data.load_from_query(query=\"cat:math.AP\", max_results=100, raw=True)"
|
36210 |
+
]
|
36211 |
+
},
|
36212 |
+
{
|
36213 |
+
"cell_type": "code",
|
36214 |
+
"execution_count": 259,
|
36215 |
+
"metadata": {},
|
36216 |
+
"outputs": [
|
36217 |
+
{
|
36218 |
+
"data": {
|
36219 |
+
"text/html": [
|
36220 |
+
"<div>\n",
|
36221 |
+
"<style scoped>\n",
|
36222 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
36223 |
+
" vertical-align: middle;\n",
|
36224 |
+
" }\n",
|
36225 |
+
"\n",
|
36226 |
+
" .dataframe tbody tr th {\n",
|
36227 |
+
" vertical-align: top;\n",
|
36228 |
+
" }\n",
|
36229 |
+
"\n",
|
36230 |
+
" .dataframe thead th {\n",
|
36231 |
+
" text-align: right;\n",
|
36232 |
+
" }\n",
|
36233 |
+
"</style>\n",
|
36234 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
36235 |
+
" <thead>\n",
|
36236 |
+
" <tr style=\"text-align: right;\">\n",
|
36237 |
+
" <th></th>\n",
|
36238 |
+
" <th>title</th>\n",
|
36239 |
+
" <th>summary</th>\n",
|
36240 |
+
" <th>categories</th>\n",
|
36241 |
+
" <th>id</th>\n",
|
36242 |
+
" </tr>\n",
|
36243 |
+
" </thead>\n",
|
36244 |
+
" <tbody>\n",
|
36245 |
+
" <tr>\n",
|
36246 |
+
" <th>0</th>\n",
|
36247 |
+
" <td>Future stability of expanding spatially homoge...</td>\n",
|
36248 |
+
" <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
|
36249 |
+
" <td>[gr-qc, math-ph, math.AP, math.DG, math.MP]</td>\n",
|
36250 |
+
" <td>2306.17774v1</td>\n",
|
36251 |
+
" </tr>\n",
|
36252 |
+
" <tr>\n",
|
36253 |
+
" <th>1</th>\n",
|
36254 |
+
" <td>Autonomous and asymptotically quasiconvex func...</td>\n",
|
36255 |
+
" <td>We obtain local regularity for minimizers of a...</td>\n",
|
36256 |
+
" <td>[math.AP, 35J47, 35B65, 46E30]</td>\n",
|
36257 |
+
" <td>2306.17768v1</td>\n",
|
36258 |
+
" </tr>\n",
|
36259 |
+
" <tr>\n",
|
36260 |
+
" <th>2</th>\n",
|
36261 |
+
" <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
|
36262 |
+
" <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
|
36263 |
+
" <td>[math.AP]</td>\n",
|
36264 |
+
" <td>2306.03349v2</td>\n",
|
36265 |
+
" </tr>\n",
|
36266 |
+
" <tr>\n",
|
36267 |
+
" <th>3</th>\n",
|
36268 |
+
" <td>Nonuniqueness results for constant sixth order...</td>\n",
|
36269 |
+
" <td>We prove nonuniqueness results for constant si...</td>\n",
|
36270 |
+
" <td>[math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,...</td>\n",
|
36271 |
+
" <td>2306.00679v2</td>\n",
|
36272 |
+
" </tr>\n",
|
36273 |
+
" <tr>\n",
|
36274 |
+
" <th>4</th>\n",
|
36275 |
+
" <td>Asymptotic limits of the principal spectrum po...</td>\n",
|
36276 |
+
" <td>This work examines the limits of the principal...</td>\n",
|
36277 |
+
" <td>[math.AP, math.DS, 92D40, 92D50, 35P15, 35K57]</td>\n",
|
36278 |
+
" <td>2306.17734v1</td>\n",
|
36279 |
+
" </tr>\n",
|
36280 |
+
" <tr>\n",
|
36281 |
+
" <th>...</th>\n",
|
36282 |
+
" <td>...</td>\n",
|
36283 |
+
" <td>...</td>\n",
|
36284 |
+
" <td>...</td>\n",
|
36285 |
+
" <td>...</td>\n",
|
36286 |
+
" </tr>\n",
|
36287 |
+
" <tr>\n",
|
36288 |
+
" <th>95</th>\n",
|
36289 |
+
" <td>Quantization of the Energy for the inhomogeneo...</td>\n",
|
36290 |
+
" <td>We consider the varifold associated to the All...</td>\n",
|
36291 |
+
" <td>[math.DG, math.AP, 53E99]</td>\n",
|
36292 |
+
" <td>2302.00137v2</td>\n",
|
36293 |
+
" </tr>\n",
|
36294 |
+
" <tr>\n",
|
36295 |
+
" <th>96</th>\n",
|
36296 |
+
" <td>Second order estimates for transition layers a...</td>\n",
|
36297 |
+
" <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
|
36298 |
+
" <td>[math.DG, math.AP, 53E99]</td>\n",
|
36299 |
+
" <td>2003.11886v3</td>\n",
|
36300 |
+
" </tr>\n",
|
36301 |
+
" <tr>\n",
|
36302 |
+
" <th>97</th>\n",
|
36303 |
+
" <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
|
36304 |
+
" <td>A compartment epidemic model for infectious di...</td>\n",
|
36305 |
+
" <td>[math.AP]</td>\n",
|
36306 |
+
" <td>2212.10137v2</td>\n",
|
36307 |
+
" </tr>\n",
|
36308 |
+
" <tr>\n",
|
36309 |
+
" <th>98</th>\n",
|
36310 |
+
" <td>Multiple positive solutions for a double phase...</td>\n",
|
36311 |
+
" <td>In this paper, we study a class of double phas...</td>\n",
|
36312 |
+
" <td>[math.AP, math.FA, 05J50, 03H10, 35D30]</td>\n",
|
36313 |
+
" <td>2306.01319v2</td>\n",
|
36314 |
+
" </tr>\n",
|
36315 |
+
" <tr>\n",
|
36316 |
+
" <th>99</th>\n",
|
36317 |
+
" <td>Stabilization of the wave equation on larger-d...</td>\n",
|
36318 |
+
" <td>This paper deals with uniform stabilization of...</td>\n",
|
36319 |
+
" <td>[math.AP, 93C20 (Primary) 35A27 (Secondary)]</td>\n",
|
36320 |
+
" <td>2303.03733v3</td>\n",
|
36321 |
+
" </tr>\n",
|
36322 |
+
" </tbody>\n",
|
36323 |
+
"</table>\n",
|
36324 |
+
"<p>100 rows × 4 columns</p>\n",
|
36325 |
+
"</div>"
|
36326 |
+
],
|
36327 |
+
"text/plain": [
|
36328 |
+
" title \\\n",
|
36329 |
+
"0 Future stability of expanding spatially homoge... \n",
|
36330 |
+
"1 Autonomous and asymptotically quasiconvex func... \n",
|
36331 |
+
"2 A Coefficient Inverse Problem for the Mean Fie... \n",
|
36332 |
+
"3 Nonuniqueness results for constant sixth order... \n",
|
36333 |
+
"4 Asymptotic limits of the principal spectrum po... \n",
|
36334 |
+
".. ... \n",
|
36335 |
+
"95 Quantization of the Energy for the inhomogeneo... \n",
|
36336 |
+
"96 Second order estimates for transition layers a... \n",
|
36337 |
+
"97 Well-Posedness and Stability Analysis of an Ep... \n",
|
36338 |
+
"98 Multiple positive solutions for a double phase... \n",
|
36339 |
+
"99 Stabilization of the wave equation on larger-d... \n",
|
36340 |
+
"\n",
|
36341 |
+
" summary \\\n",
|
36342 |
+
"0 Spatially homogeneous FLRW solutions constitut... \n",
|
36343 |
+
"1 We obtain local regularity for minimizers of a... \n",
|
36344 |
+
"2 A Coefficient Inverse Problem (CIP) of the det... \n",
|
36345 |
+
"3 We prove nonuniqueness results for constant si... \n",
|
36346 |
+
"4 This work examines the limits of the principal... \n",
|
36347 |
+
".. ... \n",
|
36348 |
+
"95 We consider the varifold associated to the All... \n",
|
36349 |
+
"96 The parabolic Allen-Cahn equation is a semilin... \n",
|
36350 |
+
"97 A compartment epidemic model for infectious di... \n",
|
36351 |
+
"98 In this paper, we study a class of double phas... \n",
|
36352 |
+
"99 This paper deals with uniform stabilization of... \n",
|
36353 |
+
"\n",
|
36354 |
+
" categories id \n",
|
36355 |
+
"0 [gr-qc, math-ph, math.AP, math.DG, math.MP] 2306.17774v1 \n",
|
36356 |
+
"1 [math.AP, 35J47, 35B65, 46E30] 2306.17768v1 \n",
|
36357 |
+
"2 [math.AP] 2306.03349v2 \n",
|
36358 |
+
"3 [math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,... 2306.00679v2 \n",
|
36359 |
+
"4 [math.AP, math.DS, 92D40, 92D50, 35P15, 35K57] 2306.17734v1 \n",
|
36360 |
+
".. ... ... \n",
|
36361 |
+
"95 [math.DG, math.AP, 53E99] 2302.00137v2 \n",
|
36362 |
+
"96 [math.DG, math.AP, 53E99] 2003.11886v3 \n",
|
36363 |
+
"97 [math.AP] 2212.10137v2 \n",
|
36364 |
+
"98 [math.AP, math.FA, 05J50, 03H10, 35D30] 2306.01319v2 \n",
|
36365 |
+
"99 [math.AP, 93C20 (Primary) 35A27 (Secondary)] 2303.03733v3 \n",
|
36366 |
+
"\n",
|
36367 |
+
"[100 rows x 4 columns]"
|
36368 |
+
]
|
36369 |
+
},
|
36370 |
+
"execution_count": 259,
|
36371 |
+
"metadata": {},
|
36372 |
+
"output_type": "execute_result"
|
36373 |
+
}
|
36374 |
+
],
|
36375 |
+
"source": [
|
36376 |
+
"data._returned_metadata"
|
36377 |
+
]
|
36378 |
+
},
|
36379 |
+
{
|
36380 |
+
"cell_type": "code",
|
36381 |
+
"execution_count": 260,
|
36382 |
+
"metadata": {},
|
36383 |
+
"outputs": [
|
36384 |
+
{
|
36385 |
+
"data": {
|
36386 |
+
"text/html": [
|
36387 |
+
"<div>\n",
|
36388 |
+
"<style scoped>\n",
|
36389 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
36390 |
+
" vertical-align: middle;\n",
|
36391 |
+
" }\n",
|
36392 |
+
"\n",
|
36393 |
+
" .dataframe tbody tr th {\n",
|
36394 |
+
" vertical-align: top;\n",
|
36395 |
+
" }\n",
|
36396 |
+
"\n",
|
36397 |
+
" .dataframe thead th {\n",
|
36398 |
+
" text-align: right;\n",
|
36399 |
+
" }\n",
|
36400 |
+
"</style>\n",
|
36401 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
36402 |
+
" <thead>\n",
|
36403 |
+
" <tr style=\"text-align: right;\">\n",
|
36404 |
+
" <th></th>\n",
|
36405 |
+
" <th>title</th>\n",
|
36406 |
+
" <th>summary</th>\n",
|
36407 |
+
" <th>id</th>\n",
|
36408 |
+
" <th>msc_tags</th>\n",
|
36409 |
+
" </tr>\n",
|
36410 |
+
" </thead>\n",
|
36411 |
+
" <tbody>\n",
|
36412 |
+
" <tr>\n",
|
36413 |
+
" <th>0</th>\n",
|
36414 |
+
" <td>Future stability of expanding spatially homoge...</td>\n",
|
36415 |
+
" <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
|
36416 |
+
" <td>2306.17774v1</td>\n",
|
36417 |
+
" <td>NaN</td>\n",
|
36418 |
+
" </tr>\n",
|
36419 |
+
" <tr>\n",
|
36420 |
+
" <th>1</th>\n",
|
36421 |
+
" <td>Autonomous and asymptotically quasiconvex func...</td>\n",
|
36422 |
+
" <td>We obtain local regularity for minimizers of a...</td>\n",
|
36423 |
+
" <td>2306.17768v1</td>\n",
|
36424 |
+
" <td>[35J47, 35B65, 46E30]</td>\n",
|
36425 |
+
" </tr>\n",
|
36426 |
+
" <tr>\n",
|
36427 |
+
" <th>2</th>\n",
|
36428 |
+
" <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
|
36429 |
+
" <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
|
36430 |
+
" <td>2306.03349v2</td>\n",
|
36431 |
+
" <td>NaN</td>\n",
|
36432 |
+
" </tr>\n",
|
36433 |
+
" <tr>\n",
|
36434 |
+
" <th>3</th>\n",
|
36435 |
+
" <td>Nonuniqueness results for constant sixth order...</td>\n",
|
36436 |
+
" <td>We prove nonuniqueness results for constant si...</td>\n",
|
36437 |
+
" <td>2306.00679v2</td>\n",
|
36438 |
+
" <td>[35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55]</td>\n",
|
36439 |
+
" </tr>\n",
|
36440 |
+
" <tr>\n",
|
36441 |
+
" <th>4</th>\n",
|
36442 |
+
" <td>Asymptotic limits of the principal spectrum po...</td>\n",
|
36443 |
+
" <td>This work examines the limits of the principal...</td>\n",
|
36444 |
+
" <td>2306.17734v1</td>\n",
|
36445 |
+
" <td>[92D40, 92D50, 35P15, 35K57]</td>\n",
|
36446 |
+
" </tr>\n",
|
36447 |
+
" <tr>\n",
|
36448 |
+
" <th>...</th>\n",
|
36449 |
+
" <td>...</td>\n",
|
36450 |
+
" <td>...</td>\n",
|
36451 |
+
" <td>...</td>\n",
|
36452 |
+
" <td>...</td>\n",
|
36453 |
+
" </tr>\n",
|
36454 |
+
" <tr>\n",
|
36455 |
+
" <th>95</th>\n",
|
36456 |
+
" <td>Quantization of the Energy for the inhomogeneo...</td>\n",
|
36457 |
+
" <td>We consider the varifold associated to the All...</td>\n",
|
36458 |
+
" <td>2302.00137v2</td>\n",
|
36459 |
+
" <td>[53E99]</td>\n",
|
36460 |
+
" </tr>\n",
|
36461 |
+
" <tr>\n",
|
36462 |
+
" <th>96</th>\n",
|
36463 |
+
" <td>Second order estimates for transition layers a...</td>\n",
|
36464 |
+
" <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
|
36465 |
+
" <td>2003.11886v3</td>\n",
|
36466 |
+
" <td>[53E99]</td>\n",
|
36467 |
+
" </tr>\n",
|
36468 |
+
" <tr>\n",
|
36469 |
+
" <th>97</th>\n",
|
36470 |
+
" <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
|
36471 |
+
" <td>A compartment epidemic model for infectious di...</td>\n",
|
36472 |
+
" <td>2212.10137v2</td>\n",
|
36473 |
+
" <td>NaN</td>\n",
|
36474 |
+
" </tr>\n",
|
36475 |
+
" <tr>\n",
|
36476 |
+
" <th>98</th>\n",
|
36477 |
+
" <td>Multiple positive solutions for a double phase...</td>\n",
|
36478 |
+
" <td>In this paper, we study a class of double phas...</td>\n",
|
36479 |
+
" <td>2306.01319v2</td>\n",
|
36480 |
+
" <td>[05J50, 03H10, 35D30]</td>\n",
|
36481 |
+
" </tr>\n",
|
36482 |
+
" <tr>\n",
|
36483 |
+
" <th>99</th>\n",
|
36484 |
+
" <td>Stabilization of the wave equation on larger-d...</td>\n",
|
36485 |
+
" <td>This paper deals with uniform stabilization of...</td>\n",
|
36486 |
+
" <td>2303.03733v3</td>\n",
|
36487 |
+
" <td>[93C20, 35A27]</td>\n",
|
36488 |
+
" </tr>\n",
|
36489 |
+
" </tbody>\n",
|
36490 |
+
"</table>\n",
|
36491 |
+
"<p>100 rows × 4 columns</p>\n",
|
36492 |
+
"</div>"
|
36493 |
+
],
|
36494 |
+
"text/plain": [
|
36495 |
+
" title \\\n",
|
36496 |
+
"0 Future stability of expanding spatially homoge... \n",
|
36497 |
+
"1 Autonomous and asymptotically quasiconvex func... \n",
|
36498 |
+
"2 A Coefficient Inverse Problem for the Mean Fie... \n",
|
36499 |
+
"3 Nonuniqueness results for constant sixth order... \n",
|
36500 |
+
"4 Asymptotic limits of the principal spectrum po... \n",
|
36501 |
+
".. ... \n",
|
36502 |
+
"95 Quantization of the Energy for the inhomogeneo... \n",
|
36503 |
+
"96 Second order estimates for transition layers a... \n",
|
36504 |
+
"97 Well-Posedness and Stability Analysis of an Ep... \n",
|
36505 |
+
"98 Multiple positive solutions for a double phase... \n",
|
36506 |
+
"99 Stabilization of the wave equation on larger-d... \n",
|
36507 |
+
"\n",
|
36508 |
+
" summary id \\\n",
|
36509 |
+
"0 Spatially homogeneous FLRW solutions constitut... 2306.17774v1 \n",
|
36510 |
+
"1 We obtain local regularity for minimizers of a... 2306.17768v1 \n",
|
36511 |
+
"2 A Coefficient Inverse Problem (CIP) of the det... 2306.03349v2 \n",
|
36512 |
+
"3 We prove nonuniqueness results for constant si... 2306.00679v2 \n",
|
36513 |
+
"4 This work examines the limits of the principal... 2306.17734v1 \n",
|
36514 |
+
".. ... ... \n",
|
36515 |
+
"95 We consider the varifold associated to the All... 2302.00137v2 \n",
|
36516 |
+
"96 The parabolic Allen-Cahn equation is a semilin... 2003.11886v3 \n",
|
36517 |
+
"97 A compartment epidemic model for infectious di... 2212.10137v2 \n",
|
36518 |
+
"98 In this paper, we study a class of double phas... 2306.01319v2 \n",
|
36519 |
+
"99 This paper deals with uniform stabilization of... 2303.03733v3 \n",
|
36520 |
+
"\n",
|
36521 |
+
" msc_tags \n",
|
36522 |
+
"0 NaN \n",
|
36523 |
+
"1 [35J47, 35B65, 46E30] \n",
|
36524 |
+
"2 NaN \n",
|
36525 |
+
"3 [35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55] \n",
|
36526 |
+
"4 [92D40, 92D50, 35P15, 35K57] \n",
|
36527 |
+
".. ... \n",
|
36528 |
+
"95 [53E99] \n",
|
36529 |
+
"96 [53E99] \n",
|
36530 |
+
"97 NaN \n",
|
36531 |
+
"98 [05J50, 03H10, 35D30] \n",
|
36532 |
+
"99 [93C20, 35A27] \n",
|
36533 |
+
"\n",
|
36534 |
+
"[100 rows x 4 columns]"
|
36535 |
+
]
|
36536 |
+
},
|
36537 |
+
"execution_count": 260,
|
36538 |
+
"metadata": {},
|
36539 |
+
"output_type": "execute_result"
|
36540 |
+
}
|
36541 |
+
],
|
36542 |
+
"source": [
|
36543 |
+
"data.load_from_query(query=\"cat:math.AP\", max_results=100)\n",
|
36544 |
+
"data.metadata"
|
36545 |
+
]
|
36546 |
+
},
|
36547 |
+
{
|
36548 |
+
"cell_type": "code",
|
36549 |
+
"execution_count": 261,
|
36550 |
+
"metadata": {},
|
36551 |
+
"outputs": [
|
36552 |
+
{
|
36553 |
+
"data": {
|
36554 |
+
"text/html": [
|
36555 |
+
"<div>\n",
|
36556 |
+
"<style scoped>\n",
|
36557 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
36558 |
+
" vertical-align: middle;\n",
|
36559 |
+
" }\n",
|
36560 |
+
"\n",
|
36561 |
+
" .dataframe tbody tr th {\n",
|
36562 |
+
" vertical-align: top;\n",
|
36563 |
+
" }\n",
|
36564 |
+
"\n",
|
36565 |
+
" .dataframe thead th {\n",
|
36566 |
+
" text-align: right;\n",
|
36567 |
+
" }\n",
|
36568 |
+
"</style>\n",
|
36569 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
36570 |
+
" <thead>\n",
|
36571 |
+
" <tr style=\"text-align: right;\">\n",
|
36572 |
+
" <th></th>\n",
|
36573 |
+
" <th>Materials Science</th>\n",
|
36574 |
+
" <th>Soft Condensed Matter</th>\n",
|
36575 |
+
" <th>Numerical Analysis</th>\n",
|
36576 |
+
" <th>General Relativity and Quantum Cosmology</th>\n",
|
36577 |
+
" <th>Mathematical Physics</th>\n",
|
36578 |
+
" <th>Analysis of PDEs</th>\n",
|
36579 |
+
" <th>Classical Analysis and ODEs</th>\n",
|
36580 |
+
" <th>Differential Geometry</th>\n",
|
36581 |
+
" <th>Dynamical Systems</th>\n",
|
36582 |
+
" <th>Functional Analysis</th>\n",
|
36583 |
+
" <th>...</th>\n",
|
36584 |
+
" <th>Optimization and Control</th>\n",
|
36585 |
+
" <th>Probability</th>\n",
|
36586 |
+
" <th>Spectral Theory</th>\n",
|
36587 |
+
" <th>Pattern Formation and Solitons</th>\n",
|
36588 |
+
" <th>Biological Physics</th>\n",
|
36589 |
+
" <th>Fluid Dynamics</th>\n",
|
36590 |
+
" <th>Optics</th>\n",
|
36591 |
+
" <th>Cell Behavior</th>\n",
|
36592 |
+
" <th>Populations and Evolution</th>\n",
|
36593 |
+
" <th>Tissues and Organs</th>\n",
|
36594 |
+
" </tr>\n",
|
36595 |
+
" </thead>\n",
|
36596 |
+
" <tbody>\n",
|
36597 |
+
" <tr>\n",
|
36598 |
+
" <th>0</th>\n",
|
36599 |
+
" <td>0</td>\n",
|
36600 |
+
" <td>0</td>\n",
|
36601 |
+
" <td>0</td>\n",
|
36602 |
+
" <td>1</td>\n",
|
36603 |
+
" <td>1</td>\n",
|
36604 |
+
" <td>1</td>\n",
|
36605 |
+
" <td>0</td>\n",
|
36606 |
+
" <td>1</td>\n",
|
36607 |
+
" <td>0</td>\n",
|
36608 |
+
" <td>0</td>\n",
|
36609 |
+
" <td>...</td>\n",
|
36610 |
+
" <td>0</td>\n",
|
36611 |
+
" <td>0</td>\n",
|
36612 |
+
" <td>0</td>\n",
|
36613 |
+
" <td>0</td>\n",
|
36614 |
+
" <td>0</td>\n",
|
36615 |
+
" <td>0</td>\n",
|
36616 |
+
" <td>0</td>\n",
|
36617 |
+
" <td>0</td>\n",
|
36618 |
+
" <td>0</td>\n",
|
36619 |
+
" <td>0</td>\n",
|
36620 |
+
" </tr>\n",
|
36621 |
+
" <tr>\n",
|
36622 |
+
" <th>1</th>\n",
|
36623 |
+
" <td>0</td>\n",
|
36624 |
+
" <td>0</td>\n",
|
36625 |
+
" <td>0</td>\n",
|
36626 |
+
" <td>0</td>\n",
|
36627 |
+
" <td>0</td>\n",
|
36628 |
+
" <td>1</td>\n",
|
36629 |
+
" <td>0</td>\n",
|
36630 |
+
" <td>0</td>\n",
|
36631 |
+
" <td>0</td>\n",
|
36632 |
+
" <td>0</td>\n",
|
36633 |
+
" <td>...</td>\n",
|
36634 |
+
" <td>0</td>\n",
|
36635 |
+
" <td>0</td>\n",
|
36636 |
+
" <td>0</td>\n",
|
36637 |
+
" <td>0</td>\n",
|
36638 |
+
" <td>0</td>\n",
|
36639 |
+
" <td>0</td>\n",
|
36640 |
+
" <td>0</td>\n",
|
36641 |
+
" <td>0</td>\n",
|
36642 |
+
" <td>0</td>\n",
|
36643 |
+
" <td>0</td>\n",
|
36644 |
+
" </tr>\n",
|
36645 |
+
" <tr>\n",
|
36646 |
+
" <th>2</th>\n",
|
36647 |
+
" <td>0</td>\n",
|
36648 |
+
" <td>0</td>\n",
|
36649 |
+
" <td>0</td>\n",
|
36650 |
+
" <td>0</td>\n",
|
36651 |
+
" <td>0</td>\n",
|
36652 |
+
" <td>1</td>\n",
|
36653 |
+
" <td>0</td>\n",
|
36654 |
+
" <td>0</td>\n",
|
36655 |
+
" <td>0</td>\n",
|
36656 |
+
" <td>0</td>\n",
|
36657 |
+
" <td>...</td>\n",
|
36658 |
+
" <td>0</td>\n",
|
36659 |
+
" <td>0</td>\n",
|
36660 |
+
" <td>0</td>\n",
|
36661 |
+
" <td>0</td>\n",
|
36662 |
+
" <td>0</td>\n",
|
36663 |
+
" <td>0</td>\n",
|
36664 |
+
" <td>0</td>\n",
|
36665 |
+
" <td>0</td>\n",
|
36666 |
+
" <td>0</td>\n",
|
36667 |
+
" <td>0</td>\n",
|
36668 |
+
" </tr>\n",
|
36669 |
+
" <tr>\n",
|
36670 |
+
" <th>3</th>\n",
|
36671 |
+
" <td>0</td>\n",
|
36672 |
+
" <td>0</td>\n",
|
36673 |
+
" <td>0</td>\n",
|
36674 |
+
" <td>0</td>\n",
|
36675 |
+
" <td>0</td>\n",
|
36676 |
+
" <td>1</td>\n",
|
36677 |
+
" <td>0</td>\n",
|
36678 |
+
" <td>1</td>\n",
|
36679 |
+
" <td>0</td>\n",
|
36680 |
+
" <td>0</td>\n",
|
36681 |
+
" <td>...</td>\n",
|
36682 |
+
" <td>0</td>\n",
|
36683 |
+
" <td>0</td>\n",
|
36684 |
+
" <td>0</td>\n",
|
36685 |
+
" <td>0</td>\n",
|
36686 |
+
" <td>0</td>\n",
|
36687 |
+
" <td>0</td>\n",
|
36688 |
+
" <td>0</td>\n",
|
36689 |
+
" <td>0</td>\n",
|
36690 |
+
" <td>0</td>\n",
|
36691 |
+
" <td>0</td>\n",
|
36692 |
+
" </tr>\n",
|
36693 |
+
" <tr>\n",
|
36694 |
+
" <th>4</th>\n",
|
36695 |
+
" <td>0</td>\n",
|
36696 |
+
" <td>0</td>\n",
|
36697 |
+
" <td>0</td>\n",
|
36698 |
+
" <td>0</td>\n",
|
36699 |
+
" <td>0</td>\n",
|
36700 |
+
" <td>1</td>\n",
|
36701 |
+
" <td>0</td>\n",
|
36702 |
+
" <td>0</td>\n",
|
36703 |
+
" <td>1</td>\n",
|
36704 |
+
" <td>0</td>\n",
|
36705 |
+
" <td>...</td>\n",
|
36706 |
+
" <td>0</td>\n",
|
36707 |
+
" <td>0</td>\n",
|
36708 |
+
" <td>0</td>\n",
|
36709 |
+
" <td>0</td>\n",
|
36710 |
+
" <td>0</td>\n",
|
36711 |
+
" <td>0</td>\n",
|
36712 |
+
" <td>0</td>\n",
|
36713 |
+
" <td>0</td>\n",
|
36714 |
+
" <td>0</td>\n",
|
36715 |
+
" <td>0</td>\n",
|
36716 |
+
" </tr>\n",
|
36717 |
+
" <tr>\n",
|
36718 |
+
" <th>...</th>\n",
|
36719 |
+
" <td>...</td>\n",
|
36720 |
+
" <td>...</td>\n",
|
36721 |
+
" <td>...</td>\n",
|
36722 |
+
" <td>...</td>\n",
|
36723 |
+
" <td>...</td>\n",
|
36724 |
+
" <td>...</td>\n",
|
36725 |
+
" <td>...</td>\n",
|
36726 |
+
" <td>...</td>\n",
|
36727 |
+
" <td>...</td>\n",
|
36728 |
+
" <td>...</td>\n",
|
36729 |
+
" <td>...</td>\n",
|
36730 |
+
" <td>...</td>\n",
|
36731 |
+
" <td>...</td>\n",
|
36732 |
+
" <td>...</td>\n",
|
36733 |
+
" <td>...</td>\n",
|
36734 |
+
" <td>...</td>\n",
|
36735 |
+
" <td>...</td>\n",
|
36736 |
+
" <td>...</td>\n",
|
36737 |
+
" <td>...</td>\n",
|
36738 |
+
" <td>...</td>\n",
|
36739 |
+
" <td>...</td>\n",
|
36740 |
+
" </tr>\n",
|
36741 |
+
" <tr>\n",
|
36742 |
+
" <th>95</th>\n",
|
36743 |
+
" <td>0</td>\n",
|
36744 |
+
" <td>0</td>\n",
|
36745 |
+
" <td>0</td>\n",
|
36746 |
+
" <td>0</td>\n",
|
36747 |
+
" <td>0</td>\n",
|
36748 |
+
" <td>1</td>\n",
|
36749 |
+
" <td>0</td>\n",
|
36750 |
+
" <td>1</td>\n",
|
36751 |
+
" <td>0</td>\n",
|
36752 |
+
" <td>0</td>\n",
|
36753 |
+
" <td>...</td>\n",
|
36754 |
+
" <td>0</td>\n",
|
36755 |
+
" <td>0</td>\n",
|
36756 |
+
" <td>0</td>\n",
|
36757 |
+
" <td>0</td>\n",
|
36758 |
+
" <td>0</td>\n",
|
36759 |
+
" <td>0</td>\n",
|
36760 |
+
" <td>0</td>\n",
|
36761 |
+
" <td>0</td>\n",
|
36762 |
+
" <td>0</td>\n",
|
36763 |
+
" <td>0</td>\n",
|
36764 |
+
" </tr>\n",
|
36765 |
+
" <tr>\n",
|
36766 |
+
" <th>96</th>\n",
|
36767 |
+
" <td>0</td>\n",
|
36768 |
+
" <td>0</td>\n",
|
36769 |
+
" <td>0</td>\n",
|
36770 |
+
" <td>0</td>\n",
|
36771 |
+
" <td>0</td>\n",
|
36772 |
+
" <td>1</td>\n",
|
36773 |
+
" <td>0</td>\n",
|
36774 |
+
" <td>1</td>\n",
|
36775 |
+
" <td>0</td>\n",
|
36776 |
+
" <td>0</td>\n",
|
36777 |
+
" <td>...</td>\n",
|
36778 |
+
" <td>0</td>\n",
|
36779 |
+
" <td>0</td>\n",
|
36780 |
+
" <td>0</td>\n",
|
36781 |
+
" <td>0</td>\n",
|
36782 |
+
" <td>0</td>\n",
|
36783 |
+
" <td>0</td>\n",
|
36784 |
+
" <td>0</td>\n",
|
36785 |
+
" <td>0</td>\n",
|
36786 |
+
" <td>0</td>\n",
|
36787 |
+
" <td>0</td>\n",
|
36788 |
+
" </tr>\n",
|
36789 |
+
" <tr>\n",
|
36790 |
+
" <th>97</th>\n",
|
36791 |
+
" <td>0</td>\n",
|
36792 |
+
" <td>0</td>\n",
|
36793 |
+
" <td>0</td>\n",
|
36794 |
+
" <td>0</td>\n",
|
36795 |
+
" <td>0</td>\n",
|
36796 |
+
" <td>1</td>\n",
|
36797 |
+
" <td>0</td>\n",
|
36798 |
+
" <td>0</td>\n",
|
36799 |
+
" <td>0</td>\n",
|
36800 |
+
" <td>0</td>\n",
|
36801 |
+
" <td>...</td>\n",
|
36802 |
+
" <td>0</td>\n",
|
36803 |
+
" <td>0</td>\n",
|
36804 |
+
" <td>0</td>\n",
|
36805 |
+
" <td>0</td>\n",
|
36806 |
+
" <td>0</td>\n",
|
36807 |
+
" <td>0</td>\n",
|
36808 |
+
" <td>0</td>\n",
|
36809 |
+
" <td>0</td>\n",
|
36810 |
+
" <td>0</td>\n",
|
36811 |
+
" <td>0</td>\n",
|
36812 |
+
" </tr>\n",
|
36813 |
+
" <tr>\n",
|
36814 |
+
" <th>98</th>\n",
|
36815 |
+
" <td>0</td>\n",
|
36816 |
+
" <td>0</td>\n",
|
36817 |
+
" <td>0</td>\n",
|
36818 |
+
" <td>0</td>\n",
|
36819 |
+
" <td>0</td>\n",
|
36820 |
+
" <td>1</td>\n",
|
36821 |
+
" <td>0</td>\n",
|
36822 |
+
" <td>0</td>\n",
|
36823 |
+
" <td>0</td>\n",
|
36824 |
+
" <td>1</td>\n",
|
36825 |
+
" <td>...</td>\n",
|
36826 |
+
" <td>0</td>\n",
|
36827 |
+
" <td>0</td>\n",
|
36828 |
+
" <td>0</td>\n",
|
36829 |
+
" <td>0</td>\n",
|
36830 |
+
" <td>0</td>\n",
|
36831 |
+
" <td>0</td>\n",
|
36832 |
+
" <td>0</td>\n",
|
36833 |
+
" <td>0</td>\n",
|
36834 |
+
" <td>0</td>\n",
|
36835 |
+
" <td>0</td>\n",
|
36836 |
+
" </tr>\n",
|
36837 |
+
" <tr>\n",
|
36838 |
+
" <th>99</th>\n",
|
36839 |
+
" <td>0</td>\n",
|
36840 |
+
" <td>0</td>\n",
|
36841 |
+
" <td>0</td>\n",
|
36842 |
+
" <td>0</td>\n",
|
36843 |
+
" <td>0</td>\n",
|
36844 |
+
" <td>1</td>\n",
|
36845 |
+
" <td>0</td>\n",
|
36846 |
+
" <td>0</td>\n",
|
36847 |
+
" <td>0</td>\n",
|
36848 |
+
" <td>0</td>\n",
|
36849 |
+
" <td>...</td>\n",
|
36850 |
+
" <td>0</td>\n",
|
36851 |
+
" <td>0</td>\n",
|
36852 |
+
" <td>0</td>\n",
|
36853 |
+
" <td>0</td>\n",
|
36854 |
+
" <td>0</td>\n",
|
36855 |
+
" <td>0</td>\n",
|
36856 |
+
" <td>0</td>\n",
|
36857 |
+
" <td>0</td>\n",
|
36858 |
+
" <td>0</td>\n",
|
36859 |
+
" <td>0</td>\n",
|
36860 |
+
" </tr>\n",
|
36861 |
+
" </tbody>\n",
|
36862 |
+
"</table>\n",
|
36863 |
+
"<p>100 rows × 21 columns</p>\n",
|
36864 |
+
"</div>"
|
36865 |
+
],
|
36866 |
+
"text/plain": [
|
36867 |
+
" Materials Science Soft Condensed Matter Numerical Analysis \\\n",
|
36868 |
+
"0 0 0 0 \n",
|
36869 |
+
"1 0 0 0 \n",
|
36870 |
+
"2 0 0 0 \n",
|
36871 |
+
"3 0 0 0 \n",
|
36872 |
+
"4 0 0 0 \n",
|
36873 |
+
".. ... ... ... \n",
|
36874 |
+
"95 0 0 0 \n",
|
36875 |
+
"96 0 0 0 \n",
|
36876 |
+
"97 0 0 0 \n",
|
36877 |
+
"98 0 0 0 \n",
|
36878 |
+
"99 0 0 0 \n",
|
36879 |
+
"\n",
|
36880 |
+
" General Relativity and Quantum Cosmology Mathematical Physics \\\n",
|
36881 |
+
"0 1 1 \n",
|
36882 |
+
"1 0 0 \n",
|
36883 |
+
"2 0 0 \n",
|
36884 |
+
"3 0 0 \n",
|
36885 |
+
"4 0 0 \n",
|
36886 |
+
".. ... ... \n",
|
36887 |
+
"95 0 0 \n",
|
36888 |
+
"96 0 0 \n",
|
36889 |
+
"97 0 0 \n",
|
36890 |
+
"98 0 0 \n",
|
36891 |
+
"99 0 0 \n",
|
36892 |
+
"\n",
|
36893 |
+
" Analysis of PDEs Classical Analysis and ODEs Differential Geometry \\\n",
|
36894 |
+
"0 1 0 1 \n",
|
36895 |
+
"1 1 0 0 \n",
|
36896 |
+
"2 1 0 0 \n",
|
36897 |
+
"3 1 0 1 \n",
|
36898 |
+
"4 1 0 0 \n",
|
36899 |
+
".. ... ... ... \n",
|
36900 |
+
"95 1 0 1 \n",
|
36901 |
+
"96 1 0 1 \n",
|
36902 |
+
"97 1 0 0 \n",
|
36903 |
+
"98 1 0 0 \n",
|
36904 |
+
"99 1 0 0 \n",
|
36905 |
+
"\n",
|
36906 |
+
" Dynamical Systems Functional Analysis ... Optimization and Control \\\n",
|
36907 |
+
"0 0 0 ... 0 \n",
|
36908 |
+
"1 0 0 ... 0 \n",
|
36909 |
+
"2 0 0 ... 0 \n",
|
36910 |
+
"3 0 0 ... 0 \n",
|
36911 |
+
"4 1 0 ... 0 \n",
|
36912 |
+
".. ... ... ... ... \n",
|
36913 |
+
"95 0 0 ... 0 \n",
|
36914 |
+
"96 0 0 ... 0 \n",
|
36915 |
+
"97 0 0 ... 0 \n",
|
36916 |
+
"98 0 1 ... 0 \n",
|
36917 |
+
"99 0 0 ... 0 \n",
|
36918 |
+
"\n",
|
36919 |
+
" Probability Spectral Theory Pattern Formation and Solitons \\\n",
|
36920 |
+
"0 0 0 0 \n",
|
36921 |
+
"1 0 0 0 \n",
|
36922 |
+
"2 0 0 0 \n",
|
36923 |
+
"3 0 0 0 \n",
|
36924 |
+
"4 0 0 0 \n",
|
36925 |
+
".. ... ... ... \n",
|
36926 |
+
"95 0 0 0 \n",
|
36927 |
+
"96 0 0 0 \n",
|
36928 |
+
"97 0 0 0 \n",
|
36929 |
+
"98 0 0 0 \n",
|
36930 |
+
"99 0 0 0 \n",
|
36931 |
+
"\n",
|
36932 |
+
" Biological Physics Fluid Dynamics Optics Cell Behavior \\\n",
|
36933 |
+
"0 0 0 0 0 \n",
|
36934 |
+
"1 0 0 0 0 \n",
|
36935 |
+
"2 0 0 0 0 \n",
|
36936 |
+
"3 0 0 0 0 \n",
|
36937 |
+
"4 0 0 0 0 \n",
|
36938 |
+
".. ... ... ... ... \n",
|
36939 |
+
"95 0 0 0 0 \n",
|
36940 |
+
"96 0 0 0 0 \n",
|
36941 |
+
"97 0 0 0 0 \n",
|
36942 |
+
"98 0 0 0 0 \n",
|
36943 |
+
"99 0 0 0 0 \n",
|
36944 |
+
"\n",
|
36945 |
+
" Populations and Evolution Tissues and Organs \n",
|
36946 |
+
"0 0 0 \n",
|
36947 |
+
"1 0 0 \n",
|
36948 |
+
"2 0 0 \n",
|
36949 |
+
"3 0 0 \n",
|
36950 |
+
"4 0 0 \n",
|
36951 |
+
".. ... ... \n",
|
36952 |
+
"95 0 0 \n",
|
36953 |
+
"96 0 0 \n",
|
36954 |
+
"97 0 0 \n",
|
36955 |
+
"98 0 0 \n",
|
36956 |
+
"99 0 0 \n",
|
36957 |
+
"\n",
|
36958 |
+
"[100 rows x 21 columns]"
|
36959 |
+
]
|
36960 |
+
},
|
36961 |
+
"execution_count": 261,
|
36962 |
+
"metadata": {},
|
36963 |
+
"output_type": "execute_result"
|
36964 |
+
}
|
36965 |
+
],
|
36966 |
+
"source": [
|
36967 |
+
"data.arxiv_subjects"
|
36968 |
+
]
|
36969 |
+
},
|
36970 |
+
{
|
36971 |
+
"cell_type": "code",
|
36972 |
+
"execution_count": 156,
|
36973 |
"metadata": {},
|
36974 |
"outputs": [],
|
36975 |
+
"source": [
|
36976 |
+
"x = []\n",
|
36977 |
+
"\n",
|
36978 |
+
"if x:\n",
|
36979 |
+
" y = x"
|
36980 |
+
]
|
36981 |
+
},
|
36982 |
+
{
|
36983 |
+
"cell_type": "code",
|
36984 |
+
"execution_count": 157,
|
36985 |
+
"metadata": {},
|
36986 |
+
"outputs": [
|
36987 |
+
{
|
36988 |
+
"ename": "NameError",
|
36989 |
+
"evalue": "name 'y' is not defined",
|
36990 |
+
"output_type": "error",
|
36991 |
+
"traceback": [
|
36992 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
36993 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
36994 |
+
"Cell \u001b[1;32mIn[157], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m y\n",
|
36995 |
+
"\u001b[1;31mNameError\u001b[0m: name 'y' is not defined"
|
36996 |
+
]
|
36997 |
+
}
|
36998 |
+
],
|
36999 |
+
"source": [
|
37000 |
+
"y"
|
37001 |
+
]
|
37002 |
}
|
37003 |
],
|
37004 |
"metadata": {
|
data_cleaning.py
CHANGED
@@ -2,6 +2,7 @@ import regex
|
|
2 |
import pandas as pd
|
3 |
import json
|
4 |
import sentence_transformers.util
|
|
|
5 |
import os
|
6 |
|
7 |
|
@@ -196,35 +197,25 @@ def category_map():
|
|
196 |
}
|
197 |
|
198 |
|
199 |
-
def
|
200 |
-
|
201 |
-
|
|
|
202 |
|
203 |
-
|
204 |
-
raw_metadata_row: row of a dataframe returned by an arXiv query request
|
205 |
|
206 |
-
Returns:
|
207 |
-
(x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
|
208 |
-
"""
|
209 |
-
categories = raw_metadata_row.categories
|
210 |
-
expanded_categories = pd.Series(categories)
|
211 |
-
arxiv_subject_labels = category_map()
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
return (raw_metadata_row.categories[:-2], msc_tags)
|
218 |
|
|
|
219 |
|
220 |
-
|
221 |
-
split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
|
222 |
|
223 |
-
flag = 1
|
224 |
-
if arxiv_tag:
|
225 |
-
flag = 0
|
226 |
|
227 |
-
|
228 |
|
229 |
|
230 |
## 1. Latin-ize latex accents enclosed in brackets
|
@@ -285,9 +276,9 @@ def find_hyph(text):
|
|
285 |
|
286 |
|
287 |
def find_msc(msc_string):
|
288 |
-
|
289 |
-
|
290 |
-
return
|
291 |
|
292 |
|
293 |
def msc_tags():
|
|
|
2 |
import pandas as pd
|
3 |
import json
|
4 |
import sentence_transformers.util
|
5 |
+
import numpy as np
|
6 |
import os
|
7 |
|
8 |
|
|
|
197 |
}
|
198 |
|
199 |
|
200 |
+
def extract_arxiv_subjects(raw_metadata):
|
201 |
+
def get_arxiv_subjects_from_cats(categories):
|
202 |
+
arxiv_subject_labels = category_map()
|
203 |
+
return [tag for tag in categories if tag in arxiv_subject_labels.keys()]
|
204 |
|
205 |
+
return raw_metadata.categories.apply(get_arxiv_subjects_from_cats)
|
|
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
+
def extract_msc_tags(raw_metadata):
|
209 |
+
## Check the last entry for 5 digit msc tags only.
|
210 |
+
|
211 |
+
msc_tags = raw_metadata.categories.apply(lambda x: find_msc(x[-1]))
|
|
|
212 |
|
213 |
+
msc_tags = msc_tags.apply(lambda x: np.nan if len(x) == 0 else x)
|
214 |
|
215 |
+
return msc_tags
|
|
|
216 |
|
|
|
|
|
|
|
217 |
|
218 |
+
#### LATEX CLEANING UTILITIES
|
219 |
|
220 |
|
221 |
## 1. Latin-ize latex accents enclosed in brackets
|
|
|
276 |
|
277 |
|
278 |
def find_msc(msc_string):
|
279 |
+
five_digit_pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
|
280 |
+
five_digit_tags = regex.findall(five_digit_pattern, msc_string)
|
281 |
+
return five_digit_tags
|
282 |
|
283 |
|
284 |
def msc_tags():
|
data_storage.py
CHANGED
@@ -20,13 +20,19 @@ class ArXivData:
|
|
20 |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
21 |
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
|
22 |
|
23 |
-
def load_from_query(self,
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
27 |
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def clean(self, dataset):
|
32 |
"""Constructs this dataset by cleaning another one.
|
@@ -39,15 +45,20 @@ class ArXivData:
|
|
39 |
self.raw = dataset.raw
|
40 |
self.categories = dataset.categories
|
41 |
|
42 |
-
def get_OHE_arxiv_subjects(returned_metadata):
|
43 |
mlb = MultiLabelBinarizer()
|
44 |
|
45 |
OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
|
46 |
arxiv_subject_labels = clean.category_map()
|
47 |
|
48 |
-
|
49 |
-
columns=
|
50 |
-
)
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
|
53 |
def format_query(author="", title="", cat="", abstract=""):
|
@@ -72,7 +83,7 @@ def format_query(author="", title="", cat="", abstract=""):
|
|
72 |
return query
|
73 |
|
74 |
|
75 |
-
def query_to_df(query, max_results, offset):
|
76 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
77 |
|
78 |
Args:
|
@@ -116,9 +127,10 @@ def query_to_df(query, max_results, offset):
|
|
116 |
raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
|
117 |
|
118 |
returned_metadata = raw_metadata.copy().drop(columns=["categories"])
|
119 |
-
returned_metadata["arxiv_subjects"] = clean.
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
123 |
|
124 |
return returned_metadata
|
|
|
20 |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
21 |
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
|
22 |
|
23 |
+
def load_from_query(self, query, max_results, offset=0, raw=False):
|
24 |
+
if raw:
|
25 |
+
self._returned_metadata = query_to_df(
|
26 |
+
query=query, max_results=max_results, offset=offset, raw=True
|
27 |
+
)
|
28 |
|
29 |
+
else:
|
30 |
+
self._returned_metadata = query_to_df(
|
31 |
+
query=query, max_results=max_results, offset=offset
|
32 |
+
)
|
33 |
+
|
34 |
+
self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
|
35 |
+
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
36 |
|
37 |
def clean(self, dataset):
|
38 |
"""Constructs this dataset by cleaning another one.
|
|
|
45 |
self.raw = dataset.raw
|
46 |
self.categories = dataset.categories
|
47 |
|
48 |
+
def get_OHE_arxiv_subjects(self, returned_metadata):
|
49 |
mlb = MultiLabelBinarizer()
|
50 |
|
51 |
OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
|
52 |
arxiv_subject_labels = clean.category_map()
|
53 |
|
54 |
+
OHE_arxiv_subjects = pd.DataFrame(
|
55 |
+
OHE_arxiv_subjects_array, columns=mlb.classes_
|
56 |
+
).rename(columns=arxiv_subject_labels)
|
57 |
+
|
58 |
+
## Remove duplicated columns
|
59 |
+
return OHE_arxiv_subjects.loc[
|
60 |
+
:, ~OHE_arxiv_subjects.columns.duplicated()
|
61 |
+
].copy()
|
62 |
|
63 |
|
64 |
def format_query(author="", title="", cat="", abstract=""):
|
|
|
83 |
return query
|
84 |
|
85 |
|
86 |
+
def query_to_df(query, max_results, offset, raw=False):
|
87 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
88 |
|
89 |
Args:
|
|
|
127 |
raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
|
128 |
|
129 |
returned_metadata = raw_metadata.copy().drop(columns=["categories"])
|
130 |
+
returned_metadata["arxiv_subjects"] = clean.extract_arxiv_subjects(raw_metadata)
|
131 |
+
returned_metadata["msc_tags"] = clean.extract_msc_tags(raw_metadata)
|
132 |
+
|
133 |
+
if raw:
|
134 |
+
return raw_metadata
|
135 |
|
136 |
return returned_metadata
|