Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

App Files Files Community

Michael-Geis commited on Jul 3, 2023

Commit

fcfd917

•

1 Parent(s): cbdef5e

updated load_from_query in data_storage, added to data cleaning

Browse files

Files changed (3) hide show

collection.ipynb +913 -92
data_cleaning.py +16 -25
data_storage.py +27 -15

collection.ipynb CHANGED Viewed

@@ -6,7 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from util import format_query , query_to_df\n",
     "import pandas as pd\n",
     "import numpy as np"
    ]
@@ -19,10 +19,10 @@
    "source": [
     "## Try collection data for pde articles\n",
     "\n",
-    "max_results=3e3\n",
-    "cat='math.AP'\n",
     "query = format_query(cat=cat)\n",
-    "pde = query_to_df(query=query,max_results=max_results)\n"
    ]
   },
   {
@@ -66,19 +66,18 @@
     "## Find the msc subject tags within the categories\n",
     "import regex\n",
     "\n",
     "def find_msc(cat_list):\n",
-    "    pattern = r'\\b\\d{2}[0-9a-zA-Z]{3}\\b'\n",
     "    out = []\n",
     "    for cat in cat_list:\n",
-    "        tags = regex.findall(pattern,cat)\n",
     "        for tag in tags:\n",
     "            out.append(tag)\n",
     "    if out == []:\n",
     "        return None\n",
     "    else:\n",
-    "        return out\n",
-    "\n",
-    "        "
    ]
   },
   {
@@ -89,7 +88,7 @@
    "source": [
     "## Now create a new column for msc tags\n",
     "\n",
-    "pde['msc_tags'] = pde.categories.apply(find_msc)"
    ]
   },
   {
@@ -119,7 +118,7 @@
     "\n",
     "msc = pde.msc_tags.sample(10)\n",
     "for tag in msc:\n",
-    "    print(tag)\n"
    ]
   },
   {
@@ -139,9 +138,8 @@
     "## what fraction of these articles has non-zero msc tags?\n",
     "\n",
     "tagged = pde.msc_tags.count()\n",
-    "fraction = tagged/len(pde)\n",
-    "print(fraction)\n",
-    "\n"
    ]
   },
   {
@@ -153,11 +151,11 @@
     "## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
     "\n",
     "from bs4 import BeautifulSoup\n",
-    "import requests \n",
     "\n",
-    "url = r'https://cran.r-project.org/web/classifications/MSC.html'\n",
     "\n",
-    "source = requests.get(url)\n"
    ]
   },
   {
@@ -177,7 +175,7 @@
     }
    ],
    "source": [
-    "source.headers['content-type']"
    ]
   },
   {
@@ -19291,7 +19289,7 @@
     }
    ],
    "source": [
-    "soup = BeautifulSoup(document, 'html.parser')\n",
     "print(soup.prettify())"
    ]
   },
@@ -19338,25 +19336,24 @@
     "import PyPDF2\n",
     "import regex\n",
     "\n",
-    "with open('msc2020.pdf', 'rb') as file:\n",
-    "\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    print(len(reader.pages))\n",
     "    page = reader.pages[0]\n",
     "    raw_text = page.extract_text()\n",
     "\n",
-    "    lines = raw_text.split('\\n')\n",
     "\n",
     "    subject_dict = {}\n",
     "    for line in lines[2:]:\n",
     "        subject_dict[str(line[:2])] = line[2:]\n",
     "\n",
     "\n",
-    "subject_dict['44'] = 'Integral transforms, operational calculus'\n",
-    "subject_dict['45'] = 'Integral equations'\n",
     "\n",
     "for k in subject_dict.keys():\n",
-    "    subject_dict[k] = regex.sub(r'\\x0b','ff',subject_dict[k])\n",
     "\n",
     "print(subject_dict)"
    ]
@@ -19375,7 +19372,7 @@
     }
    ],
    "source": [
-    "subject_dict.pop('1')\n",
     "print(subject_dict)"
    ]
   },
@@ -19390,9 +19387,8 @@
     "import json\n",
     "\n",
     "json_subjects = json.dumps(subject_dict)\n",
-    "with open('./data/msc_subjects.json','w+') as file:\n",
-    "    file.write(json_subjects)\n",
-    "    "
    ]
   },
   {
@@ -19476,6 +19472,7 @@
    "source": [
     "import util\n",
     "import importlib\n",
     "importlib.reload(util)\n",
     "\n",
     "util.msc_subjects()"
@@ -19489,11 +19486,10 @@
    "source": [
     "## Next we make a dictionary consisting of all other subject tags\n",
     "\n",
-    "with open('msc2020.pdf', 'rb') as file:\n",
-    "\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    page = reader.pages[3]\n",
-    "    raw_text = page.extract_text()\n"
    ]
   },
   {
@@ -19552,17 +19548,17 @@
    "source": [
     "## Try splitting on a pattern \\d\\d[A-Z]xx\n",
     "\n",
-    "pattern = r'\\b\\d\\d[A-Z]xx\\b'\n",
-    "splitting = regex.split(pattern,raw_text)\n",
     "# for line in splitting:\n",
-    "    # print(line + 'END')\n",
     "\n",
     "print(splitting[1])\n",
     "\n",
-    "## Within each of these, find all text between two instances of the pattern \n",
-    "tag_pattern = r'(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)'\n",
     "\n",
-    "patterns = regex.findall(tag_pattern, splitting[1])\n"
    ]
   },
   {
@@ -19648,7 +19644,7 @@
     "## Turn this into a dict\n",
     "dict = {}\n",
     "for item in patterns:\n",
-    "    k , v = item\n",
     "    dict[k] = v\n",
     "\n",
     "for item in dict.items():\n",
@@ -19661,24 +19657,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "## Within each of these, find all text between two instances of the pattern \n",
-    "tag_pattern = r'(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)'\n",
     "dict = {}\n",
     "\n",
-    "with open('msc2020.pdf', 'rb') as file:\n",
-    "\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    for page in reader.pages:\n",
     "        page_text = page.extract_text()\n",
     "\n",
-    "        ## Find all the msc tags \n",
     "\n",
     "        tags = regex.findall(tag_pattern, page_text)\n",
-    "        \n",
     "        for item in tags:\n",
-    "            k , v = item\n",
-    "            dict[k] = v\n"
    ]
   },
   {
@@ -20737,15 +20731,15 @@
    "source": [
     "def clean_msc_dict(dict):\n",
     "    for item in dict.items():\n",
-    "        k , v = item \n",
-    "        v = regex.sub(r'\\x0c','fi',v)\n",
-    "        v = regex.sub(r'\\x0b','ff',v)\n",
-    "        v = regex.sub(r'\\r','fl',v)\n",
-    "        v = regex.sub(r'\\xf7 ','',v)\n",
-    "        v = regex.sub(r'\\x0e','ffi',v)\n",
-    "        v = regex.sub(r'\\x13','',v)\n",
     "        dict[k] = v\n",
-    "        return dict\n"
    ]
   },
   {
@@ -31666,7 +31660,8 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "codes = pd.read_csv('./data/MSC_2020.csv', encoding='windows-1252', on_bad_lines='skip')"
    ]
   },
   {
@@ -31749,15 +31744,17 @@
    "source": [
     "## Look for all entries that start with a valid msc tag pattern\n",
     "\n",
-    "pattern = '\\d\\d[A-Z]\\d\\d'\n",
     "\n",
     "def check_valid(entry):\n",
-    "    if regex.match(pattern,entry):\n",
     "        return True\n",
     "    else:\n",
     "        return False\n",
     "\n",
-    "valid_codes = codes.loc[codes['code\\ttext\\tdescription'].apply(check_valid)]\n"
    ]
   },
   {
@@ -31873,11 +31870,11 @@
    "source": [
     "dict = {}\n",
     "\n",
-    "for entry in valid_codes['code\\ttext\\tdescription']:\n",
-    "    split = entry.split('\\t')\n",
     "    code = split[0]\n",
     "    desc = split[1][1:-1]\n",
-    "    dict[code] = desc\n"
    ]
   },
   {
@@ -33920,9 +33917,9 @@
     "## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
     "\n",
     "for item in dict.items():\n",
-    "    k , v = item\n",
-    "    v = v.replace('\\\\(','')\n",
-    "    v = v.replace('\\\\)','')\n",
     "    dict[k] = v\n",
     "\n",
     "dict"
@@ -33963,8 +33960,8 @@
     "dict_stripped_accents = {}\n",
     "\n",
     "for item in dict.items():\n",
-    "    k , v = item\n",
-    "    dict_stripped_accents[k] = unidecode(v)\n"
    ]
   },
   {
@@ -33973,7 +33970,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open('./data/msc.json','w') as file:\n",
     "    json = json.dumps(dict_stripped_accents)\n",
     "    file.write(json)"
    ]
@@ -35010,7 +35007,7 @@
     "\n",
     "lib = Library()\n",
     "\n",
-    "lib.load_from_query(query_string='cat:math.AP',max_results=5000)"
    ]
   },
   {
@@ -35386,6 +35383,7 @@
    "outputs": [],
    "source": [
     "import importlib\n",
     "importlib.reload(util)\n",
     "\n",
     "lib.clean_library()"
@@ -35659,9 +35657,7 @@
     "from library_class import Library\n",
     "\n",
     "lib = Library()\n",
-    "lib.load_from_query(query_string='cat:math.AP OR math.SP',max_results=2e4)\n",
-    "\n",
-    "\n"
    ]
   },
   {
@@ -35671,7 +35667,7 @@
    "outputs": [],
    "source": [
     "raw_lib = lib.raw_lib\n",
-    "raw_lib.to_parquet('./data/APSP.parquet')"
    ]
   },
   {
@@ -35682,7 +35678,7 @@
    "source": [
     "## Is the list information preserved?\n",
     "\n",
-    "df = pd.read_parquet('./data/APSP.parquet')"
    ]
   },
   {
@@ -35808,7 +35804,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pd.set_option('display.max_colwidth', 0)"
    ]
   },
   {
@@ -35820,13 +35816,14 @@
     "from cleaning import cleaning\n",
     "import pandas as pd\n",
     "import importlib\n",
     "importlib.reload(cleaning)\n",
     "\n",
-    "data = pd.read_parquet('./data/APSP.parquet')\n",
     "\n",
     "clean_data = cleaning.main(\n",
-    "    raw_arxiv_results=data,path_to_embeddings='./data/APSP_mini_vec.parquet'\n",
-    ")\n"
    ]
   },
   {
@@ -35950,7 +35947,7 @@
     }
    ],
    "source": [
-    "pd.set_option('display.max_colwidth', 0)\n",
     "clean_data.head()"
    ]
   },
@@ -36094,6 +36091,7 @@
    "source": [
     "import data_storage\n",
     "import importlib\n",
     "importlib.reload(data_storage)\n",
     "\n",
     "\n",
@@ -36101,10 +36099,11 @@
     "\n",
     "max_results = 20000\n",
     "offset = 0\n",
-    "data.load_from_query(query_string='cat:math.AP',\n",
-    "                     max_results=max_results,\n",
-    "                     offset=offset,\n",
-    "                     )\n",
     "data.data"
    ]
   },
@@ -36123,16 +36122,20 @@
    ],
    "source": [
     "import arxiv\n",
-    "from datetime import datetime , timedelta , timezone\n",
     "\n",
     "\n",
-    "search = arxiv.Search(query='cat:math.AP', max_results=1e3,sort_by=arxiv.SortCriterion.LastUpdatedDate, sort_order=arxiv.SortOrder.Descending)\n",
     "\n",
     "for result in search.results():\n",
     "    if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
-    "        print(result.title,result.updated)\n",
-    "        break\n",
-    "\n"
    ]
   },
   {
@@ -36151,7 +36154,7 @@
    "source": [
     "##\n",
     "oldest = list(search.results())[-1]\n",
-    "print(oldest.updated)\n"
    ]
   },
   {
@@ -36174,10 +36177,828 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

    "metadata": {},
    "outputs": [],
    "source": [
+    "from util import format_query, query_to_df\n",
     "import pandas as pd\n",
     "import numpy as np"
    ]
    "source": [
     "## Try collection data for pde articles\n",
     "\n",
+    "max_results = 3e3\n",
+    "cat = \"math.AP\"\n",
     "query = format_query(cat=cat)\n",
+    "pde = query_to_df(query=query, max_results=max_results)"
    ]
   },
   {
     "## Find the msc subject tags within the categories\n",
     "import regex\n",
     "\n",
+    "\n",
     "def find_msc(cat_list):\n",
+    "    pattern = r\"\\b\\d{2}[0-9a-zA-Z]{3}\\b\"\n",
     "    out = []\n",
     "    for cat in cat_list:\n",
+    "        tags = regex.findall(pattern, cat)\n",
     "        for tag in tags:\n",
     "            out.append(tag)\n",
     "    if out == []:\n",
     "        return None\n",
     "    else:\n",
+    "        return out"
    ]
   },
   {
    "source": [
     "## Now create a new column for msc tags\n",
     "\n",
+    "pde[\"msc_tags\"] = pde.categories.apply(find_msc)"
    ]
   },
   {
     "\n",
     "msc = pde.msc_tags.sample(10)\n",
     "for tag in msc:\n",
+    "    print(tag)"
    ]
   },
   {
     "## what fraction of these articles has non-zero msc tags?\n",
     "\n",
     "tagged = pde.msc_tags.count()\n",
+    "fraction = tagged / len(pde)\n",
+    "print(fraction)"
    ]
   },
   {
     "## Now we are going to see if we can extract the MSC codes using xml parsing with beautifulsoup\n",
     "\n",
     "from bs4 import BeautifulSoup\n",
+    "import requests\n",
     "\n",
+    "url = r\"https://cran.r-project.org/web/classifications/MSC.html\"\n",
     "\n",
+    "source = requests.get(url)"
    ]
   },
   {
     }
    ],
    "source": [
+    "source.headers[\"content-type\"]"
    ]
   },
   {
     }
    ],
    "source": [
+    "soup = BeautifulSoup(document, \"html.parser\")\n",
     "print(soup.prettify())"
    ]
   },
     "import PyPDF2\n",
     "import regex\n",
     "\n",
+    "with open(\"msc2020.pdf\", \"rb\") as file:\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    print(len(reader.pages))\n",
     "    page = reader.pages[0]\n",
     "    raw_text = page.extract_text()\n",
     "\n",
+    "    lines = raw_text.split(\"\\n\")\n",
     "\n",
     "    subject_dict = {}\n",
     "    for line in lines[2:]:\n",
     "        subject_dict[str(line[:2])] = line[2:]\n",
     "\n",
     "\n",
+    "subject_dict[\"44\"] = \"Integral transforms, operational calculus\"\n",
+    "subject_dict[\"45\"] = \"Integral equations\"\n",
     "\n",
     "for k in subject_dict.keys():\n",
+    "    subject_dict[k] = regex.sub(r\"\\x0b\", \"ff\", subject_dict[k])\n",
     "\n",
     "print(subject_dict)"
    ]
     }
    ],
    "source": [
+    "subject_dict.pop(\"1\")\n",
     "print(subject_dict)"
    ]
   },
     "import json\n",
     "\n",
     "json_subjects = json.dumps(subject_dict)\n",
+    "with open(\"./data/msc_subjects.json\", \"w+\") as file:\n",
+    "    file.write(json_subjects)"
    ]
   },
   {
    "source": [
     "import util\n",
     "import importlib\n",
+    "\n",
     "importlib.reload(util)\n",
     "\n",
     "util.msc_subjects()"
    "source": [
     "## Next we make a dictionary consisting of all other subject tags\n",
     "\n",
+    "with open(\"msc2020.pdf\", \"rb\") as file:\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    page = reader.pages[3]\n",
+    "    raw_text = page.extract_text()"
    ]
   },
   {
    "source": [
     "## Try splitting on a pattern \\d\\d[A-Z]xx\n",
     "\n",
+    "pattern = r\"\\b\\d\\d[A-Z]xx\\b\"\n",
+    "splitting = regex.split(pattern, raw_text)\n",
     "# for line in splitting:\n",
+    "# print(line + 'END')\n",
     "\n",
     "print(splitting[1])\n",
     "\n",
+    "## Within each of these, find all text between two instances of the pattern\n",
+    "tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
     "\n",
+    "patterns = regex.findall(tag_pattern, splitting[1])"
    ]
   },
   {
     "## Turn this into a dict\n",
     "dict = {}\n",
     "for item in patterns:\n",
+    "    k, v = item\n",
     "    dict[k] = v\n",
     "\n",
     "for item in dict.items():\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "## Within each of these, find all text between two instances of the pattern\n",
+    "tag_pattern = r\"(\\b\\d\\d[A-Z]\\d\\d)\\s(.*)(?!\\b\\d\\d[A-Z]\\d\\d\\b)\"\n",
     "dict = {}\n",
     "\n",
+    "with open(\"msc2020.pdf\", \"rb\") as file:\n",
     "    reader = PyPDF2.PdfReader(file)\n",
     "    for page in reader.pages:\n",
     "        page_text = page.extract_text()\n",
     "\n",
+    "        ## Find all the msc tags\n",
     "\n",
     "        tags = regex.findall(tag_pattern, page_text)\n",
+    "\n",
     "        for item in tags:\n",
+    "            k, v = item\n",
+    "            dict[k] = v"
    ]
   },
   {
    "source": [
     "def clean_msc_dict(dict):\n",
     "    for item in dict.items():\n",
+    "        k, v = item\n",
+    "        v = regex.sub(r\"\\x0c\", \"fi\", v)\n",
+    "        v = regex.sub(r\"\\x0b\", \"ff\", v)\n",
+    "        v = regex.sub(r\"\\r\", \"fl\", v)\n",
+    "        v = regex.sub(r\"\\xf7 \", \"\", v)\n",
+    "        v = regex.sub(r\"\\x0e\", \"ffi\", v)\n",
+    "        v = regex.sub(r\"\\x13\", \"\", v)\n",
     "        dict[k] = v\n",
+    "        return dict"
    ]
   },
   {
    "outputs": [],
    "source": [
     "import pandas as pd\n",
+    "\n",
+    "codes = pd.read_csv(\"./data/MSC_2020.csv\", encoding=\"windows-1252\", on_bad_lines=\"skip\")"
    ]
   },
   {
    "source": [
     "## Look for all entries that start with a valid msc tag pattern\n",
     "\n",
+    "pattern = \"\\d\\d[A-Z]\\d\\d\"\n",
+    "\n",
     "\n",
     "def check_valid(entry):\n",
+    "    if regex.match(pattern, entry):\n",
     "        return True\n",
     "    else:\n",
     "        return False\n",
     "\n",
+    "\n",
+    "valid_codes = codes.loc[codes[\"code\\ttext\\tdescription\"].apply(check_valid)]"
    ]
   },
   {
    "source": [
     "dict = {}\n",
     "\n",
+    "for entry in valid_codes[\"code\\ttext\\tdescription\"]:\n",
+    "    split = entry.split(\"\\t\")\n",
     "    code = split[0]\n",
     "    desc = split[1][1:-1]\n",
+    "    dict[code] = desc"
    ]
   },
   {
     "## Good, but we can improve it by removing the '\\\\(' and '\\\\)' characters\n",
     "\n",
     "for item in dict.items():\n",
+    "    k, v = item\n",
+    "    v = v.replace(\"\\\\(\", \"\")\n",
+    "    v = v.replace(\"\\\\)\", \"\")\n",
     "    dict[k] = v\n",
     "\n",
     "dict"
     "dict_stripped_accents = {}\n",
     "\n",
     "for item in dict.items():\n",
+    "    k, v = item\n",
+    "    dict_stripped_accents[k] = unidecode(v)"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "with open(\"./data/msc.json\", \"w\") as file:\n",
     "    json = json.dumps(dict_stripped_accents)\n",
     "    file.write(json)"
    ]
     "\n",
     "lib = Library()\n",
     "\n",
+    "lib.load_from_query(query_string=\"cat:math.AP\", max_results=5000)"
    ]
   },
   {
    "outputs": [],
    "source": [
     "import importlib\n",
+    "\n",
     "importlib.reload(util)\n",
     "\n",
     "lib.clean_library()"
     "from library_class import Library\n",
     "\n",
     "lib = Library()\n",
+    "lib.load_from_query(query_string=\"cat:math.AP OR math.SP\", max_results=2e4)"
    ]
   },
   {
    "outputs": [],
    "source": [
     "raw_lib = lib.raw_lib\n",
+    "raw_lib.to_parquet(\"./data/APSP.parquet\")"
    ]
   },
   {
    "source": [
     "## Is the list information preserved?\n",
     "\n",
+    "df = pd.read_parquet(\"./data/APSP.parquet\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "pd.set_option(\"display.max_colwidth\", 0)"
    ]
   },
   {
     "from cleaning import cleaning\n",
     "import pandas as pd\n",
     "import importlib\n",
+    "\n",
     "importlib.reload(cleaning)\n",
     "\n",
+    "data = pd.read_parquet(\"./data/APSP.parquet\")\n",
     "\n",
     "clean_data = cleaning.main(\n",
+    "    raw_arxiv_results=data, path_to_embeddings=\"./data/APSP_mini_vec.parquet\"\n",
+    ")"
    ]
   },
   {
     }
    ],
    "source": [
+    "pd.set_option(\"display.max_colwidth\", 0)\n",
     "clean_data.head()"
    ]
   },
    "source": [
     "import data_storage\n",
     "import importlib\n",
+    "\n",
     "importlib.reload(data_storage)\n",
     "\n",
     "\n",
     "\n",
     "max_results = 20000\n",
     "offset = 0\n",
+    "data.load_from_query(\n",
+    "    query_string=\"cat:math.AP\",\n",
+    "    max_results=max_results,\n",
+    "    offset=offset,\n",
+    ")\n",
     "data.data"
    ]
   },
    ],
    "source": [
     "import arxiv\n",
+    "from datetime import datetime, timedelta, timezone\n",
     "\n",
     "\n",
+    "search = arxiv.Search(\n",
+    "    query=\"cat:math.AP\",\n",
+    "    max_results=1e3,\n",
+    "    sort_by=arxiv.SortCriterion.LastUpdatedDate,\n",
+    "    sort_order=arxiv.SortOrder.Descending,\n",
+    ")\n",
     "\n",
     "for result in search.results():\n",
     "    if result.updated < datetime.now(timezone.utc) - timedelta(days=2):\n",
+    "        print(result.title, result.updated)\n",
+    "        break"
    ]
   },
   {
    "source": [
     "##\n",
     "oldest = list(search.results())[-1]\n",
+    "print(oldest.updated)"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 256,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import data_storage\n",
+    "import data_cleaning\n",
+    "from data_storage import ArXivData\n",
+    "import importlib\n",
+    "\n",
+    "importlib.reload(data_storage)\n",
+    "importlib.reload(data_cleaning)\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 257,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = ArXivData()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 258,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.load_from_query(query=\"cat:math.AP\", max_results=100, raw=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>summary</th>\n",
+       "      <th>categories</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Future stability of expanding spatially homoge...</td>\n",
+       "      <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
+       "      <td>[gr-qc, math-ph, math.AP, math.DG, math.MP]</td>\n",
+       "      <td>2306.17774v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Autonomous and asymptotically quasiconvex func...</td>\n",
+       "      <td>We obtain local regularity for minimizers of a...</td>\n",
+       "      <td>[math.AP, 35J47, 35B65, 46E30]</td>\n",
+       "      <td>2306.17768v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
+       "      <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
+       "      <td>[math.AP]</td>\n",
+       "      <td>2306.03349v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Nonuniqueness results for constant sixth order...</td>\n",
+       "      <td>We prove nonuniqueness results for constant si...</td>\n",
+       "      <td>[math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,...</td>\n",
+       "      <td>2306.00679v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Asymptotic limits of the principal spectrum po...</td>\n",
+       "      <td>This work examines the limits of the principal...</td>\n",
+       "      <td>[math.AP, math.DS, 92D40, 92D50, 35P15, 35K57]</td>\n",
+       "      <td>2306.17734v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>Quantization of the Energy for the inhomogeneo...</td>\n",
+       "      <td>We consider the varifold associated to the All...</td>\n",
+       "      <td>[math.DG, math.AP, 53E99]</td>\n",
+       "      <td>2302.00137v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>Second order estimates for transition layers a...</td>\n",
+       "      <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
+       "      <td>[math.DG, math.AP, 53E99]</td>\n",
+       "      <td>2003.11886v3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
+       "      <td>A compartment epidemic model for infectious di...</td>\n",
+       "      <td>[math.AP]</td>\n",
+       "      <td>2212.10137v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>Multiple positive solutions for a double phase...</td>\n",
+       "      <td>In this paper, we study a class of double phas...</td>\n",
+       "      <td>[math.AP, math.FA, 05J50, 03H10, 35D30]</td>\n",
+       "      <td>2306.01319v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>Stabilization of the wave equation on larger-d...</td>\n",
+       "      <td>This paper deals with uniform stabilization of...</td>\n",
+       "      <td>[math.AP, 93C20 (Primary) 35A27 (Secondary)]</td>\n",
+       "      <td>2303.03733v3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                title  \\\n",
+       "0   Future stability of expanding spatially homoge...   \n",
+       "1   Autonomous and asymptotically quasiconvex func...   \n",
+       "2   A Coefficient Inverse Problem for the Mean Fie...   \n",
+       "3   Nonuniqueness results for constant sixth order...   \n",
+       "4   Asymptotic limits of the principal spectrum po...   \n",
+       "..                                                ...   \n",
+       "95  Quantization of the Energy for the inhomogeneo...   \n",
+       "96  Second order estimates for transition layers a...   \n",
+       "97  Well-Posedness and Stability Analysis of an Ep...   \n",
+       "98  Multiple positive solutions for a double phase...   \n",
+       "99  Stabilization of the wave equation on larger-d...   \n",
+       "\n",
+       "                                              summary  \\\n",
+       "0   Spatially homogeneous FLRW solutions constitut...   \n",
+       "1   We obtain local regularity for minimizers of a...   \n",
+       "2   A Coefficient Inverse Problem (CIP) of the det...   \n",
+       "3   We prove nonuniqueness results for constant si...   \n",
+       "4   This work examines the limits of the principal...   \n",
+       "..                                                ...   \n",
+       "95  We consider the varifold associated to the All...   \n",
+       "96  The parabolic Allen-Cahn equation is a semilin...   \n",
+       "97  A compartment epidemic model for infectious di...   \n",
+       "98  In this paper, we study a class of double phas...   \n",
+       "99  This paper deals with uniform stabilization of...   \n",
+       "\n",
+       "                                           categories            id  \n",
+       "0         [gr-qc, math-ph, math.AP, math.DG, math.MP]  2306.17774v1  \n",
+       "1                      [math.AP, 35J47, 35B65, 46E30]  2306.17768v1  \n",
+       "2                                           [math.AP]  2306.03349v2  \n",
+       "3   [math.DG, math.AP, 35J60, 35B09, 35J30, 35B40,...  2306.00679v2  \n",
+       "4      [math.AP, math.DS, 92D40, 92D50, 35P15, 35K57]  2306.17734v1  \n",
+       "..                                                ...           ...  \n",
+       "95                          [math.DG, math.AP, 53E99]  2302.00137v2  \n",
+       "96                          [math.DG, math.AP, 53E99]  2003.11886v3  \n",
+       "97                                          [math.AP]  2212.10137v2  \n",
+       "98            [math.AP, math.FA, 05J50, 03H10, 35D30]  2306.01319v2  \n",
+       "99       [math.AP, 93C20 (Primary) 35A27 (Secondary)]  2303.03733v3  \n",
+       "\n",
+       "[100 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 259,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data._returned_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 260,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>summary</th>\n",
+       "      <th>id</th>\n",
+       "      <th>msc_tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Future stability of expanding spatially homoge...</td>\n",
+       "      <td>Spatially homogeneous FLRW solutions constitut...</td>\n",
+       "      <td>2306.17774v1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Autonomous and asymptotically quasiconvex func...</td>\n",
+       "      <td>We obtain local regularity for minimizers of a...</td>\n",
+       "      <td>2306.17768v1</td>\n",
+       "      <td>[35J47, 35B65, 46E30]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>A Coefficient Inverse Problem for the Mean Fie...</td>\n",
+       "      <td>A Coefficient Inverse Problem (CIP) of the det...</td>\n",
+       "      <td>2306.03349v2</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Nonuniqueness results for constant sixth order...</td>\n",
+       "      <td>We prove nonuniqueness results for constant si...</td>\n",
+       "      <td>2306.00679v2</td>\n",
+       "      <td>[35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Asymptotic limits of the principal spectrum po...</td>\n",
+       "      <td>This work examines the limits of the principal...</td>\n",
+       "      <td>2306.17734v1</td>\n",
+       "      <td>[92D40, 92D50, 35P15, 35K57]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>Quantization of the Energy for the inhomogeneo...</td>\n",
+       "      <td>We consider the varifold associated to the All...</td>\n",
+       "      <td>2302.00137v2</td>\n",
+       "      <td>[53E99]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>Second order estimates for transition layers a...</td>\n",
+       "      <td>The parabolic Allen-Cahn equation is a semilin...</td>\n",
+       "      <td>2003.11886v3</td>\n",
+       "      <td>[53E99]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>Well-Posedness and Stability Analysis of an Ep...</td>\n",
+       "      <td>A compartment epidemic model for infectious di...</td>\n",
+       "      <td>2212.10137v2</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>Multiple positive solutions for a double phase...</td>\n",
+       "      <td>In this paper, we study a class of double phas...</td>\n",
+       "      <td>2306.01319v2</td>\n",
+       "      <td>[05J50, 03H10, 35D30]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>Stabilization of the wave equation on larger-d...</td>\n",
+       "      <td>This paper deals with uniform stabilization of...</td>\n",
+       "      <td>2303.03733v3</td>\n",
+       "      <td>[93C20, 35A27]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                title  \\\n",
+       "0   Future stability of expanding spatially homoge...   \n",
+       "1   Autonomous and asymptotically quasiconvex func...   \n",
+       "2   A Coefficient Inverse Problem for the Mean Fie...   \n",
+       "3   Nonuniqueness results for constant sixth order...   \n",
+       "4   Asymptotic limits of the principal spectrum po...   \n",
+       "..                                                ...   \n",
+       "95  Quantization of the Energy for the inhomogeneo...   \n",
+       "96  Second order estimates for transition layers a...   \n",
+       "97  Well-Posedness and Stability Analysis of an Ep...   \n",
+       "98  Multiple positive solutions for a double phase...   \n",
+       "99  Stabilization of the wave equation on larger-d...   \n",
+       "\n",
+       "                                              summary            id  \\\n",
+       "0   Spatially homogeneous FLRW solutions constitut...  2306.17774v1   \n",
+       "1   We obtain local regularity for minimizers of a...  2306.17768v1   \n",
+       "2   A Coefficient Inverse Problem (CIP) of the det...  2306.03349v2   \n",
+       "3   We prove nonuniqueness results for constant si...  2306.00679v2   \n",
+       "4   This work examines the limits of the principal...  2306.17734v1   \n",
+       "..                                                ...           ...   \n",
+       "95  We consider the varifold associated to the All...  2302.00137v2   \n",
+       "96  The parabolic Allen-Cahn equation is a semilin...  2003.11886v3   \n",
+       "97  A compartment epidemic model for infectious di...  2212.10137v2   \n",
+       "98  In this paper, we study a class of double phas...  2306.01319v2   \n",
+       "99  This paper deals with uniform stabilization of...  2303.03733v3   \n",
+       "\n",
+       "                                             msc_tags  \n",
+       "0                                                 NaN  \n",
+       "1                               [35J47, 35B65, 46E30]  \n",
+       "2                                                 NaN  \n",
+       "3   [35J60, 35B09, 35J30, 35B40, 53C18, 34C23, 58J55]  \n",
+       "4                        [92D40, 92D50, 35P15, 35K57]  \n",
+       "..                                                ...  \n",
+       "95                                            [53E99]  \n",
+       "96                                            [53E99]  \n",
+       "97                                                NaN  \n",
+       "98                              [05J50, 03H10, 35D30]  \n",
+       "99                                     [93C20, 35A27]  \n",
+       "\n",
+       "[100 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 260,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.load_from_query(query=\"cat:math.AP\", max_results=100)\n",
+    "data.metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Materials Science</th>\n",
+       "      <th>Soft Condensed Matter</th>\n",
+       "      <th>Numerical Analysis</th>\n",
+       "      <th>General Relativity and Quantum Cosmology</th>\n",
+       "      <th>Mathematical Physics</th>\n",
+       "      <th>Analysis of PDEs</th>\n",
+       "      <th>Classical Analysis and ODEs</th>\n",
+       "      <th>Differential Geometry</th>\n",
+       "      <th>Dynamical Systems</th>\n",
+       "      <th>Functional Analysis</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Optimization and Control</th>\n",
+       "      <th>Probability</th>\n",
+       "      <th>Spectral Theory</th>\n",
+       "      <th>Pattern Formation and Solitons</th>\n",
+       "      <th>Biological Physics</th>\n",
+       "      <th>Fluid Dynamics</th>\n",
+       "      <th>Optics</th>\n",
+       "      <th>Cell Behavior</th>\n",
+       "      <th>Populations and Evolution</th>\n",
+       "      <th>Tissues and Organs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Materials Science  Soft Condensed Matter  Numerical Analysis  \\\n",
+       "0                   0                      0                   0   \n",
+       "1                   0                      0                   0   \n",
+       "2                   0                      0                   0   \n",
+       "3                   0                      0                   0   \n",
+       "4                   0                      0                   0   \n",
+       "..                ...                    ...                 ...   \n",
+       "95                  0                      0                   0   \n",
+       "96                  0                      0                   0   \n",
+       "97                  0                      0                   0   \n",
+       "98                  0                      0                   0   \n",
+       "99                  0                      0                   0   \n",
+       "\n",
+       "    General Relativity and Quantum Cosmology  Mathematical Physics  \\\n",
+       "0                                          1                     1   \n",
+       "1                                          0                     0   \n",
+       "2                                          0                     0   \n",
+       "3                                          0                     0   \n",
+       "4                                          0                     0   \n",
+       "..                                       ...                   ...   \n",
+       "95                                         0                     0   \n",
+       "96                                         0                     0   \n",
+       "97                                         0                     0   \n",
+       "98                                         0                     0   \n",
+       "99                                         0                     0   \n",
+       "\n",
+       "    Analysis of PDEs  Classical Analysis and ODEs  Differential Geometry  \\\n",
+       "0                  1                            0                      1   \n",
+       "1                  1                            0                      0   \n",
+       "2                  1                            0                      0   \n",
+       "3                  1                            0                      1   \n",
+       "4                  1                            0                      0   \n",
+       "..               ...                          ...                    ...   \n",
+       "95                 1                            0                      1   \n",
+       "96                 1                            0                      1   \n",
+       "97                 1                            0                      0   \n",
+       "98                 1                            0                      0   \n",
+       "99                 1                            0                      0   \n",
+       "\n",
+       "    Dynamical Systems  Functional Analysis  ...  Optimization and Control  \\\n",
+       "0                   0                    0  ...                         0   \n",
+       "1                   0                    0  ...                         0   \n",
+       "2                   0                    0  ...                         0   \n",
+       "3                   0                    0  ...                         0   \n",
+       "4                   1                    0  ...                         0   \n",
+       "..                ...                  ...  ...                       ...   \n",
+       "95                  0                    0  ...                         0   \n",
+       "96                  0                    0  ...                         0   \n",
+       "97                  0                    0  ...                         0   \n",
+       "98                  0                    1  ...                         0   \n",
+       "99                  0                    0  ...                         0   \n",
+       "\n",
+       "    Probability  Spectral Theory  Pattern Formation and Solitons  \\\n",
+       "0             0                0                               0   \n",
+       "1             0                0                               0   \n",
+       "2             0                0                               0   \n",
+       "3             0                0                               0   \n",
+       "4             0                0                               0   \n",
+       "..          ...              ...                             ...   \n",
+       "95            0                0                               0   \n",
+       "96            0                0                               0   \n",
+       "97            0                0                               0   \n",
+       "98            0                0                               0   \n",
+       "99            0                0                               0   \n",
+       "\n",
+       "    Biological Physics  Fluid Dynamics  Optics  Cell Behavior  \\\n",
+       "0                    0               0       0              0   \n",
+       "1                    0               0       0              0   \n",
+       "2                    0               0       0              0   \n",
+       "3                    0               0       0              0   \n",
+       "4                    0               0       0              0   \n",
+       "..                 ...             ...     ...            ...   \n",
+       "95                   0               0       0              0   \n",
+       "96                   0               0       0              0   \n",
+       "97                   0               0       0              0   \n",
+       "98                   0               0       0              0   \n",
+       "99                   0               0       0              0   \n",
+       "\n",
+       "    Populations and Evolution  Tissues and Organs  \n",
+       "0                           0                   0  \n",
+       "1                           0                   0  \n",
+       "2                           0                   0  \n",
+       "3                           0                   0  \n",
+       "4                           0                   0  \n",
+       "..                        ...                 ...  \n",
+       "95                          0                   0  \n",
+       "96                          0                   0  \n",
+       "97                          0                   0  \n",
+       "98                          0                   0  \n",
+       "99                          0                   0  \n",
+       "\n",
+       "[100 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 261,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.arxiv_subjects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "x = []\n",
+    "\n",
+    "if x:\n",
+    "    y = x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'y' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[157], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m y\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'y' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "y"
+   ]
   }
  ],
  "metadata": {

data_cleaning.py CHANGED Viewed

@@ -2,6 +2,7 @@ import regex
 import pandas as pd
 import json
 import sentence_transformers.util
 import os
@@ -196,35 +197,25 @@ def category_map():
     }
-def split_categories_by_row(raw_metadata_row):
-    """Takes in row of a dataframe returned by an arxiv query search, returns a tuple with the list
-    of arXiv subject tags in the first slot, msc_tags in the second slot.
-    Args:
-        raw_metadata_row: row of a dataframe returned by an arXiv query request
-    Returns:
-        (x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
-    """
-    categories = raw_metadata_row.categories
-    expanded_categories = pd.Series(categories)
-    arxiv_subject_labels = category_map()
-    if expanded_categories.isin(arxiv_subject_labels.keys()).all():
-        return (raw_metadata_row.categories, None)
-    else:
-        msc_tags = find_msc(raw_metadata_row.categories[-1])
-        return (raw_metadata_row.categories[:-2], msc_tags)
-def extract_tags(raw_metadata, arxiv_tag):
-    split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
-    flag = 1
-    if arxiv_tag:
-        flag = 0
-    return split_categories.apply(lambda x: x[flag])
 ## 1. Latin-ize latex accents enclosed in brackets
@@ -285,9 +276,9 @@ def find_hyph(text):
 def find_msc(msc_string):
-    pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
-    tags = regex.findall(pattern, msc_string)
-    return tags
 def msc_tags():

 import pandas as pd
 import json
 import sentence_transformers.util
+import numpy as np
 import os
     }
+def extract_arxiv_subjects(raw_metadata):
+    def get_arxiv_subjects_from_cats(categories):
+        arxiv_subject_labels = category_map()
+        return [tag for tag in categories if tag in arxiv_subject_labels.keys()]
+    return raw_metadata.categories.apply(get_arxiv_subjects_from_cats)
+def extract_msc_tags(raw_metadata):
+    ## Check the last entry for 5 digit msc tags only.
+    msc_tags = raw_metadata.categories.apply(lambda x: find_msc(x[-1]))
+    msc_tags = msc_tags.apply(lambda x: np.nan if len(x) == 0 else x)
+    return msc_tags
+#### LATEX CLEANING UTILITIES
 ## 1. Latin-ize latex accents enclosed in brackets
 def find_msc(msc_string):
+    five_digit_pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
+    five_digit_tags = regex.findall(five_digit_pattern, msc_string)
+    return five_digit_tags
 def msc_tags():

data_storage.py CHANGED Viewed

@@ -20,13 +20,19 @@ class ArXivData:
         self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
         self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
-    def load_from_query(self, query_string, max_results, offset=0):
-        self._returned_metadata = query_to_df(
-            query=query_string, max_results=max_results, offset=offset
-        )
-        self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
-        self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
     def clean(self, dataset):
         """Constructs this dataset by cleaning another one.
@@ -39,15 +45,20 @@ class ArXivData:
         self.raw = dataset.raw
         self.categories = dataset.categories
-    def get_OHE_arxiv_subjects(returned_metadata):
         mlb = MultiLabelBinarizer()
         OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
         arxiv_subject_labels = clean.category_map()
-        return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename(
-            columns=arxiv_subject_labels
-        )
 def format_query(author="", title="", cat="", abstract=""):
@@ -72,7 +83,7 @@ def format_query(author="", title="", cat="", abstract=""):
     return query
-def query_to_df(query, max_results, offset):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
@@ -116,9 +127,10 @@ def query_to_df(query, max_results, offset):
     raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
     returned_metadata = raw_metadata.copy().drop(columns=["categories"])
-    returned_metadata["arxiv_subjects"] = clean.extract_tags(
-        raw_metadata, arxiv_tag=True
-    )
-    returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False)
     return returned_metadata

         self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
         self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
+    def load_from_query(self, query, max_results, offset=0, raw=False):
+        if raw:
+            self._returned_metadata = query_to_df(
+                query=query, max_results=max_results, offset=offset, raw=True
+            )
+        else:
+            self._returned_metadata = query_to_df(
+                query=query, max_results=max_results, offset=offset
+            )
+            self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
+            self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
     def clean(self, dataset):
         """Constructs this dataset by cleaning another one.
         self.raw = dataset.raw
         self.categories = dataset.categories
+    def get_OHE_arxiv_subjects(self, returned_metadata):
         mlb = MultiLabelBinarizer()
         OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
         arxiv_subject_labels = clean.category_map()
+        OHE_arxiv_subjects = pd.DataFrame(
+            OHE_arxiv_subjects_array, columns=mlb.classes_
+        ).rename(columns=arxiv_subject_labels)
+        ## Remove duplicated columns
+        return OHE_arxiv_subjects.loc[
+            :, ~OHE_arxiv_subjects.columns.duplicated()
+        ].copy()
 def format_query(author="", title="", cat="", abstract=""):
     return query
+def query_to_df(query, max_results, offset, raw=False):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
     raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
     returned_metadata = raw_metadata.copy().drop(columns=["categories"])
+    returned_metadata["arxiv_subjects"] = clean.extract_arxiv_subjects(raw_metadata)
+    returned_metadata["msc_tags"] = clean.extract_msc_tags(raw_metadata)
+    if raw:
+        return raw_metadata
     return returned_metadata