{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-06-24 16:49:13.031488: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import pandas as pd\n", "import numpy\n", "from matplotlib import pyplot as plt\n", "from typing import List, Dict\n", "from collections import Counter\n", "from pprint import pprint\n", "\n", "import seaborn as sns\n", "sns.set_style(\"darkgrid\")\n", "sns.set_palette(\"mako\")\n", "\n", "import spacy\n", "from spacy.lang.en import English\n", "from nltk.corpus import stopwords\n", "\n", "nlp = spacy.load('en_core_web_sm')\n", "\n", "pd.set_option('display.float_format', '{:.2f}'.format)\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "spacy.lang.en.English" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(nlp)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df_raw = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers_raw.parquet.gzip\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitterauthorstitlecommentsjournal-refdoireport-nocategorieslicenseabstractversionsupdate_dateauthors_parsed
00704.0001Pavel NadolskyC. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...Calculation of prompt diphoton production cros...37 pages, 15 figures; published versionPhys.Rev.D76:013009,200710.1103/PhysRevD.76.013009ANL-HEP-PR-07-12hep-phNoneA fully differential calculation in perturba...[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '...2008-11-26[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...
10704.0002Louis TheranIleana Streinu and Louis TheranSparsity-certifying Graph DecompositionsTo appear in Graphs and CombinatoricsNoneNoneNonemath.CO cs.CGhttp://arxiv.org/licenses/nonexclusive-distrib...We describe a new algorithm, the $(k,\\ell)$-...[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ...2008-12-13[[Streinu, Ileana, ], [Theran, Louis, ]]
20704.0003Hongjun PanHongjun PanThe evolution of the Earth-Moon system based o...23 pages, 3 figuresNoneNoneNonephysics.gen-phNoneThe evolution of Earth-Moon system is descri...[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '...2008-01-13[[Pan, Hongjun, ]]
30704.0004David CallanDavid CallanA determinant of Stirling cycle numbers counts...11 pagesNoneNoneNonemath.CONoneWe show that a determinant of Stirling cycle...[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ...2007-05-23[[Callan, David, ]]
40704.0005Alberto TorchinskyWael Abu-Shammala and Alberto TorchinskyFrom dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...NoneIllinois J. Math. 52 (2008) no.2, 681-689NoneNonemath.CA math.FANoneIn this paper we show how to compute the $\\L...[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '...2013-10-15[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]
\n", "
" ], "text/plain": [ " id submitter \n", "0 0704.0001 Pavel Nadolsky \\\n", "1 0704.0002 Louis Theran \n", "2 0704.0003 Hongjun Pan \n", "3 0704.0004 David Callan \n", "4 0704.0005 Alberto Torchinsky \n", "\n", " authors \n", "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n", "1 Ileana Streinu and Louis Theran \n", "2 Hongjun Pan \n", "3 David Callan \n", "4 Wael Abu-Shammala and Alberto Torchinsky \n", "\n", " title \n", "0 Calculation of prompt diphoton production cros... \\\n", "1 Sparsity-certifying Graph Decompositions \n", "2 The evolution of the Earth-Moon system based o... \n", "3 A determinant of Stirling cycle numbers counts... \n", "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n", "\n", " comments \n", "0 37 pages, 15 figures; published version \\\n", "1 To appear in Graphs and Combinatorics \n", "2 23 pages, 3 figures \n", "3 11 pages \n", "4 None \n", "\n", " journal-ref doi \n", "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \\\n", "1 None None \n", "2 None None \n", "3 None None \n", "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n", "\n", " report-no categories \n", "0 ANL-HEP-PR-07-12 hep-ph \\\n", "1 None math.CO cs.CG \n", "2 None physics.gen-ph \n", "3 None math.CO \n", "4 None math.CA math.FA \n", "\n", " license \n", "0 None \\\n", "1 http://arxiv.org/licenses/nonexclusive-distrib... \n", "2 None \n", "3 None \n", "4 None \n", "\n", " abstract \n", "0 A fully differential calculation in perturba... \\\n", "1 We describe a new algorithm, the $(k,\\ell)$-... \n", "2 The evolution of Earth-Moon system is descri... \n", "3 We show that a determinant of Stirling cycle... \n", "4 In this paper we show how to compute the $\\L... \n", "\n", " versions update_date \n", "0 [{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... 2008-11-26 \\\n", "1 [{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... 2008-12-13 \n", "2 [{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... 2008-01-13 \n", "3 [{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... 2007-05-23 \n", "4 [{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... 2013-10-15 \n", "\n", " authors_parsed \n", "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n", "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n", "2 [[Pan, Hongjun, ]] \n", "3 [[Callan, David, ]] \n", "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_raw.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def cleanData(doc: pd.Series, stemming=False, nlp = spacy.load('en_core_web_sm')):\n", " \"\"\"\n", " TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs.\n", " \n", " Cleans and processes the input documents by performing various text cleaning operations.\n", "\n", " Args:\n", " doc (pd.Series): The documents to be cleaned, passed in a Series object.\n", " stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False.\n", "\n", " Returns:\n", " str: The cleaned and processed document as a single string.\n", " \"\"\"\n", " doc = doc.lower()\n", " doc = nlp(doc)\n", " tokens = [tokens.lower_ for tokens in doc]\n", " tokens = [tokens for tokens in doc if (tokens.is_stop == False)]\n", " tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]\n", " final_token = [token.lemma_ for token in tokens]\n", " \n", " return \" \".join(final_token)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " In this paper we show how to compute the $\\Lambda_{\\alpha}$ norm, $\\alpha\\ge\n", "0$, using the dyadic grid. This result is a consequence of the description of\n", "the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.\n", "\n" ] }, { "data": { "text/plain": [ "' paper compute $ \\\\lambda_{\\\\alpha}$ norm $ \\\\alpha\\\\ge \\n 0 $ dyadic grid result consequence description \\n hardy space $ h^p(r^n)$ term dyadic special atom \\n'" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(df_raw['abstract'][4])\n", "test= cleanData(df_raw['abstract'][4])\n", "test" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df_raw[\"cleaned_abstracts\"] = df_raw[\"abstract\"].map(lambda x: cleanData(x))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "\n", "df_raw.to_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.11.4 ('arxiv-env': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363" } } }, "nbformat": 4, "nbformat_minor": 2 }