{ "cells": [ { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import spacy\n", "from typing import List\n", "from collections import Counter\n", "\n", "from matplotlib import pyplot as plt \n", "import seaborn as sns\n", "sns.set_style(\"darkgrid\")\n", "sns.set_palette(\"mako\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# text_corpus = df['cleaned_abstracts'].to_list()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_corpus_tokens(text_corpus: List[List[str]]) -> List[str]:\n", " \"\"\"\n", " Extracts tokens from a given text corpus using spaCy.\n", " Args:\n", " text_corpus (List[List[str]]): A list of lists where each inner list represents a document in the corpus.\n", " Returns:\n", " List[str]: A list of tokens extracted from the corpus.\n", " \"\"\"\n", " tokens = list() \n", " nlp = spacy.load('en_core_web_sm') \n", " \n", " for doc in text_corpus:\n", " nlp_doc = nlp.make_doc(doc)\n", " tokens.extend([token.text for token in nlp_doc])\n", " \n", " return tokens" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# with open(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/corpus_tokens.txt\", \"w\") as file:\n", "# list_string = '\\n'.join(str(item) for item in corpus_tokens)\n", "# file.write(list_string)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_list_from_file(file_path: str):\n", " \"\"\"\n", " Reads a text file containing a list and converts it back to a Python list\n", " Args:\n", " file_path (str): Path to the text file\n", " Returns:\n", " list: The Python list read from the file.\n", " \"\"\"\n", " try:\n", " with open(file_path, 'r') as file:\n", " # Read the contents of the file\n", " file_contents = file.read()\n", " lines = file_contents.split('\\n')\n", " plist = [item for item in lines if item]\n", "\n", " return plist\n", " except FileNotFoundError:\n", " raise FileNotFoundError(\"The specified file cannot be found.\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "corpus_tokens = read_list_from_file(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/corpus_tokens.txt\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "count_words = Counter(corpus_tokens)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "collections.Counter" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(count_words)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "most_common_words = count_words.most_common(1000)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | word | \n", "count | \n", "
---|---|---|
0 | \n", "\n", " | 26467421 | \n", "
1 | \n", "$ | \n", "4088943 | \n", "
2 | \n", "\n", " | 2459316 | \n", "
3 | \n", "model | \n", "1598013 | \n", "
4 | \n", "result | \n", "1064830 | \n", "
... | \n", "... | \n", "... | \n", "
995 | \n", "cavity | \n", "40578 | \n", "
996 | \n", "community | \n", "40478 | \n", "
997 | \n", "trace | \n", "40448 | \n", "
998 | \n", "relatively | \n", "40385 | \n", "
999 | \n", "electromagnetic | \n", "40358 | \n", "
1000 rows × 2 columns
\n", "