Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

App Files Files Community

Michael-Geis commited on Jul 1, 2023

Commit

73994b7

•

1 Parent(s): b0e8ca7

turned black on

Browse files

Files changed (2) hide show

.vscode/settings.json +6 -0
data_storage.py +45 -51

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}

data_storage.py CHANGED Viewed

@@ -3,16 +3,16 @@ import pandas as pd
 import data_cleaning as clean
 from sklearn.preprocessing import MultiLabelBinarizer
-class ArXivData():
-    """A light class for storing the metadata of a collection of arXiv papers.
-    """
     def __init__(self):
         """
         data: dataframe holding the metadata. Each row represents a paper and each column is
         a separate piece of metadata.
-        query: A tuple of the form (query_string,max_results) where query_string is the formatted
         string that produced the raw data and max_results is the value of that parameter passed to the
         arXiv API.
@@ -26,15 +26,16 @@ class ArXivData():
         self.categories = None
     def load_from_file():
-        pass
-    def load_from_query(self,query_string,max_results,offset):
-        self.data = query_to_df(query=query_string,max_results=max_results,offset=offset)
-        self.query = (query_string,max_results)
-        #self.categories = self.get_OHE_cats()
-    def clean(self,dataset):
         """Constructs this dataset by cleaning another one.
         Args:
@@ -48,18 +49,15 @@ class ArXivData():
     def get_OHE_cats(self):
         mlb = MultiLabelBinarizer()
         OHE_category_array = mlb.fit_transform(self.data.categories)
-        return pd.DataFrame(
-            OHE_category_array, columns = mlb.classes_).rename(
-            mapper=clean.category_map())
-def format_query(author='',title='',cat='',abstract=''):
     """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
     leave the corresponding argument blank.
     e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
     Args:
@@ -73,19 +71,18 @@ def format_query(author='',title='',cat='',abstract=''):
         properly formatted query string to return all results simultaneously matching all specified fields.
     """
-    tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
-    query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
     return query
-def query_to_df(query,max_results,offset):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
-        query: string defining an arxiv query formatted according to
         https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
         max_results: positive integer specifying the maximum number of results returned.
         chunksize:
@@ -98,31 +95,28 @@ def query_to_df(query,max_results,offset):
         The 'links' column is dropped and the authors column is a list of each author's name as a string.
         The categories column is also a list of all tags appearing.
     """
-    client = arxiv.Client(page_size=2000,num_retries=3)
     search = arxiv.Search(
-            query = query,
-            max_results=max_results,
-            sort_by=arxiv.SortCriterion.LastUpdatedDate
-            )
-    columns = ['title','summary','categories','id']
-    index = range(offset,max_results)
-    results = client.results(search,offset=offset)
-    metadata_generator = ((result.title,result.summary,
-                        result.categories,
-                        result.entry_id.split('/')[-1]) for result in results)
     metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
     return metadata_dataframe

 import data_cleaning as clean
 from sklearn.preprocessing import MultiLabelBinarizer
+class ArXivData:
+    """A light class for storing the metadata of a collection of arXiv papers."""
     def __init__(self):
         """
         data: dataframe holding the metadata. Each row represents a paper and each column is
         a separate piece of metadata.
+        query: A tuple of the form (query_string,max_results) where query_string is the formatted
         string that produced the raw data and max_results is the value of that parameter passed to the
         arXiv API.
         self.categories = None
     def load_from_file():
+        pass
+    def load_from_query(self, query_string, max_results, offset):
+        self.data = query_to_df(
+            query=query_string, max_results=max_results, offset=offset
+        )
+        self.query = (query_string, max_results)
+        # self.categories = self.get_OHE_cats()
+    def clean(self, dataset):
         """Constructs this dataset by cleaning another one.
         Args:
     def get_OHE_cats(self):
         mlb = MultiLabelBinarizer()
         OHE_category_array = mlb.fit_transform(self.data.categories)
+        return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
+            mapper=clean.category_map()
+        )
+def format_query(author="", title="", cat="", abstract=""):
     """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
     leave the corresponding argument blank.
     e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
     Args:
         properly formatted query string to return all results simultaneously matching all specified fields.
     """
+    tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
+    query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
     return query
+def query_to_df(query, max_results, offset):
     """Returns the results of an arxiv API query in a pandas dataframe.
     Args:
+        query: string defining an arxiv query formatted according to
         https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
         max_results: positive integer specifying the maximum number of results returned.
         chunksize:
         The 'links' column is dropped and the authors column is a list of each author's name as a string.
         The categories column is also a list of all tags appearing.
     """
+    client = arxiv.Client(page_size=2000, num_retries=3)
     search = arxiv.Search(
+        query=query,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.LastUpdatedDate,
+    )
+    columns = ["title", "summary", "categories", "id"]
+    index = range(offset, max_results)
+    results = client.results(search, offset=offset)
+    metadata_generator = (
+        (
+            result.title,
+            result.summary,
+            result.categories,
+            result.entry_id.split("/")[-1],
+        )
+        for result in results
+    )
     metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
     return metadata_dataframe