Spaces:
Runtime error
Runtime error
File size: 4,378 Bytes
458942a 9b818c8 b0e8ca7 9b818c8 b0e8ca7 9b818c8 458942a b0e8ca7 458942a b0e8ca7 458942a b0e8ca7 458942a b0e8ca7 458942a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import arxiv
import pandas as pd
import data_cleaning as clean
from sklearn.preprocessing import MultiLabelBinarizer
class ArXivData():
"""A light class for storing the metadata of a collection of arXiv papers.
"""
def __init__(self):
"""
data: dataframe holding the metadata. Each row represents a paper and each column is
a separate piece of metadata.
query: A tuple of the form (query_string,max_results) where query_string is the formatted
string that produced the raw data and max_results is the value of that parameter passed to the
arXiv API.
raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
"""
self.data = None
self.query = None
self.categories = None
def load_from_file():
pass
def load_from_query(self,query_string,max_results,offset):
self.data = query_to_df(query=query_string,max_results=max_results,offset=offset)
self.query = (query_string,max_results)
#self.categories = self.get_OHE_cats()
def clean(self,dataset):
"""Constructs this dataset by cleaning another one.
Args:
dataset: An ArXivData object containing data to be cleaned.
"""
self.data = clean.clean(dataset)
self.query = dataset.query
self.raw = dataset.raw
self.categories = dataset.categories
def get_OHE_cats(self):
mlb = MultiLabelBinarizer()
OHE_category_array = mlb.fit_transform(self.data.categories)
return pd.DataFrame(
OHE_category_array, columns = mlb.classes_).rename(
mapper=clean.category_map())
def format_query(author='',title='',cat='',abstract=''):
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
leave the corresponding argument blank.
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
Args:
author: string to search for in the author field.
title: string to search for in the title field.
cat: A valid arxiv subject tag. See the full list of these at:
https://arxiv.org/category_taxonomy
abstract: string to search for in the abstract field.
Returns:
properly formatted query string to return all results simultaneously matching all specified fields.
"""
tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
return query
def query_to_df(query,max_results,offset):
"""Returns the results of an arxiv API query in a pandas dataframe.
Args:
query: string defining an arxiv query formatted according to
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
max_results: positive integer specifying the maximum number of results returned.
chunksize:
Returns:
pandas dataframe with one column for indivial piece of metadata of a returned result.
To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
http://lukasschwab.me/arxiv.py/index.html#Result
The 'links' column is dropped and the authors column is a list of each author's name as a string.
The categories column is also a list of all tags appearing.
"""
client = arxiv.Client(page_size=2000,num_retries=3)
search = arxiv.Search(
query = query,
max_results=max_results,
sort_by=arxiv.SortCriterion.LastUpdatedDate
)
columns = ['title','summary','categories','id']
index = range(offset,max_results)
results = client.results(search,offset=offset)
metadata_generator = ((result.title,result.summary,
result.categories,
result.entry_id.split('/')[-1]) for result in results)
metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
return metadata_dataframe
|