Spaces:
Build error
Build error
File size: 8,951 Bytes
a8d4e3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
"""
oia_metadata.py
authors: Matt Bierbaum and Colin Clement
date: 2019-02-25
This module interacts with the Open Archive Initiative API, downloading
the metadata for all Arxiv articles.
Usage
=====
python oia_metadata.py data/<savefile>.json
Notes
=====
The save file is not technically JSON, but individual streamed lines of JSON,
each of which is compressed by gzip. Use the helper function load_metadata
to be sure to open it without error.
Resources
=========
* http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm
* https://arxiv.org/help/oa/index
"""
import os
import gzip
import glob
import json
import time
import hashlib
import datetime
import requests
import xml.etree.ElementTree as ET
from arxiv_public_data.config import LOGGER, DIR_BASE
log = LOGGER.getChild('metadata')
URL_ARXIV_OAI = 'https://export.arxiv.org/oai2'
URL_CITESEER_OAI = 'http://citeseerx.ist.psu.edu/oai2'
OAI_XML_NAMESPACES = {
'OAI': 'http://www.openarchives.org/OAI/2.0/',
'arXiv': 'http://arxiv.org/OAI/arXivRaw/'
}
def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI,
metadataPrefix='arXivRaw'):
"""
Query OIA API for the metadata of 1000 Arxiv article
Parameters
----------
resumptionToken : str
Token for the API which triggers the next 1000 articles
Returns
-------
record_chunks : str
metadata of 1000 arXiv articles as an XML string
"""
parameters = {'verb': 'ListRecords'}
if resumptionToken:
parameters['resumptionToken'] = resumptionToken
else:
parameters['metadataPrefix'] = metadataPrefix
response = requests.get(harvest_url, params=parameters)
if response.status_code == 200:
return response.text
if response.status_code == 503:
secs = int(response.headers.get('Retry-After', 20)) * 1.5
log.info('Requested to wait, waiting {} seconds until retry...'.format(secs))
time.sleep(secs)
return get_list_record_chunk(resumptionToken=resumptionToken)
else:
raise Exception(
'Unknown error in HTTP request {}, status code: {}'.format(
response.url, response.status_code
)
)
def _record_element_text(elm, name):
""" XML helper function for extracting text from leaf (single-node) elements """
item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
return item.text if item is not None else None
def _record_element_all(elm, name):
""" XML helper function for extracting text from queries with multiple nodes """
return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
def parse_record(elm):
"""
Parse the XML element of a single ArXiv article into a dictionary of
attributes
Parameters
----------
elm : xml.etree.ElementTree.Element
Element of the record of a single ArXiv article
Returns
-------
output : dict
Attributes of the ArXiv article stored as a dict with the keys
id, submitter, authors, title, comments, journal-ref, doi, abstract,
report-no, categories, and version
"""
text_keys = [
'id', 'submitter', 'authors', 'title', 'comments',
'journal-ref', 'doi', 'abstract', 'report-no'
]
output = {key: _record_element_text(elm, key) for key in text_keys}
output['categories'] = [
i.text for i in (_record_element_all(elm, 'categories') or [])
]
output['versions'] = [
i.attrib['version'] for i in _record_element_all(elm, 'version')
]
return output
def parse_xml_listrecords(root):
"""
Parse XML of one chunk of the metadata of 1000 ArXiv articles
into a list of dictionaries
Parameters
----------
root : xml.etree.ElementTree.Element
Element containing the records of an entire chunk of ArXiv queries
Returns
-------
records, resumptionToken : list, str
records is a list of 1000 dictionaries, each containing the
attributes of a single arxiv article
resumptionToken is a string which is fed into the subsequent query
"""
resumptionToken = root.find(
'OAI:ListRecords/OAI:resumptionToken',
OAI_XML_NAMESPACES
)
resumptionToken = resumptionToken.text if resumptionToken is not None else ''
records = root.findall(
'OAI:ListRecords/OAI:record/OAI:metadata/arXiv:arXivRaw',
OAI_XML_NAMESPACES
)
records = [parse_record(p) for p in records]
return records, resumptionToken
def check_xml_errors(root):
""" Check for, log, and raise any OAI service errors in the XML """
error = root.find('OAI:error', OAI_XML_NAMESPACES)
if error is not None:
raise RuntimeError(
'OAI service returned error: {}'.format(error.text)
)
def find_default_locations():
outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz')
resume = os.path.join(
DIR_BASE, 'arxiv-metadata-oai-*.json.gz-resumptionToken.txt'
)
fn_outfile = sorted(glob.glob(outfile))
fn_resume = sorted(glob.glob(resume))
if len(fn_outfile) > 0:
return fn_outfile[-1]
return None
def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
"""
Download the metadata for every article in the ArXiv via the OAI API
Parameters
----------
outfile : str (default './arxiv-metadata-oai-<date>.json')
name of file where data is stored, appending each chunk of 1000
articles.
resumptionToken : str (default None)
token which instructs the OAI server to continue feeding the next
chunk
autoresume : bool
If true, it looks for a saved resumptionToken in the file
<outfile>-resumptionToken.txt
"""
date = str(datetime.datetime.now()).split(' ')[0]
outfile = (
outfile or # user-supplied
find_default_locations() or # already in progress
os.path.join(
DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date)
) # new file
)
directory = os.path.split(outfile)[0]
if directory and not os.path.exists(directory):
os.makedirs(directory)
tokenfile = '{}-resumptionToken.txt'.format(outfile)
chunk_index = 0
total_records = 0
log.info('Saving metadata to "{}"'.format(outfile))
resumptionToken = None
if autoresume:
try:
resumptionToken = open(tokenfile, 'r').read()
except Exception as e:
log.warn("No tokenfile found '{}'".format(tokenfile))
log.info("Starting download from scratch...")
while True:
log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format(
chunk_index, total_records, resumptionToken)
)
xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
check_xml_errors(xml_root)
records, resumptionToken = parse_xml_listrecords(xml_root)
chunk_index = chunk_index + 1
total_records = total_records + len(records)
with gzip.open(outfile, 'at', encoding='utf-8') as fout:
for rec in records:
fout.write(json.dumps(rec) + '\n')
if resumptionToken:
with open(tokenfile, 'w') as fout:
fout.write(resumptionToken)
else:
log.info('No resumption token, query finished')
return
time.sleep(12) # OAI server usually requires a 10s wait
def load_metadata(infile=None):
"""
Load metadata saved by all_of_arxiv, as a list of lines of gzip compressed
json.
Parameters
----------
infile : str or None
name of file saved by gzip. If None, one is attempted to be found
in the expected location with the expected name.
Returns
-------
article_attributes : list
list of dicts, each of which contains the metadata attributes of
the ArXiv articles
"""
fname = infile or find_default_locations()
with gzip.open(fname, 'rt', encoding='utf-8') as fin:
return [json.loads(line) for line in fin.readlines()]
def hash_abstracts(metadata):
""" Replace abstracts with their MD5 hash for legal distribution """
metadata_no_abstract = []
for i in range(len(metadata)):
m = metadata[i].copy()
m['abstract_md5'] = hashlib.md5(m['abstract'].encode()).hexdigest()
del m['abstract']
metadata_no_abstract.append(m)
return metadata_no_abstract
def validate_abstract_hashes(metadata, metadata_no_abstract):
""" Validate that abstracts match the hashes """
for m, n in zip(metadata, metadata_no_abstract):
md5 = hashlib.md5(m['abstract'].encode()).hexdigest()
if not md5 == n['abstract_md5']:
return False
return True
|