sidphbot's picture
spaces init
a8d4e3d
raw history blame
No virus
4.91 kB
"""
util.py
author: Colin Clement
date: 2019-04-05
This module contains helper functions for loading embeddings and batch
loading the full text, since many computers cannot contain the whole
fulltext in memory.
"""
import os
import re
import numpy as np
import pickle
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT
from arxiv_public_data.oai_metadata import load_metadata
def id_to_pathname(aid):
"""
Make filename path for text document, matching the format of fulltext
creation in `s3_bulk_download`
Parameters
----------
aid : str
string of arXiv article id as found in metadata
Returns
-------
pathname : str
pathname in which to store the article following
Examples
--------
>>> id_to_pathname('hep-ph/0001001') #doctest: +ELLIPSIS
'.../hep-ph/0001/hep-ph0001001.txt'
>>> id_to_pathname('1501.13851') #doctest: +ELLIPSIS
'.../arxiv/1501/1501.13851.txt'
"""
if '.' in aid: # new style ArXiv ID
yymm = aid.split('.')[0]
return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')
# old style ArXiv ID
cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
yymm = arxiv_id[:4]
return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt')
def load_generator(paths, batchsize):
"""
Creates a generator object for batch loading files from paths
Parameters
----------
paths : list of filepaths
batchsize : int
Returns
-------
file_contents : list of strings of contents of files in path
"""
assert type(paths) is list, 'Requires a list of paths'
assert type(batchsize) is int, 'batchsize must be an int'
assert batchsize > 0, 'batchsize must be positive'
out = []
for p in paths:
with open(p, 'r') as fin:
out.append(fin.read())
if len(out) == batchsize:
yield np.array(out, dtype='object')
out = []
yield out
def batch_fulltext(batchsize=32, maxnum=None):
"""
Read metadata and find corresponding files in the fulltext
Parameters
----------
(optional)
batchsize : int
number of fulltext files to load into a batch
maxnum : int
the maximum number of paths to feed the generator, for
testing purposes
Returns
-------
md_index, all_ids, load_gen : tuple of (list, list, generator)
md_index is a mapping of existing fulltext files, in order
of their appearance, and containing the index of corresponding
metadata. all_ids is a list of all arXiv IDs in the metadata.
load_gen is a generator which allows batched loading of the
full-text, as defined by `load_generator`
"""
all_ids = [m['id'] for m in load_metadata()]
all_paths = [id_to_pathname(aid) for aid in all_ids]
exists = [os.path.exists(p) for p in all_paths]
existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum]
md_index = [i for i, e in enumerate(exists) if e]
return md_index, all_ids, load_generator(existing_paths, batchsize)
def load_embeddings(filename, headers=0):
"""
Loads vector embeddings
Parameters
----------
filename : str
path to vector embeddings saved by `create_save_embeddings`
(optional)
headers : int
number of pickle calls containing metadata separate from the graphs
Returns
-------
embeddings : dict
keys 'embeddings' containing vector embeddings and
'headers' containining metadata
"""
out = {'embeddings': [], 'headers': []}
N = 0
with open(filename, 'rb') as fin:
while True:
try:
if N < headers:
out['headers'].append(pickle.load(fin))
else:
out['embeddings'].extend(pickle.load(fin))
except EOFError as e:
break
N += 1
out['embeddings'] = np.array(out['embeddings'])
return out
def fill_zeros(loaded_embedding):
"""
Fill out zeros in the full-text embedding where full-text is missing
Parameters
----------
loaded_embedding : dict
dict as saved from with `load_embeddings` with 2 headers
of the list of the metadata_index each embedding vector corresponds
to, the list of all article ids
Returns
-------
embeddings : array_like
vector embeddings of shape (number of articles, embedding dimension)
"""
md_index = loaded_embedding['headers'][0]
all_ids = loaded_embedding['headers'][1]
vectors = loaded_embedding['embeddings']
output = np.zeros((len(all_ids), vectors.shape[1]))
for idx, v in zip(md_index, vectors):
output[idx,:] = v
return output