Spaces:
Build error
Build error
| """ | |
| util.py | |
| author: Colin Clement | |
| date: 2019-04-05 | |
| This module contains helper functions for loading embeddings and batch | |
| loading the full text, since many computers cannot contain the whole | |
| fulltext in memory. | |
| """ | |
| import os | |
| import re | |
| import numpy as np | |
| import pickle | |
| from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT | |
| from arxiv_public_data.oai_metadata import load_metadata | |
| def id_to_pathname(aid): | |
| """ | |
| Make filename path for text document, matching the format of fulltext | |
| creation in `s3_bulk_download` | |
| Parameters | |
| ---------- | |
| aid : str | |
| string of arXiv article id as found in metadata | |
| Returns | |
| ------- | |
| pathname : str | |
| pathname in which to store the article following | |
| Examples | |
| -------- | |
| >>> id_to_pathname('hep-ph/0001001') #doctest: +ELLIPSIS | |
| '.../hep-ph/0001/hep-ph0001001.txt' | |
| >>> id_to_pathname('1501.13851') #doctest: +ELLIPSIS | |
| '.../arxiv/1501/1501.13851.txt' | |
| """ | |
| if '.' in aid: # new style ArXiv ID | |
| yymm = aid.split('.')[0] | |
| return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt') | |
| # old style ArXiv ID | |
| cat, arxiv_id = re.split(r'(\d+)', aid)[:2] | |
| yymm = arxiv_id[:4] | |
| return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt') | |
| def load_generator(paths, batchsize): | |
| """ | |
| Creates a generator object for batch loading files from paths | |
| Parameters | |
| ---------- | |
| paths : list of filepaths | |
| batchsize : int | |
| Returns | |
| ------- | |
| file_contents : list of strings of contents of files in path | |
| """ | |
| assert type(paths) is list, 'Requires a list of paths' | |
| assert type(batchsize) is int, 'batchsize must be an int' | |
| assert batchsize > 0, 'batchsize must be positive' | |
| out = [] | |
| for p in paths: | |
| with open(p, 'r') as fin: | |
| out.append(fin.read()) | |
| if len(out) == batchsize: | |
| yield np.array(out, dtype='object') | |
| out = [] | |
| yield out | |
| def batch_fulltext(batchsize=32, maxnum=None): | |
| """ | |
| Read metadata and find corresponding files in the fulltext | |
| Parameters | |
| ---------- | |
| (optional) | |
| batchsize : int | |
| number of fulltext files to load into a batch | |
| maxnum : int | |
| the maximum number of paths to feed the generator, for | |
| testing purposes | |
| Returns | |
| ------- | |
| md_index, all_ids, load_gen : tuple of (list, list, generator) | |
| md_index is a mapping of existing fulltext files, in order | |
| of their appearance, and containing the index of corresponding | |
| metadata. all_ids is a list of all arXiv IDs in the metadata. | |
| load_gen is a generator which allows batched loading of the | |
| full-text, as defined by `load_generator` | |
| """ | |
| all_ids = [m['id'] for m in load_metadata()] | |
| all_paths = [id_to_pathname(aid) for aid in all_ids] | |
| exists = [os.path.exists(p) for p in all_paths] | |
| existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum] | |
| md_index = [i for i, e in enumerate(exists) if e] | |
| return md_index, all_ids, load_generator(existing_paths, batchsize) | |
| def load_embeddings(filename, headers=0): | |
| """ | |
| Loads vector embeddings | |
| Parameters | |
| ---------- | |
| filename : str | |
| path to vector embeddings saved by `create_save_embeddings` | |
| (optional) | |
| headers : int | |
| number of pickle calls containing metadata separate from the graphs | |
| Returns | |
| ------- | |
| embeddings : dict | |
| keys 'embeddings' containing vector embeddings and | |
| 'headers' containining metadata | |
| """ | |
| out = {'embeddings': [], 'headers': []} | |
| N = 0 | |
| with open(filename, 'rb') as fin: | |
| while True: | |
| try: | |
| if N < headers: | |
| out['headers'].append(pickle.load(fin)) | |
| else: | |
| out['embeddings'].extend(pickle.load(fin)) | |
| except EOFError as e: | |
| break | |
| N += 1 | |
| out['embeddings'] = np.array(out['embeddings']) | |
| return out | |
| def fill_zeros(loaded_embedding): | |
| """ | |
| Fill out zeros in the full-text embedding where full-text is missing | |
| Parameters | |
| ---------- | |
| loaded_embedding : dict | |
| dict as saved from with `load_embeddings` with 2 headers | |
| of the list of the metadata_index each embedding vector corresponds | |
| to, the list of all article ids | |
| Returns | |
| ------- | |
| embeddings : array_like | |
| vector embeddings of shape (number of articles, embedding dimension) | |
| """ | |
| md_index = loaded_embedding['headers'][0] | |
| all_ids = loaded_embedding['headers'][1] | |
| vectors = loaded_embedding['embeddings'] | |
| output = np.zeros((len(all_ids), vectors.shape[1])) | |
| for idx, v in zip(md_index, vectors): | |
| output[idx,:] = v | |
| return output | |