colbert-acl / parse.py
davidheineman's picture
fix filepaths
8b805bb
import bibtexparser, json
ANTHOLOGY_PATH = 'anthology.bib'
COLLECTION_PATH = 'collection.json'
DATASET_PATH = 'dataset.json'
def parse_anthology_bibtex(anthology_path):
with open(anthology_path, 'r', encoding='utf-8') as f:
acl_bib = bibtexparser.load(f)
print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
for entry in acl_bib.entries[:2]:
print(entry.get('author'))
print(entry.get('title'))
print(entry.get('url') + '\n')
dataset = acl_bib.entries
collection = [e['abstract'] for e in dataset]
return dataset, collection
if __name__ == '__main__':
# Parse and save the anthology dataset
dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
f.write(json.dumps(collection, indent=4))
with open(DATASET_PATH, 'w', encoding='utf-8') as f:
f.write(json.dumps(dataset, indent=4))