|
import bibtexparser, json |
|
|
|
ANTHOLOGY_PATH = 'anthology.bib' |
|
COLLECTION_PATH = 'collection.json' |
|
DATASET_PATH = 'dataset.json' |
|
|
|
def parse_anthology_bibtex(anthology_path): |
|
with open(anthology_path, 'r', encoding='utf-8') as f: |
|
acl_bib = bibtexparser.load(f) |
|
|
|
print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}') |
|
for entry in acl_bib.entries[:2]: |
|
print(entry.get('author')) |
|
print(entry.get('title')) |
|
print(entry.get('url') + '\n') |
|
|
|
dataset = acl_bib.entries |
|
collection = [e['abstract'] for e in dataset] |
|
return dataset, collection |
|
|
|
if __name__ == '__main__': |
|
|
|
dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH) |
|
with open(COLLECTION_PATH, 'w', encoding='utf-8') as f: |
|
f.write(json.dumps(collection, indent=4)) |
|
with open(DATASET_PATH, 'w', encoding='utf-8') as f: |
|
f.write(json.dumps(dataset, indent=4)) |