File size: 972 Bytes
7563fd5
 
8b805bb
 
 
7563fd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import bibtexparser, json

ANTHOLOGY_PATH  = 'anthology.bib'
COLLECTION_PATH = 'collection.json'
DATASET_PATH    = 'dataset.json'

def parse_anthology_bibtex(anthology_path):
    with open(anthology_path, 'r', encoding='utf-8') as f:
        acl_bib = bibtexparser.load(f)

    print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
    for entry in acl_bib.entries[:2]:
        print(entry.get('author'))
        print(entry.get('title'))
        print(entry.get('url') + '\n')

    dataset = acl_bib.entries
    collection = [e['abstract'] for e in dataset]
    return dataset, collection

if __name__ == '__main__':
    # Parse and save the anthology dataset
    dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
    with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
        f.write(json.dumps(collection, indent=4))
    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
        f.write(json.dumps(dataset, indent=4))