import bibtexparser, json ANTHOLOGY_PATH = 'anthology.bib' COLLECTION_PATH = 'collection.json' DATASET_PATH = 'dataset.json' def parse_anthology_bibtex(anthology_path): with open(anthology_path, 'r', encoding='utf-8') as f: acl_bib = bibtexparser.load(f) print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}') for entry in acl_bib.entries[:2]: print(entry.get('author')) print(entry.get('title')) print(entry.get('url') + '\n') dataset = acl_bib.entries collection = [e['abstract'] for e in dataset] return dataset, collection if __name__ == '__main__': # Parse and save the anthology dataset dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH) with open(COLLECTION_PATH, 'w', encoding='utf-8') as f: f.write(json.dumps(collection, indent=4)) with open(DATASET_PATH, 'w', encoding='utf-8') as f: f.write(json.dumps(dataset, indent=4))