import mysql.connector import json ACL_DB_NAME = 'acl_anthology' DATASET_PATH = 'dataset.json' def create_database(): db = mysql.connector.connect( host = "localhost", user = "root", password = "" ) cursor = db.cursor() cursor.execute("SHOW DATABASES") acl_db_exists = False for x in cursor: db_name = x[0] if db_name == ACL_DB_NAME: acl_db_exists = True # Create database if not acl_db_exists: print("Creating new database...") cursor.execute(f'CREATE DATABASE {ACL_DB_NAME}') cursor.execute(f'USE {ACL_DB_NAME}') # Create table cursor.execute(f'DROP TABLE IF EXISTS paper') cursor.execute("CREATE TABLE paper (pid INT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract TEXT(12800), url VARCHAR(150), type VARCHAR(100), venue VARCHAR(500))") acl_data = read_dataset() vals = [] for pid, paper in enumerate(acl_data): sql = "INSERT INTO paper (pid, title, author, year, abstract, url, type, venue) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)" title = paper.get('title', '') author = paper.get('author', '') year = paper.get('year', '') abstract = paper.get('abstract', '') url = paper.get('url', '') type = paper.get('ENTRYTYPE', '') venue = paper.get('booktitle', '') if not abstract: continue vals += [(pid, title, author, year, abstract, url, type, venue)] cursor.executemany(sql, vals) db.commit() def read_dataset(): print("Reading dataset...") with open(DATASET_PATH, 'r', encoding='utf-8') as f: dataset = json.loads(f.read()) dataset = [d for d in dataset if 'abstract' in d.keys()] return dataset def main(): create_database() print('Done!') if __name__ == '__main__': main()