|
import mysql.connector
|
|
import json
|
|
|
|
|
|
ACL_DB_NAME = 'acl_anthology'
|
|
DATASET_PATH = 'dataset.json'
|
|
|
|
|
|
def create_database():
|
|
db = mysql.connector.connect(
|
|
host = "localhost",
|
|
user = "root",
|
|
password = ""
|
|
)
|
|
cursor = db.cursor()
|
|
|
|
cursor.execute("SHOW DATABASES")
|
|
acl_db_exists = False
|
|
for x in cursor:
|
|
db_name = x[0]
|
|
if db_name == ACL_DB_NAME:
|
|
acl_db_exists = True
|
|
|
|
|
|
if not acl_db_exists:
|
|
print("Creating new database...")
|
|
cursor.execute(f'CREATE DATABASE {ACL_DB_NAME}')
|
|
cursor.execute(f'USE {ACL_DB_NAME}')
|
|
|
|
|
|
cursor.execute(f'DROP TABLE IF EXISTS paper')
|
|
cursor.execute("CREATE TABLE paper (pid INT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract TEXT(12800), url VARCHAR(150), type VARCHAR(100), venue VARCHAR(500))")
|
|
|
|
acl_data = read_dataset()
|
|
|
|
vals = []
|
|
for pid, paper in enumerate(acl_data):
|
|
sql = "INSERT INTO paper (pid, title, author, year, abstract, url, type, venue) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
|
|
|
|
title = paper.get('title', '')
|
|
author = paper.get('author', '')
|
|
year = paper.get('year', '')
|
|
abstract = paper.get('abstract', '')
|
|
url = paper.get('url', '')
|
|
type = paper.get('ENTRYTYPE', '')
|
|
venue = paper.get('booktitle', '')
|
|
|
|
if not abstract:
|
|
continue
|
|
|
|
vals += [(pid, title, author, year, abstract, url, type, venue)]
|
|
|
|
cursor.executemany(sql, vals)
|
|
db.commit()
|
|
|
|
|
|
def read_dataset():
|
|
print("Reading dataset...")
|
|
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
|
|
dataset = json.loads(f.read())
|
|
dataset = [d for d in dataset if 'abstract' in d.keys()]
|
|
return dataset
|
|
|
|
|
|
def main():
|
|
create_database()
|
|
print('Done!')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|