colbert-acl / db_init.py
davidheineman's picture
bug fixes
adda926
import mysql.connector
import json
ACL_DB_NAME = 'acl_anthology'
DATASET_PATH = 'dataset.json'
def create_database():
db = mysql.connector.connect(
host = "localhost",
user = "root",
password = ""
)
cursor = db.cursor()
cursor.execute("SHOW DATABASES")
acl_db_exists = False
for x in cursor:
db_name = x[0]
if db_name == ACL_DB_NAME:
acl_db_exists = True
# Create database
if not acl_db_exists:
print("Creating new database...")
cursor.execute(f'CREATE DATABASE {ACL_DB_NAME}')
cursor.execute(f'USE {ACL_DB_NAME}')
# Create table
cursor.execute(f'DROP TABLE IF EXISTS paper')
cursor.execute("CREATE TABLE paper (pid INT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract TEXT(12800), url VARCHAR(150), type VARCHAR(100), venue VARCHAR(500))")
acl_data = read_dataset()
vals = []
for pid, paper in enumerate(acl_data):
sql = "INSERT INTO paper (pid, title, author, year, abstract, url, type, venue) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
title = paper.get('title', '')
author = paper.get('author', '')
year = paper.get('year', '')
abstract = paper.get('abstract', '')
url = paper.get('url', '')
type = paper.get('ENTRYTYPE', '')
venue = paper.get('booktitle', '')
if not abstract:
continue
vals += [(pid, title, author, year, abstract, url, type, venue)]
cursor.executemany(sql, vals)
db.commit()
def read_dataset():
print("Reading dataset...")
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
dataset = json.loads(f.read())
dataset = [d for d in dataset if 'abstract' in d.keys()]
return dataset
def main():
create_database()
print('Done!')
if __name__ == '__main__':
main()