davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 26

Commit

adda926

•

1 Parent(s): 9027915

bug fixes

Browse files

Files changed (5) hide show

db_init.py +12 -22
db_search.py +12 -6
search.py +1 -1
server.py +19 -16
templates/index.html +0 -1

db_init.py CHANGED Viewed

@@ -2,7 +2,8 @@ import mysql.connector
 import json
-ACL_DB_NAME = 'acl_anthology'
 def create_database():
@@ -28,13 +29,13 @@ def create_database():
     # Create table
     cursor.execute(f'DROP TABLE IF EXISTS paper')
-    cursor.execute("CREATE TABLE paper (id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract TEXT(12800), url VARCHAR(150), type VARCHAR(100), venue VARCHAR(500))")
     acl_data = read_dataset()
     vals = []
-    for paper in acl_data:
-        sql = "INSERT INTO paper (title, author, year, abstract, url, type, venue) VALUES (%s, %s, %s, %s, %s, %s, %s)"
         title    = paper.get('title', '')
         author   = paper.get('author', '')
@@ -47,34 +48,23 @@ def create_database():
         if not abstract:
             continue
-        vals += [(title, author, year, abstract, url, type, venue)]
     cursor.executemany(sql, vals)
     db.commit()
 def read_dataset():
-    print("Reading dataset")
-    json_file = open('dataset.json')
-    data = json.load(json_file)
-    '''
-    namelen = 0
-    ablen = 0
-    for i in data:
-        if 'title' in i.keys():
-            al = len(i['title'])
-            if ablen < al:
-                print("------------")
-                print(i['title'])
-                ablen = al
-    print(ablen)
-    json_file.close()
-    '''
-    return data
 def main():
     create_database()
 if __name__ == '__main__':

 import json
+ACL_DB_NAME  = 'acl_anthology'
+DATASET_PATH = 'dataset.json'
 def create_database():
     # Create table
     cursor.execute(f'DROP TABLE IF EXISTS paper')
+    cursor.execute("CREATE TABLE paper (pid INT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract TEXT(12800), url VARCHAR(150), type VARCHAR(100), venue VARCHAR(500))")
     acl_data = read_dataset()
     vals = []
+    for pid, paper in enumerate(acl_data):
+        sql = "INSERT INTO paper (pid, title, author, year, abstract, url, type, venue) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
         title    = paper.get('title', '')
         author   = paper.get('author', '')
         if not abstract:
             continue
+        vals += [(pid, title, author, year, abstract, url, type, venue)]
     cursor.executemany(sql, vals)
     db.commit()
 def read_dataset():
+    print("Reading dataset...")
+    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
+        dataset = json.loads(f.read())
+    dataset = [d for d in dataset if 'abstract' in d.keys()]
+    return dataset
 def main():
     create_database()
+    print('Done!')
 if __name__ == '__main__':

db_search.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import mysql.connector
-PAPER_QUERY = 'SELECT * FROM paper WHERE id IN ({query_arg_str}) AND year >= {year}'
 def complete_request(colbert_response, year):
     pids = [r['pid'] for r in colbert_response["topk"]]
-    pids_inc = [i+1 for i in pids]
     # Get data from DB
     db = mysql.connector.connect(
@@ -28,20 +28,26 @@ def complete_request(colbert_response, year):
     results = cursor.fetchall()
     if len(results) == 0: return []
     return results
 def parse_results(results):
-    parsed_results = []
     for result in results:
-        _, title, authors, year, abstract, url, type, venue = result
         title = title.replace("{", "").replace("}", "")
         authors = authors.replace("{", "").replace("}", "").replace('\\"', "")
         abstract = abstract.replace("{", "").replace("}", "").replace("\\", "")
-        parsed_results += [{
             'title': title,
             'authors': authors,
             'year': year,
@@ -49,6 +55,6 @@ def parse_results(results):
             'url': url,
             'type': type,
             'venue': venue,
-        }]
     return parsed_results

 import mysql.connector
+PAPER_QUERY = 'SELECT * FROM paper WHERE pid IN ({query_arg_str}) AND year >= {year}'
 def complete_request(colbert_response, year):
     pids = [r['pid'] for r in colbert_response["topk"]]
+    pids_inc = [i for i in pids]
     # Get data from DB
     db = mysql.connector.connect(
     results = cursor.fetchall()
     if len(results) == 0: return []
+    parsed_results = parse_results(results)
+    # Restore original ordering of PIDs from ColBERT
+    results = [parsed_results[pid] for pid in pids_inc if pid in parsed_results.keys()]
     return results
 def parse_results(results):
+    parsed_results = {}
     for result in results:
+        pid, title, authors, year, abstract, url, type, venue = result
         title = title.replace("{", "").replace("}", "")
         authors = authors.replace("{", "").replace("}", "").replace('\\"', "")
         abstract = abstract.replace("{", "").replace("}", "").replace("\\", "")
+        parsed_results[int(pid)] = {
             'title': title,
             'authors': authors,
             'year': year,
             'url': url,
             'type': type,
             'venue': venue,
+        }
     return parsed_results

search.py CHANGED Viewed

@@ -33,7 +33,7 @@ searcher = Searcher(index=INDEX_NAME, collection=collection)
 QUERY_MAX_LEN = searcher.config.query_maxlen
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
-NDOCS = 64  # Number of closest documents to consider
 def init_colbert(index_path=INDEX_PATH, load_index_with_mmap=False):

 QUERY_MAX_LEN = searcher.config.query_maxlen
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
+NDOCS = 512  # Number of closest documents to consider
 def init_colbert(index_path=INDEX_PATH, load_index_with_mmap=False):

server.py CHANGED Viewed

@@ -11,22 +11,22 @@ app = Flask(__name__)
 counter = {"api" : 0}
-# Load data
-COLLECTION_PATH = 'collection.json'
-DATASET_PATH    = 'dataset.json'
-with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
-    collection = json.loads(f.read())
-with open(DATASET_PATH, 'r', encoding='utf-8') as f:
-    dataset = json.loads(f.read())
-dataset = [d for d in dataset if 'abstract' in d.keys()] # We only indexed the entries containing abstracts
 @lru_cache(maxsize=1000000)
-def api_search_query(query, k):
     print(f"Query={query}")
-    k = 10 if k == None else min(int(k), 100)
     # Use ColBERT to find passages related to the query
     pids, ranks, scores = search_colbert(query, k)
@@ -39,12 +39,12 @@ def api_search_query(query, k):
     topk = []
     for pid, rank, score, prob in zip(pids, ranks, scores, probs):
         topk += [{
-            'text': collection[pid],
             'pid': pid,
             'rank': rank,
             'score': score,
             'prob': prob,
-            'entry': dataset[pid]
         }]
     topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
@@ -71,16 +71,19 @@ def query():
     if request.method == "POST":
         query, year = request.form['query'], request.form['year']
     # Get top passage IDs from ColBERT
-    colbert_response = api_search_query(query, 10)
     results = complete_request(colbert_response, year)
     if results:
-        parsed_results = parse_results(results)
-        return render_template('results.html', query=query, year=year, results=parsed_results)
-    return render_template('no_results.html')
 if __name__ == "__main__":

 counter = {"api" : 0}
+# # Load data
+# COLLECTION_PATH = 'collection.json'
+# DATASET_PATH    = 'dataset.json'
+# with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
+#     collection = json.loads(f.read())
+# with open(DATASET_PATH, 'r', encoding='utf-8') as f:
+#     dataset = json.loads(f.read())
+# dataset = [d for d in dataset if 'abstract' in d.keys()] # We only indexed the entries containing abstracts
 @lru_cache(maxsize=1000000)
+def api_search_query(query, k=10):
     print(f"Query={query}")
+    k = min(int(k), 100)
     # Use ColBERT to find passages related to the query
     pids, ranks, scores = search_colbert(query, k)
     topk = []
     for pid, rank, score, prob in zip(pids, ranks, scores, probs):
         topk += [{
             'pid': pid,
             'rank': rank,
             'score': score,
             'prob': prob,
+            # 'text': collection[pid],
+            # 'entry': dataset[pid]
         }]
     topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
     if request.method == "POST":
         query, year = request.form['query'], request.form['year']
+    K = 100
     # Get top passage IDs from ColBERT
+    colbert_response = api_search_query(query, K)
     results = complete_request(colbert_response, year)
+    print(colbert_response)
     if results:
+        return render_template('results.html', query=query, year=year, results=results)
+    return render_template('no_results.html', query=query, year=year)
 if __name__ == "__main__":

templates/index.html CHANGED Viewed

@@ -3,7 +3,6 @@
         <link rel="stylesheet" type="text/css" class='welcome-form' href="{{ url_for('static', filename='css/styles.css') }}">
         <link href="https://fonts.googleapis.com/css?family=Droid+Serif" rel="stylesheet">
         <link href="https://fonts.googleapis.com/css?family=Droid+Sans" rel="stylesheet">
     </head>
     <body>
         <div id="welcome-message" class="welcome-message">

         <link rel="stylesheet" type="text/css" class='welcome-form' href="{{ url_for('static', filename='css/styles.css') }}">
         <link href="https://fonts.googleapis.com/css?family=Droid+Serif" rel="stylesheet">
         <link href="https://fonts.googleapis.com/css?family=Droid+Sans" rel="stylesheet">
     </head>
     <body>
         <div id="welcome-message" class="welcome-message">