davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 26

Commit

fbce275

•

1 Parent(s): 00b3aaf

add MySQL backend

Browse files

Files changed (10) hide show

README.md +17 -1
db_init.py +80 -0
db_search.py +55 -0
search.py +17 -13
server.py +27 -4
static/css/styles.css +66 -0
templates/index.html +26 -0
templates/no_results.html +10 -0
templates/results.html +20 -0
utils.py +43 -0

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 license: apache-2.0
 ---
-## Setup
 First, clone this repo and create a conda environment and install the dependencies:
 ```sh
 git clone https://huggingface.co/davidheineman/colbert-acl
@@ -17,6 +17,22 @@ gunzip anthology+abstracts.bib.gz
 mv anthology+abstracts.bib anthology.bib
 ```
 ### (Optional) Step 1: Parse the Anthology
 Feel free to skip steps 1 and 2, since the parsed/indexed anthology is contained in this repo. To parse the `.bib` file into `.json`:

 license: apache-2.0
 ---
+## Setup ColBERT
 First, clone this repo and create a conda environment and install the dependencies:
 ```sh
 git clone https://huggingface.co/davidheineman/colbert-acl
 mv anthology+abstracts.bib anthology.bib
 ```
+## Setup server
+Install pip dependencies
+```sh
+pip install mysql-connector-python flask
+```
+Set up a local MySQL server:
+```sh
+brew install mysql
+```
+Run the database setup to copy the ACL entries:
+```sh
+python init_db.py
+```
 ### (Optional) Step 1: Parse the Anthology
 Feel free to skip steps 1 and 2, since the parsed/indexed anthology is contained in this repo. To parse the `.bib` file into `.json`:

db_init.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import mysql.connector
+import json
+ACL_DB_NAME = 'acl_anthology'
+def create_database():
+    db = mysql.connector.connect(
+        host = "localhost",
+        user = "root",
+        password = ""
+    )
+    cursor = db.cursor()
+    cursor.execute("SHOW DATABASES")
+    acl_db_exists = False
+    for x in cursor:
+        db_name = x[0]
+        if db_name == ACL_DB_NAME:
+            acl_db_exists = True
+    if not acl_db_exists:
+        print("Creating new database...")
+        cursor.execute(f'CREATE DATABASE {ACL_DB_NAME}')
+        cursor.execute("CREATE TABLE paper (id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(1024), author VARCHAR(2170), year INT, abstract VARCHAR(12800))")
+    cursor.execute(f'USE {ACL_DB_NAME}')
+    acl_data = read_dataset()
+    vals = []
+    for paper in acl_data:
+        sql = "INSERT INTO paper (title, author, year, abstract) VALUES (%s, %s, %s, %s)"
+        title, author, abstract, year = '', '', '', ''
+        if 'title' in paper.keys():
+            title = paper['title']
+        if 'author' in paper.keys():
+            author = paper['author']
+        if 'year' in paper.keys():
+            year = paper['year']
+        if 'abstract' in paper.keys():
+            abstract = paper['abstract']
+        else:
+            continue
+        val = (title, author, year, abstract)
+        vals.append(val)
+    cursor.executemany(sql, vals)
+    db.commit()
+def read_dataset():
+    print("Reading dataset")
+    json_file = open('dataset.json')
+    data = json.load(json_file)
+    '''
+    namelen = 0
+    ablen = 0
+    for i in data:
+        if 'title' in i.keys():
+            al = len(i['title'])
+            if ablen < al:
+                print("------------")
+                print(i['title'])
+                ablen = al
+    print(ablen)
+    json_file.close()
+    '''
+    return data
+def main():
+    create_database()
+    read_dataset()
+if __name__ == '__main__':
+    main()

db_search.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import mysql.connector
+def complete_request(colbert_response, year):
+    NUM_ARTICLES = len(colbert_response["topk"])
+    # Get article IDS
+    article_ids = [None] * NUM_ARTICLES
+    for i in range(NUM_ARTICLES):
+        article_ids[i] = colbert_response["topk"][i]["pid"]
+    print(article_ids)
+    # Get data from DB
+    db = mysql.connector.connect(
+        host = "localhost",
+        user = "root",
+        password = "",
+        database= "acl_anthology"
+    )
+    cursor = db.cursor()
+    query_arg_str = ', '.join(['%s']*NUM_ARTICLES)
+    sql = f'SELECT * FROM paper WHERE id IN ({query_arg_str}) AND year >= {year}'
+    print(sql)
+    print(article_ids)
+    article_ids_inc = [x + 1 for x in article_ids]
+    cursor.execute(sql, article_ids_inc)
+    res = cursor.fetchall()
+    if len(res) == 0:
+        return []
+    print(res[0])
+    return res
+def parse_results(results):
+    parsed_results = []
+    for result in results:
+        title = result[1]
+        authors = result[2]
+        year = result[3]
+        abstract = result[4]
+        title = title.replace("{", "").replace("}", "")
+        authors = authors.replace("{", "").replace("}", "").replace('\\"', "")
+        abstract = abstract.replace("{", "").replace("}", "").replace("\\", "")
+        parsed_result = {'title': title, 'authors': authors, 'year': year, 'abstract': abstract}
+        parsed_results.append(parsed_result)
+    return parsed_results

search.py CHANGED Viewed

@@ -8,7 +8,7 @@ from colbert.search.strided_tensor import StridedTensor
 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
-from utils import filter_pids
 INDEX_NAME = os.getenv("INDEX_NAME", 'index')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
@@ -30,6 +30,7 @@ with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
 searcher = Searcher(index=INDEX_NAME, collection=collection)
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
 NDOCS = 64  # Number of closest documents to consider
@@ -139,22 +140,20 @@ def _calculate_colbert(Q: torch.Tensor):
     # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)
     # print(unfiltered_pids.shape)  # (num_passage_candidates)
-    # ivf_1, ivf_2 = ivf.as_padded_tensor()
-    # print(ivf_1.shape)
-    # print(ivf_2.shape)
     # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
     idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
-    pids = filter_pids(
-        unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
-    )
-    # C++ : Filter pids under the centroid score threshold
-    # pids_true = IndexScorer.filter_pids(
     #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     # )
-    # assert torch.equal(pids_true, pids), f'\n{pids_true}\n{pids}'
-    # print('Stage 2 filtering:', unfiltered_pids.shape, '->', pids.shape) # (n_docs) -> (n_docs/4)
     # Stage 3.5 (Decompression) - Get the true passage embeddings for calculating maxsim
     D_packed = IndexScorer.decompress_residuals(
@@ -162,6 +161,11 @@ def _calculate_colbert(Q: torch.Tensor):
         codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
         centroids, codec.dim, nbits
     )
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
@@ -180,7 +184,7 @@ def search_colbert(query, k):
     """
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
     Q = searcher.encode(query)
-    Q = Q[:, :searcher.config.query_maxlen] # Cut off query to maxlen tokens
     scores, pids = _calculate_colbert(Q)

 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
+from utils import filter_pids, decompress_residuals
 INDEX_NAME = os.getenv("INDEX_NAME", 'index')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
 searcher = Searcher(index=INDEX_NAME, collection=collection)
+QUERY_MAX_LEN = searcher.config.query_maxlen
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
 NDOCS = 64  # Number of closest documents to consider
     # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)
     # print(unfiltered_pids.shape)  # (num_passage_candidates)
     # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
     idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
+    # pids = filter_pids(
     #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     # )
+    # C++ : Filter pids under the centroid score threshold
+    pids_true = IndexScorer.filter_pids(
+        unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
+    )
+    pids = pids_true
+    assert torch.equal(pids_true, pids), f'\n{pids_true}\n{pids}'
+    print('Stage 2 filtering:', unfiltered_pids.shape, '->', pids.shape) # (n_docs) -> (n_docs/4)
     # Stage 3.5 (Decompression) - Get the true passage embeddings for calculating maxsim
     D_packed = IndexScorer.decompress_residuals(
         codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
         centroids, codec.dim, nbits
     )
+    # D_packed = decompress_residuals(
+    #     pids, doclens, offsets, bucket_weights, codec.reversed_bit_map,
+    #     codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
+    #     centroids, codec.dim, nbits
+    # )
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
     """
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
     Q = searcher.encode(query)
+    Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
     scores, pids = _calculate_colbert(Q)

server.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import os, math, json
-from flask import Flask, request
 from functools import lru_cache
 from search import init_colbert, search_colbert
 PORT = int(os.getenv("PORT", 8893))
 app = Flask(__name__)
@@ -56,8 +57,30 @@ def api_search():
         counter["api"] += 1
         print("API request count:", counter["api"])
         return api_search_query(request.args.get("query"), request.args.get("k"))
-    else:
-        return ('', 405)
 if __name__ == "__main__":
@@ -69,5 +92,5 @@ if __name__ == "__main__":
     init_colbert()
     # test_response = api_search_query("What is NLP?", 2)
     # print(test_response)
-    print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
     app.run("0.0.0.0", PORT)

 import os, math, json
+from flask import Flask, request, render_template
 from functools import lru_cache
 from search import init_colbert, search_colbert
+from db_search import complete_request, parse_results
 PORT = int(os.getenv("PORT", 8893))
 app = Flask(__name__)
         counter["api"] += 1
         print("API request count:", counter["api"])
         return api_search_query(request.args.get("query"), request.args.get("k"))
+    return ('', 405)
+@app.route('/', methods=['POST', 'GET'])
+def index():
+    return render_template('index.html')
+@app.route('/query', methods=['POST', 'GET'])
+def query():
+    if request.method == "POST":
+        query, year = request.form['query'], request.form['year']
+    # Get top passage IDs from ColBERT
+    colbert_response = api_search_query(query, 10)
+    results = complete_request(colbert_response, year)
+    if results:
+        parsed_results = parse_results(results)
+        return render_template('results.html', query=query, year=year, results=parsed_results)
+    return render_template('no_results.html')
 if __name__ == "__main__":
     init_colbert()
     # test_response = api_search_query("What is NLP?", 2)
     # print(test_response)
+    # print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
     app.run("0.0.0.0", PORT)

static/css/styles.css ADDED Viewed

	@@ -0,0 +1,66 @@

+/* Body background */
+body {
+    background-color: #f4f4f4;
+}
+/* Custom fonts */
+h1 {
+    text-align: center; /* Center the text */
+    font-family: 'Droid Serif', Georgia, Times, serif;
+}
+p {
+    font-family: 'Droid Sans', Helvetica, Arial, sans-serif;
+}
+/* Formatting welcome message */
+#welcome-message {
+    text-align: center; /* Center the text */
+    margin-bottom: 20px; /* Add some space below the message */
+}
+#welcome-message h1 {
+    font-size: 36px; /* Adjust font size */
+    color: #333; /* Text color */
+}
+#welcome-message p {
+    font-size: 18px; /* Adjust font size */
+    color: #666; /* Text color */
+}
+/* Style the form container */
+form {
+    margin: 20px auto; /* Center horizontally */
+    padding: 20px;
+    border: 1px solid #ccc;
+    border-radius: 5px;
+    width: 300px;
+    background-color: #fff; /* Form background color */
+    box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1); /* Add a subtle shadow */
+}
+/* Style the form input fields */
+input[type="text"] {
+    width: calc(100% - 22px); /* Adjust width to account for padding */
+    padding: 10px;
+    margin-bottom: 10px;
+    border: 1px solid #ccc;
+    border-radius: 5px;
+}
+/* Style the submit button */
+input[type="submit"] {
+    width: 100%;
+    padding: 10px;
+    background-color: #4CAF50;
+    color: white;
+    border: none;
+    border-radius: 5px;
+    cursor: pointer;
+}
+/* Change the submit button color on hover */
+input[type="submit"]:hover {
+    background-color: #45a049;
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,26 @@

+<html>
+    <head>
+        <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
+        <link href="https://fonts.googleapis.com/css?family=Droid+Serif" rel="stylesheet">
+        <link href="https://fonts.googleapis.com/css?family=Droid+Sans" rel="stylesheet">
+    </head>
+    <body>
+        <div id="welcome-message">
+            <h1>Welcome!</h1>
+            <p>Please enter your search terms below</p>
+        </div>
+        <form action="http://localhost:8893/query" method="post">
+            <!-- Label and input field for Query -->
+            <label for="query">Query:</label>
+            <input type="text" id="query" name="query" placeholder="Enter your search query">
+            <!-- Label and input field for Year -->
+            <label for="year">Year:</label>
+            <input type="text" id="year" name="year" placeholder="Enter the year">
+            <input type="submit" value="Submit">
+        </form>
+    </body>
+</html>

templates/no_results.html ADDED Viewed

	@@ -0,0 +1,10 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Search Results</title>
+    <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
+</head>
+<body>
+    <h1>Unfortunately no papers seem to match your search for "{{ query }}" in {{ year }}</h1>
+</body>
+</html>

templates/results.html ADDED Viewed

	@@ -0,0 +1,20 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Search Results</title>
+    <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
+</head>
+<body>
+    <h1>Search Results for "{{ query }}" in {{ year }}</h1>
+    <ul>
+        {% for result in results %}
+        <li>
+            <p><strong>Title:</strong> {{ result.title }}</p>
+            <p><strong>Authors:</strong> {{ result.authors }}</p>
+            <p><strong>Year:</strong> {{ result.year }}</p>
+            <p><strong>Abstract:</strong> {{ result.abstract }}</p>
+        </li>
+        {% endfor %}
+    </ul>
+</body>
+</html>

utils.py CHANGED Viewed

@@ -50,3 +50,46 @@ def filter_pids(pids, centroid_scores, codes, doclens, offsets, idx, nfiltered_d
     print('Stage 3 filtering:', filtered_pids.shape, '->', final_filtered_pids.shape) # (n_docs) -> (n_docs/4)
     return final_filtered_pids

     print('Stage 3 filtering:', filtered_pids.shape, '->', final_filtered_pids.shape) # (n_docs) -> (n_docs/4)
     return final_filtered_pids
+def decompress_residuals(pids, doclens, offsets, bucket_weights, reversed_bit_map,
+        bucket_weight_combinations, binary_residuals, codes,
+        centroids, dim, nbits):
+    npacked_vals_per_byte = 8 // nbits
+    packed_dim = dim // npacked_vals_per_byte
+    cumulative_lengths = [0 for _ in range(len(pids)+1)]
+    noutputs = 0
+    for i in range(len(pids)):
+        noutputs += doclens[pids[i]]
+        cumulative_lengths[i + 1] = cumulative_lengths[i] + doclens[pids[i]]
+    output = []
+    binary_residuals = binary_residuals.flatten()
+    centroids = centroids.flatten()
+    # Iterate over all documents
+    for i in range(len(pids)):
+        pid = pids[i]
+        # Offset into packed list of token vectors for the given document
+        offset = offsets[pid]
+        # For each document, iterate over all token vectors
+        for j in range(doclens[pid]):
+            code = codes[offset + j]
+            # For each token vector, iterate over the packed (8-bit) residual values
+            for k in range(packed_dim):
+                x = binary_residuals[(offset + j) * packed_dim + k]
+                x = reversed_bit_map[x]
+                # For each packed residual value, iterate over the bucket weight indices.
+                # If we use n-bit compression, that means there will be (8 / n) indices per packed value.
+                for l in range(npacked_vals_per_byte):
+                    output_dim_idx = k * npacked_vals_per_byte + l
+                    bucket_weight_idx = bucket_weight_combinations[x * npacked_vals_per_byte + l]
+                    output[(cumulative_lengths[i] + j) * dim + output_dim_idx] = \
+                        bucket_weights[bucket_weight_idx] + centroids[code * dim + output_dim_idx]
+    return output