Spaces:

Exploration-Lab
/

BookSQL-Leaderboard

Running

App Files Files

rahulkiitk commited on Jun 2, 2024

Commit

632338f

1 Parent(s): 91d52f9

Updating Leaderboard code

Browse files

Files changed (10) hide show

app.py +222 -0
requirements.txt +10 -0
sample_prediction.csv +10 -0
script.py +374 -0
submissions/.DS_Store +0 -0
submissions/baseline/baseline.csv +6 -0
submissions/modify.sh +28 -0
tests/test.json +0 -0
tests/test_sql.json +0 -0
uploads.py +382 -0

app.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import gradio as gr
+import pandas as pd
+import os
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from uploads import add_new_eval
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@inproceedings{kumar-etal-2024-booksql,
+    title = "BookSQL: A Large Scale Text-to-SQL Dataset for Accounting Domain",
+    author = "Kumar, Rahul and Raja, Amar and Harsola, Shrutendra and Subrahmaniam, Vignesh and Modi, Ashutosh",
+    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics",
+    month = "march",
+    year = "2024",
+    address = "Mexico City, Mexico",
+    publisher = "Association for Computational Linguistics"
+}"""
+api = HfApi()
+TOKEN = os.environ.get("TOKEN", None)
+LEADERBOARD_PATH = f"Exploration-lab/BookSQL-Leaderboard"
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+# Function to load data from a given CSV file
+def baseline_load_data(tasks):
+    # version = version.replace("%", "p")
+    file_path = f"submissions/baseline/baseline.csv"  # Replace with your file paths
+    df = pd.read_csv(file_path)
+    # we only want specific columns and in a specific order
+    # column_names = [
+    #     "Method",
+    #     "Submitted By",
+    #     "L-NER",
+    #     "RR",
+    #     "CJPE",
+    #     "BAIL",
+    #     "LSI",
+    #     "PCR",
+    #     "SUMM",
+    #     "Average",
+    # ]
+    column_names = [
+        "Method",
+        "Submitted By",
+        "EMA",
+        "EX",
+        "BLEU-4",
+        "ROUGE-L"
+    ]
+    if tasks is None:
+        breakpoint()
+    # based on the tasks, remove the columns that are not needed
+    if "EMA" not in tasks:
+        column_names.remove("EMA")
+    if "EX" not in tasks:
+        column_names.remove("EX")
+    if "BLEU-4" not in tasks:
+        column_names.remove("BLEU-4")
+    if "ROUGE-L" not in tasks:
+        column_names.remove("ROUGE-L")
+    df = df[column_names]
+    # df = df.sort_values(by="Average", ascending=False)
+    df = df.drop_duplicates(subset=["Method"], keep="first")
+    return df
+def load_data(tasks):
+    baseline_df = baseline_load_data(tasks)
+    return baseline_df
+# Function for searching in the leaderboard
+def search_leaderboard(df, query):
+    if query == "":
+        return df
+    else:
+        return df[df["Method"].str.contains(query)]
+# Function to change the version of the leaderboard
+def change_version(tasks):
+    new_df = load_data(tasks)
+    return new_df
+# Initialize Gradio app
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(
+        """
+    ## 🥇 BookSQL Leaderboard
+    Given the importance and wide prevalence of business databases across the world, the proposed dataset, BookSQL focuses on the finance and accounting domain. Accounting databases are used across a wide spectrum of industries like construction, healthcare, retail, educational services, insurance, restaurant, real estate, etc. Business in these industries arranges their financial transactions into their own different set of categories (called a chart of accounts Industry Details in accounting terminology.
+    Text-to-SQL system developed on BookSQL will be robust at handling various types of accounting databases. The total size of the dataset is 1 million. The dataset is prepared under financial experts' supervision, and the dataset's statistics are provided in below table. The dataset consists of 27 businesses, and each business has around 35k - 40k transactions
+    Read more at [https://exploration-lab.github.io/BookSQL/](https://exploration-lab.github.io/BookSQL/).
+    Please follow this format for uploading prediction file (https://huggingface.co/spaces/Exploration-Lab/BookSQL/blob/main/sample_prediction.csv)
+    """
+    )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )  # .style(show_copy_button=True)
+    with gr.Tabs():
+        with gr.TabItem("Leaderboard"):
+            with gr.Row():
+                # tasks_checkbox = gr.CheckboxGroup(
+                #     label="Select Tasks",
+                #     choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
+                #     value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
+                # )
+                tasks_checkbox = gr.CheckboxGroup(
+                    label="Select Tasks",
+                    choices=["EMA","EX","BLEU-4","ROUGE-L"],
+                    value=["EMA","EX","BLEU-4","ROUGE-L"],
+                )
+            with gr.Row():
+                search_bar = gr.Textbox(
+                    placeholder="Search for methods...",
+                    show_label=False,
+                )
+            leaderboard_table = gr.components.Dataframe(
+                value=load_data(
+                    # "baseline",
+                    ["EMA","EX","BLEU-4","ROUGE-L"],
+                ),
+                interactive=True,
+                visible=True,
+            )
+            # version_dropdown.change(
+            #     change_version,
+            #     inputs=[model_dropdown, version_dropdown, tasks_checkbox],
+            #     outputs=leaderboard_table,
+            # )
+            # model_dropdown.change(
+            #     change_version,
+            #     inputs=[model_dropdown, version_dropdown, tasks_checkbox],
+            #     outputs=leaderboard_table,
+            # )
+            search_bar.change(
+                search_leaderboard,
+                inputs=[
+                    leaderboard_table,
+                    search_bar,
+                    # tasks_checkbox
+                ],
+                outputs=leaderboard_table,
+            )
+            tasks_checkbox.change(
+                change_version,
+                inputs=[tasks_checkbox],
+                outputs=leaderboard_table,
+            )
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                method_name_textbox = gr.Textbox(label="Method name")
+                url_textbox = gr.Textbox(label="Url to model information")
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation")
+                mail = gr.Textbox(label="Contact email")
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                method_name_textbox,
+                url_textbox,
+                file_output,
+                organisation,
+                mail,
+            ],
+            submission_result,
+        )
+    gr.Markdown(
+        """
+    ## Quick Links
+    - [**GitHub Repository**](https://github.com/exploration-lab/BookSQL): Access the source code, fine-tuning scripts, and additional resources for the BookSQL dataset.
+    - [**arXiv Paper**](#): Detailed information about the BookSQL dataset and its significance in unlearning tasks.
+    - [**Dataset on Hugging Face**](https://huggingface.co/datasets/Exploration-Lab/BookSQL): Direct link to download the BookSQL dataset.
+    """
+    )
+# scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
+# scheduler.start()
+# demo.queue(default_concurrency_limit=40).launch()
+# demo.launch()
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+# demo.launch(debug=True)
+demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+seaborn
+scipy
+datasets==2.14.5
+gradio
+huggingface-hub==0.18.0
+numpy==1.24.2
+APScheduler==3.10.1
+evaluate
+rouge_score
+sqlparse

sample_prediction.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+,id,pred_sql
+0,0,SELECT * FROM employees WHERE department_id = 3
+1,1,UPDATE customers SET status = 'active' WHERE customer_id = 42
+2,2,"INSERT INTO orders (order_id, order_date, customer_id) VALUES (1001, '2023-06-01', 5)"
+3,3,DELETE FROM products WHERE product_id = 200
+4,4,"SELECT name, salary FROM employees WHERE salary > 50000"
+5,5,UPDATE products SET price = price * 1.1 WHERE category = 'electronics'
+6,6,"INSERT INTO users (user_id, username, email) VALUES (10, 'jdoe', 'jdoe@example.com')"
+7,7,DELETE FROM sessions WHERE last_active < '2023-01-01'
+8,8,SELECT DISTINCT category FROM products

script.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from email.utils import parseaddr
+from huggingface_hub import HfApi
+import os
+import datetime
+import pandas as pd
+import json
+import evaluate as nlp_evaluate
+import re
+import sqlite3
+import random
+from tqdm import tqdm
+import sys
+import numpy as np
+from get_exact_and_f1_score.ext_services.jsql_parser import JSQLParser
+from get_exact_and_f1_score.metrics.partial_match_eval.evaluate import evaluate
+random.seed(10001)
+bleu = nlp_evaluate.load("bleu")
+rouge = nlp_evaluate.load('rouge')
+LEADERBOARD_PATH = "Exploration-Lab/BookSQL-Leaderboard"
+RESULTS_PATH = "Exploration-Lab/BookSQL-Leaderboard-results"
+api = HfApi()
+TOKEN = os.environ.get("TOKEN", None)
+YEAR_VERSION = "2024"
+sqlite_path = "accounting/accounting_for_testing.sqlite"
+_jsql_parser = JSQLParser.create()
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def input_verification(method_name, url, path_to_file, organisation, mail):
+    for input in [method_name, url, path_to_file, organisation, mail]:
+        if input == "":
+            return format_warning("Please fill all the fields.")
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    return parsed_mail
+def replace_current_date_and_now(_sql, _date):
+    _sql = _sql.replace('current_date', "\'"+_date+"\'")
+    _sql = _sql.replace(', now', ", \'"+_date+"\'")
+    return _sql
+def remove_gold_Non_exec(data,df1, sqlite_path):
+    con = sqlite3.connect(sqlite_path)
+    cur = con.cursor()
+    out, non_exec=[], []
+    new_df = df1.copy()
+    new_df.loc[:, 'Exec/Non-Exec'] = 0
+    for i,s in tqdm(enumerate(data)):
+        _sql = str(s).replace('"', "'").lower()
+        _sql = replace_current_date_and_now(_sql, '2022-06-01')
+        _sql = replace_percent_symbol_y(_sql)
+        try:
+            cur.execute(_sql)
+            res = cur.fetchall()
+            out.append(i)
+        except:
+            non_exec.append(i)
+            print("_sql: ", _sql)
+    new_df.loc[out, 'Exec/Non-Exec'] = 1
+    con.close()
+    return out, non_exec, new_df
+def remove_data_from_index(data, ind_list):
+    new_data=[]
+    for i in ind_list:
+        new_data.append(data[i])
+    return new_data
+def get_exec_match_acc(gold, pred):
+    assert len(gold)==len(pred)
+    count=0
+    goldd = [re.sub(' +', ' ', str(g).replace("'", '"').lower()) for g in gold]
+    predd = [re.sub(' +', ' ', str(p).replace("'", '"').lower()) for p in pred]
+    # for g, p in zip(gold, pred):
+    #     #extra space, double quotes, lower_case
+    #     gg = re.sub(' +', ' ', str(g).replace("'", '"').lower())
+    #     gg = re.sub(' +', ' ', str(p).replace("'", '"').lower())
+        # if gold==pred:
+        #     count+=1
+    goldd = _jsql_parser.translate_batch(goldd)
+    predd = _jsql_parser.translate_batch(predd)
+    pcm_f1_scores = evaluate(goldd, predd)
+    pcm_em_scores = evaluate(goldd, predd, exact_match=True)
+    _pcm_f1_scores, _pcm_em_scores=[], []
+    for f1, em in zip(pcm_f1_scores, pcm_em_scores):
+        if type(f1)==float and type(em)==float:
+            _pcm_f1_scores.append(f1)
+            _pcm_em_scores.append(em)
+    assert len(_pcm_f1_scores) == len(_pcm_em_scores)
+    jsql_error_count=0 ####JSQLError
+    for i, score in enumerate(pcm_f1_scores):
+        if type(score)==str:
+            jsql_error_count+=1
+    print("JSQLError in sql: ", jsql_error_count)
+    return sum(_pcm_em_scores) / len(_pcm_em_scores), sum(_pcm_f1_scores) / len(_pcm_f1_scores)
+def replace_percent_symbol_y(_sql):
+    _sql = _sql.replace('%y', "%Y")
+    return _sql
+def get_exec_results(sqlite_path, scores, df, flag, gold_sql_map_res={}):
+    con = sqlite3.connect(sqlite_path)
+    cur = con.cursor()
+    i,j,count=0,0,0
+    out,non_exec={},{}
+    new_df = df.copy()
+    responses=[]
+    for s in tqdm(scores):
+        _sql = str(s).replace('"', "'").lower()
+        _sql = replace_current_date_and_now(_sql, '2022-06-01')
+        _sql = replace_percent_symbol_y(_sql)
+        try:
+            cur.execute(_sql)
+            res = cur.fetchall()
+            out[i] = str(res)
+        except Exception as err:
+            non_exec[i]=err
+        i+=1
+    if flag=='g':
+        new_df.loc[list(out.keys()), 'GOLD_res'] = list(out.values())
+    # assert len(gold_sql_map_res)==count
+    if flag=='p':
+        new_df.loc[list(out.keys()), 'PRED_res'] = list(out.values())
+    if flag=='d':
+        new_df.loc[list(out.keys()), 'DEBUG_res'] = list(out.values())
+    con.close()
+    return out, non_exec, new_df
+def get_scores(gold_dict, pred_dict):
+    exec_count, non_exec_count=0, 0
+    none_count=0
+    correct_sql, incorrect_sql = [], []
+    for k, res in pred_dict.items():
+        if k in gold_dict:
+            if gold_dict[k]==str(None) or str(None) in gold_dict[k]:
+                none_count+=1
+                continue
+            if res==gold_dict[k]:
+                exec_count+=1
+                correct_sql.append(k)
+            else:
+                non_exec_count+=1
+                incorrect_sql.append(k)
+    return exec_count, non_exec_count, none_count, correct_sql, incorrect_sql
+def get_total_gold_none_count(gold_dict):
+    none_count, ok_count=0, 0
+    for k, res in gold_dict.items():
+        if res==str(None) or str(None) in res:
+            none_count+=1
+        else: ok_count+=1
+    return ok_count, none_count
+def evaluate(df):
+    # df - [id, pred_sql]
+    pred_sql = df['pred_sql'].to_list()
+    ids = df['id'].to_list()
+    f = open(f"tests/test.json")
+    questions_and_ids = json.load(f)
+    ts = open(f"tests/test_sql.json")
+    gold_sql = json.load(ts)
+    gold_sql_list=[]
+    pred_sql_list=[]
+    questions_list=[]
+    for idx, pred in zip(ids, pred_sql):
+        ques = questions_and_ids[idx]['Query']
+        gd_sql = gold_sql[idx]['SQL']
+        gold_sql_list.append(gd_sql)
+        pred_sql_list.append(pred_sql_list)
+        questions_list.append(ques)
+    df = pd.DataFrame({'NLQ':questions_list, 'GOLD SQL':gold_sql_list, 'PREDICTED SQL':pred_sql_list})
+    test_size = len(df)
+    pred_score = df['PREDICTED SQL'].str.lower().values
+    # debug_score = df['DEBUGGED SQL'].str.lower().values
+    gold_score1 = df['GOLD SQL'].str.lower().values
+    print("Checking non-exec Gold sql query")
+    gold_exec, gold_not_exec, new_df = remove_gold_Non_exec(gold_score1, df, sqlite_path)
+    print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size))
+    print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size))
+    prev_non_exec_df = new_df[new_df['Exec/Non-Exec'] == 0]
+    new_df = new_df[new_df['Exec/Non-Exec']==1]
+    prev_non_exec_df.reset_index(inplace=True)
+    new_df.reset_index(inplace=True)
+    #Removing Non-exec sql from data
+    print(f"Removing {len(gold_not_exec)} non-exec sql query from all Gold/Pred/Debug")
+    gold_score1 = remove_data_from_index(gold_score1, gold_exec)
+    pred_score = remove_data_from_index(pred_score, gold_exec)
+    # debug_score = remove_data_from_index(debug_score, gold_exec)
+    gold_score = [[x] for x in gold_score1]
+    assert len(gold_score) == len(pred_score) #== len(debug_score)
+    pred_bleu_score  = bleu.compute(predictions=pred_score, references=gold_score)
+    pred_rouge_score  = rouge.compute(predictions=pred_score, references=gold_score)
+    pred_exact_match, pred_partial_f1_score = get_exec_match_acc(gold_score1, pred_score)
+    print("PREDICTED_vs_GOLD Final bleu_score: ", pred_bleu_score['bleu'])
+    print("PREDICTED_vs_GOLD Final rouge_score: ", pred_rouge_score['rougeL'])
+    print("PREDICTED_vs_GOLD Exact Match Accuracy: ", pred_exact_match)
+    print("PREDICTED_vs_GOLD Partial CM F1 score: ", pred_partial_f1_score)
+    print()
+    new_df.loc[:, 'GOLD_res'] = str(None)
+    new_df.loc[:, 'PRED_res'] = str(None)
+    # new_df.loc[:, 'DEBUG_res'] = str(None)
+    print("Getting Gold results")
+    # gout_res_dict, gnon_exec_err_dict, gold_sql_map_res = get_exec_results(cur, gold_score1, 'g')
+    gout_res_dict, gnon_exec_err_dict, new_df = get_exec_results(sqlite_path, gold_score1, new_df, 'g')
+    total_gold_ok_count, total_gold_none_count = get_total_gold_none_count(gout_res_dict)
+    print("Total Gold None count: ", total_gold_none_count)
+    print("Getting Pred results")
+    pout_res_dict, pnon_exec_err_dict, new_df = get_exec_results(sqlite_path, pred_score, new_df, 'p')
+    # print("Getting Debug results")
+    # dout_res_dict, dnon_exec_err_dict = get_exec_results(cur, debug_score, 'd')
+    print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size))
+    print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size))
+    print()
+    print("PRED Total exec SQL query: {}/{}".format(len(pout_res_dict), len(pred_score)))
+    print("PRED Total non-exec SQL query: {}/{}".format(len(pnon_exec_err_dict), len(pred_score)))
+    print()
+    # print("DEBUG Total exec SQL query: {}/{}".format(len(dout_res_dict), len(debug_score)))
+    # print("DEBUG Total non-exec SQL query: {}/{}".format(len(dnon_exec_err_dict), len(debug_score)))
+    # print()
+    pred_correct_exec_acc_count, pred_incorrect_exec_acc_count, pred_none_count, pred_correct_sql, pred_incorrect_sql  = get_scores(gout_res_dict, pout_res_dict)
+    # debug_correct_exec_acc_count, debug_incorrect_exec_acc_count, debug_none_count, debug_correct_sql, debug_incorrect_sql   = get_scores(gout_res_dict, dout_res_dict)
+    # print("PRED_vs_GOLD None_count: ", total_gold_none_count)
+    print("PRED_vs_GOLD Correct_Exec_count without None: ", pred_correct_exec_acc_count)
+    print("PRED_vs_GOLD Incorrect_Exec_count without None: ", pred_incorrect_exec_acc_count)
+    print("PRED_vs_GOLD Exec_Accuracy: ", pred_correct_exec_acc_count/total_gold_ok_count)
+    print()
+    return pred_exact_match, pred_correct_exec_acc_count/total_gold_ok_count, pred_partial_f1_score, pred_bleu_score['bleu'], pred_rouge_score['rougeL']
+def add_new_eval(
+    method_name: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    parsed_mail = input_verification(
+        method_name,
+        url,
+        path_to_file,
+        organisation,
+        mail,
+    )
+    # load the file
+    df = pd.read_csv(path_to_file)
+    submission_df = pd.read_csv(path_to_file)
+    # modify the df to include metadata
+    df["Method"] = method_name
+    df["url"] = url
+    df["organisation"] = organisation
+    df["mail"] = parsed_mail
+    df["timestamp"] = datetime.datetime.now()
+    submission_df = pd.read_csv(path_to_file)
+    submission_df["Method"] = method_name
+    submission_df["Submitted By"] = organisation
+    # upload to spaces using the hf api at
+    path_in_repo = f"submissions/{method_name}"
+    file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
+    EM, EX, PCM_F1, BLEU, ROUGE = evaluate(submission_df)
+    submission_df['EM'] = EM
+    submission_df['EX'] = EX
+    # submission_df['PCM_F1'] = PCM_F1
+    submission_df['BLEU'] = BLEU
+    submission_df['ROUGE'] = ROUGE
+    # upload the df to spaces
+    import io
+    buffer = io.BytesIO()
+    df.to_csv(buffer, index=False)  # Write the DataFrame to a buffer in CSV format
+    buffer.seek(0)  # Rewind the buffer to the beginning
+    api.upload_file(
+        repo_id=RESULTS_PATH,
+        path_in_repo=f"{path_in_repo}/{file_name}",
+        path_or_fileobj=buffer,
+        token=TOKEN,
+        repo_type="dataset",
+    )
+    # read the leaderboard
+    leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
+    # append the new submission_df csv to the leaderboard
+    # leaderboard_df = leaderboard_df._append(submission_df)
+    leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
+    # save the new leaderboard
+    # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
+    leaderboard_buffer = io.BytesIO()
+    leaderboard_df.to_csv(leaderboard_buffer, index=False)
+    leaderboard_buffer.seek(0)
+    api.upload_file(
+        repo_id=LEADERBOARD_PATH,
+        path_in_repo=f"submissions/baseline/baseline.csv",
+        path_or_fileobj=leaderboard_buffer,
+        token=TOKEN,
+        repo_type="space",
+    )
+    return format_log(
+        f"Method {method_name} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed"
+    )

submissions/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

submissions/baseline/baseline.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Method,Submitted By,EMA,EX,BLEU-4,ROUGE-L
+SEDE,IITK,0.43,0.443,0.69,0.83
+UniSAr,IITK,0.43,0.47,0.72,0.8
+RESDSQL,IITK,0.52,0.54,0.74,0.81
+DIN-SQL + GPT4,IITK,0.09,0.08,0.43,0.68
+Dfew + GPT4,IITK,0.48,0.67,0.86,0.9

submissions/modify.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# Loop through each CSV file in the current directory
+for csv_file in *.csv; do
+    # Check if the file is a regular file
+    if [ -f "$csv_file" ]; then
+        echo "Processing $csv_file..."
+        # Temporary file
+        temp_file=$(mktemp)
+        # Check if the file has a header
+        if head -1 "$csv_file" | grep -q "Submitted By"; then
+            echo "The 'Submitted By' column already exists in $csv_file."
+            continue
+        fi
+        # Add 'Submitted By' column header and 'Baseline' entry for each row
+        awk -v OFS="," 'NR==1 {print $0, "Submitted By"} NR>1 {print $0, "Baseline"}' "$csv_file" > "$temp_file"
+        # Move the temporary file to original file
+        mv "$temp_file" "$csv_file"
+        echo "Column 'Submitted By' added successfully with 'Baseline' entry in each row for $csv_file."
+    fi
+done
+echo "All CSV files processed."

tests/test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/test_sql.json ADDED Viewed

The diff for this file is too large to render. See raw diff

uploads.py ADDED Viewed

	@@ -0,0 +1,382 @@

+from email.utils import parseaddr
+from huggingface_hub import HfApi
+import os
+import datetime
+import pandas as pd
+import json
+import evaluate as nlp_evaluate
+import re
+import sqlite3
+import random
+from tqdm import tqdm
+import sys
+import numpy as np
+from sqlparse import parse
+random.seed(10001)
+bleu = nlp_evaluate.load("bleu")
+rouge = nlp_evaluate.load('rouge')
+LEADERBOARD_PATH = "Exploration-Lab/BookSQL-Leaderboard"
+RESULTS_PATH = "Exploration-Lab/BookSQL-Leaderboard"
+api = HfApi()
+TOKEN = os.environ.get("TOKEN", None)
+YEAR_VERSION = "2024"
+sqlite_path = "accounting/accounting_for_testing.sqlite"
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def input_verification(method_name, url, path_to_file, organisation, mail):
+    for input in [method_name, url, path_to_file, organisation, mail]:
+        if input == "":
+            return format_warning("Please fill all the fields.")
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    return parsed_mail
+def replace_current_date_and_now(_sql, _date):
+    _sql = _sql.replace('current_date', "\'"+_date+"\'")
+    _sql = _sql.replace(', now', ", \'"+_date+"\'")
+    return _sql
+def remove_gold_Non_exec(data,df1, sqlite_path):
+    con = sqlite3.connect(sqlite_path)
+    cur = con.cursor()
+    out, non_exec=[], []
+    new_df = df1.copy()
+    new_df.loc[:, 'Exec/Non-Exec'] = 0
+    for i,s in tqdm(enumerate(data)):
+        _sql = str(s).replace('"', "'").lower()
+        _sql = replace_current_date_and_now(_sql, '2022-06-01')
+        _sql = replace_percent_symbol_y(_sql)
+        try:
+            cur.execute(_sql)
+            res = cur.fetchall()
+            out.append(i)
+        except:
+            non_exec.append(i)
+            # print("_sql: ", _sql)
+    new_df.loc[out, 'Exec/Non-Exec'] = 1
+    con.close()
+    return out, non_exec, new_df
+def remove_data_from_index(data, ind_list):
+    new_data=[]
+    for i in ind_list:
+        new_data.append(data[i])
+    return new_data
+def parse_query(query):
+    parsed = parse(query)[0]
+    return parsed
+def normalize_query(query):
+    # Remove comments
+    query = re.sub(r'--.*', '', query)
+    query = re.sub(r'/\*.*?\*/', '', query, flags=re.DOTALL)
+    # Remove extra whitespace
+    query = re.sub(r'\s+', ' ', query)
+    # Strip leading and trailing whitespace
+    query = query.strip()
+    return query.lower()
+def get_exec_match_acc(gold, pred):
+    assert len(gold)==len(pred)
+    correct_sql_count=0
+    count=0
+    goldd = [re.sub(' +', ' ', str(g).replace("'", '"').lower()) for g in gold]
+    predd = [re.sub(' +', ' ', str(p).replace("'", '"').lower()) for p in pred]
+    # for g, p in zip(gold, pred):
+    #     #extra space, double quotes, lower_case
+    #     gg = re.sub(' +', ' ', str(g).replace("'", '"').lower())
+    #     gg = re.sub(' +', ' ', str(p).replace("'", '"').lower())
+        # if gold==pred:
+        #     count+=1
+    for q1, q2 in zip(goldd, predd):
+        q1 = normalize_query(q1)
+        q2 = normalize_query(q2)
+        parsed_query1 = parse_query(q1)
+        parsed_query2 = parse_query(q2)
+        if str(parsed_query1) == str(parsed_query2):
+            correct_sql_count+=1
+    return correct_sql_count/len(goldd), 0
+def replace_percent_symbol_y(_sql):
+    _sql = _sql.replace('%y', "%Y")
+    return _sql
+def get_exec_results(sqlite_path, scores, df, flag, gold_sql_map_res={}):
+    con = sqlite3.connect(sqlite_path)
+    cur = con.cursor()
+    i,j,count=0,0,0
+    out,non_exec={},{}
+    new_df = df.copy()
+    responses=[]
+    for s in tqdm(scores):
+        _sql = str(s).replace('"', "'").lower()
+        _sql = replace_current_date_and_now(_sql, '2022-06-01')
+        _sql = replace_percent_symbol_y(_sql)
+        try:
+            cur.execute(_sql)
+            res = cur.fetchall()
+            out[i] = str(res)
+        except Exception as err:
+            non_exec[i]=err
+        i+=1
+    if flag=='g':
+        new_df.loc[list(out.keys()), 'GOLD_res'] = list(out.values())
+    # assert len(gold_sql_map_res)==count
+    if flag=='p':
+        new_df.loc[list(out.keys()), 'PRED_res'] = list(out.values())
+    if flag=='d':
+        new_df.loc[list(out.keys()), 'DEBUG_res'] = list(out.values())
+    con.close()
+    return out, non_exec, new_df
+def get_scores(gold_dict, pred_dict):
+    exec_count, non_exec_count=0, 0
+    none_count=0
+    correct_sql, incorrect_sql = [], []
+    for k, res in pred_dict.items():
+        if k in gold_dict:
+            if gold_dict[k]==str(None) or str(None) in gold_dict[k]:
+                none_count+=1
+                continue
+            if res==gold_dict[k]:
+                exec_count+=1
+                correct_sql.append(k)
+            else:
+                non_exec_count+=1
+                incorrect_sql.append(k)
+    return exec_count, non_exec_count, none_count, correct_sql, incorrect_sql
+def get_total_gold_none_count(gold_dict):
+    none_count, ok_count=0, 0
+    for k, res in gold_dict.items():
+        if res==str(None) or str(None) in res:
+            none_count+=1
+        else: ok_count+=1
+    return ok_count, none_count
+def Evaluate(df):
+    # df - [id, pred_sql]
+    pred_sql = df['pred_sql'].to_list()
+    ids = df['id'].to_list()
+    f = open(f"tests/test.json")
+    questions_and_ids = json.load(f)
+    ts = open(f"tests/test_sql.json")
+    gold_sql = json.load(ts)
+    gold_sql_list=[]
+    pred_sql_list=[]
+    questions_list=[]
+    for idx, pred in zip(ids, pred_sql):
+        ques = questions_and_ids[idx]['Query']
+        gd_sql = gold_sql[idx]['SQL']
+        gold_sql_list.append(gd_sql)
+        pred_sql_list.append(pred)
+        questions_list.append(ques)
+    df = pd.DataFrame({'NLQ':questions_list, 'GOLD SQL':gold_sql_list, 'PREDICTED SQL':pred_sql_list})
+    test_size = len(df)
+    pred_score = df['PREDICTED SQL'].str.lower().values
+    # debug_score = df['DEBUGGED SQL'].str.lower().values
+    gold_score1 = df['GOLD SQL'].str.lower().values
+    print("Checking non-exec Gold sql query")
+    gold_exec, gold_not_exec, new_df = remove_gold_Non_exec(gold_score1, df, sqlite_path)
+    print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size))
+    print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size))
+    prev_non_exec_df = new_df[new_df['Exec/Non-Exec'] == 0]
+    new_df = new_df[new_df['Exec/Non-Exec']==1]
+    prev_non_exec_df.reset_index(inplace=True)
+    new_df.reset_index(inplace=True)
+    #Removing Non-exec sql from data
+    print(f"Removing {len(gold_not_exec)} non-exec sql query from all Gold/Pred/Debug ")
+    gold_score1 = remove_data_from_index(gold_score1, gold_exec)
+    pred_score = remove_data_from_index(pred_score, gold_exec)
+    # debug_score = remove_data_from_index(debug_score, gold_exec)
+    gold_score = [[x] for x in gold_score1]
+    assert len(gold_score) == len(pred_score) #== len(debug_score)
+    pred_bleu_score  = bleu.compute(predictions=pred_score, references=gold_score)
+    pred_rouge_score  = rouge.compute(predictions=pred_score, references=gold_score)
+    pred_exact_match, pred_partial_f1_score = get_exec_match_acc(gold_score1, pred_score)
+    print("PREDICTED_vs_GOLD Final bleu_score: ", pred_bleu_score['bleu'])
+    print("PREDICTED_vs_GOLD Final rouge_score: ", pred_rouge_score['rougeL'])
+    print("PREDICTED_vs_GOLD Exact Match Accuracy: ", pred_exact_match)
+    # print("PREDICTED_vs_GOLD Partial CM F1 score: ", pred_partial_f1_score)
+    print()
+    new_df.loc[:, 'GOLD_res'] = str(None)
+    new_df.loc[:, 'PRED_res'] = str(None)
+    # new_df.loc[:, 'DEBUG_res'] = str(None)
+    print("Getting Gold results")
+    # gout_res_dict, gnon_exec_err_dict, gold_sql_map_res = get_exec_results(cur, gold_score1, 'g')
+    gout_res_dict, gnon_exec_err_dict, new_df = get_exec_results(sqlite_path, gold_score1, new_df, 'g')
+    total_gold_ok_count, total_gold_none_count = get_total_gold_none_count(gout_res_dict)
+    print("Total Gold None count: ", total_gold_none_count)
+    print("Getting Pred results")
+    pout_res_dict, pnon_exec_err_dict, new_df = get_exec_results(sqlite_path, pred_score, new_df, 'p')
+    # print("Getting Debug results")
+    # dout_res_dict, dnon_exec_err_dict = get_exec_results(cur, debug_score, 'd')
+    print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size))
+    print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size))
+    print()
+    print("PRED Total exec SQL query: {}/{}".format(len(pout_res_dict), len(pred_score)))
+    print("PRED Total non-exec SQL query: {}/{}".format(len(pnon_exec_err_dict), len(pred_score)))
+    print()
+    # print("DEBUG Total exec SQL query: {}/{}".format(len(dout_res_dict), len(debug_score)))
+    # print("DEBUG Total non-exec SQL query: {}/{}".format(len(dnon_exec_err_dict), len(debug_score)))
+    # print()
+    pred_correct_exec_acc_count, pred_incorrect_exec_acc_count, pred_none_count, pred_correct_sql, pred_incorrect_sql  = get_scores(gout_res_dict, pout_res_dict)
+    # debug_correct_exec_acc_count, debug_incorrect_exec_acc_count, debug_none_count, debug_correct_sql, debug_incorrect_sql   = get_scores(gout_res_dict, dout_res_dict)
+    # print("PRED_vs_GOLD None_count: ", total_gold_none_count)
+    print("PRED_vs_GOLD Correct_Exec_count without None: ", pred_correct_exec_acc_count)
+    print("PRED_vs_GOLD Incorrect_Exec_count without None: ", pred_incorrect_exec_acc_count)
+    print("PRED_vs_GOLD Exec_Accuracy: ", pred_correct_exec_acc_count/total_gold_ok_count)
+    print()
+    return pred_exact_match, pred_correct_exec_acc_count/total_gold_ok_count, pred_partial_f1_score, pred_bleu_score['bleu'], pred_rouge_score['rougeL']
+def add_new_eval(
+    method_name: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    parsed_mail = input_verification(
+        method_name,
+        url,
+        path_to_file,
+        organisation,
+        mail,
+    )
+    # load the file
+    df = pd.read_csv(path_to_file)
+    submission_df = pd.read_csv(path_to_file)
+    # modify the df to include metadata
+    df["Method"] = method_name
+    df["url"] = url
+    df["organisation"] = organisation
+    df["mail"] = parsed_mail
+    df["timestamp"] = datetime.datetime.now()
+    submission_df = pd.read_csv(path_to_file)
+    submission_df["Method"] = method_name
+    submission_df["Submitted By"] = organisation
+    # upload to spaces using the hf api at
+    path_in_repo = f"submissions/{method_name}"
+    file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
+    EM, EX, PCM_F1, BLEU, ROUGE = Evaluate(submission_df)
+    sub_df = pd.DataFrame()
+    sub_df["Method"] = method_name
+    sub_df["Submitted By"] = organisation
+    sub_df['EMA'] = EM
+    sub_df['EX'] = EX
+    # submission_df['PCM_F1'] = PCM_F1
+    sub_df['BLEU-4'] = BLEU
+    sub_df['ROUGE-L'] = ROUGE
+    # upload the df to spaces
+    import io
+    buffer = io.BytesIO()
+    df.to_csv(buffer, index=False)  # Write the DataFrame to a buffer in CSV format
+    buffer.seek(0)  # Rewind the buffer to the beginning
+    api.upload_file(
+        repo_id=RESULTS_PATH,
+        path_in_repo=f"{path_in_repo}/{file_name}",
+        path_or_fileobj=buffer,
+        token=TOKEN,
+        repo_type="dataset",
+    )
+    # read the leaderboard
+    leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
+    # append the new submission_df csv to the leaderboard
+    # leaderboard_df = leaderboard_df._append(submission_df)
+    leaderboard_df = pd.concat([leaderboard_df, sub_df], ignore_index=True)
+    # save the new leaderboard
+    # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
+    leaderboard_buffer = io.BytesIO()
+    leaderboard_df.to_csv(leaderboard_buffer, index=False)
+    leaderboard_buffer.seek(0)
+    api.upload_file(
+        repo_id=LEADERBOARD_PATH,
+        path_in_repo=f"submissions/baseline/baseline.csv",
+        path_or_fileobj=leaderboard_buffer,
+        token=TOKEN,
+        repo_type="space",
+    )
+    return format_log(
+        f"Method {method_name} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed"
+    )