task2_speaker_tagging_leaderboard

Runtime error

App Files Files Community

Taejin commited on Jun 13

Commit

bf24ae8

•

1 Parent(s): 30dd9ff

Adding files

Browse files

Signed-off-by: Taejin Park <tango4j@gmail.com>

Files changed (20) hide show

__pycache__/app.cpython-310.pyc +0 -0
__pycache__/app.cpython-39.pyc +0 -0
__pycache__/app_new.cpython-310.pyc +0 -0
__pycache__/app_new.cpython-39.pyc +0 -0
__pycache__/content.cpython-310.pyc +0 -0
__pycache__/scorer.cpython-310.pyc +0 -0
app.py +15 -13
app_new.py +301 -0
app_old.py +281 -0
beam_search_utils.py +339 -0
entry_data/dev_set_data.csv +2 -0
entry_data/dev_set_data_1.csv +2 -0
hyper_optim.py +196 -0
requirements.txt +1 -0
scorer.py +29 -6
seglst_files/err_dev.hyp.seglst.json +0 -0
seglst_files/err_dev.ref.list +13 -0
seglst_files/err_dev.ref.seglst.json +0 -0
seglst_files/err_dev.src.list +13 -0
seglst_files/err_dev.src.seglst.json +0 -0

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (7.58 kB). View file

__pycache__/app.cpython-39.pyc ADDED Viewed

Binary file (7.96 kB). View file

__pycache__/app_new.cpython-310.pyc ADDED Viewed

Binary file (6.49 kB). View file

__pycache__/app_new.cpython-39.pyc ADDED Viewed

Binary file (7.45 kB). View file

__pycache__/content.cpython-310.pyc ADDED Viewed

Binary file (5.47 kB). View file

__pycache__/scorer.cpython-310.pyc ADDED Viewed

Binary file (1.94 kB). View file

app.py CHANGED Viewed

@@ -27,6 +27,8 @@ api = HfApi()
 YEAR_VERSION = "2024"
 def read_json_file(filepath):
     with open(filepath) as infile:
         data_dict = json.load(infile)
@@ -38,17 +40,17 @@ def save_json_file(filepath, data_dict):
 os.makedirs("scored", exist_ok=True)
-test_data_files = {"test": "contextual_test.csv"}
-test_dataset = load_dataset(TEST_DATASET, data_files=test_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
-val_data_files = {"val": "contextual_val.csv"}
-val_dataset = load_dataset(VAL_DATASET, data_files=val_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
-results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
-results = load_dataset(RESULTS_DATASET, data_files=results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
-contacts_data_files = {"contacts": "contacts.csv"}
-contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
 def get_dataframe_from_results(results, split):
     df = results[split].to_pandas()
@@ -56,13 +58,13 @@ def get_dataframe_from_results(results, split):
     df = df.sort_values(by=["All"], ascending=False)
     return df
-test_dataset_dataframe = test_dataset["test"].to_pandas()
-val_dataset_dataframe = val_dataset["val"].to_pandas()
-contacts_dataframe = contact_infos["contacts"].to_pandas()
-val_results_dataframe = get_dataframe_from_results(results=results, split="val")
-test_results_dataframe = get_dataframe_from_results(results=results, split="test")
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

 YEAR_VERSION = "2024"
+results = {"dev": {"cpWER": 0, "W
 def read_json_file(filepath):
     with open(filepath) as infile:
         data_dict = json.load(infile)
 os.makedirs("scored", exist_ok=True)
+# test_data_files = {"test": "contextual_test.csv"}
+# test_dataset = load_dataset(TEST_DATASET, data_files=test_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# val_data_files = {"val": "contextual_val.csv"}
+# val_dataset = load_dataset(VAL_DATASET, data_files=val_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+# results = load_dataset(RESULTS_DATASET, data_files=results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# contacts_data_files = {"contacts": "contacts.csv"}
+# contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
 def get_dataframe_from_results(results, split):
     df = results[split].to_pandas()
     df = df.sort_values(by=["All"], ascending=False)
     return df
+# test_dataset_dataframe = test_dataset["test"].to_pandas()
+# val_dataset_dataframe = val_dataset["val"].to_pandas()
+# contacts_dataframe = contact_infos["contacts"].to_pandas()
+# val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+# test_results_dataframe = get_dataframe_from_results(results=results, split="test")
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

app_new.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import os
+import json
+import csv
+import datetime
+from email.utils import parseaddr
+import gradio as gr
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from scorer import instruction_scorer
+from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
+TOKEN = os.environ.get("TOKEN", None)
+# OWNER="ucla-contextual"
+OWNER="Taejin"
+# TEST_DATASET = f"{OWNER}/contextual_test"
+# VAL_DATASET = f"{OWNER}/contextual_val"
+# SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+# CONTACT_DATASET = f"{OWNER}/contact_info"
+# RESULTS_DATASET = f"{OWNER}/results"
+# LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+RESULTS_DATASET = f"{OWNER}/spk_tag_results"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+SUBMISSION_DATASET = f"{OWNER}/submission_leaderboard"
+api = HfApi()
+YEAR_VERSION = "2024"
+def read_json_file(filepath):
+    with open(filepath) as infile:
+        data_dict = json.load(infile)
+    return data_dict
+def save_json_file(filepath, data_dict):
+    with open(filepath, "w") as outfile:
+        json.dump(data_dict, outfile)
+os.makedirs("scored", exist_ok=True)
+# test_data_files = {"test": "contextual_test.csv"}
+# test_dataset = load_dataset(TEST_DATASET, data_files=test_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# val_data_files = {"val": "contextual_val.csv"}
+# val_dataset = load_dataset(VAL_DATASET, data_files=val_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+# results = load_dataset(RESULTS_DATASET, data_files=results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# contacts_data_files = {"contacts": "contacts.csv"}
+# contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# BASE_PATH="entry_data"
+# results_data_files = {"dev": f"{BASE_PATH}/dev_set_data.csv", "val": "contextual_val_results.csv"}
+results_data_files = {"dev": "dev_set_data.csv"}
+results = load_dataset(RESULTS_DATASET, data_files=results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# contacts_data_files = {"contacts": "contacts.csv"}
+# contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+def get_dataframe_from_results(results, split):
+    df = results[split].to_pandas()
+    # df.drop(columns=['URL'], inplace=True)
+    df = df.sort_values(by=["cpWER"], ascending=False)
+    return df
+# test_dataset_dataframe = test_dataset["test"].to_pandas()
+# val_dataset_dataframe = val_dataset["val"].to_pandas()
+# contacts_dataframe = contact_infos["contacts"].to_pandas()
+# val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+# test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+# TYPES = ["markdown", "markdown", "markdown", "number", "number", "number","number", "number", "number", "number", "number", "number"]
+TYPES = ["markdown", "markdown", "markdown", "markdown", "number", "number"]
+# file_path = "dev_set_data.csv"
+# dev_dataframe= pd.read_csv(file_path)
+dev_dataset_dataframe= get_dataframe_from_results(results=results, split="dev")
+def add_new_eval(
+    system_name: str,
+    method: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    print("printing all inputs:", system_name, method, path_to_file, organisation, mail)
+    if len(system_name)==0:
+        print("system_name none")
+        raise gr.Error("Please provide a system_name name. Field empty!")
+    if len(method)==0:
+        print("method none")
+        raise gr.Error("Please provide a method. Field empty!")
+    if len(organisation)==0:
+        print("org none")
+        raise gr.Error("Please provide organisation information. Field empty!")
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        print("email here")
+        raise gr.Error("Please provide a valid email address.")
+    # Check if the combination system_name/org already exists and prints a warning message if yes
+    # if system_name.lower() in set([m.lower() for m in results["dev"]["System_name"]]) and organisation.lower() in set([o.lower() for o in results["dev"]["Organisation"]]):
+    #     print("system_name org combo here")
+    #     raise gr.Error("This system_name has been already submitted.")
+    if path_to_file is None:
+        print("file missing here")
+        raise gr.Error("Please attach a file.")
+    tmp_file_output = read_json_file(path_to_file.name)
+    if len(tmp_file_output.keys())!=1:
+        print("file format wrong here")
+        raise gr.Error("Submission file format incorrect. Please refer to the format description!")
+    tmp_output_key = list(tmp_file_output.keys())[0]
+    if len(tmp_file_output[tmp_output_key].keys())!=100:
+        print("file not 100 here")
+        raise gr.Error("File must contain exactly 100 predictions.")
+    # Save submitted file
+    time_atm = datetime.datetime.today()
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{system_name}/{YEAR_VERSION}_raw_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Compute score
+    file_path = path_to_file.name
+    # scores = instruction_scorer(val_dataset_dataframe, file_path , system_name)
+    ref_file_path="seglst_files/err_dev.ref.seglst.json"
+    scores =  instruction_scorer(file_path_input= path_to_file.name, ref_file_path=ref_file_path,  system_name=system_name)
+    path_or_fileobj=f"scored/{organisation}_{system_name}.json"
+    save_json_file(path_or_fileobj, scores)
+    # Save scored file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_or_fileobj,
+        path_in_repo=f"{organisation}/{system_name}/{YEAR_VERSION}_scored_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Actual submission
+    eval_entry = {
+        "System_name": system_name,
+        "Method":method,
+        "Organisation": organisation,
+        "cpWER":scores["cpWER"],
+        "WER":scores["WER"],
+    }
+    dev_set_data_csv = "dev_set_data.csv"
+    val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+    val_results_dataframe = pd.concat([val_results_dataframe, pd.DataFrame([eval_entry])], ignore_index=True)
+    val_results_dataframe.to_csv(dev_set_data_csv, index=False)
+    api.upload_file(
+        repo_id=RESULTS_DATASET,
+        path_or_fileobj=dev_set_data_csv,
+        path_in_repo=dev_set_data_csv,
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # contact_info = {
+    #     "System_name": system_name,
+    #     "Organisation": organisation,
+    #     "Mail": mail,
+    # }
+    # contacts_dataframe = contact_infos["contacts"].to_pandas()
+    # contacts_dataframe = pd.concat([contacts_dataframe, pd.DataFrame([contact_info])], ignore_index=True)
+    # contacts_dataframe.to_csv('contacts.csv', index=False)
+    # api.upload_file(
+    #     repo_id=CONTACT_DATASET,
+    #     path_or_fileobj="contacts.csv",
+    #     path_in_repo=f"contacts.csv",
+    #     repo_type="dataset",
+    #     token=TOKEN
+    # )
+    return format_log(f"System_name {system_name} submitted by {organisation} successfully! \nPlease refresh the val leaderboard, and wait a bit to see the score displayed")
+# def refresh():
+#     results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+#     results = load_dataset(RESULTS_DATASET, data_files=
+#     results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+#     val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+#     test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+#     return val_results_dataframe, test_results_dataframe
+def refresh():
+    results_data_files = {"dev": "dev_set_data.csv"}
+    results = load_dataset(RESULTS_DATASET, data_files=
+    results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+    dev_results_dataframe = get_dataframe_from_results(results=results, split="dev")
+    # test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+    return dev_results_dataframe
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🧐 Introduction", open=False):
+            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🎯 Submission Guidelines", open=False):
+            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.TextArea(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            )
+    with gr.Tab("Results: Dev"):
+        leaderboard_table_dev  = gr.components.Dataframe(
+            value=dev_dataset_dataframe, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_dev,
+        ],
+    )
+    with gr.Accordion("Submit a new system_name for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                system_name_textbox = gr.Textbox(label="System name", type='text')
+                method_textbox = gr.Textbox(label="Method (LLM with prompt, beam-search, etc)", type='text')
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation or Team Name", type='text')
+                mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)", type='email')
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                system_name_textbox,
+                method_textbox,
+                file_output,
+                organisation,
+                mail
+            ],
+            submission_result,
+        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True)

app_old.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import os
+import json
+import csv
+import datetime
+from email.utils import parseaddr
+import gradio as gr
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from scorer import instruction_scorer
+from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
+TOKEN = os.environ.get("TOKEN", None)
+OWNER="ucla-contextual"
+TEST_DATASET = f"{OWNER}/contextual_test"
+VAL_DATASET = f"{OWNER}/contextual_val"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+CONTACT_DATASET = f"{OWNER}/contact_info"
+RESULTS_DATASET = f"{OWNER}/results"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+api = HfApi()
+YEAR_VERSION = "2024"
+def read_json_file(filepath):
+    with open(filepath) as infile:
+        data_dict = json.load(infile)
+    return data_dict
+def save_json_file(filepath, data_dict):
+    with open(filepath, "w") as outfile:
+        json.dump(data_dict, outfile)
+os.makedirs("scored", exist_ok=True)
+# test_data_files = {"test": "contextual_test.csv"}
+# test_dataset = load_dataset(TEST_DATASET, data_files=test_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# val_data_files = {"val": "contextual_val.csv"}
+# val_dataset = load_dataset(VAL_DATASET, data_files=val_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+# results = load_dataset(RESULTS_DATASET, data_files=results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+# contacts_data_files = {"contacts": "contacts.csv"}
+# contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+def get_dataframe_from_results(results, split):
+    df = results[split].to_pandas()
+    df.drop(columns=['URL'], inplace=True)
+    df = df.sort_values(by=["All"], ascending=False)
+    return df
+# test_dataset_dataframe = test_dataset["test"].to_pandas()
+# val_dataset_dataframe = val_dataset["val"].to_pandas()
+# contacts_dataframe = contact_infos["contacts"].to_pandas()
+# val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+# test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "markdown", "markdown", "number", "number", "number","number", "number", "number", "number", "number", "number"]
+def add_new_eval(
+    model: str,
+    method: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    print("printing all inputs:", model, method, url, path_to_file, organisation, mail)
+    if len(model)==0:
+        print("model none")
+        raise gr.Error("Please provide a model name. Field empty!")
+    if len(method)==0:
+        print("method none")
+        raise gr.Error("Please provide a method. Field empty!")
+    if len(organisation)==0:
+        print("org none")
+        raise gr.Error("Please provide organisation information. Field empty!")
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        print("email here")
+        raise gr.Error("Please provide a valid email address.")
+    # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set([m.lower() for m in results["val"]["Model"]]) and organisation.lower() in set([o.lower() for o in results["val"]["Organisation"]]):
+        print("model org combo here")
+        raise gr.Error("This model has been already submitted.")
+    if path_to_file is None:
+        print("file missing here")
+        raise gr.Error("Please attach a file.")
+    tmp_file_output = read_json_file(path_to_file.name)
+    if len(tmp_file_output.keys())!=1:
+        print("file format wrong here")
+        raise gr.Error("Submission file format incorrect. Please refer to the format description!")
+    tmp_output_key = list(tmp_file_output.keys())[0]
+    if len(tmp_file_output[tmp_output_key].keys())!=100:
+        print("file not 100 here")
+        raise gr.Error("File must contain exactly 100 predictions.")
+    # Save submitted file
+    time_atm = datetime.datetime.today()
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_raw_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Compute score
+    file_path = path_to_file.name
+    scores = instruction_scorer(val_dataset_dataframe, file_path , model)
+    path_or_fileobj=f"scored/{organisation}_{model}.json"
+    save_json_file(path_or_fileobj, scores)
+    # Save scored file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_or_fileobj,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_scored_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Actual submission
+    eval_entry = {
+        "Model": model,
+        "Method":method,
+        "Organisation": organisation,
+        "URL": url,
+        "All":scores["average"],
+        "Time":scores["time"],
+        "Shopping":scores["shopping"],
+        "Navigation":scores["navigation-transportation"],
+        "Abstract":scores["abstract"],
+        "Application Usage":scores["app"],
+        "Web Usage":scores["web"],
+        "Infographic":scores["infographics"],
+        "Miscellaneous Natural Scenes": scores["misc"]
+    }
+    val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+    val_results_dataframe = pd.concat([val_results_dataframe, pd.DataFrame([eval_entry])], ignore_index=True)
+    val_results_dataframe.to_csv('contextual_val_results.csv', index=False)
+    api.upload_file(
+        repo_id=RESULTS_DATASET,
+        path_or_fileobj="contextual_val_results.csv",
+        path_in_repo=f"contextual_val_results.csv",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    contact_info = {
+        "Model": model,
+        "URL": url,
+        "Organisation": organisation,
+        "Mail": mail,
+    }
+    contacts_dataframe = contact_infos["contacts"].to_pandas()
+    contacts_dataframe = pd.concat([contacts_dataframe, pd.DataFrame([contact_info])], ignore_index=True)
+    contacts_dataframe.to_csv('contacts.csv', index=False)
+    api.upload_file(
+        repo_id=CONTACT_DATASET,
+        path_or_fileobj="contacts.csv",
+        path_in_repo=f"contacts.csv",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    return format_log(f"Model {model} submitted by {organisation} successfully! \nPlease refresh the val leaderboard, and wait a bit to see the score displayed")
+def refresh():
+    results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+    results = load_dataset(RESULTS_DATASET, data_files=
+    results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+    val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+    test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+    return val_results_dataframe, test_results_dataframe
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🧐 Introduction", open=False):
+            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🎯 Submission Guidelines", open=False):
+            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.TextArea(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            )
+    with gr.Tab("Results: Test"):
+        leaderboard_table_test = gr.components.Dataframe(
+            value=test_results_dataframe, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    with gr.Tab("Results: Val"):
+        leaderboard_table_val = gr.components.Dataframe(
+            value=val_results_dataframe, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_val,
+            leaderboard_table_test,
+        ],
+    )
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                model_name_textbox = gr.Textbox(label="Model name", type='text')
+                method_textbox = gr.Textbox(label="Method (LMM or Aug LLM or any other)", type='text')
+                url_textbox = gr.Textbox(label="URL to model information", type='text')
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation", type='text')
+                mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)", type='email')
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                model_name_textbox,
+                method_textbox,
+                url_textbox,
+                file_output,
+                organisation,
+                mail
+            ],
+            submission_result,
+        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True)

beam_search_utils.py ADDED Viewed

	@@ -0,0 +1,339 @@

+from tqdm import tqdm
+from typing import Dict, List
+from pydiardecode import build_diardecoder
+import numpy as np
+import copy
+import os
+import json
+import concurrent.futures
+import kenlm
+__INFO_TAG__ = "[BeamSearchUtil INFO]"
+class SpeakerTaggingBeamSearchDecoder:
+    def __init__(self, loaded_kenlm_model: kenlm, cfg: dict):
+        self.realigning_lm_params = cfg
+        self.realigning_lm = self._load_realigning_LM(loaded_kenlm_model=loaded_kenlm_model)
+        self._SPLITSYM = "@"
+    def _load_realigning_LM(self, loaded_kenlm_model: kenlm):
+        """
+        Load ARPA language model for realigning speaker labels for words.
+        """
+        diar_decoder = None
+        return diar_decoder
+    def realign_words_with_lm(self, word_dict_seq_list: List[Dict[str, float]], speaker_count: int = None, port_num=None) -> List[Dict[str, float]]:
+        if speaker_count is None:
+            spk_list = []
+            for k, line_dict in enumerate(word_dict_seq_list):
+                _, spk_label = line_dict['word'], line_dict['speaker']
+                spk_list.append(spk_label)
+        else:
+            spk_list = [ f"speaker_{k}" for k in range(speaker_count)]
+        realigned_list = self.realigning_lm.decode_beams(beam_width=self.realigning_lm_params['beam_width'],
+                                                         speaker_list=sorted(list(set(spk_list))),
+                                                         word_dict_seq_list=word_dict_seq_list,
+                                                         port_num=port_num)
+        return realigned_list
+    def beam_search_diarization(
+        self,
+        trans_info_dict: Dict[str, Dict[str, list]],
+        port_num: List[int] = None,
+    ) -> Dict[str, Dict[str, float]]:
+        """
+        Match the diarization result with the ASR output.
+        The words and the timestamps for the corresponding words are matched in a for loop.
+        Args:
+        Returns:
+            trans_info_dict (dict):
+                Dictionary containing word timestamps, speaker labels and words from all sessions.
+                Each session is indexed by a unique ID.
+        """
+        for uniq_id, session_dict in tqdm(trans_info_dict.items(), total=len(trans_info_dict), disable=True):
+            # print(f"{__INFO_TAG__} Processing session {uniq_id}")
+            word_dict_seq_list = session_dict['words']
+            output_beams = self.realign_words_with_lm(word_dict_seq_list=word_dict_seq_list, speaker_count=session_dict['speaker_count'], port_num=port_num)
+            word_dict_seq_list = output_beams[0][2]
+            trans_info_dict[uniq_id]['words'] = word_dict_seq_list
+        return trans_info_dict
+    def merge_div_inputs(self, div_trans_info_dict, org_trans_info_dict, win_len=250, word_window=16, limit_max_spks=8):
+        """
+        Merge the outputs of parallel processing.
+        """
+        uniq_id_list = list(org_trans_info_dict.keys())
+        sub_div_dict = {}
+        for seq_id in div_trans_info_dict.keys():
+            div_info = seq_id.split(self._SPLITSYM)
+            uniq_id, sub_idx, total_count = div_info[0], int(div_info[1]), int(div_info[2])
+            if uniq_id not in sub_div_dict:
+                sub_div_dict[uniq_id] = [None] * total_count
+            sub_div_dict[uniq_id][sub_idx] = div_trans_info_dict[seq_id]['words']
+        processed_trans_info_dict = {}
+        for uniq_id in uniq_id_list:
+            processed_trans_info_dict[uniq_id] = {'words': []}
+            if uniq_id in sub_div_dict:
+                for k, div_words in enumerate(sub_div_dict[uniq_id]):
+                    if k == 0:
+                        div_words = div_words[:win_len]
+                    else:
+                        div_words = div_words[word_window:]
+                    processed_trans_info_dict[uniq_id]['words'].extend(div_words)
+                org_trans_info_dict[uniq_id]['words'] = processed_trans_info_dict[uniq_id]['words']
+            else:
+                processed_trans_info_dict[uniq_id]['words'] = org_trans_info_dict[uniq_id]['words']
+        return processed_trans_info_dict
+        # return org_trans_info_dict
+    def divide_chunks(self, trans_info_dict, win_len, word_window, limit_max_spks, port):
+        """
+        Divide word sequence into chunks of length `win_len` for parallel processing.
+        Args:
+            trans_info_dict (_type_): _description_
+            diar_logits (_type_): _description_
+            win_len (int, optional): _description_. Defaults to 250.
+        """
+        if len(port) > 1:
+            num_workers = len(port)
+        else:
+            num_workers = 25
+        div_trans_info_dict = {}
+        for uniq_id in trans_info_dict.keys():
+            uniq_trans = trans_info_dict[uniq_id]
+            if 'status' in uniq_trans:
+                del uniq_trans['status']
+            if 'transcription' in uniq_trans:
+                del uniq_trans['transcription']
+            if 'sentences' in uniq_trans:
+                del uniq_trans['sentences']
+            word_seq = uniq_trans['words']
+            num_spks = len(set([x['speaker'] for x in word_seq]))
+            if num_spks > limit_max_spks:
+                continue
+            div_word_seq = []
+            if win_len is None:
+                win_len = int(np.ceil(len(word_seq)/num_workers))
+            n_chunks = int(np.ceil(len(word_seq)/win_len))
+            for k in range(n_chunks):
+                div_word_seq.append(word_seq[max(k*win_len - word_window, 0):(k+1)*win_len])
+            total_count = len(div_word_seq)
+            for k, w_seq in enumerate(div_word_seq):
+                seq_id = uniq_id + f"{self._SPLITSYM}{k}{self._SPLITSYM}{total_count}"
+                div_trans_info_dict[seq_id] = dict(uniq_trans)
+                div_trans_info_dict[seq_id]['words'] = w_seq
+        return div_trans_info_dict
+def run_mp_beam_search_decoding(
+    speaker_beam_search_decoder,
+    loaded_kenlm_model,
+    div_trans_info_dict,
+    org_trans_info_dict,
+    div_mp,
+    win_len,
+    word_window,
+    limit_max_spks,
+    port=None,
+    use_ngram=False
+    ):
+    if len(port) > 1:
+        port = [int(p) for p in port]
+    if use_ngram:
+        port = [None]
+        num_workers = 24
+    else:
+        num_workers = len(port)
+    uniq_id_list = sorted(list(div_trans_info_dict.keys() ))
+    tp = concurrent.futures.ProcessPoolExecutor(max_workers=num_workers)
+    futures = []
+    count = 0
+    print(f"{__INFO_TAG__} Number of unique chunks to process: {len(uniq_id_list)}")
+    for uniq_id in uniq_id_list:
+        print(f"{__INFO_TAG__} Running beam search decoding for {uniq_id}...")
+        if port is not None:
+            port_num = port[count % len(port)]
+        else:
+            port_num = None
+        count += 1
+        uniq_trans_info_dict = {uniq_id: div_trans_info_dict[uniq_id]}
+        futures.append(tp.submit(speaker_beam_search_decoder.beam_search_diarization, uniq_trans_info_dict, port_num=port_num))
+    pbar = tqdm(total=len(uniq_id_list), desc="Running beam search decoding", unit="files")
+    count = 0
+    output_trans_info_dict = {}
+    for done_future in concurrent.futures.as_completed(futures):
+        count += 1
+        pbar.update()
+        output_trans_info_dict.update(done_future.result())
+    pbar.close()
+    tp.shutdown()
+    if div_mp:
+        output_trans_info_dict = speaker_beam_search_decoder.merge_div_inputs(div_trans_info_dict=output_trans_info_dict,
+                                                                              org_trans_info_dict=org_trans_info_dict,
+                                                                              win_len=win_len,
+                                                                              word_window=word_window,
+                                                                              limit_max_spks=limit_max_spks)
+    return output_trans_info_dict
+def count_num_of_spks(json_trans_list):
+    spk_set = set()
+    for sentence_dict in json_trans_list:
+        spk_set.add(sentence_dict['speaker'])
+    speaker_map = { spk_str: idx for idx, spk_str in enumerate(spk_set)}
+    return speaker_map
+def add_placeholder_speaker_softmax(json_trans_list, peak_prob=0.94 ,max_spks=4):
+    nemo_json_dict = {}
+    word_dict_seq_list = []
+    if peak_prob > 1 or peak_prob < 0:
+        raise ValueError(f"peak_prob must be between 0 and 1 but got {peak_prob}")
+    speaker_map = count_num_of_spks(json_trans_list)
+    base_array = np.ones(max_spks) * (1 - peak_prob)/(max_spks-1)
+    stt_sec, end_sec = None, None
+    for sentence_dict in json_trans_list:
+        word_list = sentence_dict['words'].split()
+        speaker = sentence_dict['speaker']
+        for word in word_list:
+            speaker_softmax = copy.deepcopy(base_array)
+            speaker_softmax[speaker_map[speaker]] = peak_prob
+            word_dict_seq_list.append({'word': word,
+                                    'start_time': stt_sec,
+                                    'end_time': end_sec,
+                                    'speaker': speaker_map[speaker],
+                                    'speaker_softmax': speaker_softmax}
+                                    )
+    nemo_json_dict.update({'words': word_dict_seq_list,
+                           'status': "success",
+                           'sentences': json_trans_list,
+                           'speaker_count': len(speaker_map),
+                           'transcription': None}
+                        )
+    return nemo_json_dict
+def convert_nemo_json_to_seglst(trans_info_dict):
+    seglst_seq_list = []
+    seg_lst_dict, spk_wise_trans_sessions = {}, {}
+    for uniq_id in trans_info_dict.keys():
+        spk_wise_trans_sessions[uniq_id] = {}
+        seglst_seq_list = []
+        word_seq_list = trans_info_dict[uniq_id]['words']
+        prev_speaker, sentence = None, ''
+        for widx, word_dict in enumerate(word_seq_list):
+            curr_speaker = word_dict['speaker']
+            # For making speaker wise transcriptions
+            word = word_dict['word']
+            if curr_speaker not in spk_wise_trans_sessions[uniq_id]:
+                spk_wise_trans_sessions[uniq_id][curr_speaker] = word
+            elif curr_speaker in spk_wise_trans_sessions[uniq_id]:
+                spk_wise_trans_sessions[uniq_id][curr_speaker] = f"{spk_wise_trans_sessions[uniq_id][curr_speaker]} {word_dict['word']}"
+            # For making segment wise transcriptions
+            if curr_speaker!= prev_speaker and prev_speaker is not None:
+                seglst_seq_list.append({'session_id': uniq_id,
+                                        'words': sentence.strip(),
+                                        'start_time': 0.0,
+                                        'end_time': 0.0,
+                                        'speaker': prev_speaker,
+                })
+                sentence = word_dict['word']
+            else:
+                sentence = f"{sentence} {word_dict['word']}"
+            prev_speaker = curr_speaker
+        # For the last word:
+        # (1) If there is no speaker change, add the existing sentence and exit the loop
+        # (2) If there is a speaker change, add the last word and exit the loop
+        if widx == len(word_seq_list) - 1:
+            seglst_seq_list.append({'session_id': uniq_id,
+                                    'words': sentence.strip(),
+                                    'start_time': 0.0,
+                                    'end_time': 0.0,
+                                    'speaker': curr_speaker,
+            })
+        seg_lst_dict[uniq_id] = seglst_seq_list
+    return seg_lst_dict
+def load_input_jsons(input_error_src_list_path, ext_str=".seglst.json", peak_prob=0.94, max_spks=4):
+    trans_info_dict = {}
+    json_filepath_list = open(input_error_src_list_path).readlines()
+    for json_path in json_filepath_list:
+        json_path = json_path.strip()
+        uniq_id = os.path.split(json_path)[-1].split(ext_str)[0]
+        if os.path.exists(json_path):
+            with open(json_path, "r") as file:
+                json_trans = json.load(file)
+        else:
+            raise FileNotFoundError(f"{json_path} does not exist. Aborting.")
+        nemo_json_dict = add_placeholder_speaker_softmax(json_trans, peak_prob=peak_prob, max_spks=max_spks)
+        trans_info_dict[uniq_id] = nemo_json_dict
+    return trans_info_dict
+def load_reference_jsons(reference_seglst_list_path,  ext_str=".seglst.json"):
+    reference_info_dict = {}
+    json_filepath_list = open(reference_seglst_list_path).readlines()
+    for json_path in json_filepath_list:
+        json_path = json_path.strip()
+        uniq_id = os.path.split(json_path)[-1].split(ext_str)[0]
+        if os.path.exists(json_path):
+            with open(json_path, "r") as file:
+                json_trans = json.load(file)
+        else:
+            raise FileNotFoundError(f"{json_path} does not exist. Aborting.")
+        json_trans_uniq_id = []
+        for sentence_dict in json_trans:
+            sentence_dict['session_id'] = uniq_id
+            json_trans_uniq_id.append(sentence_dict)
+        reference_info_dict[uniq_id] = json_trans_uniq_id
+    return reference_info_dict
+def write_seglst_jsons(
+    seg_lst_sessions_dict: dict,
+    input_error_src_list_path: str,
+    diar_out_path: str,
+    ext_str: str,
+    write_individual_seglst_jsons=True
+    ):
+    """
+    Writes the segment list (seglst) JSON files to the output directory.
+    Parameters:
+        seg_lst_sessions_dict (dict): A dictionary containing session IDs as keys and their corresponding segment lists as values.
+        input_error_src_list_path (str): The path to the input error source list file.
+        diar_out_path (str): The path to the output directory where the seglst JSON files will be written.
+        type_string (str): A string representing the type of the seglst JSON files (e.g., 'hyp' for hypothesis or 'ef' for reference).
+        write_individual_seglst_jsons (bool, optional): A flag indicating whether to write individual seglst JSON files for each session. Defaults to True.
+    Returns:
+        None
+    """
+    total_infer_list = []
+    total_output_filename = os.path.split(input_error_src_list_path)[-1].replace(".list", "")
+    for session_id, seg_lst_list in seg_lst_sessions_dict.items():
+        total_infer_list.extend(seg_lst_list)
+        if write_individual_seglst_jsons:
+            print(f"{__INFO_TAG__} Writing {diar_out_path}/{session_id}.seglst.json")
+            with open(f'{diar_out_path}/{session_id}.seglst.json', 'w') as file:
+                json.dump(seg_lst_list, file, indent=4)  # indent=4 for pretty printing
+    print(f"{__INFO_TAG__} Writing {diar_out_path}/{session_id}.seglst.json")
+    total_output_filename = total_output_filename.replace("src", ext_str).replace("ref", ext_str)
+    write_fn = f"{diar_out_path}/{total_output_filename}.seglst.json"
+    if os.path.exists(write_fn):
+        print(f"{__INFO_TAG__} {write_fn} already exists. Deleting it.")
+        os.remove(write_fn)
+    with open(write_fn, 'w') as file:
+        json.dump(total_infer_list, file, indent=4)  # indent=4 for pretty printing

entry_data/dev_set_data.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ system_name,method,organisation,mail,cpWER,WER
2	+ baseline_system,beam_search_ngram,SLT_Task2,tango4j@gmail.com,0.24536675570166427,0.21231591

entry_data/dev_set_data_1.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ baseline_system,0.24536675570166427,0.21231591
2	+ baseline_system_2,0.01234,0.1234

hyper_optim.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import optuna
+import os
+import tempfile
+import time
+import json
+import subprocess
+import logging
+from beam_search_utils import (
+    write_seglst_jsons,
+    run_mp_beam_search_decoding,
+    convert_nemo_json_to_seglst,
+    SpeakerTaggingBeamSearchDecoder,
+)
+from speaker_tagging_cpwer_jsons import process_session_data
+def evaluate(cfg, temp_out_dir, asrdiar_file_name, source_info_dict, hypothesis_sessions_dict, reference_info_dict):
+    write_seglst_jsons(hypothesis_sessions_dict, input_error_src_list_path=cfg.input_error_src_list_path, diar_out_path=temp_out_dir, ext_str='hyp')
+    write_seglst_jsons(reference_info_dict, input_error_src_list_path=cfg.groundtruth_ref_list_path, diar_out_path=temp_out_dir, ext_str='ref')
+    write_seglst_jsons(source_info_dict, input_error_src_list_path=cfg.groundtruth_ref_list_path, diar_out_path=temp_out_dir, ext_str='src')
+    # Construct the file paths
+    # src_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.src.seglst.json")
+    hyp_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.hyp.seglst.json")
+    ref_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.ref.seglst.json")
+    # Construct the output JSON file path
+    output_cpwer_hyp_json_file = os.path.join(temp_out_dir, f"{asrdiar_file_name}.hyp.seglst_cpwer.json")
+    # output_cpwer_src_json_file = os.path.join(temp_out_dir, f"{asrdiar_file_name}.src.seglst_cpwer.json")
+    # Run meeteval-wer command
+    cmd_hyp = [
+        "meeteval-wer",
+        "cpwer",
+        "-h", hyp_seglst_json,
+        "-r", ref_seglst_json
+    ]
+    subprocess.run(cmd_hyp)
+    # Read the JSON file and print the cpWER
+    try:
+        with open(output_cpwer_hyp_json_file, "r") as file:
+            data_h = json.load(file)
+            print("Hypothesis cpWER:", data_h["error_rate"])
+        cpwer = data_h["error_rate"]
+        logging.info(f"-> HYPOTHESIS cpWER={cpwer:.4f}")
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Output JSON: {output_cpwer_hyp_json_file}\nfile not found.")
+    return cpwer
+def evaluate_diff(cfg, temp_out_dir, asrdiar_file_name, source_info_dict, hypothesis_sessions_dict, reference_info_dict):
+    write_seglst_jsons(hypothesis_sessions_dict, input_error_src_list_path=cfg.input_error_src_list_path, diar_out_path=temp_out_dir, ext_str='hyp')
+    write_seglst_jsons(reference_info_dict, input_error_src_list_path=cfg.groundtruth_ref_list_path, diar_out_path=temp_out_dir, ext_str='ref')
+    write_seglst_jsons(source_info_dict, input_error_src_list_path=cfg.groundtruth_ref_list_path, diar_out_path=temp_out_dir, ext_str='src')
+    # Construct the file paths
+    src_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.src.seglst.json")
+    hyp_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.hyp.seglst.json")
+    ref_seglst_json = os.path.join(temp_out_dir, f"{asrdiar_file_name}.ref.seglst.json")
+    # Run meeteval-wer command
+    cmd_hyp = [
+        "meeteval-wer",
+        "cpwer",
+        "-h", hyp_seglst_json,
+        "-r", ref_seglst_json
+    ]
+    subprocess.run(cmd_hyp)
+    cmd_src = [
+        "meeteval-wer",
+        "cpwer",
+        "-h", src_seglst_json,
+        "-r", ref_seglst_json
+    ]
+    subprocess.run(cmd_src)
+    # Construct the output JSON file path
+    output_cpwer_hyp_json_file = os.path.join(temp_out_dir, f"{asrdiar_file_name}.hyp.seglst_cpwer.json")
+    output_cpwer_src_json_file = os.path.join(temp_out_dir, f"{asrdiar_file_name}.src.seglst_cpwer.json")
+    output_cpwer_hyp_json_file_per_reco = os.path.join(temp_out_dir, f"{asrdiar_file_name}.hyp.seglst_cpwer_per_reco.json")
+    output_cpwer_src_json_file_per_reco = os.path.join(temp_out_dir, f"{asrdiar_file_name}.src.seglst_cpwer_per_reco.json")
+    avg_cpwer_diff = process_session_data(output_cpwer_hyp_json_file_per_reco, output_cpwer_src_json_file_per_reco)
+    try:
+        with open(output_cpwer_hyp_json_file, "r") as file:
+            data_h = json.load(file)
+        hyp_cpwer = data_h["error_rate"]
+        logging.info(f"-> HYPOTHESIS cpWER={hyp_cpwer:.4f}")
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Output JSON: {output_cpwer_hyp_json_file}\nfile not found.")
+    try:
+        with open(output_cpwer_src_json_file, "r") as file:
+            data_h = json.load(file)
+        src_cpwer = data_h["error_rate"]
+        logging.info(f"-> SOURCE cpWER={src_cpwer:.4f}")
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Output JSON: {output_cpwer_src_json_file}\nfile not found.")
+    diff_cpwer = (hyp_cpwer - src_cpwer)
+    logging.info(f"-> Average cpWER DIFF={avg_cpwer_diff:.4f}")
+    logging.info(f"-> HYPOTHESIS Improved cpWER={diff_cpwer:.4f}")
+    return diff_cpwer
+def optuna_suggest_params(cfg, trial):
+    cfg.alpha = trial.suggest_float("alpha", 0.5, 1.5)
+    cfg.beta = trial.suggest_float("beta", 0.02, 0.4)
+    cfg.beam_width = trial.suggest_int("beam_width", 2, 12)
+    cfg.word_window = trial.suggest_int("word_window", 10, 50, step=10)
+    cfg.use_ngram = True
+    cfg.parallel_chunk_word_len = trial.suggest_int("parallel_chunk_word_len", 50, 250, step=25)
+    cfg.peak_prob = trial.suggest_float("peak_prob", 0.96, 0.96)
+    return cfg
+def beamsearch_objective(
+    trial,
+    cfg,
+    speaker_beam_search_decoder,
+    loaded_kenlm_model,
+    org_trans_info_dict,
+    source_info_dict,
+    reference_info_dict,
+    ):
+    with tempfile.TemporaryDirectory(dir=cfg.temp_out_dir, prefix="GenSEC_") as local_temp_out_dir:
+        start_time2 = time.time()
+        if trial is not None:
+            cfg = optuna_suggest_params(cfg, trial)
+        speaker_beam_search_decoder = SpeakerTaggingBeamSearchDecoder(loaded_kenlm_model=loaded_kenlm_model, cfg=cfg)
+        div_trans_info_dict = speaker_beam_search_decoder.divide_chunks(trans_info_dict=org_trans_info_dict,
+                                                                        win_len=cfg.parallel_chunk_word_len,
+                                                                        word_window=cfg.word_window,
+                                                                        limit_max_spks=cfg.limit_max_spks,
+                                                                        port=cfg.port,)
+        result_trans_info_dict = run_mp_beam_search_decoding(speaker_beam_search_decoder,
+                                                        loaded_kenlm_model=loaded_kenlm_model,
+                                                        div_trans_info_dict=div_trans_info_dict,
+                                                        org_trans_info_dict=org_trans_info_dict,
+                                                        div_mp=True,
+                                                        win_len=cfg.parallel_chunk_word_len,
+                                                        word_window=cfg.word_window,
+                                                        limit_max_spks=cfg.limit_max_spks,
+                                                        port=cfg.port,
+                                                        use_ngram=cfg.use_ngram,
+                                                        )
+        hypothesis_sessions_dict = convert_nemo_json_to_seglst(result_trans_info_dict)
+        cpwer = evaluate_diff(cfg, local_temp_out_dir, cfg.asrdiar_file_name, source_info_dict, hypothesis_sessions_dict, reference_info_dict)
+    logging.info(f"Beam Search time taken for trial {trial}: {(time.time() - start_time2)/60:.2f} mins")
+    if trial is not None:
+        logging.info(f"Trial: {trial.number}")
+    logging.info(f"[ cpWER={cpwer:.4f} ]")
+    logging.info("-----------------------------------------------")
+    return cpwer
+def optuna_hyper_optim(
+    cfg,
+    speaker_beam_search_decoder,
+    loaded_kenlm_model,
+    # div_trans_info_dict,
+    org_trans_info_dict,
+    source_info_dict,
+    reference_info_dict,
+    ):
+    """
+    Optuna hyper-parameter optimization function.
+    Parameters:
+        cfg (dict): A dictionary containing the configuration parameters.
+    """
+    worker_function = lambda trial: beamsearch_objective(    # noqa: E731
+        trial=trial,
+        cfg=cfg,
+        speaker_beam_search_decoder=speaker_beam_search_decoder,
+        loaded_kenlm_model=loaded_kenlm_model,
+        # div_trans_info_dict=div_trans_info_dict,
+        org_trans_info_dict=org_trans_info_dict,
+        source_info_dict=source_info_dict,
+        reference_info_dict=reference_info_dict,
+    )
+    study = optuna.create_study(
+        direction="minimize",
+        study_name=cfg.optuna_study_name,
+        storage=cfg.storage,
+        load_if_exists=True
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)  # Setup the root logger.
+    if cfg.output_log_file is not None:
+        logger.addHandler(logging.FileHandler(cfg.output_log_file, mode="a"))
+    logger.addHandler(logging.StreamHandler())
+    optuna.logging.enable_propagation()  # Propagate logs to the root logger.
+    study.optimize(worker_function, n_trials=cfg.optuna_n_trials)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 datasets==2.14.5
 gradio==4.19.2
 huggingface-hub==0.19.3
 numpy==1.24.2

 datasets==2.14.5
+meeteval
 gradio==4.19.2
 huggingface-hub==0.19.3
 numpy==1.24.2

scorer.py CHANGED Viewed

@@ -1,12 +1,35 @@
 import json
-import re
-import string
-import warnings
-import pandas as pd
-import numpy as np
 import os
-def instruction_scorer(data, judgment_file, model_name):
     df = data
     img_dict = {}

 import json
+import tempfile
+import json
+import subprocess
+import logging
 import os
+def instruction_scorer(file_path_input, ref_file_path,  system_name):
+    cmd_hyp = [
+        "meeteval-wer",
+        "cpwer",
+        "-h", file_path_input,
+        "-r", ref_file_path,
+    ]
+    subprocess.run(cmd_hyp)
+    # Read the JSON file and print the cpWER
+    asrdiar_file_name="err_dev"
+    output_cpwer_hyp_json_file = os.path.join(f"{asrdiar_file_name}.hyp.seglst_cpwer.json")
+    with open(output_cpwer_hyp_json_file, "r") as temp_file:
+        data_h = json.load(temp_file)
+        print("Hypothesis cpWER:", data_h["error_rate"])
+    cpwer = data_h["error_rate"]
+    logging.info(f"-> HYPOTHESIS cpWER={cpwer:.4f}")
+    scores_dict = {"cpWER": cpwer, "WER": cpwer}
+    return scores_dict
+def __instruction_scorer(data, judgment_file, model_name):
     df = data
     img_dict = {}

seglst_files/err_dev.hyp.seglst.json ADDED Viewed

The diff for this file is too large to render. See raw diff

seglst_files/err_dev.ref.list ADDED Viewed

	@@ -0,0 +1,13 @@

+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_e992c01d.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_17dba297.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_e6e6ca6b.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_197ddec4.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_ac417036.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_0edd751f.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_327770bf.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_1b20cec4.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_fa752d9e.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_ed8a6f55.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_75e7876e.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_405fe47b.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/ref_annotated_text/dev/session_7fe82ea3.seglst.json

seglst_files/err_dev.ref.seglst.json ADDED Viewed

The diff for this file is too large to render. See raw diff

seglst_files/err_dev.src.list ADDED Viewed

	@@ -0,0 +1,13 @@

+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_e992c01d.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_17dba297.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_e6e6ca6b.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_197ddec4.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_ac417036.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_0edd751f.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_327770bf.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_1b20cec4.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_fa752d9e.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_ed8a6f55.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_75e7876e.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_405fe47b.seglst.json
+/home/taejinp/projects/update_llm_speaker_tagging/llm_speaker_tagging/SLT-Task2-Post-ASR-Speaker-Tagging/err_source_text/dev/session_7fe82ea3.seglst.json

seglst_files/err_dev.src.seglst.json ADDED Viewed

The diff for this file is too large to render. See raw diff