Spaces:

NOTSOFAR
/

CHiME8Challenge

Running

App Files Files Community

shaipeerms commited on Mar 13

Commit

aaef8e0

•

1 Parent(s): f4289e9

init challenge code

Browse files

Files changed (6) hide show

README.md +4 -4
app.py +168 -0
content.py +58 -0
requirements.txt +7 -0
server.py +149 -0
validation.py +79 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
 title: CHiME8Challenge
-emoji: 🏢
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: CHiME8Challenge
+emoji: 🔊
+colorFrom: blue
+colorTo: red
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 ---
+# CHiME8 Challenge

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import logging
+import pandas as pd
+import gradio as gr
+from gradio.themes.utils.sizes import text_md
+from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN,
+                     MY_SUBMISSIONS_TAB_TITLE_MARKDOWN)
+from validation import validate_zip
+from server import LeaderboardServer
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+lb_server = LeaderboardServer()
+LEADERBOARD_TYPES = ['NOTSOFAR-SC', 'NOTSOFAR-MC', 'DASR-Constrained-LM', 'DASR-Unconstrained-LM']
+MAX_SUBMISSIONS_PER_24H = 5
+with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
+    app_state = gr.State({})
+    with gr.Row():
+        with gr.Row():
+            gr.Markdown(HEADER_MARKDOWN)
+    with gr.Row():
+        # Leaderboards Tab #
+        ####################
+        with gr.Tab('Leaderboards') as leaderboards_tab:
+            gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
+            with gr.Row():
+                def populate_leaderboard(leaderboard_type):
+                    leaderboard_df = lb_server.get_leaderboard(submission_type=leaderboard_type)
+                    if leaderboard_df.empty:
+                        return pd.DataFrame(columns=['No submissions yet'])
+                    return leaderboard_df
+                for idx, tab_name in enumerate(LEADERBOARD_TYPES):
+                    with gr.Tab(tab_name) as leaderboard_tab:
+                        leaderboard_table = gr.DataFrame(populate_leaderboard(tab_name)) if idx == 0 else gr.DataFrame(pd.DataFrame(columns=['No submissions yet']))
+                        leaderboard_tab.select(fn=populate_leaderboard,
+                                               inputs=[gr.Text(tab_name, visible=False)],
+                                               outputs=[leaderboard_table])
+                        leaderboard_table.change(fn=populate_leaderboard, inputs=[gr.Text(tab_name, visible=False)],
+                                                 outputs=[leaderboard_table])
+        # Submission Tab #
+        ##################
+        with gr.Tab('Submission'):
+            with gr.Column():
+                def on_submit_pressed():
+                    return gr.update(value='Processing submission...', interactive=False)
+                def validate_submission_inputs(team_name, submission_zip, submission_type, token):
+                    if not team_name or not submission_zip or not submission_type:
+                        raise ValueError('Please fill in all fields')
+                    if not os.path.exists(submission_zip):
+                        raise ValueError('File does not exist')
+                    if not submission_zip.endswith('.zip'):
+                        raise ValueError('File must be a zip')
+                    if not token:
+                        raise ValueError('Please insert a valid Hugging Face token')
+                def process_submission(team_name, submission_zip, submission_type, description,
+                                       app_state, request: gr.Request):
+                    logging.info(f'{team_name}: new submission for track: {submission_type}')
+                    try:
+                        token = app_state.get('hf_token')
+                        validate_submission_inputs(team_name, submission_zip, submission_type, token)
+                        validate_zip(submission_type, submission_zip)
+                    except ValueError as err:
+                        gr.Warning(str(err))
+                        return
+                    metadata = {'challenge_name': 'NOTSOFAR1',
+                                'team_name': team_name,
+                                'submission_type': submission_type,
+                                'description': description,
+                                'token': token,
+                                'file_name': os.path.basename(submission_zip),
+                                'file_size_mb': os.path.getsize(submission_zip) / 1024 / 1024,
+                                'ip': request.client.host}
+                    try:
+                        gr.Info('Processing submission...')
+                        response = lb_server.add_submission(token=token, file_path=submission_zip, metadata=metadata)
+                        if 'error' in response:
+                            gr.Warning(f'Failed to process submission - {response["error"]}')
+                        else:
+                            gr.Info('Done processing submission')
+                    except Exception as e:
+                        gr.Warning(f'Submission failed to upload - {e}')
+                def on_submit_done():
+                    return gr.update(value='Submit', interactive=True)
+                gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
+                submission_team_name_tb = gr.Textbox(label='Team Name')
+                submission_file_path = gr.File(label='Upload your results', type='filepath')
+                submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
+                with gr.Row():
+                    hf_token_tb = gr.Textbox(label='Token', type='password')
+                    submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
+                description_tb = gr.Textbox(label='Description', type='text')
+                submission_btn = gr.Button(value='Submit')
+                submission_btn.click(
+                    fn=on_submit_pressed,
+                    outputs=[submission_btn]
+                ).then(
+                    fn=process_submission,
+                    inputs=[submission_team_name_tb, submission_file_path,
+                            submission_type_radio, description_tb, app_state]
+                ).then(
+                    fn=on_submit_done,
+                    outputs=[submission_btn]
+                )
+        # My Submissions Tab #
+        ######################
+        with gr.Tab('My Submissions') as my_submissions_tab:
+            def on_my_submissions_tab_select(app_state):
+                hf_token = app_state.get('hf_token')
+                if not hf_token:
+                    return pd.DataFrame(columns=['Please insert your Hugging Face token'])
+                submissions = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
+                if submissions.empty:
+                    submissions = pd.DataFrame(columns=['No submissions yet'])
+                return submissions
+            gr.Markdown(MY_SUBMISSIONS_TAB_TITLE_MARKDOWN)
+            my_submissions_table = gr.DataFrame()
+            my_submissions_tab.select(fn=on_my_submissions_tab_select, inputs=[app_state],
+                                      outputs=[my_submissions_table])
+            my_submissions_token_tb = gr.Textbox(label='Token', type='password')
+    # Token Insertion #
+    ###################
+    with gr.Row():
+        def on_token_insert(hf_token, app_state):
+            gr.Info(f'Verifying token...')
+            submission_count = lb_server.get_submission_count_last_24_hours(hf_token=hf_token)
+            if submission_count is None:
+                # Invalid token
+                app_state['hf_token'] = None
+                submissions_24h_str = ''
+                team_submissions_df = pd.DataFrame(columns=['Invalid Token'])
+                gr.Warning('Invalid token')
+            else:
+                app_state['hf_token'] = hf_token
+                submissions_24h_str = f'{submission_count}/{MAX_SUBMISSIONS_PER_24H}'
+                team_submissions_df = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
+                if team_submissions_df.empty:
+                    team_submissions_df = pd.DataFrame(columns=['No submissions yet'])
+                gr.Info('Token verified!')
+            return app_state, team_submissions_df, submissions_24h_str
+        hf_token_tb.change(fn=on_token_insert, inputs=[hf_token_tb, app_state],
+                           outputs=[app_state, my_submissions_table, submissions_24h_txt])
+        my_submissions_token_tb.change(fn=on_token_insert, inputs=[my_submissions_token_tb, app_state],
+                                       outputs=[app_state, my_submissions_table, submissions_24h_txt])
+    main.launch()

content.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+This file contains the text content for the leaderboard client.
+"""
+HEADER_MARKDOWN = """
+    # CHiME-8 Leaderboard
+    In collaboration with the CHiME-8 Challenge, the NOTSOFAR team is proud to host the official leaderboard for the three tasks this year.\n
+    For details, visit:
+    1. [DASR](https://www.chimechallenge.org/current/task1/index)
+    2. [NOTSOFAR](https://www.chimechallenge.org/current/task2/index)
+    3. [MMCSG](https://www.chimechallenge.org/current/task3/index)
+    ### DASR and NOTSOFAR - the scientific story
+    Both tasks focus on distant automatic speech recognition and speaker diarization, offering a fundamental comparison
+    among different system designs:
+    - Single-channel (SC), 1 device (NOTSOFAR-SC)
+    - Multi-channel (MC), known-geometry, 1 device (NOTSOFAR-MC)
+    - Multi-channel (MC), geometry-agnostic, multiple devices (DASR-Constrained-LM and DASR-Unconstrained-LM)
+    Featured in both tasks, the NOTSOFAR recorded meeting dataset is leveraged as a common benchmark:
+    each geometry-agnostic MC system submitted to DASR tracks (constrained or not) will also be **automatically submitted**
+    to the known-geometry NOTSOFAR-MC track. These entries will be marked with "DASR" to denote their origin.
+"""
+LEADERBOARD_TAB_TITLE_MARKDOWN = """
+    ## Leaderboards for CHiME-8 Tracks
+"""
+SUBMISSION_TAB_TITLE_MARKDOWN = """
+    ## Submission
+    To submit your results, please fill in the form below.
+    - *Team Name:* The name of your team, as it will appear on the leaderboard'
+    - *Results:* Results zip file to submit
+    - *Submission track:* The track to submit results to
+    - *Token:* Your Hugging Face token
+    - *Description:* Short description of your submission (optional)
+    **Hugging Face tokens:** To create a token, go to your profile settings > Access Tokens > New Token.
+    Name the token and give it a write role, then copy the token and paste it in the field below.\n
+    **Team creation:** Upon the first submission, your team name is associated with your Hugging Face user account.
+    Any token generated by your account can be used. All team members should use this specific user's token for
+    future submissions.
+    New tokens can be created by the team member who initially linked the team to the token.
+"""
+MY_SUBMISSIONS_TAB_TITLE_MARKDOWN = """
+    ## My Submissions
+    To view all submissions, please enter the Hugging Face token associated with your team in the field below
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+pandas
+azure-cosmos
+huggingface_hub
+requests
+Pyarrow
+tabulate

server.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import logging
+from typing import Optional
+import pandas as pd
+import requests
+class LeaderboardServer:
+    def __init__(self):
+        self._LOG = logging.getLogger('leaderboard_server')
+        self._server_address = os.environ['LEADERBOARD_SERVER_ADDRESS']
+    def get_leaderboard(self, submission_type: str) -> pd.DataFrame:
+        """
+        Gets the leaderboard of the given submission type
+        Args:
+            submission_type: the type of the submission to get the leaderboard of:
+                             'SC' / 'MC-specific' / 'MC-agnostic' / 'MC-agnostic-all'
+        """
+        self._LOG.info(f'Getting leaderboard for submission type: {submission_type}')
+        endpoint = f'{self._server_address}/leaderboard'
+        submission_type = submission_type.lower().replace('-', '_')
+        response = requests.get(endpoint, params={'submission_type': submission_type})
+        if response.status_code != 200:
+            self._LOG.error(f'Error while fetching leaderboard, status code: {response.status_code}, '
+                            f'response: {response.text}, endpoint: {endpoint}')
+            return pd.DataFrame()
+        return pd.DataFrame(response.json())
+    def get_submissions_by_hf_token(self, hf_token: str) -> pd.DataFrame:
+        """
+        Gets the submissions of the given hf token
+        Args:
+            hf_token: the hf token to get the submissions of
+        """
+        self._LOG.info(f'Getting submissions for hf token: {hf_token}')
+        endpoint = f'{self._server_address}/submissions'
+        response = requests.get(endpoint, params={'token': hf_token})
+        if response.status_code != 200:
+            self._LOG.error(f'Error while fetching submissions, status code: {response.status_code}, '
+                            f'response: {response.text}, endpoint: {endpoint}')
+            return pd.DataFrame()
+        return pd.DataFrame(response.json())
+    def is_hf_token_valid(self, hf_token: str) -> Optional[bool]:
+        """
+        Validates the given hf token
+        Args:
+            hf_token: the hf token to validate
+        """
+        self._LOG.info(f'Validating hf token: {hf_token}')
+        endpoint = f'{self._server_address}/validate_hf_token'
+        response = requests.get(endpoint, params={'token': hf_token})
+        if response.status_code != 200:
+            self._LOG.error(f'Error while validating hf token, status code: {response.status_code}, '
+                            f'response: {response.text}, endpoint: {endpoint}')
+            return None
+        return response.json()['valid']
+    def get_submission_count_last_24_hours(self, hf_token: str) -> Optional[int]:
+        """
+        Gets the number of submissions of the given hf token in the last 24 hours
+        Args:
+            hf_token: the hf token to get the submissions count of
+        """
+        self._LOG.info(f'Getting submissions count for hf token: {hf_token} in the last 24 hours')
+        endpoint = f'{self._server_address}/submission_count_last_24_hours'
+        response = requests.get(endpoint, params={'token': hf_token})
+        if response.status_code != 200:
+            self._LOG.error(f'Error while fetching submissions count, status code: {response.status_code}, '
+                            f'response: {response.text}, endpoint: {endpoint}')
+            return None
+        return int(response.json()['count'])
+    def add_submission(self, token: str, file_path: str, metadata: dict) -> dict:
+        """
+        Adds a submission to the leaderboard based on the given file and metadata
+        Args:
+            token: the token of the team
+            file_path: the path of the file to submit
+            metadata: the metadata of the submission, structure:
+            {
+                'challenge_name': 'NOTSOFAR1',
+                'team_name': (str),
+                'submission_type': (str),
+                'token': (str),
+                'file_name': (str),
+                'file_size_mb': (int),
+                'ip': (str) xxx.xxx.xxx.xxx
+            }
+        """
+        self._LOG.info(f'Adding submission for team: {metadata["team_name"]}, '
+                       f'submission type: {metadata["submission_type"]}')
+        endpoint = f'{self._server_address}/add_submission'
+        metadata['token'] = token
+        metadata['submission_type'] = metadata['submission_type'].lower().replace('-', '_')
+        with open(file_path, 'rb') as payload_file:
+            files = {'zip_file': payload_file}
+            response = requests.post(endpoint, files=files, params=metadata, timeout=600)
+        if response.status_code != 200:
+            self._LOG.error(f'Error while adding submission, status code: {int(response.status_code)}, '
+                            f'response: {response.text}, endpoint: {endpoint}')
+            return dict(error=response.json()['message'])
+        return response.json()
+def test_server():
+    """
+    Basic server tests for the leaderboard server
+    """
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    server = LeaderboardServer()
+    hf_token = str(os.environ['HF_TOKEN'])
+    print('leaderboard:\n', server.get_leaderboard('notsofar_mc'))
+    print('submissions by hf token:\n', server.get_submissions_by_hf_token(hf_token))
+    print('is hf token valid:\n', server.is_hf_token_valid(hf_token))
+    print('is hf token valid:\n', server.is_hf_token_valid(hf_token + '1'))
+    print('add_submission:\n', server.add_submission(
+        token=hf_token,
+        file_path=fr"C:\Users\shaipeer\Downloads\submissions\notsofar_submission.zip",
+        metadata={
+                'challenge_name': 'NOTSOFAR1',
+                'team_name': 'NOTSOFAR Test Team',
+                'submission_type': 'notsofar_mc',
+                 'description': 'Test NOTSOFAR submission',
+                'token': hf_token,
+                'file_name': 'notsofar_submission.zip',
+                'file_size_mb': 10,
+                'ip': '127.0.0.1'
+            }))
+    print('add_submission:\n', server.add_submission(
+        token=hf_token,
+        file_path=fr"C:\Users\shaipeer\Downloads\submissions\chime_submission.zip",
+        metadata={
+                'challenge_name': 'NOTSOFAR1',
+                'team_name': 'Chime Test Team',
+                'submission_type': 'dasr_unconstrained_lm',
+                 'description': 'Test chime submission',
+                'token': hf_token,
+                'file_name': 'chime_submission.zip',
+                'file_size_mb': 10,
+                'ip': '127.0.0.1'
+            }))
+if __name__ == '__main__':
+    test_server()

validation.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+from pathlib import Path
+from zipfile import ZipFile
+from typing import List, Dict, Any
+from tempfile import TemporaryDirectory
+def validate_zip(submission_track: str, submission_zip: str):
+    """
+    Validates the submission format and contents
+    Args:
+        submission_track: the track of the submission
+        submission_zip: path to the submission zip file
+    Raises:
+        ValueError: if the submission zip is invalid
+    """
+    with TemporaryDirectory() as temp_dir:
+        with ZipFile(submission_zip, 'r') as submission_zip_file:
+            submission_zip_file.extractall(temp_dir)
+        submission_dir = Path(temp_dir)
+        if submission_track in ['NOTSOFAR-SC', 'NOTSOFAR-MC']:
+            validate_notsofar_submission(submission_dir=submission_dir)
+        elif submission_track in ['DASR-Constrained-LM', 'DASR-Unconstrained-LM']:
+            validate_dasr_submission(submission_dir=submission_dir)
+        else:
+            raise ValueError(f'Invalid submission track: {submission_track}')
+def validate_notsofar_submission(submission_dir: Path):
+    """
+    Validates NOTSOFAR submission format and contents
+    Args:
+        submission_dir: path to the submission directory
+    Raises:
+        ValueError: if the submission zip is invalid
+    """
+    submission_file_names = ['tc_orc_wer_hyp.json', 'tcp_wer_hyp.json']
+    fields = ['session_id', 'words', 'speaker', 'start_time', 'end_time']
+    for file_name in submission_file_names:
+        file_path = submission_dir / file_name
+        if not file_path.exists():
+            raise ValueError(f'Missing {file_name}')
+        with open(file_path, 'r') as json_file:
+            json_data: List[Dict[str, Any]] = json.load(json_file)
+            if not isinstance(json_data, list):
+                raise ValueError(f'Invalid `{file_name}` format, expecting a list of entries')
+            for data in json_data:
+                if not all(field in data for field in fields):
+                    raise ValueError(f'Invalid `{file_name}` format, fields: {fields} are required in each entry')
+def validate_dasr_submission(submission_dir: Path):
+    """
+    Validates DASR submission format and contents
+    Args:
+        submission_dir: path to the submission directory
+    Raises:
+        ValueError: if the submission zip is invalid
+    """
+    submission_file_names = ['chime6.json', 'dipco.json', 'mixer6.json', 'notsofar1.json']
+    fields = ['session_id', 'words', 'speaker', 'start_time', 'end_time']
+    if not (submission_dir / 'dev').exists():
+        raise ValueError('Missing dev directory, expecting a directory named `dev` with the submission files in it.')
+    for file_name in submission_file_names:
+        file_path = submission_dir / 'dev' / file_name
+        if not file_path.exists():
+            raise ValueError(f'Missing {file_name}')
+        with open(file_path, 'r') as json_file:
+            json_data: List[Dict[str, Any]] = json.load(json_file)
+            if not isinstance(json_data, list):
+                raise ValueError(f'Invalid `{file_name}` format, expecting a list of entries')
+            for data in json_data:
+                if not all(field in data for field in fields):
+                    raise ValueError(f'Invalid `{file_name}` format, fields: {fields} are required in each entry')