Spaces:

sapienzanlp
/

ea-mt-leaderboard

Running

App Files Files Community

s-conia commited on 23 days ago

Commit

3abad7e

1 Parent(s): 72447c4

Initial commit.

Browse files

Files changed (3) hide show

app.py +460 -0
data/scores.jsonl +53 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import json
+from typing import Dict, List
+import pandas as pd
+import gradio as gr
+class LeaderboardData:
+    def __init__(self, data_path: str):
+        """Initialize leaderboard data from a jsonl file.
+        Args:
+            data_path: Path to the jsonl file containing submission data.
+        """
+        self.df = self._load_data(data_path)
+        # Compute absolute ranking based on overall score.
+        overall_score_cols = [
+            col for col in self.df.columns if col.endswith("overall_overall_score")
+        ]
+        if overall_score_cols:
+            # Use the first overall score column to compute ranking.
+            main_overall = overall_score_cols[0]
+            # Higher overall scores get lower rank numbers (i.e. rank 1 is best).
+            self.df["rank"] = (
+                self.df[main_overall]
+                .fillna(0)
+                .rank(method="min", ascending=False)
+                .astype(int)
+            )
+        else:
+            self.df["rank"] = None
+        self.column_categories = self._categorize_columns()
+    def _load_data(self, data_path: str) -> pd.DataFrame:
+        """Load and process submission data from a jsonl file.
+        This method flattens each score object into three separate columns:
+        `<lang>_meta_score`, `<lang>_comet_score`, and `<lang>_overall_score`,
+        rounding each value to 2 decimal digits.
+        Args:
+            data_path: Path to the data file.
+        Returns:
+            Processed DataFrame with submission data.
+        """
+        records = []
+        with open(data_path) as f:
+            for line in f:
+                submission = json.loads(line)
+                for submission_id, data in submission.items():
+                    record = {
+                        "team_name": data["metadata"]["team_name"],
+                        "system_name": data["metadata"]["submission_name"],
+                        "uses_gold": "🟠" if data["metadata"]["uses_gold"] else "",
+                        "uses_rag": "🔍" if data["metadata"]["uses_rag"] else "",
+                        "uses_llm": (
+                            "🤖" if data["metadata"]["llm_name"] != "N/A" else ""
+                        ),
+                        "llm_name": data["metadata"]["llm_name"],
+                        "is_finetuned": (
+                            "📚" if data["metadata"]["is_finetuned"] else ""
+                        ),
+                    }
+                    # Flatten each score object into three separate columns and round to 2 decimals.
+                    # Sort languages by name to ensure consistent column order.
+                    languages = sorted(
+                        l for l in data["scores"].keys() if l != "overall"
+                    )
+                    languages.append("overall")
+                    for lang in languages:
+                        scores = data["scores"].get(lang, {})
+                        record[f"{lang}_meta_score"] = (
+                            round(scores.get("meta_score", 0), 2)
+                            if scores.get("meta_score") is not None
+                            else None
+                        )
+                        record[f"{lang}_comet_score"] = (
+                            round(scores.get("comet_score", 0), 2)
+                            if scores.get("comet_score") is not None
+                            else None
+                        )
+                        record[f"{lang}_overall_score"] = (
+                            round(scores.get("overall_score", 0), 2)
+                            if scores.get("overall_score") is not None
+                            else None
+                        )
+                    records.append(record)
+        return pd.DataFrame.from_records(records)
+    def _categorize_columns(self) -> Dict[str, List[str]]:
+        """Categorize DataFrame columns into team info, system details, and scores.
+        Returns:
+            Dictionary mapping column categories to column names.
+        """
+        if self.df.empty:
+            return {"team_info": [], "system_details": [], "scores": []}
+        team_info_cols = ["team_name", "system_name"]
+        system_details_cols = [
+            "uses_gold",
+            "uses_rag",
+            "uses_llm",
+            "llm_name",
+            "is_finetuned",
+        ]
+        score_cols = [
+            col
+            for col in self.df.columns
+            if col not in team_info_cols + system_details_cols + ["rank"]
+        ]
+        return {
+            "team_info": team_info_cols,
+            "system_details": system_details_cols,
+            "scores": score_cols,
+        }
+    def get_team_names(self) -> List[str]:
+        """Get list of unique team names."""
+        return self.df["team_name"].unique().tolist()
+    def get_system_names(self) -> List[str]:
+        """Get list of unique system names."""
+        return self.df["system_name"].unique().tolist()
+    def get_llm_names(self) -> List[str]:
+        """Get list of unique LLM names."""
+        return self.df["llm_name"].unique().tolist()
+def leaderboard_view(
+    selected_team,
+    selected_system,
+    selected_llm,
+    score_type,
+    leaderboard_data,
+    uses_gold,
+    uses_rag,
+    uses_llm,
+    is_finetuned,
+):
+    """
+    Filters the leaderboard based on dropdown selections and checkbox values,
+    then selects only the score columns that match the score_type (meta, comet, or overall).
+    The DataFrame is sorted (descending) by the first matching score column.
+    Finally, the score columns are renamed to only show the language code.
+    An absolute ranking column (based on overall score) is added as the first column.
+    Column names are made more user-friendly.
+    """
+    df = leaderboard_data.df.copy()
+    # Apply dropdown filtering (if not "All").
+    if selected_team != "All":
+        df = df[df["team_name"] == selected_team]
+    if selected_system != "All":
+        df = df[df["system_name"] == selected_system]
+    if selected_llm != "All":
+        df = df[df["llm_name"] == selected_llm]
+    # Apply checkbox filtering.
+    if uses_gold:
+        df = df[df["uses_gold"] == ""]  # Show only systems that do not use gold.
+    if uses_rag:
+        df = df[df["uses_rag"] == ""]  # Show only systems that do not use RAG.
+    if uses_llm:
+        df = df[df["uses_llm"] == ""]  # Show only systems that do not use LLM.
+    if is_finetuned:
+        df = df[df["is_finetuned"] == ""]  # Show only systems that are not finetuned.
+    # Build columns: add the ranking column first, then team info and system details.
+    cols = []
+    if "rank" in df.columns:
+        cols.append("rank")
+    cols.extend(leaderboard_data.column_categories["team_info"])
+    cols.extend(leaderboard_data.column_categories["system_details"])
+    # Include only score columns ending with the desired score_type.
+    score_cols = [
+        col
+        for col in leaderboard_data.column_categories["scores"]
+        if col.endswith(f"_{score_type}_score")
+    ]
+    cols.extend(score_cols)
+    df = df[cols]
+    # Sort by the score column matching score_type (if available).
+    sort_col = next(
+        (col for col in df.columns if col.endswith(f"overall_overall_score")), None
+    )
+    if sort_col is not None:
+        df = df.sort_values(by=sort_col, ascending=False)
+    # Rename columns: for score columns, strip the suffix; for others, use friendly names.
+    friendly_names = {
+        "rank": "Rank",
+        "team_name": "Team",
+        "system_name": "System",
+        "uses_gold": "Uses Gold",
+        "uses_rag": "Uses RAG",
+        "uses_llm": "Uses LLM",
+        "llm_name": "LLM Name",
+        "is_finetuned": "Finetuned",
+    }
+    new_columns = {}
+    for col in df.columns:
+        if col.endswith(f"_{score_type}_score"):
+            new_columns[col] = col.replace(f"_{score_type}_score", "")
+        elif col in friendly_names:
+            new_columns[col] = friendly_names[col]
+    df = df.rename(columns=new_columns)
+    # Reorder columns to ensure "Rank" is the first column.
+    if "Rank" in df.columns:
+        new_order = ["Rank"] + [c for c in df.columns if c != "Rank"]
+        df = df[new_order]
+    return df
+def update_all(
+    selected_team,
+    selected_system,
+    selected_llm,
+    leaderboard_data,
+    uses_gold,
+    uses_rag,
+    uses_llm,
+    is_finetuned,
+):
+    """Update all three tabs and return DataFrames for meta, comet, and overall scores."""
+    meta_df = leaderboard_view(
+        selected_team,
+        selected_system,
+        selected_llm,
+        "meta",
+        leaderboard_data,
+        uses_gold,
+        uses_rag,
+        uses_llm,
+        is_finetuned,
+    )
+    comet_df = leaderboard_view(
+        selected_team,
+        selected_system,
+        selected_llm,
+        "comet",
+        leaderboard_data,
+        uses_gold,
+        uses_rag,
+        uses_llm,
+        is_finetuned,
+    )
+    overall_df = leaderboard_view(
+        selected_team,
+        selected_system,
+        selected_llm,
+        "overall",
+        leaderboard_data,
+        uses_gold,
+        uses_rag,
+        uses_llm,
+        is_finetuned,
+    )
+    return meta_df, comet_df, overall_df
+def leaderboard_app(data_path: str):
+    """Create a leaderboard app with three tabs for meta, comet, and overall scores."""
+    leaderboard_data = LeaderboardData(data_path)
+    # Calculate initial values first.
+    initial_meta, initial_comet, initial_overall = update_all(
+        "All",
+        "All",
+        "All",
+        leaderboard_data,
+        uses_gold=False,
+        uses_rag=False,
+        uses_llm=False,
+        is_finetuned=False,
+    )
+    # Prepend "All" to dropdown choices.
+    team_choices = ["All"] + leaderboard_data.get_team_names()
+    system_choices = ["All"] + leaderboard_data.get_system_names()
+    llm_choices = ["All"] + leaderboard_data.get_llm_names()
+    theme = gr.themes.Ocean(
+        secondary_hue="indigo",
+        neutral_hue="slate",
+        spacing_size="lg",
+        radius_size="md",
+    )
+    with gr.Blocks(theme=theme) as demo:
+        gr.Markdown(
+            """
+            # Entity-Aware Machine Translation Leaderboard
+            ## Overview
+            This leaderboard showcases the performance of various systems on the **EA-MT shared task**, which has been organized as part of the **[SemEval 2025](https://semeval.github.io/2025/)** workshop.
+            * **The results are still provisional and subject to change.**
+            ## Task Description
+            The task is to translate a given input sentence from the source language (English) to the target language, where the input sentence contains named entities that may be challenging for machine translation systems to handle. The named entities may be entities that are rare, ambiguous, or unknown to the machine translation system. The task is to develop machine translation systems that can accurately translate such named entities in the input sentence to the target language.
+            * Learn more about the task on the **[EA-MT shared task page](https://sapienzanlp.github.io/ea-mt/)**.
+            ### Scoring
+            The leaderboard is based on three main scores:
+            - **M-ETA Score**: A score that evaluates the translation quality of named entities in the input sentence.
+            - **COMET Score**: A score that evaluates the translation quality at the sentence level.
+            - **Overall Score**: The harmonic mean of the M-ETA and COMET scores.
+            ### Legend
+            - 🟠: Uses *gold* data, i.e., the Wikidata ID or information derived from it.
+            - 🔍: Uses *RAG* (Retrieval-Augmented Generation) for named entity translation.
+            - 🤖: Uses an *LLM* (Large Language Model) for named entity translation.
+            - 📚: The system (LLM and/or MT model) is *finetuned* on additional data.
+            """
+        )
+        # Divider.
+        gr.Markdown("---")
+        # Filters and controls.
+        gr.Markdown(
+            """## Filters and Controls
+        Use the dropdowns and checkboxes to filter the leaderboard scores."""
+        )
+        # Controls at the top.
+        uses_gold_checkbox = gr.Checkbox(
+            label="🟠 : Display only those systems that do not use gold information.",
+            value=False,
+        )
+        uses_rag_checkbox = gr.Checkbox(
+            label="🔍 : Display only those systems that do not use RAG.",
+            value=False,
+        )
+        uses_llm_checkbox = gr.Checkbox(
+            label="🤖 : Display only those systems that do not use LLM.",
+            value=False,
+        )
+        is_finetuned_checkbox = gr.Checkbox(
+            label="📚 : Display only those systems that are not finetuned.",
+            value=False,
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                team_dropdown = gr.Dropdown(
+                    choices=team_choices, label="Team Name", value="All"
+                )
+            with gr.Column(scale=1):
+                system_dropdown = gr.Dropdown(
+                    choices=system_choices, label="System Name", value="All"
+                )
+            with gr.Column(scale=1):
+                llm_dropdown = gr.Dropdown(
+                    choices=llm_choices, label="LLM Name", value="All"
+                )
+        # Divider.
+        gr.Markdown("---")
+        # Tabs for meta, comet, and overall scores.
+        gr.Markdown(
+            """
+        ## Leaderboard Scores
+        You can view the leaderboard scores for each system based on the following metrics:
+        - **M-ETA Score**: A score that evaluates the translation quality of named entities in the input sentence.
+        - **COMET Score**: A score that evaluates the translation quality at the sentence level.
+        - **Overall Score**: The harmonic mean of the M-ETA and COMET scores.
+        Switch between the tabs to view the scores for each metric.
+        > **Note**: You can sort the leaderboard by clicking on the column headers. For example, click on the "it_IT" column to sort by the Italian language scores.
+        """
+        )
+        with gr.Tabs() as tabs:
+            with gr.TabItem("Overall Score"):
+                overall_table = gr.Dataframe(
+                    value=initial_overall,  # Set initial value here.
+                    label="Overall Score Leaderboard",
+                    interactive=True,
+                )
+            with gr.TabItem("M-ETA Score"):
+                meta_table = gr.Dataframe(
+                    value=initial_meta,  # Set initial value here.
+                    label="M-ETA Score Leaderboard",
+                    interactive=True,
+                )
+            with gr.TabItem("COMET Score"):
+                comet_table = gr.Dataframe(
+                    value=initial_comet,  # Set initial value here.
+                    label="Comet Score Leaderboard",
+                    interactive=True,
+                )
+        def update_callback(
+            selected_team,
+            selected_system,
+            selected_llm,
+            uses_gold,
+            uses_rag,
+            uses_llm,
+            is_finetuned,
+        ):
+            return update_all(
+                selected_team,
+                selected_system,
+                selected_llm,
+                leaderboard_data,
+                uses_gold,
+                uses_rag,
+                uses_llm,
+                is_finetuned,
+            )
+        for control in [
+            team_dropdown,
+            system_dropdown,
+            llm_dropdown,
+            uses_gold_checkbox,
+            uses_rag_checkbox,
+            uses_llm_checkbox,
+            is_finetuned_checkbox,
+        ]:
+            control.change(
+                update_callback,
+                inputs=[
+                    team_dropdown,
+                    system_dropdown,
+                    llm_dropdown,
+                    uses_gold_checkbox,
+                    uses_rag_checkbox,
+                    uses_llm_checkbox,
+                    is_finetuned_checkbox,
+                ],
+                outputs=[meta_table, comet_table, overall_table],
+            )
+    return demo
+if __name__ == "__main__":
+    data_path = "semeval/submissions/scores.jsonl"
+    leaderboard_app(data_path).launch()

data/scores.jsonl ADDED Viewed

	@@ -0,0 +1,53 @@

+{"225638": {"metadata": {"team_name": "YNU-HPCC", "email": "skyfurynowonline@icloud.com", "submission_name": "LLaMA + MT", "submission_description": "try another llm", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "Llama-3.3-70B-Instruct", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 88.17394726485635, "comet_score": 94.903170802095, "overall_score": 91.41488861007092}, "th_TH": {"meta_score": 86.5100087032202, "comet_score": 93.42989604871515, "overall_score": 89.83689450606285}, "de_DE": {"meta_score": 82.59019741320627, "comet_score": 94.37710679890023, "overall_score": 88.091118486679}, "fr_FR": {"meta_score": 86.58737419945105, "comet_score": 93.77212552771259, "overall_score": 90.03664497660041}, "tr_TR": {"meta_score": 79.27565392354124, "comet_score": 94.08626482706862, "overall_score": 86.04831122247953}, "ar_AE": {"meta_score": 88.78381350340884, "comet_score": 94.3301959197181, "overall_score": 91.47300688418572}, "it_IT": {"meta_score": 89.87838367987446, "comet_score": 94.96415760843087, "overall_score": 92.35130542869926}, "es_ES": {"meta_score": 88.42263019857624, "comet_score": 95.27658856609601, "overall_score": 91.72174616762133}, "zh_TW": {"meta_score": 80.6445387881127, "comet_score": 94.23986344620879, "overall_score": 86.91375818515075}, "ja_JP": {"meta_score": 87.74471417384495, "comet_score": 95.68072684348424, "overall_score": 91.5410423141258}, "overall": {"meta_score": 85.86112618480924, "comet_score": 94.50600963884295, "overall_score": 89.94287167816755}}}}
+{"226803": {"metadata": {"team_name": "Lunar", "email": "suzie_oh@korea.ac.kr", "submission_name": "LLaMA-RAFT-Plus", "submission_description": "(I have already submitted the Google Form for this submission ID, but I was not completely sure if I submitted the correct content, so I am submitting it once more. I appreciate your understanding.) We trained LLaMA-3.1-8B-Instruct using a multi-turn dialogue setup that incorporates function calls to: (1) identify key entities in the source text that require lookup via function calls (e.g., [search(\"The Great Gatsby\")]) and (2) retrieve their corresponding entities in the target language using the Wikipedia API. These retrieved entities are then leveraged to generate the final translation. The system is capable of performing searches at test time, and this version utilizes real-time search to evaluate its performance in real-world conditions. Additionally, it integrates a validation dataset during training to further enhance accuracy.", "uses_gold": false, "uses_rag": true, "uses_llm": false, "llm_name": "Llama-3.1-8B-Instruct", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 61.49153876426604, "comet_score": 92.78515101927606, "overall_score": 73.96453370441374}, "th_TH": {"meta_score": 67.21787061212649, "comet_score": 90.16974140353088, "overall_score": 77.02026777289785}, "de_DE": {"meta_score": 59.54731109598366, "comet_score": 91.40155507572416, "overall_score": 72.11338478766199}, "fr_FR": {"meta_score": 67.15462031107045, "comet_score": 91.3326877611881, "overall_score": 77.39940873745869}, "tr_TR": {"meta_score": 71.495640509725, "comet_score": 93.62873369659216, "overall_score": 81.07883912266462}, "ar_AE": {"meta_score": 67.53903672751265, "comet_score": 91.44903267900682, "overall_score": 77.6961390858889}, "it_IT": {"meta_score": 73.57787367595135, "comet_score": 93.30533824454761, "overall_score": 82.27560234063249}, "es_ES": {"meta_score": 66.57924316223304, "comet_score": 93.01946911362057, "overall_score": 77.60922083422801}, "zh_TW": {"meta_score": 38.788112697800074, "comet_score": 88.96667605739724, "overall_score": 54.023015354445704}, "ja_JP": {"meta_score": 55.657791699295224, "comet_score": 92.11579698051152, "overall_score": 69.38942048250435}, "overall": {"meta_score": 62.9049039255964, "comet_score": 91.8174182031395, "overall_score": 74.25698322227962}}}}
+{"221913": {"metadata": {"team_name": "sakura", "email": "alberto.poncelas@rakuten.com", "submission_name": "Rakuten7b-PO10", "submission_description": "Rakuten7b with Preference Optmization on paranames", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Rakuten/RakutenAI-7B-chat", "is_finetuned": true}, "scores": {"ja_JP": {"meta_score": 29.502740798747062, "comet_score": 90.73745774552985, "overall_score": 44.52759940541613}}}}
+{"226408": {"metadata": {"team_name": "SHEF", "email": "xyang138@sheffield.ac.uk", "submission_name": "LLaMA-Wiki", "submission_description": "A three-stage pipeline incorporating Chain-of-Thought (CoT) reasoning that extracts entities, generates queries for Wikidata search, and leverages returned candidate entity information to enhance LLM translation.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "meta-llama/Llama-3.3-70B-Instruct", "is_finetuned": true}, "scores": {"de_DE": {"meta_score": 85.56841388699796, "comet_score": 92.82443383879772, "overall_score": 89.04885677651487}, "fr_FR": {"meta_score": 90.04574565416286, "comet_score": 91.92607256434756, "overall_score": 90.97619433769394}, "it_IT": {"meta_score": 93.01686936053353, "comet_score": 94.68152872449326, "overall_score": 93.84181727786786}, "es_ES": {"meta_score": 90.50206069689023, "comet_score": 93.90829785368602, "overall_score": 92.173721032759}}}}
+{"226036": {"metadata": {"team_name": "YNU-HPCC", "email": "skyfurynowonline@icloud.com", "submission_name": "Qwen2.5-32B", "submission_description": "Modified to improve performance", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "Qwen2.5-32B", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 88.17394726485635, "comet_score": 94.903170802095, "overall_score": 91.41488861007092}, "th_TH": {"meta_score": 86.5100087032202, "comet_score": 93.42989604871515, "overall_score": 89.83689450606285}, "de_DE": {"meta_score": 82.59019741320627, "comet_score": 94.37710679890023, "overall_score": 88.091118486679}, "fr_FR": {"meta_score": 86.58737419945105, "comet_score": 93.77212552771259, "overall_score": 90.03664497660041}, "tr_TR": {"meta_score": 79.27565392354124, "comet_score": 94.08626482706862, "overall_score": 86.04831122247953}, "ar_AE": {"meta_score": 88.78381350340884, "comet_score": 94.3301959197181, "overall_score": 91.47300688418572}, "it_IT": {"meta_score": 89.87838367987446, "comet_score": 94.96415760843087, "overall_score": 92.35130542869926}, "es_ES": {"meta_score": 88.42263019857624, "comet_score": 95.27658856609601, "overall_score": 91.72174616762133}, "zh_TW": {"meta_score": 80.6445387881127, "comet_score": 94.23986344620879, "overall_score": 86.91375818515075}, "ja_JP": {"meta_score": 87.74471417384495, "comet_score": 95.68072684348424, "overall_score": 91.5410423141258}, "overall": {"meta_score": 85.86112618480924, "comet_score": 94.50600963884295, "overall_score": 89.94287167816755}}}}
+{"226713": {"metadata": {"team_name": "UAlberta", "email": "ning.shi@ualberta.ca", "submission_name": "WikiGPT4o", "submission_description": "We prompt a state-of-the-art language model with instructions designed to increase the model's attention in the named entity. We incorporate information from WikiData into the prompt to suggest a translation of the entity, and also leverage in context learning.", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.33844942935852, "comet_score": 95.59958734436329, "overall_score": 92.8945861387825}, "th_TH": {"meta_score": 89.96228604583696, "comet_score": 94.11520353859669, "overall_score": 91.99189842403622}, "de_DE": {"meta_score": 85.05786249149081, "comet_score": 94.27934431802504, "overall_score": 89.43152006720092}, "fr_FR": {"meta_score": 89.67978042086003, "comet_score": 94.25645109010419, "overall_score": 91.91117777691568}, "tr_TR": {"meta_score": 81.64542812430136, "comet_score": 95.82169847469216, "overall_score": 88.16735522226375}, "ar_AE": {"meta_score": 91.6648339564548, "comet_score": 94.86057546627976, "overall_score": 93.23532837741573}, "it_IT": {"meta_score": 91.74185955276579, "comet_score": 95.92362562550335, "overall_score": 93.78615126342946}, "es_ES": {"meta_score": 89.34057699512927, "comet_score": 95.29729041435961, "overall_score": 92.22284714553352}, "zh_TW": {"meta_score": 81.22346584330374, "comet_score": 94.27579471799041, "overall_score": 87.26426273975798}, "ja_JP": {"meta_score": 90.32889584964762, "comet_score": 95.79453747675768, "overall_score": 92.98146551518195}, "overall": {"meta_score": 88.09834387091489, "comet_score": 95.02241084666721, "overall_score": 91.38865926705178}}}}
+{"225867": {"metadata": {"team_name": "SALT \ud83e\uddc2", "email": "voelker@informatik.uni-wuerzburg.de", "submission_name": "Salt-MT-Pipeline", "submission_description": "See description for Submission ID 226303. This removes the LLM post-processing (i.e. only DB retrieval and NLLB translation).", "uses_gold": false, "uses_rag": true, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 74.24242424242425, "comet_score": 92.96608721351399, "overall_score": 82.5559372183359}, "th_TH": {"meta_score": 65.5932695097186, "comet_score": 90.6400655477608, "overall_score": 76.10895902165163}, "de_DE": {"meta_score": 73.77467665078285, "comet_score": 92.34129760371252, "overall_score": 82.02040054005515}, "fr_FR": {"meta_score": 74.76669716376945, "comet_score": 91.84182100413813, "overall_score": 82.42927424713382}, "tr_TR": {"meta_score": 76.86116700201208, "comet_score": 94.46934204479471, "overall_score": 84.76043077058138}, "ar_AE": {"meta_score": 81.72421376731911, "comet_score": 93.20150630694839, "overall_score": 87.08633380650197}, "it_IT": {"meta_score": 77.61867398979993, "comet_score": 93.3607608500814, "overall_score": 84.7650299774166}, "es_ES": {"meta_score": 74.57849381790933, "comet_score": 93.59931963778203, "overall_score": 83.01328382778564}, "zh_TW": {"meta_score": 45.27209571593979, "comet_score": 89.75164376196369, "overall_score": 60.185638803456435}, "ja_JP": {"meta_score": 72.20046985121378, "comet_score": 93.0171020772416, "overall_score": 81.29738738786637}, "overall": {"meta_score": 71.66321817108891, "comet_score": 92.51889460479373, "overall_score": 80.4222675600785}}}}
+{"226800": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Chatgpt-4o-mini-llm", "submission_description": "The generated output from the model detailed (gpt-4o-mini-2024-07-18) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "gpt-4o-mini-2024-07-18", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 29.142070051160964, "comet_score": 90.46389896339059, "overall_score": 44.083172477312345}}}}
+{"224032": {"metadata": {"team_name": "silp_nlp", "email": "pankajgoyal02003@gmail.com", "submission_name": "NER-M2M100", "submission_description": "I have fine tuned M2M100 model by first finding entities using universal NER and then have done the Instruction based fine tuning.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "M2M100", "is_finetuned": true}, "scores": {"ja_JP": {"meta_score": 7.184808144087705, "comet_score": 86.54698509556667, "overall_score": 13.26814439090011}}}}
+{"226826": {"metadata": {"team_name": "Muhandro_HSE", "email": "muhandro@icloud.com", "submission_name": "NER-LLM", "submission_description": "NER using standard libraries and LLM, and then the LLM ensemble to create a translation . The final translation is selected based on the metric and the additional model", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Llama-3.1-8B, Llama-3.1-70B", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 20.97599370326643, "comet_score": 89.53337222103151, "overall_score": 33.9890005925305}, "th_TH": {"meta_score": 9.776617348418915, "comet_score": 77.5849548939314, "overall_score": 17.365035828066322}, "de_DE": {"meta_score": 37.848876786929885, "comet_score": 90.45921488581993, "overall_score": 53.36810225793355}, "fr_FR": {"meta_score": 35.77310155535224, "comet_score": 88.82349541017143, "overall_score": 51.00447362440645}, "tr_TR": {"meta_score": 39.86139056561592, "comet_score": 92.37412976027335, "overall_score": 55.69095588626412}, "ar_AE": {"meta_score": 32.35100065977568, "comet_score": 87.06894011609707, "overall_score": 47.17415401215668}, "it_IT": {"meta_score": 33.03256178893684, "comet_score": 89.452628376859, "overall_score": 48.24827344500537}, "es_ES": {"meta_score": 41.21393780442113, "comet_score": 91.64619073132175, "overall_score": 56.858298218440616}, "zh_TW": {"meta_score": 6.233114627556928, "comet_score": 84.98896329456294, "overall_score": 11.614424103439204}, "ja_JP": {"meta_score": 23.70790916209867, "comet_score": 89.99633431002883, "overall_score": 37.52938067376608}, "overall": {"meta_score": 28.07745040023726, "comet_score": 88.19282240000973, "overall_score": 41.28420986420089}}}}
+{"224693": {"metadata": {"team_name": "CUET_DeepLearners", "email": "u1904111@student.cuet.ac.bd", "submission_name": "Spacy-NLLB", "submission_description": "This submission used an NER model and a Machine Translation model from huggingface. Just detected the NER's and ensured their proper replacement. Used spacy as NER model and NLLB as Machine Translation model.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "N/A", "is_finetuned": false}, "scores": {}}}
+{"226798": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Claude-haiku-llm", "submission_description": "The generated output from the model detailed (claude-3-5-haiku-20241022) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "claude-3-5-haiku-20241022", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 28.492719401810312, "comet_score": 80.5606092801717, "overall_score": 42.09666706740437}}}}
+{"222020": {"metadata": {"team_name": "RAGthoven", "email": "karetka.gregor@gmail.com", "submission_name": "GPT-4o + Wikidata", "submission_description": "Prompted GPT-4o to do the translation. We utilised RAGthoven preprocessor to include wikidata entity name and it's translation in the user prompt.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 88.07556080283354, "comet_score": 95.19031932038709, "overall_score": 91.49483527983352}, "th_TH": {"meta_score": 89.20800696257616, "comet_score": 93.67097990475499, "overall_score": 91.38503630924737}, "de_DE": {"meta_score": 85.07488087134105, "comet_score": 94.33062866817046, "overall_score": 89.46399714325253}, "fr_FR": {"meta_score": 91.01555352241537, "comet_score": 94.03287318575197, "overall_score": 92.49961379894908}, "tr_TR": {"meta_score": 79.61099932930918, "comet_score": 95.29488579674693, "overall_score": 86.74975211707424}, "ar_AE": {"meta_score": 91.88475918187817, "comet_score": 94.6260689089842, "overall_score": 93.23526835443606}, "it_IT": {"meta_score": 92.64417418595528, "comet_score": 95.86757703557488, "overall_score": 94.22831678256576}, "es_ES": {"meta_score": 89.82765080554515, "comet_score": 95.14727143088545, "overall_score": 92.41096868546983}, "zh_TW": {"meta_score": 81.45503666538016, "comet_score": 94.07902041867818, "overall_score": 87.313085391478}, "ja_JP": {"meta_score": 87.56851996867658, "comet_score": 95.53658606815637, "overall_score": 91.37918243706056}, "overall": {"meta_score": 87.63651422959107, "comet_score": 94.77762107380904, "overall_score": 91.0160056299367}}}}
+{"226684": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Gemini-pro-llm", "submission_description": "The generated output from the model detailed (gemini-1.5-pro) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "gemini-1.5-pro", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 48.32743014561196, "comet_score": 90.94122744472136, "overall_score": 63.114786811817964}}}}
+{"220769": {"metadata": {"team_name": "YNU-HPCC", "email": "skyfurynowonline@icloud.com", "submission_name": "Qwen2.5 + M2M", "submission_description": "combine llm and MT", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Qwen2.5-32B", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 59.917355371900825, "comet_score": 93.0767384377909, "overall_score": 72.90362490426146}, "th_TH": {"meta_score": 56.367856106759504, "comet_score": 89.14121667440297, "overall_score": 69.06372473750946}, "de_DE": {"meta_score": 60.15997277059224, "comet_score": 91.65249365915076, "overall_score": 72.63977264269154}, "fr_FR": {"meta_score": 65.30649588289113, "comet_score": 90.56590397480095, "overall_score": 75.88953323950221}, "tr_TR": {"meta_score": 57.83590431477755, "comet_score": 91.48535905549544, "overall_score": 70.86918973376523}, "ar_AE": {"meta_score": 64.08621068836595, "comet_score": 91.54385640538864, "overall_score": 75.3928720635588}, "it_IT": {"meta_score": 65.39819537073363, "comet_score": 91.9688831180299, "overall_score": 76.44037169579335}, "es_ES": {"meta_score": 67.27238666167104, "comet_score": 92.77403245105329, "overall_score": 77.99150543710576}, "zh_TW": {"meta_score": 59.166345040524895, "comet_score": 92.0160554628352, "overall_score": 72.02232096666809}, "ja_JP": {"meta_score": 64.19342208300705, "comet_score": 93.8242614275091, "overall_score": 76.23071395096945}, "overall": {"meta_score": 61.97041442912238, "comet_score": 91.80488006664572, "overall_score": 73.94436293718252}}}}
+{"225975": {"metadata": {"team_name": "FII-UAIC-SAI", "email": "pricoptudor2001@gmail.com", "submission_name": "Qwen2.5-Wiki-MT", "submission_description": "This MT approach involves a pipeline of multiple steps, where the control center is a LLM (Qwen-2.5), and uses an external Knowledge Base (Wikidata). It uses carefully designed prompts and few-shots examples for the following steps: 1. LLM detects NE and surrounds them with tags; 2. LLM translates the tagged sentence, leaving NE as they are; 3. NE translations are queried through Wikidata API by their detected name, and inserted into the sentence, replacing the tagged parts; 4. LLM checks for fluency and adequacy in the target language. Since instructions are involved, it is important to have a low temperature when sampling the model.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Qwen2.5-72B-Instruct-AWQ", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 66.01731601731602, "comet_score": 92.78244493761572, "overall_score": 77.14429733989562}, "th_TH": {"meta_score": 65.24514070205977, "comet_score": 88.61969130728117, "overall_score": 75.15693031095819}, "de_DE": {"meta_score": 66.98434309053778, "comet_score": 91.29745374282174, "overall_score": 77.27357266786814}, "fr_FR": {"meta_score": 72.46111619396157, "comet_score": 90.58756986617605, "overall_score": 80.5167656902252}, "tr_TR": {"meta_score": 67.56092108204784, "comet_score": 91.62548770580466, "overall_score": 77.77425712575527}, "ar_AE": {"meta_score": 66.41741807785353, "comet_score": 91.34629440354807, "overall_score": 76.9123004249582}, "it_IT": {"meta_score": 75.79442918791683, "comet_score": 92.71085178819344, "overall_score": 83.40351174878518}, "es_ES": {"meta_score": 72.34919445485201, "comet_score": 92.58045707750784, "overall_score": 81.22398161382759}, "zh_TW": {"meta_score": 62.504824392126594, "comet_score": 91.24910325985165, "overall_score": 74.19009403266675}, "ja_JP": {"meta_score": 67.03210649960846, "comet_score": 93.56287388465044, "overall_score": 78.10600943614624}, "overall": {"meta_score": 68.23668096982803, "comet_score": 91.63622279734508, "overall_score": 78.17017203910862}}}}
+{"221204": {"metadata": {"team_name": "silp_nlp", "email": "pankajgoyal02003@gmail.com", "submission_name": "GPT-4o-mini", "submission_description": "I have basically here used gpt 4o mini to do the machine translated.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "GPT-4o-mini", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 26.367571822117274, "comet_score": 91.31651152803502, "overall_score": 40.919631741475385}, "th_TH": {"meta_score": 0.11604293588627793, "comet_score": 60.91679855122025, "overall_score": 0.2316446023628086}, "de_DE": {"meta_score": 0.595643294758339, "comet_score": 63.190060125637245, "overall_score": 1.180162123827173}, "fr_FR": {"meta_score": 1.6651418115279049, "comet_score": 74.01514098607174, "overall_score": 3.2570096565746387}, "tr_TR": {"meta_score": 8.5177733065057, "comet_score": 67.50290888669159, "overall_score": 15.126790732167347}, "ar_AE": {"meta_score": 27.644600835715856, "comet_score": 88.59959516861896, "overall_score": 42.14060618649992}, "it_IT": {"meta_score": 34.48411141624167, "comet_score": 89.9334431314244, "overall_score": 49.85268974728372}, "es_ES": {"meta_score": 2.210565754964406, "comet_score": 79.20994023374067, "overall_score": 4.3010978427909725}, "zh_TW": {"meta_score": 0.11578541103820918, "comet_score": 69.36774708068955, "overall_score": 0.23118493894904907}, "ja_JP": {"meta_score": 30.07047768206735, "comet_score": 92.6099737524146, "overall_score": 45.39967234056154}, "overall": {"meta_score": 13.1787714270823, "comet_score": 77.6662119444544, "overall_score": 20.26404899124925}}}}
+{"226258": {"metadata": {"team_name": "FII the Best", "email": "deliaiustinagrigorita@gmail.com", "submission_name": "mBERT-WikiNEuRal", "submission_description": "The approach involves identifying named entities using the WikiNEuRal multilingual NER model, translating them with the Wikidata API for precision, and temporarily replacing them with placeholders. The remaining text is translated separately, after which the placeholders are reinserted with the translated entities. Finally, refinements are applied to ensure the output maintains grammatical correctness and contextual coherence.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Gemini 1.0 Pro", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 64.1086186540732, "comet_score": 90.71857598190559, "overall_score": 75.12688718720815}, "th_TH": {"meta_score": 55.410501885697705, "comet_score": 85.20282879406935, "overall_score": 67.15055368843326}, "de_DE": {"meta_score": 62.62763784887678, "comet_score": 89.12504726654417, "overall_score": 73.56299731010095}, "fr_FR": {"meta_score": 68.10612991765782, "comet_score": 89.89364660560594, "overall_score": 77.4976839741539}, "tr_TR": {"meta_score": 56.89693717862732, "comet_score": 90.18537334393207, "overall_score": 69.77414895578109}, "ar_AE": {"meta_score": 68.11084231361338, "comet_score": 90.00911183584803, "overall_score": 77.54361498542995}, "it_IT": {"meta_score": 67.67359748921146, "comet_score": 88.50160868448947, "overall_score": 76.6987589131212}, "es_ES": {"meta_score": 69.91382540277257, "comet_score": 91.06375947455066, "overall_score": 79.09940735259063}, "zh_TW": {"meta_score": 26.4569664222308, "comet_score": 88.28716627686974, "overall_score": 40.71337747310125}, "ja_JP": {"meta_score": 66.67971808927173, "comet_score": 91.81922382923804, "overall_score": 77.25578336361549}, "overall": {"meta_score": 60.598477520203275, "comet_score": 89.4806342093053, "overall_score": 71.44232132035359}}}}
+{"226818": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Chatgpt-o1-llm", "submission_description": "The generated output from the model detailed (o1-2024-12-17) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "o1-2024-12-17", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 37.52459661550571, "comet_score": 91.9599728772416, "overall_score": 53.29995536162509}}}}
+{"226809": {"metadata": {"team_name": "HausaNLP", "email": "abubakarabdulhamid@gmail.com", "submission_name": "Gemini-few-shot", "submission_description": "A prompt template was created to use the Gemini model in translating test data from English to the 10 languages. The template instructed the model to pay attention to entities while making the translations. For languages that had training data from the task repository, 10 examples were used from the said training data to guide the model on the expected result.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "gemini-1.5-flash", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 33.7465564738292, "comet_score": 91.34008410360718, "overall_score": 49.28445263694621}, "th_TH": {"meta_score": 18.624891209747606, "comet_score": 83.97209875421798, "overall_score": 30.487662542552073}, "de_DE": {"meta_score": 38.13818924438394, "comet_score": 89.86216896677034, "overall_score": 53.549543983492384}, "fr_FR": {"meta_score": 35.315645013723696, "comet_score": 88.94620348637189, "overall_score": 50.557634311077}, "tr_TR": {"meta_score": 41.53811759445562, "comet_score": 92.84815688128442, "overall_score": 57.39779116593474}, "ar_AE": {"meta_score": 34.17638003078953, "comet_score": 89.58766751887643, "overall_score": 49.47773172929659}, "it_IT": {"meta_score": 39.387995292271476, "comet_score": 90.63709888997477, "overall_score": 54.91268661385709}, "es_ES": {"meta_score": 48.295241663544395, "comet_score": 92.49800697287266, "overall_score": 63.45778144074484}, "zh_TW": {"meta_score": 8.085681204168276, "comet_score": 88.66117280314485, "overall_score": 14.819830284501327}, "ja_JP": {"meta_score": 34.92560689115113, "comet_score": 92.31326394512398, "overall_score": 50.67785883663003}, "overall": {"meta_score": 33.22343046180649, "comet_score": 90.06659223222444, "overall_score": 47.462297354503235}}}}
+{"226241": {"metadata": {"team_name": "Zero", "email": "revanth.gundam@research.iiit.ac.in", "submission_name": "FineTuned-MT", "submission_description": "We made use of google translate to translate the training set to create a silver dataset. We then finetuned an encoder decoder model on this silver dataset. We used this finetuned encoder decoder model on the test set.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 35.97009051554506, "comet_score": 91.77947467246518, "overall_score": 51.68418391998704}, "th_TH": {"meta_score": 13.751087902523935, "comet_score": 82.60899849791508, "overall_score": 23.577471592618302}, "de_DE": {"meta_score": 40.31654186521443, "comet_score": 90.62117759196671, "overall_score": 55.8056534878985}, "fr_FR": {"meta_score": 33.156450137236966, "comet_score": 89.06351815530824, "overall_score": 48.32320184693579}, "tr_TR": {"meta_score": 46.501229599821144, "comet_score": 93.82678978253392, "overall_score": 62.1837479570438}, "ar_AE": {"meta_score": 37.49725093468221, "comet_score": 90.8245740525141, "overall_score": 53.08016535180292}, "it_IT": {"meta_score": 39.36837975676736, "comet_score": 90.77644099928249, "overall_score": 54.91914901364083}, "es_ES": {"meta_score": 46.45934807043837, "comet_score": 92.38409575014893, "overall_score": 61.82653991462655}, "zh_TW": {"meta_score": 8.413739868776535, "comet_score": 88.97732896367265, "overall_score": 15.373732090502845}, "ja_JP": {"meta_score": 35.27799530148786, "comet_score": 92.57325216742727, "overall_score": 51.08747579173952}, "overall": {"meta_score": 33.67121139524939, "comet_score": 90.34356506332347, "overall_score": 47.786132096679616}}}}
+{"226349": {"metadata": {"team_name": "VerbaNexAI Lab", "email": "dgnecco@utb.edu.co", "submission_name": "TransNER-SpEn", "submission_description": "This submission implements a Spanish-English translation system using a fine-tuned MarianMT model, which was trained on a combination of general translation data and entity-specific translations obtained from Wikidata. The system includes a complete pipeline from data collection through model training to inference, with optimizations for handling large datasets and maintaining translation quality.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"es_ES": {"meta_score": 24.615961034095164, "comet_score": 87.088061075793, "overall_score": 38.382795489095514}}}}
+{"226804": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Chatgpt-4o-llm", "submission_description": "The generated output from the model detailed (gpt-4o-2024-08-06) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "gpt-4o-2024-08-06", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 39.51200314836679, "comet_score": 90.86632450492023, "overall_score": 55.07526541476404}}}}
+{"226815": {"metadata": {"team_name": "The Five Forbidden Entities ", "email": "wcblanco@ucsc.edu", "submission_name": "Embedded Entities", "submission_description": "Added the entity embedding to the cross attention and had our retrieved translated entity in the input. ", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "MBart", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 39.19716646989374, "comet_score": 79.79558383601005, "overall_score": 52.57061081690562}, "th_TH": {"meta_score": 13.635044966637654, "comet_score": 62.396456523843526, "overall_score": 22.379631436529547}, "de_DE": {"meta_score": 51.78692988427501, "comet_score": 88.76365802527853, "overall_score": 65.4112857554886}, "fr_FR": {"meta_score": 48.142726440988106, "comet_score": 86.69446835851625, "overall_score": 61.90737029700189}, "tr_TR": {"meta_score": 48.64744019673597, "comet_score": 80.10578992121337, "overall_score": 60.533496845626004}, "ar_AE": {"meta_score": 53.24389707499451, "comet_score": 86.65550641399282, "overall_score": 65.95992190704884}, "it_IT": {"meta_score": 54.68811298548451, "comet_score": 89.00609847206422, "overall_score": 67.74908356103718}, "es_ES": {"meta_score": 53.240914200074926, "comet_score": 89.63932561802926, "overall_score": 66.80391424675308}, "zh_TW": {"meta_score": 18.930914704747202, "comet_score": 80.28892595960752, "overall_score": 30.6378804662431}, "ja_JP": {"meta_score": 61.64839467501958, "comet_score": 91.29418558386405, "overall_score": 73.59807811378447}, "overall": {"meta_score": 44.31615415988512, "comet_score": 83.46399987124197, "overall_score": 56.755127344641835}}}}
+{"226805": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Chatgpt-o1-mini-llm", "submission_description": "The generated output from the model detailed (o1-mini-2024-09-12) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "o1-mini-2024-09-12", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 33.057851239669425, "comet_score": 92.0190771285918, "overall_score": 48.64131223261735}}}}
+{"226723": {"metadata": {"team_name": "Lunar", "email": "suzie_oh@korea.ac.kr", "submission_name": "LLaMA-RAFT-Gold", "submission_description": "We trained LLaMA-3.1-8B-Instruct using a multi-turn dialogue setup that incorporates function calls to: (1) identify key entities in the source text that require lookup via function calls (e.g., [search(\"The Great Gatsby\")]) and (2) retrieve their corresponding entities in the target language using the Wikipedia API. These retrieved entities are then leveraged to generate the final translation. While the system is capable of performing searches at test time, this version utilizes \u201cgold\u201d information to achieve its highest possible performance. ", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "Llama-3.1-8B-Instruct", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 88.50846123573396, "comet_score": 92.8362999315198, "overall_score": 90.62073809984025}, "th_TH": {"meta_score": 87.03220191470844, "comet_score": 89.17227425340435, "overall_score": 88.08924207591163}, "de_DE": {"meta_score": 81.7222600408441, "comet_score": 92.6139032418749, "overall_score": 86.82785420551139}, "fr_FR": {"meta_score": 73.19304666056725, "comet_score": 92.43925557372118, "overall_score": 81.69796175271453}, "tr_TR": {"meta_score": 80.50525374469036, "comet_score": 93.77640349710917, "overall_score": 86.63554476447312}, "ar_AE": {"meta_score": 86.49659115900594, "comet_score": 91.34681123526562, "overall_score": 88.85556257610334}, "it_IT": {"meta_score": 90.03530796390741, "comet_score": 94.42197130141702, "overall_score": 92.1764790041598}, "es_ES": {"meta_score": 87.46721618583739, "comet_score": 93.83159996128654, "overall_score": 90.53769917853991}, "zh_TW": {"meta_score": 57.98919336163644, "comet_score": 91.02745204761376, "overall_score": 70.84589112188529}, "ja_JP": {"meta_score": 88.27329678935004, "comet_score": 94.56669894822333, "overall_score": 91.31168756563464}, "overall": {"meta_score": 82.12228290562814, "comet_score": 92.60326699914357, "overall_score": 86.75986603447738}}}}
+{"226828": {"metadata": {"team_name": "pingan_team", "email": "ytimespace@gmail.com", "submission_name": "Qwen2.5-72B-LoRA + zhconv", "submission_description": "Fine-tune qwen2.5-72b with lora and integrate zhconv package", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "Qwen2.5-72B", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 90.2400629673357, "comet_score": 95.43971328907058, "overall_score": 92.76708439046332}, "th_TH": {"meta_score": 91.18073687264288, "comet_score": 93.54516519025543, "overall_score": 92.34781909486988}, "de_DE": {"meta_score": 85.7215793056501, "comet_score": 94.02681703750903, "overall_score": 89.682327269847}, "fr_FR": {"meta_score": 91.43641354071363, "comet_score": 94.41432573276164, "overall_score": 92.90151188653924}, "tr_TR": {"meta_score": 84.08227140621507, "comet_score": 95.51157494426013, "overall_score": 89.43324429085688}, "ar_AE": {"meta_score": 91.46690125357378, "comet_score": 94.18387947067838, "overall_score": 92.80550902738722}, "it_IT": {"meta_score": 93.19340918007062, "comet_score": 95.88923163451882, "overall_score": 94.52210272904938}, "es_ES": {"meta_score": 90.12738853503186, "comet_score": 95.09025658112691, "overall_score": 92.54233305264509}, "zh_TW": {"meta_score": 81.08838286375916, "comet_score": 94.26338453117552, "overall_score": 87.180933827514}, "ja_JP": {"meta_score": 90.91620986687549, "comet_score": 95.70499431624857, "overall_score": 93.24916090484658}, "overall": {"meta_score": 88.94533557918683, "comet_score": 94.80693427276051, "overall_score": 91.74320264740184}}}}
+{"226796": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Llama-llm", "submission_description": "The generated output from the model detailed (llama3:8b) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "Llama-3.1-8B-Instruct", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 5.627705627705628, "comet_score": 55.28800609085045, "overall_score": 10.215578682217762}}}}
+{"226625": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Gemini-flash-llm", "submission_description": "The generated output from the model detailed (gemini-1.5-flash) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "gemini-1.5-flash", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 33.15623770169225, "comet_score": 90.81376347542748, "overall_score": 48.5769573249348}}}}
+{"226436": {"metadata": {"team_name": "SALT \ud83e\uddc2", "email": "voelker@informatik.uni-wuerzburg.de", "submission_name": "Salt-Full-Pipeline + Gold", "submission_description": "See Submission 226303 for details. These are the results achieved when using the wikidataIDs provided with the test-dataset instead of using our retrieval mechanism. Translations are done using the same fine-tuned NLLB model. The LLM post-processing step is also omitted, as it was only introduced to fix wrong retrieval results.  ", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 82.624950806769, "comet_score": 93.68702314386496, "overall_score": 87.80896163821366}, "th_TH": {"meta_score": 72.93298520452568, "comet_score": 91.55131491381499, "overall_score": 81.1884257799704}, "de_DE": {"meta_score": 82.55616065350578, "comet_score": 93.20614124851781, "overall_score": 87.55849334626174}, "fr_FR": {"meta_score": 84.04391582799634, "comet_score": 92.6148269412066, "overall_score": 88.1214549346171}, "tr_TR": {"meta_score": 83.38922423429466, "comet_score": 95.0010380601286, "overall_score": 88.81721191947011}, "ar_AE": {"meta_score": 88.19001539476578, "comet_score": 93.62355730927906, "overall_score": 90.82559500503527}, "it_IT": {"meta_score": 88.68183601412318, "comet_score": 94.58494369085277, "overall_score": 91.538318939202}, "es_ES": {"meta_score": 82.98988385162983, "comet_score": 94.26201956186816, "overall_score": 88.26753230187067}, "zh_TW": {"meta_score": 51.003473562331145, "comet_score": 90.71520773175598, "overall_score": 65.29542410356926}, "ja_JP": {"meta_score": 83.33985904463587, "comet_score": 94.17353203474961, "overall_score": 88.42610507059334}, "overall": {"meta_score": 79.97523045945773, "comet_score": 93.34196046360385, "overall_score": 85.78475230388035}}}}
+{"226830": {"metadata": {"team_name": "pingan_team", "email": "ytimespace@gmail.com", "submission_name": "Qwen2.5-72B-LoRA", "submission_description": "Using qwen2.5-72B as the base model and wiki-id as the input, lora was used for training", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "Qwen2.5-72B", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 90.2400629673357, "comet_score": 95.43971328907058, "overall_score": 92.76708439046332}, "th_TH": {"meta_score": 91.18073687264288, "comet_score": 93.54516519025543, "overall_score": 92.34781909486988}, "de_DE": {"meta_score": 86.35125936010893, "comet_score": 94.0455995489944, "overall_score": 90.03433881766277}, "fr_FR": {"meta_score": 91.56450137236962, "comet_score": 94.30785186336549, "overall_score": 92.91593161696929}, "tr_TR": {"meta_score": 84.12698412698413, "comet_score": 95.69686444552453, "overall_score": 89.53972078919878}, "ar_AE": {"meta_score": 91.73081152408182, "comet_score": 93.63906043740401, "overall_score": 92.6751139587588}, "it_IT": {"meta_score": 93.01686936053353, "comet_score": 95.80177426127744, "overall_score": 94.3887843916194}, "es_ES": {"meta_score": 90.12738853503186, "comet_score": 95.09025658112691, "overall_score": 92.54233305264509}, "zh_TW": {"meta_score": 81.26206098031648, "comet_score": 94.44410534599305, "overall_score": 87.35860338110784}, "ja_JP": {"meta_score": 91.40563821456539, "comet_score": 95.35987404530275, "overall_score": 93.34089620404119}, "overall": {"meta_score": 89.10063133139704, "comet_score": 94.73702650083148, "overall_score": 91.79106256973364}}}}
+{"226303": {"metadata": {"team_name": "SALT \ud83e\uddc2", "email": "voelker@informatik.uni-wuerzburg.de", "submission_name": "Salt-Full-Pipeline", "submission_description": "Approach using a simple, heuristics driven, non-neural DB retrieval for the entities. Translation is done using a fine-tuned NLLB (Results appended @1), training on a mixture of the development dataset and a heavily filtered down version of the mintaka dataset (relevant languages only). With the usage of gpt-4o-mini provided with the src text, the retrieved candidates @3 and the translation for cleaning up obviously wrongly selected candidates. ", "uses_gold": false, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o-mini", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 80.48012593467139, "comet_score": 92.26385467766026, "overall_score": 85.97007684268034}, "th_TH": {"meta_score": 76.18218740934145, "comet_score": 90.17432857016256, "overall_score": 82.5898229257222}, "de_DE": {"meta_score": 76.07215793056501, "comet_score": 91.41815750320157, "overall_score": 83.04213288146681}, "fr_FR": {"meta_score": 79.81701738334858, "comet_score": 91.1512607346951, "overall_score": 85.10844049738913}, "tr_TR": {"meta_score": 77.93427230046949, "comet_score": 93.7362617450141, "overall_score": 85.10799349326312}, "ar_AE": {"meta_score": 83.15372773257093, "comet_score": 91.86219372263264, "overall_score": 87.2913022108523}, "it_IT": {"meta_score": 80.73754413495489, "comet_score": 92.32150181081539, "overall_score": 86.14182848773764}, "es_ES": {"meta_score": 82.4653428250281, "comet_score": 93.1592153800521, "overall_score": 87.48670131492354}, "zh_TW": {"meta_score": 54.592821304515624, "comet_score": 89.49496316224256, "overall_score": 67.81674865293931}, "ja_JP": {"meta_score": 79.91386061080658, "comet_score": 92.54373518769033, "overall_score": 85.76632557064718}, "overall": {"meta_score": 77.1349057566272, "comet_score": 91.81254724941667, "overall_score": 83.63213728776215}}}}
+{"226829": {"metadata": {"team_name": "JNLP", "email": "antrieu@jaist.ac.jp", "submission_name": "Multi-task-mT5", "submission_description": "We fine-tuned a Multilingual T5 model using specific prompting instructions. First, these instructions make the model aware of the relevant entities. Once the entities are identified, the model then proceeds with the translation. This approach helps ensure more accurate and contextually appropriate results.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"de_DE": {"meta_score": 13.070115724982982, "comet_score": 78.87477698153262, "overall_score": 22.424355123703375}, "fr_FR": {"meta_score": 11.93046660567246, "comet_score": 79.59524542720408, "overall_score": 20.750637093070793}, "es_ES": {"meta_score": 12.008242787560883, "comet_score": 71.6338134442277, "overall_score": 20.568509728009463}}}}
+{"225191": {"metadata": {"team_name": "silp_nlp", "email": "pankajgoyal02003@gmail.com", "submission_name": "T5-MT-Instruct", "submission_description": "I have used T5 instruction based fine tuning.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "T5-base", "is_finetuned": true}, "scores": {"de_DE": {"meta_score": 0.0, "comet_score": 45.63061470473965, "overall_score": 0.0}, "fr_FR": {"meta_score": 0.07319304666056725, "comet_score": 45.21000578685954, "overall_score": 0.14614948361963565}, "ar_AE": {"meta_score": 0.0, "comet_score": 38.86726113998525, "overall_score": 0.0}, "it_IT": {"meta_score": 5.629658689682229, "comet_score": 69.13161739917444, "overall_score": 10.411470509424833}, "es_ES": {"meta_score": 0.0, "comet_score": 27.92603628282111, "overall_score": 0.0}, "ja_JP": {"meta_score": 0.0, "comet_score": 38.78236922086125, "overall_score": 0.0}}}}
+{"225933": {"metadata": {"team_name": "HausaNLP", "email": "abubakarabdulhamid@gmail.com", "submission_name": "Gemini-0shot", "submission_description": "A prompt template was created to use the Gemini model in translating test data from English to the 10 languages. The template instructed the model to pay attention to entities while making the translations. This was a zero-shot approach so no sample translations were provided to the model.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "gemini-1.5-flash", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 34.671389216843764, "comet_score": 90.71432484716283, "overall_score": 50.168261795975155}, "th_TH": {"meta_score": 18.79895561357702, "comet_score": 83.41229118261954, "overall_score": 30.68280661315985}, "de_DE": {"meta_score": 38.15520762423417, "comet_score": 89.2959383041823, "overall_score": 53.46527159372105}, "fr_FR": {"meta_score": 38.7740164684355, "comet_score": 88.34536861086372, "overall_score": 53.89429433269751}, "tr_TR": {"meta_score": 40.82271406215068, "comet_score": 92.41551555911354, "overall_score": 56.63017554796197}, "ar_AE": {"meta_score": 32.658895975368374, "comet_score": 88.56474659797881, "overall_score": 47.720507069867466}, "it_IT": {"meta_score": 40.30992546096508, "comet_score": 89.98675601605856, "overall_score": 55.678462204289}, "es_ES": {"meta_score": 47.920569501686025, "comet_score": 91.70959686609903, "overall_score": 62.94880576189662}, "zh_TW": {"meta_score": 8.529525279814743, "comet_score": 87.84999996423721, "overall_score": 15.549335683678928}, "ja_JP": {"meta_score": 35.1018010963195, "comet_score": 91.05520215565186, "overall_score": 50.67022063720286}, "overall": {"meta_score": 33.57430002993948, "comet_score": 89.33497401039673, "overall_score": 47.74081412404504}}}}
+{"225961": {"metadata": {"team_name": "arancini", "email": "s327473@studenti.polito.it", "submission_name": "WikiGemmaMT", "submission_description": "Entity-aware machine translation task submission where we use an entity linking part to retrieve the correct translation of the critical entity and then insert this translation into the gemma-2-9b-it prompt", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "gemma-2-9b-it", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.73199527744983, "comet_score": 94.26169080651684, "overall_score": 92.46316959400883}, "th_TH": {"meta_score": 90.80359733101247, "comet_score": 92.18212720584897, "overall_score": 91.48766966496227}, "de_DE": {"meta_score": 84.80258679373723, "comet_score": 93.68167441817593, "overall_score": 89.02127584681102}, "fr_FR": {"meta_score": 90.70448307410796, "comet_score": 93.41467595438507, "overall_score": 92.03963279743807}, "tr_TR": {"meta_score": 82.29376257545272, "comet_score": 94.38131328546412, "overall_score": 87.92404190829706}, "ar_AE": {"meta_score": 90.14734990103365, "comet_score": 92.61821818851182, "overall_score": 91.36608180113384}, "it_IT": {"meta_score": 92.42840329540996, "comet_score": 95.34975656952376, "overall_score": 93.86635549806427}, "es_ES": {"meta_score": 89.5841139003372, "comet_score": 94.96621835394807, "overall_score": 92.19668605074357}, "zh_TW": {"meta_score": 50.7912003087611, "comet_score": 91.43674093374456, "overall_score": 65.30618082177756}, "ja_JP": {"meta_score": 90.74001566170713, "comet_score": 93.9324154347158, "overall_score": 92.30862232205908}, "overall": {"meta_score": 85.30275081190094, "comet_score": 93.6224831150835, "overall_score": 88.79797163052956}}}}
+{"224195": {"metadata": {"team_name": "AMM_CUET", "email": "arifulislamnayem11@gmail.com", "submission_name": "EA-MT-GPT4o-FR-IT-NER", "submission_description": "This system is based on GPT-4o and integrates named entity recognition (NER) and entity linking techniques to improve translation quality. The model ensures context-aware entity translations for French and Italian, leveraging fine-tuned prompts and re-ranking strategies to enhance performance.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": true}, "scores": {}}}
+{"226725": {"metadata": {"team_name": "Ualberta", "email": "ning.shi@ualberta.ca", "submission_name": "WikiEnsemble", "submission_description": "We create an ensemble of three distinct translation methods, including in-context learning with a LLM, and two commercial MT systems. We select, for each instance, a system that accurately translates the named entity, using validation set performance to break ties. We provided GPT with translations of the given named entity. We also check that translations contained those NE translations, for ensembling purposes.", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.43683589138135, "comet_score": 95.61710354264727, "overall_score": 92.9548530689595}, "th_TH": {"meta_score": 90.0203075137801, "comet_score": 94.11107900611621, "overall_score": 92.02025176374595}, "de_DE": {"meta_score": 85.2280462899932, "comet_score": 94.27794513449204, "overall_score": 89.52486775827927}, "fr_FR": {"meta_score": 89.62488563586459, "comet_score": 94.26329986992388, "overall_score": 91.88559283744956}, "tr_TR": {"meta_score": 83.21037335121842, "comet_score": 95.93105493390318, "overall_score": 89.11907171267431}, "ar_AE": {"meta_score": 91.68682647899715, "comet_score": 94.8610689250983, "overall_score": 93.24694172836786}, "it_IT": {"meta_score": 91.74185955276579, "comet_score": 95.92379437866937, "overall_score": 93.78623192149469}, "es_ES": {"meta_score": 89.35931060322217, "comet_score": 95.30632744437526, "overall_score": 92.23705943992896}, "zh_TW": {"meta_score": 81.14627556927827, "comet_score": 94.2760098675235, "overall_score": 87.2197857556488}, "ja_JP": {"meta_score": 90.38762725137039, "comet_score": 95.7894285465463, "overall_score": 93.01016309426342}, "overall": {"meta_score": 88.28423481378715, "comet_score": 95.03571116492954, "overall_score": 91.50048190808124}}}}
+{"226834": {"metadata": {"team_name": "UAlberta", "email": "ning.shi@ualberta.ca", "submission_name": "PromptGPT", "submission_description": "We prompt a state-of-the-art language model with instructions designed to increase the model's attention in the named entity. We also leverage in context learning to showcase example input output pairs in the prompt.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "GPT4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 48.85871704053522, "comet_score": 93.45366500496911, "overall_score": 64.16906398796374}, "th_TH": {"meta_score": 25.26834928923702, "comet_score": 87.31730042902463, "overall_score": 39.194409798320194}, "de_DE": {"meta_score": 49.57454050374404, "comet_score": 91.58179552590384, "overall_score": 64.32761800718511}, "fr_FR": {"meta_score": 50.39341262580055, "comet_score": 90.61494872151223, "overall_score": 64.76774082554711}, "tr_TR": {"meta_score": 48.200312989045386, "comet_score": 93.27380055286409, "overall_score": 63.556876488136126}, "ar_AE": {"meta_score": 43.985045084671206, "comet_score": 91.6797200133047, "overall_score": 59.44854753145341}, "it_IT": {"meta_score": 51.981169085916044, "comet_score": 92.21800744662148, "overall_score": 66.48581431763522}, "es_ES": {"meta_score": 57.02510303484451, "comet_score": 93.21990250123044, "overall_score": 70.76274550443468}, "zh_TW": {"meta_score": 39.03898108838286, "comet_score": 92.09323646259878, "overall_score": 54.83360510141013}, "ja_JP": {"meta_score": 52.36883320281911, "comet_score": 93.51961281188504, "overall_score": 67.14051918880828}, "overall": {"meta_score": 46.669446394499595, "comet_score": 91.89719894699144, "overall_score": 61.46869407508941}}}}
+{"226819": {"metadata": {"team_name": "GinGer", "email": "aylin.naebzadeh@gmail.com", "submission_name": "LoRA-nllb-distilled-200-distilled-600M", "submission_description": "Due to the hardware limitation, I applied several standard common MT models. To handle the models complexity and overfitting, I tried the Low Rank Adaptation settings with an early stopping mechanism.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "N/A", "is_finetuned": true}, "scores": {"ar_AE": {"meta_score": 18.4077413679349, "comet_score": 87.0979045485176, "overall_score": 30.392225680278568}, "it_IT": {"meta_score": 25.51981169085916, "comet_score": 89.28317988489412, "overall_score": 39.69391226743653}, "ja_JP": {"meta_score": 0.0, "comet_score": 0.0, "overall_score": 0.0}}}}
+{"226655": {"metadata": {"team_name": "RAGthoven", "email": "karetka.gregor@gmail.com", "submission_name": "GPT-4o + WikiData + RAG", "submission_description": "Prompted GPT-4o to do the translation. We utilised RAGthoven preprocessor to include wikidata entity name and it's translation in the user prompt. In order to improve translations we used RAG and included most similar examples in system prompt.", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.22038567493112, "comet_score": 95.79226015145176, "overall_score": 92.92287217508657}, "th_TH": {"meta_score": 90.4844792573252, "comet_score": 94.53330028392558, "overall_score": 92.46458875332293}, "de_DE": {"meta_score": 85.07488087134105, "comet_score": 94.33062866817046, "overall_score": 89.46399714325253}, "fr_FR": {"meta_score": 91.01555352241537, "comet_score": 94.03287318575197, "overall_score": 92.49961379894908}, "tr_TR": {"meta_score": 82.76324614352784, "comet_score": 95.82904141713597, "overall_score": 88.81819759221935}, "ar_AE": {"meta_score": 91.88475918187817, "comet_score": 94.6260689089842, "overall_score": 93.23526835443606}, "it_IT": {"meta_score": 92.78148293448412, "comet_score": 95.92631808527781, "overall_score": 94.327696007279}, "es_ES": {"meta_score": 89.88385162982391, "comet_score": 95.10102187282989, "overall_score": 92.41886623491352}, "zh_TW": {"meta_score": 81.47433423388652, "comet_score": 94.49984784098217, "overall_score": 87.50502030771659}, "ja_JP": {"meta_score": 89.56538762725137, "comet_score": 95.75071582452524, "overall_score": 92.5548273320138}, "overall": {"meta_score": 88.51483610768646, "comet_score": 95.0422076239035, "overall_score": 91.62109476991893}}}}
+{"226765": {"metadata": {"team_name": "Lunar", "email": "suzie_oh@korea.ac.kr", "submission_name": "LLaMA-RAFT", "submission_description": "We trained LLaMA-3.1-8B-Instruct using a multi-turn dialogue setup that incorporates function calls to: (1) identify key entities in the source text that require lookup via function calls (e.g., [search(\"The Great Gatsby\")])and (2) retrieve their corresponding entities in the target language using the Wikipedia API. These retrieved entities are then leveraged to generate the final translation. The system is capable of performing searches at test time, and this version utilizes real-time search to evaluate its performance in real-world conditions.", "uses_gold": false, "uses_rag": true, "uses_llm": false, "llm_name": "Llama-3.1-8B-Instruct", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 58.166076347894524, "comet_score": 90.64374512126717, "overall_score": 70.8607932881007}, "th_TH": {"meta_score": 55.14940527995358, "comet_score": 85.19545282648721, "overall_score": 66.95619090474514}, "de_DE": {"meta_score": 56.73927842069435, "comet_score": 90.4903641235593, "overall_score": 69.74625320921199}, "fr_FR": {"meta_score": 53.66880146386094, "comet_score": 90.39489821022187, "overall_score": 67.35056584504868}, "tr_TR": {"meta_score": 64.81108875475073, "comet_score": 92.56566027293876, "overall_score": 76.24101095818163}, "ar_AE": {"meta_score": 62.65669672311414, "comet_score": 89.67273260194547, "overall_score": 73.76903118284966}, "it_IT": {"meta_score": 70.92977638289526, "comet_score": 92.59323466930168, "overall_score": 80.32652270041815}, "es_ES": {"meta_score": 63.15099288122893, "comet_score": 92.28642426902947, "overall_score": 74.98811327280042}, "zh_TW": {"meta_score": 26.57275183326901, "comet_score": 88.03405051339965, "overall_score": 40.82317854212491}, "ja_JP": {"meta_score": 52.93657008613939, "comet_score": 92.0883102238925, "overall_score": 67.22762711970357}, "overall": {"meta_score": 56.47814381738009, "comet_score": 90.39648728320431, "overall_score": 68.82892870231848}}}}
+{"226499": {"metadata": {"team_name": "Deerlu", "email": "xu@diag.uniroma1.it", "submission_name": "Qwen2.5-Max-Wiki", "submission_description": "Use qwen-max-latest as translator, use wiki-id gold label to retrieve entity names used in input with English prompts. Entity names in both source language and target language are retrieved, explicitly pass the entity names to model through prompts. For zh-TW, added an process that convert zh to zh(Traditional).", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "Qwen2.5-Max", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.6926406926407, "comet_score": 95.1814954027985, "overall_score": 92.88286519563866}, "th_TH": {"meta_score": 90.9776617348419, "comet_score": 93.61229854751262, "overall_score": 92.2761781675378}, "de_DE": {"meta_score": 85.9428182437032, "comet_score": 94.28722383295946, "overall_score": 89.92185372883142}, "fr_FR": {"meta_score": 91.0704483074108, "comet_score": 93.82492545102565, "overall_score": 92.42716948016039}, "tr_TR": {"meta_score": 83.8810641627543, "comet_score": 95.23464956593786, "overall_score": 89.19802271349477}, "ar_AE": {"meta_score": 91.5328788212008, "comet_score": 94.29211664886718, "overall_score": 92.89201234938831}, "it_IT": {"meta_score": 92.84032954099646, "comet_score": 95.81113536189986, "overall_score": 94.30234093622707}, "es_ES": {"meta_score": 90.25852379168228, "comet_score": 95.13451545626629, "overall_score": 92.63239829878985}, "zh_TW": {"meta_score": 80.76032419915092, "comet_score": 94.53363860265392, "overall_score": 87.10587836852865}, "ja_JP": {"meta_score": 91.50352388410337, "comet_score": 95.69760112204212, "overall_score": 93.55358018959248}, "overall": {"meta_score": 88.94602133784846, "comet_score": 94.76095999919633, "overall_score": 91.71922994281894}}}}
+{"226797": {"metadata": {"team_name": "Transcreate", "email": "sharma.harsh7111@gmail.com", "submission_name": "Claude-sonnet-llm", "submission_description": "The generated output from the model detailed (claude-3-5-sonnet-20241022) was not edited (ie. was not fine-tuned, knowledge-augmented, etc). Along with the other models a system paper will explore and discuss areas of failure for machine translation by LLMs (transliteration, formality, etc).", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "claude-3-5-sonnet-20241022", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 39.68909878000787, "comet_score": 83.84262150324315, "overall_score": 53.87503839802359}}}}
+{"221357": {"metadata": {"team_name": "Howard University-AI4PC", "email": "jabez.agyemang-prem@bison.howard.edu", "submission_name": "DoubleGPT", "submission_description": "In this submission, we retrieve entity labels and descriptions from Wikidata and pass them to a GPT model for initial entity translation. This step ensures that each named entity is accurately rendered in the target language, taking into account metadata such as the entity\u2019s description. Next, we feed those refined entity translations into a final GPT translation process to generate the complete target-language sentence. ", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "gpt-4o-2024-08-06", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 86.08815426997245, "comet_score": 94.61010759597404, "overall_score": 90.14817800807896}, "th_TH": {"meta_score": 84.21816071946621, "comet_score": 92.68187512807097, "overall_score": 88.24754633791723}, "de_DE": {"meta_score": 77.1783526208305, "comet_score": 93.48878275179554, "overall_score": 84.55418467716994}, "fr_FR": {"meta_score": 79.01189387008233, "comet_score": 92.63781028173435, "overall_score": 85.2840250498008}, "tr_TR": {"meta_score": 73.32886206125643, "comet_score": 93.50875438645518, "overall_score": 82.19837585696136}, "ar_AE": {"meta_score": 85.1770398064658, "comet_score": 93.83283800179024, "overall_score": 89.2956687696653}, "it_IT": {"meta_score": 81.81639858768145, "comet_score": 93.4555833335716, "overall_score": 87.24953266859379}, "es_ES": {"meta_score": 85.16298239040839, "comet_score": 94.80671339566094, "overall_score": 89.72646676032379}, "zh_TW": {"meta_score": 42.18448475492088, "comet_score": 91.96214884554084, "overall_score": 57.837841500498605}, "ja_JP": {"meta_score": 85.10180109631949, "comet_score": 95.27430157107867, "overall_score": 89.90120688928899}, "overall": {"meta_score": 77.92681301774039, "comet_score": 93.62589152916723, "overall_score": 84.44430265182987}}}}
+{"226420": {"metadata": {"team_name": "The Five Forbidden Entities", "email": "wcblanco@ucsc.edu", "submission_name": "MBart-KnowledgeAware", "submission_description": "We finetuned an Mbart seq2seq model on translation for all of the given languages. Utilizing Flair NER predictions which were fed through our knowledge base for determining the proper entity translation we built a dataset combining the source and desired translation for the entity like so: \"<source>|<translated_entity>\" which was then trained to predict the target.  This was a very simplistic approach to attempt to provide additional information to a well performing translation model in order to boost its effectiveness. ", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "MBart", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 56.63124754033845, "comet_score": 83.13611756712609, "overall_score": 67.37054890984923}, "th_TH": {"meta_score": 16.304032492022046, "comet_score": 62.48842884240926, "overall_score": 25.860681516160216}, "de_DE": {"meta_score": 54.288631722260035, "comet_score": 88.03327818704835, "overall_score": 67.16051269752329}, "fr_FR": {"meta_score": 49.423604757548034, "comet_score": 86.05827344467306, "overall_score": 62.787882029476975}, "tr_TR": {"meta_score": 50.77129443326627, "comet_score": 84.8602331190362, "overall_score": 63.531893492842414}, "ar_AE": {"meta_score": 58.08225203430833, "comet_score": 86.83554507334836, "overall_score": 69.60641294788606}, "it_IT": {"meta_score": 55.33542565712044, "comet_score": 87.7628699225648, "overall_score": 67.874963071818}, "es_ES": {"meta_score": 52.92244286249531, "comet_score": 88.82301863479721, "overall_score": 66.32637234264041}, "zh_TW": {"meta_score": 26.225395600154382, "comet_score": 82.84353455870091, "overall_score": 39.839108416167015}, "ja_JP": {"meta_score": 62.607674236491775, "comet_score": 92.31244846419982, "overall_score": 74.61222726482849}, "overall": {"meta_score": 48.25920013360051, "comet_score": 84.31537478139042, "overall_score": 60.49706026891921}}}}
+{"223351": {"metadata": {"team_name": "silp_nlp", "email": "pankajgoyal02003@gmail.com", "submission_name": "GPT-4o", "submission_description": "I have used gpt 4o for machine Translation.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 28.846910665092484, "comet_score": 91.48987932368932, "overall_score": 43.86356617716981}, "th_TH": {"meta_score": 0.08703220191470844, "comet_score": 60.812036224922174, "overall_score": 0.17381564455063667}, "de_DE": {"meta_score": 0.9189925119128658, "comet_score": 62.967376009776046, "overall_score": 1.8115459803648342}, "fr_FR": {"meta_score": 1.5004574565416284, "comet_score": 74.14876950245217, "overall_score": 2.9413935492995464}, "tr_TR": {"meta_score": 9.210820478426113, "comet_score": 67.77208756739572, "overall_score": 16.217535758973867}, "ar_AE": {"meta_score": 28.238398944358917, "comet_score": 88.49989097603918, "overall_score": 42.815347554221596}, "it_IT": {"meta_score": 32.97371518242448, "comet_score": 89.63512776879153, "overall_score": 48.21190873751812}, "es_ES": {"meta_score": 2.210565754964406, "comet_score": 78.62158130174632, "overall_score": 4.3002241448530585}, "zh_TW": {"meta_score": 0.11578541103820918, "comet_score": 69.29883727784416, "overall_score": 0.23118455587226788}, "ja_JP": {"meta_score": 31.14722004698512, "comet_score": 92.73146558713428, "overall_score": 46.63154680947599}, "overall": {"meta_score": 13.524989865365892, "comet_score": 77.59770515397909, "overall_score": 20.71980689122997}}}}
+{"226836": {"metadata": {"team_name": "RAGthoven", "email": "karetka.gregor@gmail.com", "submission_name": "GPT-4o + RAG", "submission_description": "Prompted GPT-4o to do the translation. In order to improve translations we used RAG and included most similar examples in system prompt.", "uses_gold": false, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 46.477764659582846, "comet_score": 93.26159822124517, "overall_score": 62.03821921816277}, "th_TH": {"meta_score": 30.809399477806785, "comet_score": 88.36267748569315, "overall_score": 45.688572339292904}, "de_DE": {"meta_score": 45.932607215793055, "comet_score": 90.93609496322192, "overall_score": 61.0356036871101}, "fr_FR": {"meta_score": 46.29460201280878, "comet_score": 89.97584948873477, "overall_score": 61.13425319939118}, "tr_TR": {"meta_score": 50.10060362173038, "comet_score": 93.72567955197142, "overall_score": 65.2970099316206}, "ar_AE": {"meta_score": 43.567187156366835, "comet_score": 91.02493929081706, "overall_score": 58.92916132112914}, "it_IT": {"meta_score": 48.705374656728125, "comet_score": 91.5986316432762, "overall_score": 63.59541384285135}, "es_ES": {"meta_score": 52.547770700636946, "comet_score": 92.88100163368931, "overall_score": 67.12137492397135}, "zh_TW": {"meta_score": 41.77923581628715, "comet_score": 92.23630406172383, "overall_score": 57.50918590075851}, "ja_JP": {"meta_score": 46.6131558339859, "comet_score": 93.31366245225713, "overall_score": 62.170273613012725}, "overall": {"meta_score": 45.28277011517268, "comet_score": 91.73164387926299, "overall_score": 60.45190679773007}}}}
+{"224200": {"metadata": {"team_name": "ASL_CUET", "email": "arif1904129@gmail.com", "submission_name": "GPT-4o-EntityAware-FR-IT", "submission_description": "This submission uses GPT-4o fine-tuned for Entity-Aware Machine Translation (EA-MT) from English to French and Italian. The system incorporates named entity recognition (NER) and context-based re-ranking to improve the translation of rare and ambiguous named entities. It also leverages external reference databases to enhance entity translation consistency.", "uses_gold": false, "uses_rag": false, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": true}, "scores": {}}}
+{"226782": {"metadata": {"team_name": "Lunar", "email": "suzie_oh@korea.ac.kr", "submission_name": "LLaMA-RAFT-Plus-Gold", "submission_description": "(I have already submitted the Google Form for this submission ID, but I was not completely sure if I submitted the correct content, so I am submitting it once more. I appreciate your understanding.) We trained LLaMA-3.1-8B-Instruct using a multi-turn dialogue setup that incorporates function calls to: (1) identify key entities in the source text that require lookup via function calls (e.g., [search(\"The Great Gatsby\")]) and (2) retrieve their corresponding entities in the target language using the Wikipedia API. These retrieved entities are then leveraged to generate the final translation. While the system is capable of performing searches at test time, this version utilizes \u201cgold\u201d information to achieve its highest possible performance. Additionally, it integrates a validation dataset during training to further enhance performance.", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "Llama-3.1-8B-Instruct", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 90.73199527744983, "comet_score": 95.45637002637751, "overall_score": 93.0342441140467}, "th_TH": {"meta_score": 91.23875834058602, "comet_score": 93.49164055773304, "overall_score": 92.3514619205378}, "de_DE": {"meta_score": 85.95983662355344, "comet_score": 94.32884611955218, "overall_score": 89.95009645590295}, "fr_FR": {"meta_score": 90.11893870082342, "comet_score": 94.23341054111474, "overall_score": 92.13026015718773}, "tr_TR": {"meta_score": 83.90342052313883, "comet_score": 95.21089883597405, "overall_score": 89.20023940022666}, "ar_AE": {"meta_score": 88.80580602595117, "comet_score": 94.106136537369, "overall_score": 91.3791761223704}, "it_IT": {"meta_score": 92.64417418595528, "comet_score": 95.7471467159972, "overall_score": 94.17010609296203}, "es_ES": {"meta_score": 89.82765080554515, "comet_score": 95.1283519803562, "overall_score": 92.40204432067017}, "zh_TW": {"meta_score": 68.33269008104979, "comet_score": 93.60909068396955, "overall_score": 78.99827891553336}, "ja_JP": {"meta_score": 91.44479248238058, "comet_score": 95.68192500944892, "overall_score": 93.5153877979588}, "overall": {"meta_score": 87.30080630464336, "comet_score": 94.69938170078925, "overall_score": 90.71312952973965}}}}
+{"226756": {"metadata": {"team_name": "pingan_team", "email": "ytimespace@gmail.com", "submission_name": "Phi4-FullFT", "submission_description": "Using phi4 as the base model and wiki-id as the input, it was trained with full fine-tuning", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "Phi-4", "is_finetuned": true}, "scores": {"ko_KR": {"meta_score": 90.88941361668634, "comet_score": 95.17916312653078, "overall_score": 92.9848389933758}, "th_TH": {"meta_score": 91.12271540469973, "comet_score": 91.40778800603925, "overall_score": 91.2650290949386}, "de_DE": {"meta_score": 85.500340367597, "comet_score": 94.13203335055857, "overall_score": 89.6088018476516}, "fr_FR": {"meta_score": 91.27172918572735, "comet_score": 94.21127247297753, "overall_score": 92.7182078195878}, "tr_TR": {"meta_score": 83.92577688352335, "comet_score": 95.52361398249059, "overall_score": 89.34991058496418}, "ar_AE": {"meta_score": 91.2249835056081, "comet_score": 92.97754985763919, "overall_score": 92.09292942164629}, "it_IT": {"meta_score": 92.85994507650058, "comet_score": 95.80958084517154, "overall_score": 94.31170584250886}, "es_ES": {"meta_score": 90.08992131884601, "comet_score": 95.04400013079577, "overall_score": 92.50067655419771}, "zh_TW": {"meta_score": 80.87610961018912, "comet_score": 94.17915894793342, "overall_score": 87.02216214120413}, "ja_JP": {"meta_score": 91.26859827721222, "comet_score": 95.90603151798435, "overall_score": 93.52986644134609}, "overall": {"meta_score": 88.90295332465898, "comet_score": 94.4370192238121, "overall_score": 91.53841287414211}}}}
+{"222504": {"metadata": {"team_name": "HausaNLP", "email": "abubakarabdulhamid@gmail.com", "submission_name": "FT-NLLB", "submission_description": "The team fine-tuned the NLLB-200 model (distilled 600M version) using the training data from the task repository. Furthermore, wikidata id from the task repo were extracted and used to further finetune another NLLB-200 model. The test data was then used for translation using the fine-tuned models.", "uses_gold": true, "uses_rag": false, "uses_llm": false, "llm_name": "NLLB-200-600M", "is_finetuned": true}, "scores": {"de_DE": {"meta_score": 20.864533696392105, "comet_score": 88.41545873729244, "overall_score": 33.7618493014752}, "fr_FR": {"meta_score": 22.854528819762123, "comet_score": 85.06995498261534, "overall_score": 36.029521186428305}, "ar_AE": {"meta_score": 20.606993622168464, "comet_score": 87.97940959871694, "overall_score": 33.392599417717854}, "it_IT": {"meta_score": 27.285209886229893, "comet_score": 89.51704785105012, "overall_score": 41.82267511481522}, "es_ES": {"meta_score": 32.74634694642188, "comet_score": 91.2641737055984, "overall_score": 48.198625088077975}, "ja_JP": {"meta_score": 12.744714173844947, "comet_score": 88.85967228255332, "overall_score": 22.29216994108149}}}}
+{"222829": {"metadata": {"team_name": "CHILL", "email": "jaebok123@yonsei.ac.kr", "submission_name": "GPT4o-RAG-Refine", "submission_description": "We used GPT-4o to generate translations for each sentence. Using the gold Wikidata ID, we applied a Retrieval-Augmented Generation (RAG) approach that provides the entity\u2019s English label, foreign label, and description. Since the task emphasizes both entity label accuracy and overall translation quality, we integrated a self-refinement framework. This means GPT  provided feedback on the initial translation from both perspectives and then revised it accordingly by itself.", "uses_gold": true, "uses_rag": true, "uses_llm": false, "llm_name": "GPT-4o", "is_finetuned": false}, "scores": {"ko_KR": {"meta_score": 90.85005903187722, "comet_score": 95.20878782954004, "overall_score": 92.97836830204355}, "th_TH": {"meta_score": 91.5288656803017, "comet_score": 94.25538139115702, "overall_score": 92.87211675894737}, "de_DE": {"meta_score": 85.2280462899932, "comet_score": 94.07864393244154, "overall_score": 89.43491188228616}, "fr_FR": {"meta_score": 89.95425434583714, "comet_score": 93.54283510166248, "overall_score": 91.71345448913122}, "tr_TR": {"meta_score": 84.8647440196736, "comet_score": 95.62860799134243, "overall_score": 89.92572022982515}, "ar_AE": {"meta_score": 91.86276665933583, "comet_score": 94.23240716178941, "overall_score": 93.03250001713178}, "it_IT": {"meta_score": 92.42840329540996, "comet_score": 95.65071583834757, "overall_score": 94.0119560291437}, "es_ES": {"meta_score": 89.88385162982391, "comet_score": 94.99522890936082, "overall_score": 92.36888279548127}, "zh_TW": {"meta_score": 77.76920108066385, "comet_score": 93.86420877329036, "overall_score": 85.0620462831672}, "ja_JP": {"meta_score": 90.8574784651527, "comet_score": 95.60639457711505, "overall_score": 93.17146313325851}, "overall": {"meta_score": 88.5227670498069, "comet_score": 94.70632115060467, "overall_score": 91.45714199204158}}}}

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ pandas