Spaces:

ak5005
/

derrobot

Sleeping

File size: 4,425 Bytes

import string

import numpy as np
import torch
from laser_encoders import LaserEncoderPipeline
from scipy.spatial.distance import cosine
from simalign import SentenceAligner
from transformers import AutoModel, AutoTokenizer

# setup global variables on import (bad practice, but whatever)
# --------------------------------------------------------------

aligner = SentenceAligner(model="xlm-roberta-base", layer=6, from_tf = True)

de_encoder = LaserEncoderPipeline(lang="deu_Latn")
en_encoder = LaserEncoderPipeline(lang="eng_Latn")


def accuracy(src_sentence: str, trg_sentence: str) -> dict:
    """
    Calculate the accuracy of a translation by comparing the source and target
    sentences.

    Parameters:
        src_sentence (str): The source sentence.
        trg_sentence (str): The target sentence.

    Returns:
        dict: A dictionary containing the accuracy score and errors.
    """
    # Preprocess both sentences
    src_sentence = __preprocess_text(src_sentence)
    trg_sentence = __preprocess_text(trg_sentence)

    r = __get_alignment_score(src_sentence, trg_sentence)
    score = __get_bertscore(src_sentence, trg_sentence)

    res = {"score": __bertscore_to_percentage(score), "errors": r}
    return res


def __preprocess_text(text: str) -> str:
    """
    Remove punctuation and convert text to lowercase.

    Parameters:
        text (str): The text to preprocess.

    Returns:
        str: The preprocessed text.
    """
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text


def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
    """
    Get the BERTScore between two sentences.

    Parameters:
        src_sentence (str): The source sentence.
        trg_sentence (str): The target sentence.

    Returns:
        float: The BERTScore.
    """
    # Tokenize and generate embeddings
    emb_src = de_encoder.encode_sentences([src_sentence])[0]
    emb_tgt = en_encoder.encode_sentences([trg_sentence])[0]

    # Calculate cosine similarity (1 - cosine distance)
    similarity = 1 - cosine(emb_src, emb_tgt)

    return similarity


def __bertscore_to_percentage(similarity: float, debug: bool = False) -> float:
    """
    Convert the BERTScore cosine similarity to a percentage score (0-100).

    Parameters:
        similarity (float): The cosine similarity from BERTScore.

    Returns:
        int: A score from 0 to 100.
    """
    # Scale the similarity score from [-1, 1] range to [0, 100] (rarely negative)
    # Logistic function: 100 / (1 + exp(-k * (x - 0.5))), where k controls steepness
    # k = 35  # Steepness parameter - higher values create a sharper transition

    if debug:
        scaled_score = similarity
    else:
        scaled_score = max(
            100 / (1 + np.exp(-11 * (similarity - 0.60))),
            100 / (1 + np.exp(-5 * (similarity - 0.60))),
        )

    # scaled_score = similarity
    return round(scaled_score, 2)


def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
    """
    Get the alignment score between two sentences.

    Parameters:
        src_sentence (str): The source sentence.
        trg_sentence (str): The target sentence.

    Returns:
        list: Mistranslations
    """
    src_list = src_sentence.split()
    trg_list = trg_sentence.split()

    # The output is a dictionary with different matching methods.
    # Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
    alignments = aligner.get_word_aligns(src_list, trg_list)

    src_aligns = {x[0] for x in alignments["inter"]}
    trg_aligns = {x[1] for x in alignments["inter"]}

    mistranslations = []
    for i in range(len(src_list)):
        if i not in src_aligns:
            mistranslations.append(
                {
                    "start": i,
                    "end": i,
                    "message": f"Word {src_list[i]} possibly mistranslated or omitted",
                }
            )

    for i in range(len(trg_list)):
        if i not in trg_aligns:
            mistranslations.append(
                {
                    "start": i,
                    "end": i,
                    "message": f"Word {trg_list[i]} possibly mistranslated or added erroneously",
                }
            )

    return mistranslations