|  | import string | 
					
						
						|  |  | 
					
						
						|  | import numpy as np | 
					
						
						|  | import torch | 
					
						
						|  | from laser_encoders import LaserEncoderPipeline | 
					
						
						|  | from scipy.spatial.distance import cosine | 
					
						
						|  | from simalign import SentenceAligner | 
					
						
						|  | from transformers import AutoModel, AutoTokenizer | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | aligner = SentenceAligner(model="xlm-roberta-base", layer=6, from_tf = True) | 
					
						
						|  |  | 
					
						
						|  | de_encoder = LaserEncoderPipeline(lang="deu_Latn") | 
					
						
						|  | en_encoder = LaserEncoderPipeline(lang="eng_Latn") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def accuracy(src_sentence: str, trg_sentence: str) -> dict: | 
					
						
						|  | """ | 
					
						
						|  | Calculate the accuracy of a translation by comparing the source and target | 
					
						
						|  | sentences. | 
					
						
						|  |  | 
					
						
						|  | Parameters: | 
					
						
						|  | src_sentence (str): The source sentence. | 
					
						
						|  | trg_sentence (str): The target sentence. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | dict: A dictionary containing the accuracy score and errors. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | src_sentence = __preprocess_text(src_sentence) | 
					
						
						|  | trg_sentence = __preprocess_text(trg_sentence) | 
					
						
						|  |  | 
					
						
						|  | r = __get_alignment_score(src_sentence, trg_sentence) | 
					
						
						|  | score = __get_bertscore(src_sentence, trg_sentence) | 
					
						
						|  |  | 
					
						
						|  | res = {"score": __bertscore_to_percentage(score), "errors": r} | 
					
						
						|  | return res | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def __preprocess_text(text: str) -> str: | 
					
						
						|  | """ | 
					
						
						|  | Remove punctuation and convert text to lowercase. | 
					
						
						|  |  | 
					
						
						|  | Parameters: | 
					
						
						|  | text (str): The text to preprocess. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | str: The preprocessed text. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | text = text.translate(str.maketrans("", "", string.punctuation)) | 
					
						
						|  |  | 
					
						
						|  | text = text.lower() | 
					
						
						|  | return text | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def __get_bertscore(src_sentence: str, trg_sentence: str) -> float: | 
					
						
						|  | """ | 
					
						
						|  | Get the BERTScore between two sentences. | 
					
						
						|  |  | 
					
						
						|  | Parameters: | 
					
						
						|  | src_sentence (str): The source sentence. | 
					
						
						|  | trg_sentence (str): The target sentence. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | float: The BERTScore. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | emb_src = de_encoder.encode_sentences([src_sentence])[0] | 
					
						
						|  | emb_tgt = en_encoder.encode_sentences([trg_sentence])[0] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | similarity = 1 - cosine(emb_src, emb_tgt) | 
					
						
						|  |  | 
					
						
						|  | return similarity | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def __bertscore_to_percentage(similarity: float, debug: bool = False) -> float: | 
					
						
						|  | """ | 
					
						
						|  | Convert the BERTScore cosine similarity to a percentage score (0-100). | 
					
						
						|  |  | 
					
						
						|  | Parameters: | 
					
						
						|  | similarity (float): The cosine similarity from BERTScore. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | int: A score from 0 to 100. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if debug: | 
					
						
						|  | scaled_score = similarity | 
					
						
						|  | else: | 
					
						
						|  | scaled_score = max( | 
					
						
						|  | 100 / (1 + np.exp(-11 * (similarity - 0.60))), | 
					
						
						|  | 100 / (1 + np.exp(-5 * (similarity - 0.60))), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return round(scaled_score, 2) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list: | 
					
						
						|  | """ | 
					
						
						|  | Get the alignment score between two sentences. | 
					
						
						|  |  | 
					
						
						|  | Parameters: | 
					
						
						|  | src_sentence (str): The source sentence. | 
					
						
						|  | trg_sentence (str): The target sentence. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | list: Mistranslations | 
					
						
						|  | """ | 
					
						
						|  | src_list = src_sentence.split() | 
					
						
						|  | trg_list = trg_sentence.split() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | alignments = aligner.get_word_aligns(src_list, trg_list) | 
					
						
						|  |  | 
					
						
						|  | src_aligns = {x[0] for x in alignments["inter"]} | 
					
						
						|  | trg_aligns = {x[1] for x in alignments["inter"]} | 
					
						
						|  |  | 
					
						
						|  | mistranslations = [] | 
					
						
						|  | for i in range(len(src_list)): | 
					
						
						|  | if i not in src_aligns: | 
					
						
						|  | mistranslations.append( | 
					
						
						|  | { | 
					
						
						|  | "start": i, | 
					
						
						|  | "end": i, | 
					
						
						|  | "message": f"Word {src_list[i]} possibly mistranslated or omitted", | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | for i in range(len(trg_list)): | 
					
						
						|  | if i not in trg_aligns: | 
					
						
						|  | mistranslations.append( | 
					
						
						|  | { | 
					
						
						|  | "start": i, | 
					
						
						|  | "end": i, | 
					
						
						|  | "message": f"Word {trg_list[i]} possibly mistranslated or added erroneously", | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return mistranslations | 
					
						
						|  |  |