# -*- coding: utf-8 -*- r""" Quality Estimator Model ======================= Quality Estimator Model estimates a quality score for the hyphotesis (e.g: the MT text) by looking only at source and MT. """ from argparse import Namespace from typing import Dict, List, Tuple, Union import pandas as pd import torch from polos.models.estimators import PolosEstimator, Estimator from polos.modules.feedforward import FeedForward from polos.modules.scalar_mix import ScalarMixWithDropout from torchnlp.utils import collate_tensors class QualityEstimator(PolosEstimator): """ Estimator class that uses a pretrained encoder to extract features from the sequences and then passes those features to a feed forward estimator. :param hparams: Namespace containing the hyperparameters. """ def __init__( self, hparams: Namespace, ) -> None: super().__init__(hparams) def _build_model(self) -> Estimator: """ Initializes the estimator architecture. """ super()._build_model() if self.hparams.encoder_model != "LASER": self.layer = ( int(self.hparams.layer) if self.hparams.layer != "mix" else self.hparams.layer ) self.scalar_mix = ( ScalarMixWithDropout( mixture_size=self.encoder.num_layers, dropout=self.hparams.scalar_mix_dropout, do_layer_norm=True, ) if self.layer == "mix" and self.hparams.pool != "default" else None ) self.ff = FeedForward( in_dim=self.encoder.output_units * 4, hidden_sizes=self.hparams.hidden_sizes, activations=self.hparams.activations, dropout=self.hparams.dropout, ) def read_csv(self, path: str) -> List[dict]: """Reads a comma separated value file. :param path: path to a csv file. :return: List of records as dictionaries """ df = pd.read_csv(path) df = df[["mt", "src", "score"]] df["mt"] = df["mt"].astype(str) df["src"] = df["src"].astype(str) df["score"] = df["score"].astype(float) return df.to_dict("records") def prepare_sample( self, sample: List[Dict[str, Union[str, float]]], inference: bool = False ) -> Union[ Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor] ]: """ Function that prepares a sample to input the model. :param sample: list of dictionaries. :param inference: If set to true prepares only the model inputs. :returns: Tuple with 2 dictionaries (model inputs and targets). If `inference=True` returns only the model inputs. """ sample = collate_tensors(sample) mt_inputs = self.encoder.prepare_sample(sample["mt"]) src_inputs = self.encoder.prepare_sample(sample["src"]) mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} src_inputs = {"src_" + k: v for k, v in src_inputs.items()} inputs = {**mt_inputs, **src_inputs} if inference: return inputs targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} return inputs, targets def forward( self, mt_tokens: torch.tensor, src_tokens: torch.tensor, mt_lengths: torch.tensor, src_lengths: torch.tensor, **kwargs ) -> Dict[str, torch.Tensor]: """ Function that encodes both Source, MT and returns a quality score. :param mt_tokens: MT sequences [batch_size x mt_seq_len] :param src_tokens: SRC sequences [batch_size x src_seq_len] :param mt_lengths: MT lengths [batch_size] :param src_lengths: SRC lengths [batch_size] :return: Dictionary with model outputs to be passed to the loss function. """ mt_sentemb = self.get_sentence_embedding(mt_tokens, mt_lengths) src_sentemb = self.get_sentence_embedding(src_tokens, src_lengths) diff_src = torch.abs(mt_sentemb - src_sentemb) prod_src = mt_sentemb * src_sentemb embedded_sequences = torch.cat( (mt_sentemb, src_sentemb, prod_src, diff_src), dim=1 ) return {"score": self.ff(embedded_sequences)}