from typing import Dict, List import evaluate from datasets import Features, Sequence, Value from sklearn.metrics import accuracy_score from research_eval.utils.preprocessing import absa_term_preprocess _CITATION = """ """ _DESCRIPTION = """ Evaluation metrics for Aspect-Based Sentiment Analysis (ABSA) including precision, recall, and F1 score for aspect terms and polarities. """ _KWARGS_DESCRIPTION = """ Computes precision, recall, and F1 score for aspect terms and polarities in Aspect-Based Sentiment Analysis (ABSA). Args: predictions: List of ABSA predictions with the following structure: - 'aspects': Sequence of aspect annotations, each with the following keys: - 'term': Aspect term - 'polarity': Polarity of the aspect term references: List of ABSA references with the same structure as predictions. Returns: aspect_precision: Precision score for aspect terms aspect_recall: Recall score for aspect terms aspect_f1: F1 score for aspect terms polarity_precision: Precision score for aspect polarities polarity_recall: Recall score for aspect polarities polarity_f1: F1 score for aspect polarities """ class AbsaEvaluatorTest(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=Features( { "predictions": Features( { "aspects": Features( { "term": Sequence(Value("string")), "polarity": Sequence(Value("string")), } ), "category": Features( { "category": Sequence(Value("string")), "polarity": Sequence(Value("string")), } ), } ), "references": Features( { "aspects": Features( { "term": Sequence(Value("string")), "polarity": Sequence(Value("string")), } ), "category": Features( { "category": Sequence(Value("string")), "polarity": Sequence(Value("string")), } ), } ), } ), ) def _compute(self, predictions, references): # preprocess aspect term ( truth_aspect_terms, pred_aspect_terms, truth_term_polarities, pred_term_polarities, ) = absa_term_preprocess( references=references, predictions=predictions, subtask_key="aspects", subtask_value="term", ) # evaluate term_results = self.semeval_metric( truth_aspect_terms, pred_aspect_terms ) term_polarity_acc = accuracy_score( truth_term_polarities, pred_term_polarities ) # preprocess category detection ( truth_categories, pred_categories, truth_cat_polarities, pred_cat_polarities, ) = absa_term_preprocess( references=references, predictions=predictions, subtask_key="category", subtask_value="category", ) # evaluate category_results = self.semeval_metric( truth_categories, pred_categories ) cat_polarity_acc = accuracy_score( truth_cat_polarities, pred_cat_polarities ) return { "term_extraction_results": term_results, "term_polarity_results_accuracy": term_polarity_acc, "category_detection_results": category_results, "category_polarity_results_accuracy": cat_polarity_acc, } def semeval_metric( self, truths: List[List[str]], preds: List[List[str]] ) -> Dict[str, float]: """ Implements evaluation for extraction tasks using precision, recall, and F1 score. Parameters: - truths: List of lists, where each list contains the ground truth labels for a sample. - preds: List of lists, where each list contains the predicted labels for a sample. Returns: - A dictionary containing the precision, recall, F1 score, and counts of common, retrieved, and relevant. link for code: link for this code: https://github.com/davidsbatista/Aspect-Based-Sentiment-Analysis/blob/1d9c8ec1131993d924e96676fa212db6b53cb870/libraries/baselines.py#L387 """ b = 1 common, relevant, retrieved = 0.0, 0.0, 0.0 for truth, pred in zip(truths, preds): common += len([a for a in pred if a in truth]) retrieved += len(pred) relevant += len(truth) precision = common / retrieved if retrieved > 0 else 0.0 recall = common / relevant if relevant > 0 else 0.0 f1 = ( (1 + (b**2)) * precision * recall / ((precision * b**2) + recall) if precision > 0 and recall > 0 else 0.0 ) return { "precision": precision, "recall": recall, "f1_score": f1, "common": common, "retrieved": retrieved, "relevant": relevant, }