Spaces:

danieldux
/

isco_hierarchical_accuracy

Running

App Files Files Community

danieldux commited on Mar 30, 2024

Commit

d726519

1 Parent(s): 5af5762

Refactor ISCO_Hierarchical_Accuracy class to improve code readability and add input validation

Browse files

Files changed (1) hide show

isco_hierarchical_accuracy.py +37 -24

isco_hierarchical_accuracy.py CHANGED Viewed

@@ -16,9 +16,7 @@
 from typing import List, Set, Dict, Tuple
 import evaluate
 import datasets
-# import ham
-# import isco
 # TODO: Add BibTeX citation
@@ -264,40 +262,55 @@ class ISCO_Hierarchical_Accuracy(evaluate.Metric):
         print("Weighted ISCO hierarchy dictionary created as isco_hierarchy")
         # print(self.isco_hierarchy)
-    # Define the mapping from ISCO_CODE_TITLE to ISCO codes
-    def _extract_isco_code(isco_code_title: str):
-        # ISCO_CODE_TITLE is a string like "7412 Electrical Mechanics and Fitters" so we need to extract the first part for the evaluation.
-        return isco_code_title.split()[0]
     def _compute(self, predictions, references):
-        """Returns the accuracy scores."""
-        # Convert the inputs to strings
-        if len(predictions[0]) > 4:
-            extracted_predictions = []
-            extracted_references = []
-            for p in predictions:
-                extracted_predictions.append(self._extract_isco_code(p))
-            for r in references:
-                extracted_references.append(self._extract_isco_code(r))
-            predictions = extracted_predictions
-            references = extracted_references
         predictions = [str(p) for p in predictions]
         references = [str(r) for r in references]
         # Calculate accuracy
         accuracy = sum(i == j for i, j in zip(predictions, references)) / len(
             predictions
         )
         # Calculate hierarchical precision, recall and f-measure
-        hierarchy = self.isco_hierarchy
         hP, hR = self.calculate_hierarchical_precision_recall(
-            references, predictions, hierarchy
         )
         hF = self.hierarchical_f_measure(hP, hR)
-        print(
-            f"Accuracy: {accuracy}, Hierarchical Precision: {hP}, Hierarchical Recall: {hR}, Hierarchical F-measure: {hF}"
-        )
         return {
             "accuracy": accuracy,

 from typing import List, Set, Dict, Tuple
 import evaluate
 import datasets
+import re
 # TODO: Add BibTeX citation
         print("Weighted ISCO hierarchy dictionary created as isco_hierarchy")
         # print(self.isco_hierarchy)
+    # Function to check if a string matches the 4-digit code pattern
+    def _is_valid_code(self, code: str):
+        # Regular expression pattern for a 4-digit code
+        pattern = r"^\d{4}$"
+        if re.match(pattern, code):
+            return True
+        else:
+            return False
+    def _validate_codes(self, codes: list, code_type):
+        if not all(self._is_valid_code(code) for code in codes):
+            raise ValueError(
+                f"All {code_type} labels must start with a 4-digit ISCO-08 code string."
+            )
     def _compute(self, predictions, references):
+        """
+        Computes the accuracy scores, hierarchical precision, recall, and f-measure.
+        Args:
+            predictions (List[str]): A list of 4-digit ISCO-08 prediction label strings.
+            references (List[str]): A list of 4-digit ISCO-08 reference label strings.
+        Returns:
+            dict: A dictionary containing the accuracy, hierarchical precision, hierarchical recall,
+                  and hierarchical f-measure scores.
+        """
+        # Cast all prediction labels as strings
         predictions = [str(p) for p in predictions]
         references = [str(r) for r in references]
+        # Check if the first prediction label is longer than 4 characters
+        if len(predictions[0]) > 4:
+            # Extract the first 4 characters from each prediction label
+            predictions = [str(p.split()[0]) for p in predictions]
+            # Check if all prediction labels are 4-digit strings
+            self._validate_codes(predictions, "prediction")
+            # Repeat for reference labels
+            references = [str(r.split()[0]) for r in references]
+            self._validate_codes(references, "reference")
         # Calculate accuracy
         accuracy = sum(i == j for i, j in zip(predictions, references)) / len(
             predictions
         )
         # Calculate hierarchical precision, recall and f-measure
         hP, hR = self.calculate_hierarchical_precision_recall(
+            references, predictions, self.isco_hierarchy
         )
         hF = self.hierarchical_f_measure(hP, hR)
         return {
             "accuracy": accuracy,