Alex Cabrera commited on
Commit
95b368a
1 Parent(s): 8ceca60
Files changed (1) hide show
  1. modeling.py +29 -2
modeling.py CHANGED
@@ -2,6 +2,7 @@
2
  from __future__ import annotations
3
 
4
  import os
 
5
  from dataclasses import dataclass
6
 
7
  import config
@@ -58,6 +59,27 @@ def process_data(
58
  return data
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def process_output(
62
  input_dir: str,
63
  lang_pairs: list[str],
@@ -66,7 +88,8 @@ def process_output(
66
  """Load model outputs."""
67
  # Load the data
68
  data: list[str] = []
69
- model_path = config.model_configs[model_preset].path
 
70
  system_dir = os.path.join(input_dir, "evaluation", "system-outputs", model_path)
71
  for lang_pair in lang_pairs:
72
  src_lang, trg_lang = lang_pair[:2], lang_pair[2:]
@@ -75,5 +98,9 @@ def process_output(
75
  )
76
  with open(sys_file, "r") as sys_in:
77
  for sys_line in sys_in:
78
- data.append(sys_line.strip())
 
 
 
 
79
  return data
 
2
  from __future__ import annotations
3
 
4
  import os
5
+ import re
6
  from dataclasses import dataclass
7
 
8
  import config
 
59
  return data
60
 
61
 
62
+ def remove_leading_language(line: str) -> str:
63
+ """Remove a language at the beginning of the string.
64
+
65
+ Some zero-shot models output the name of the language at the beginning of the
66
+ string. This is a manual post-processing function that removes the language name
67
+ (partly as an example of how you can do simple fixes to issues that come up during
68
+ analysis using Zeno).
69
+
70
+ Args:
71
+ line: The line to process.
72
+
73
+ Returns:
74
+ The line with the language removed.
75
+ """
76
+ return re.sub(
77
+ r"^(English|Japanese|Chinese|Hausa|Icelandic|French|German|Russian|Ukranian): ",
78
+ "",
79
+ line,
80
+ )
81
+
82
+
83
  def process_output(
84
  input_dir: str,
85
  lang_pairs: list[str],
 
88
  """Load model outputs."""
89
  # Load the data
90
  data: list[str] = []
91
+ model_config = config.model_configs[model_preset]
92
+ model_path = model_config.path
93
  system_dir = os.path.join(input_dir, "evaluation", "system-outputs", model_path)
94
  for lang_pair in lang_pairs:
95
  src_lang, trg_lang = lang_pair[:2], lang_pair[2:]
 
98
  )
99
  with open(sys_file, "r") as sys_in:
100
  for sys_line in sys_in:
101
+ sys_line = sys_line.strip()
102
+ if model_config.post_processors is not None:
103
+ for postprocessor in model_config.post_processors:
104
+ sys_line = postprocessor(sys_line)
105
+ data.append(sys_line)
106
  return data