Elron commited on
Commit
77ed1a0
1 Parent(s): 8d7c981

Upload formats.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. formats.py +44 -2
formats.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import (
2
  Any,
3
  Dict,
@@ -14,9 +15,51 @@ class Format(StreamInstanceOperator):
14
  pass
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class SystemFormat(Format):
18
  r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
19
 
 
 
20
  SystemFormat expects the input instance to contain:
21
  1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
22
  2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
@@ -107,7 +150,6 @@ class SystemFormat(Format):
107
  instance=instance, field_name="system_prompt"
108
  )
109
 
110
- # pop "system_prompt", "instruction", and "target_prefix" from instance
111
  if "target_prefix" in instance:
112
  instance.pop("target_prefix")
113
  if "instruction" in instance:
@@ -122,7 +164,6 @@ class SystemFormat(Format):
122
  demos is not None and isoftype(demos, List[Dict[str, Any]])
123
  ), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
124
  demo_instances = demos
125
- # pop demos from instance
126
  instance.pop(self.demos_field)
127
 
128
  demos_string = ""
@@ -143,5 +184,6 @@ class SystemFormat(Format):
143
  target_prefix=target_prefix,
144
  **self.format_args,
145
  )
 
146
  instance["source"] = output
147
  return instance
 
1
+ import re
2
  from typing import (
3
  Any,
4
  Dict,
 
15
  pass
16
 
17
 
18
+ def apply_capital_new_line_notation(text: str) -> str:
19
+ r"""Transforms a given string by applying the Capital New Line Notation.
20
+
21
+ The Capital New Line Notation (\N) is designed to manage newline behavior in a string efficiently.
22
+ This custom notation aims to consolidate multiple newline characters (\n) into a single newline under
23
+ specific conditions, with tailored handling based on whether there's preceding text. The function
24
+ distinguishes between two primary scenarios:
25
+
26
+ 1. If there's text (referred to as a prefix) followed by any number of \n characters and then one or
27
+ more \N, the entire sequence is replaced with a single \n. This effectively simplifies multiple
28
+ newlines and notation characters into a single newline when there's preceding text.
29
+ 2. If the string starts with \n characters followed by \N without any text before this sequence, or if
30
+ \N is at the very beginning of the string, the sequence is completely removed. This case is
31
+ applicable when the notation should not introduce any newlines due to the absence of preceding text.
32
+
33
+ Args:
34
+ text (str): The input string to be transformed, potentially containing the Capital New Line Notation
35
+ (\N) mixed with actual newline characters (\n).
36
+
37
+ Returns:
38
+ str: The string after applying the Capital New Line Notation rules, which either consolidates multiple
39
+ newlines and notation characters into a single newline when text precedes them, or removes the
40
+ notation and any preceding newlines entirely if no text is present before the notation.
41
+
42
+ Examples:
43
+ >>> apply_capital_new_line_notation("Hello World\\n\\n\N")
44
+ 'Hello World\\n'
45
+
46
+ >>> apply_capital_new_line_notation("\\n\\n\NGoodbye World")
47
+ 'Goodbye World'
48
+
49
+ >>> apply_capital_new_line_notation("\N")
50
+ ''
51
+ """
52
+ # If sequence of \N or \n that ends with \N has no characters before delete it
53
+ text = re.sub(r"^(?:\n|\\N)*\\N", "", text)
54
+ # Replace every sequence of \N or \n that ends with \N with \n
55
+ return re.sub(r"[\n(\\N)]*(\\N)+", r"\n", text)
56
+
57
+
58
  class SystemFormat(Format):
59
  r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
60
 
61
+ Important: formats can use '\N' notations that means new-line if no new-line before and no empty string before.
62
+
63
  SystemFormat expects the input instance to contain:
64
  1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
65
  2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
 
150
  instance=instance, field_name="system_prompt"
151
  )
152
 
 
153
  if "target_prefix" in instance:
154
  instance.pop("target_prefix")
155
  if "instruction" in instance:
 
164
  demos is not None and isoftype(demos, List[Dict[str, Any]])
165
  ), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
166
  demo_instances = demos
 
167
  instance.pop(self.demos_field)
168
 
169
  demos_string = ""
 
184
  target_prefix=target_prefix,
185
  **self.format_args,
186
  )
187
+ output = apply_capital_new_line_notation(output)
188
  instance["source"] = output
189
  return instance