Spaces:
Running
Running
vineelpratap
commited on
Commit
•
2bc2fff
1
Parent(s):
9981498
Update utils/text_norm.py
Browse files- utils/text_norm.py +8 -3
utils/text_norm.py
CHANGED
@@ -6,7 +6,12 @@ from utils.norm_config import norm_config
|
|
6 |
|
7 |
|
8 |
def text_normalize(
|
9 |
-
text,
|
|
|
|
|
|
|
|
|
|
|
10 |
):
|
11 |
|
12 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
@@ -95,7 +100,7 @@ def text_normalize(
|
|
95 |
|
96 |
normalized_text = unidecode(normalized_text)
|
97 |
|
98 |
-
|
99 |
-
|
100 |
|
101 |
return normalized_text
|
|
|
6 |
|
7 |
|
8 |
def text_normalize(
|
9 |
+
text,
|
10 |
+
iso_code="xxx",
|
11 |
+
lower_case=True,
|
12 |
+
remove_numbers=False,
|
13 |
+
remove_brackets=False,
|
14 |
+
rm_extra_spaces=False,
|
15 |
):
|
16 |
|
17 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
|
|
100 |
|
101 |
normalized_text = unidecode(normalized_text)
|
102 |
|
103 |
+
if rm_extra_spaces:
|
104 |
+
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
105 |
|
106 |
return normalized_text
|