|
import re |
|
from typing import Dict |
|
|
|
|
|
non_printing_characters_re = re.compile( |
|
f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" |
|
) |
|
|
|
digits_re: re.Pattern = re.compile(r"\d") |
|
|
|
unicode_punctuation: Dict[str, str] = { |
|
"οΌ": ",", |
|
"γ": ".", |
|
"γ": ",", |
|
"β": '"', |
|
"β": '"', |
|
"β": '"', |
|
"Β«": '"', |
|
"Β»": '"', |
|
"οΌ": '"', |
|
"γ": '"', |
|
"γ": '"', |
|
"γ": '"', |
|
"γ": '"', |
|
"Β΄": "'", |
|
"βΆ": ":", |
|
"οΌ": ":", |
|
"οΌ": "?", |
|
"οΌ": "!", |
|
"οΌ": "(", |
|
"οΌ": ")", |
|
"οΌ": ";", |
|
"β": "-", |
|
"β": " - ", |
|
"οΌ": ". ", |
|
"ο½": "~", |
|
"β": "'", |
|
"β¦": "...", |
|
"β": "-", |
|
"γ": "<", |
|
"γ": ">", |
|
"γ": "[", |
|
"γ": "]", |
|
"οΌ
": "%", |
|
"βΊ": "-", |
|
} |
|
|
|
normalization = { |
|
"non_printing_characters_re": non_printing_characters_re, |
|
"digits_re": digits_re, |
|
"unicode_punctuation": unicode_punctuation, |
|
} |
|
|