marc-match-ai / marcai /processing /normalizations.py
RvanB's picture
Add files from other repo
fbf7e95
raw
history blame
No virus
797 Bytes
from unidecode import unidecode
import numpy as np
import pandas as pd
def remove_diacritics(series):
se_np = series.to_numpy()
se_np = np.vectorize(unidecode)(se_np)
return pd.Series(se_np)
def lowercase(series):
return series.str.lower()
def remove_punctuation(series):
return series.str.replace(r"[^\w\s]", "")
def normalize_whitespace(series):
# Replace all whitespace with a single space
s = series.str.replace(r"\s", " ")
# Remove leading and trailing whitespace
s = s.str.strip()
# Remove double spaces
return s.str.replace(r"\s+", " ")
def substring(series, start, end):
return series.str[start:end]
def apply_normalizers(series, transforms):
for transform in transforms:
series = transform(series)
return series