sloganAI / logic /cleaning.py
asaf1602's picture
Upload folder using huggingface_hub
b8397a5 verified
\
import pandas as pd
import re, unicodedata
from html import unescape
MIN_LEN = 20
MAX_LEN = 60
KEEP_ASCII_ONLY = False
MIN_ALPHA_RATIO = 0.60
DROP_IF_ALL_CAPS = False
BUZZY = {
"synergy","cutting edge","cutting-edge","best in class","best-in-class",
"world class","world-class","state of the art","state-of-the-art",
"revolutionary","disruptive platform","next generation","next-gen",
"leading provider","scalable solution"
}
URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
WS_RE = re.compile(r"\s+")
PUNCT_RE = re.compile(r"[^\w\s]+")
TM_RE = re.compile(r"[®️©️™️]")
def _nfkc(s): return unicodedata.normalize("NFKC", s)
def _clean_text(s: str) -> str:
s = "" if s is None else str(s)
s = unescape(s)
s = _nfkc(s)
s = s.replace("\\n"," ").replace("\\r"," ")
s = TM_RE.sub("", s)
s = WS_RE.sub(" ", s).strip()
return s
def _alpha_ratio(s: str) -> float:
if not s: return 0.0
letters = sum(ch.isalpha() for ch in s)
return letters / max(1, len(s))
def _looks_shouty(s: str) -> bool:
letters = [ch for ch in s if ch.isalpha()]
if not letters: return False
uppers = sum(ch.isupper() for ch in letters)
return uppers / len(letters) >= 0.85
def _contains_buzzy(s: str) -> bool:
lo = s.lower()
return any(term in lo for term in BUZZY)
def _has_junk(s: str) -> bool:
return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))
def _ascii_only(s: str) -> bool:
try:
s.encode("ascii"); return True
except Exception:
return False
def _dupe_key(s: str) -> str:
s = s.lower()
s = re.sub(r"[^\\w\\s]+", " ", s)
s = re.sub(r"\\s+", " ", s).strip()
return s
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
if "tagline" not in df.columns:
raise ValueError("Input must contain a 'tagline' column.")
df = df.copy()
if "description" not in df.columns:
df["description"] = df["tagline"]
df["tagline"] = df["tagline"].map(_clean_text)
df["description"] = df["description"].map(_clean_text)
df = df[(df["tagline"].str.len() > 0)]
mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
df = df[~mask_junk]
if KEEP_ASCII_ONLY:
df = df[df["tagline"].map(_ascii_only)]
df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
if DROP_IF_ALL_CAPS:
df = df[~df["tagline"].map(_looks_shouty)]
df = df[~df["tagline"].map(_contains_buzzy)]
key = df["tagline"].map(_dupe_key)
df = df.loc[~key.duplicated()].reset_index(drop=True)
df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
return df