File size: 2,871 Bytes
b8397a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
\
import pandas as pd
import re, unicodedata
from html import unescape

MIN_LEN = 20
MAX_LEN = 60
KEEP_ASCII_ONLY = False
MIN_ALPHA_RATIO = 0.60
DROP_IF_ALL_CAPS = False

BUZZY = {
    "synergy","cutting edge","cutting-edge","best in class","best-in-class",
    "world class","world-class","state of the art","state-of-the-art",
    "revolutionary","disruptive platform","next generation","next-gen",
    "leading provider","scalable solution"
}

URL_RE   = re.compile(r"(https?://|www\.)\S+", re.I)
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
WS_RE    = re.compile(r"\s+")
PUNCT_RE = re.compile(r"[^\w\s]+")
TM_RE    = re.compile(r"[®️©️™️]")

def _nfkc(s): return unicodedata.normalize("NFKC", s)

def _clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = unescape(s)
    s = _nfkc(s)
    s = s.replace("\\n"," ").replace("\\r"," ")
    s = TM_RE.sub("", s)
    s = WS_RE.sub(" ", s).strip()
    return s

def _alpha_ratio(s: str) -> float:
    if not s: return 0.0
    letters = sum(ch.isalpha() for ch in s)
    return letters / max(1, len(s))

def _looks_shouty(s: str) -> bool:
    letters = [ch for ch in s if ch.isalpha()]
    if not letters: return False
    uppers = sum(ch.isupper() for ch in letters)
    return uppers / len(letters) >= 0.85

def _contains_buzzy(s: str) -> bool:
    lo = s.lower()
    return any(term in lo for term in BUZZY)

def _has_junk(s: str) -> bool:
    return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))

def _ascii_only(s: str) -> bool:
    try:
        s.encode("ascii"); return True
    except Exception:
        return False

def _dupe_key(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^\\w\\s]+", " ", s)
    s = re.sub(r"\\s+", " ", s).strip()
    return s

def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    if "tagline" not in df.columns:
        raise ValueError("Input must contain a 'tagline' column.")
    df = df.copy()
    if "description" not in df.columns:
        df["description"] = df["tagline"]

    df["tagline"] = df["tagline"].map(_clean_text)
    df["description"] = df["description"].map(_clean_text)

    df = df[(df["tagline"].str.len() > 0)]
    mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
    df = df[~mask_junk]

    if KEEP_ASCII_ONLY:
        df = df[df["tagline"].map(_ascii_only)]

    df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
    df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]

    if DROP_IF_ALL_CAPS:
        df = df[~df["tagline"].map(_looks_shouty)]

    df = df[~df["tagline"].map(_contains_buzzy)]

    key = df["tagline"].map(_dupe_key)
    df = df.loc[~key.duplicated()].reset_index(drop=True)

    df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
    return df