Spaces:
Running
Running
Ryan Christian D. Deniega
feat: PhilVerify Phase 1-3 β FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)
6c9b8f1 | """ | |
| PhilVerify β Text Preprocessor | |
| Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text. | |
| """ | |
| import re | |
| import string | |
| import unicodedata | |
| from dataclasses import dataclass, field | |
| # ββ Filipino + English stopwords ββββββββββββββββββββββββββββββββββββββββββββββ | |
| TAGALOG_STOPWORDS = { | |
| "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si", | |
| "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan", | |
| "dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa", | |
| "lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung", | |
| "kapag", "habang", "bilang", "upang", "para", "mula", "hanggang", | |
| "ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo", | |
| "hindi", "wala", "may", "mayroon", "talaga", "pala", "sana", | |
| } | |
| ENGLISH_STOPWORDS = { | |
| "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", | |
| "for", "of", "with", "by", "from", "is", "are", "was", "were", | |
| "be", "been", "being", "have", "has", "had", "do", "does", "did", | |
| "will", "would", "could", "should", "may", "might", "shall", "can", | |
| "not", "no", "nor", "so", "yet", "both", "either", "neither", | |
| "this", "that", "these", "those", "it", "its", "i", "me", "my", | |
| "we", "our", "you", "your", "they", "their", "he", "his", "she", "her", | |
| } | |
| ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS | |
| # ββ Patterns ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _URL_PATTERN = re.compile( | |
| r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" | |
| ) | |
| _HTML_TAG_PATTERN = re.compile(r"<[^>]+>") | |
| _MENTION_PATTERN = re.compile(r"@\w+") | |
| _HASHTAG_PATTERN = re.compile(r"#\w+") | |
| _REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}") # "graaabe" β "grabe" | |
| _EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}") | |
| _WHITESPACE_PATTERN = re.compile(r"\s+") | |
| # Emoji removal via unicode category | |
| def _remove_emojis(text: str) -> str: | |
| return "".join( | |
| ch for ch in text | |
| if not unicodedata.category(ch).startswith("So") # Symbol, Other | |
| and unicodedata.category(ch) not in ("Mn",) # Modifier letters | |
| ) | |
| class PreprocessResult: | |
| original: str | |
| cleaned: str | |
| normalized: str | |
| tokens: list[str] = field(default_factory=list) | |
| filtered_tokens: list[str] = field(default_factory=list) | |
| char_count: int = 0 | |
| word_count: int = 0 | |
| class TextPreprocessor: | |
| """ | |
| Multi-step text cleaner for Tagalog / English / Taglish content. | |
| Pipeline: | |
| 1. strip_html β remove HTML tags | |
| 2. strip_urls β remove hyperlinks | |
| 3. strip_mentions β remove @user | |
| 4. strip_hashtags β remove #tag text (keep token) | |
| 5. strip_emojis β remove Unicode emoji | |
| 6. lowercase β normalize case | |
| 7. normalize_chars β collapse repeated chars, excessive !?? | |
| 8. strip_punct β remove punctuation except apostrophe | |
| 9. tokenize β split on whitespace | |
| 10. remove_stopwords β drop EN + TL stopwords | |
| """ | |
| def clean(self, text: str) -> str: | |
| """Steps 1-6: structural cleaning.""" | |
| text = _HTML_TAG_PATTERN.sub(" ", text) | |
| text = _URL_PATTERN.sub(" ", text) | |
| text = _MENTION_PATTERN.sub(" ", text) | |
| text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text) # Keep word, drop # | |
| text = _remove_emojis(text) | |
| text = text.lower() | |
| return _WHITESPACE_PATTERN.sub(" ", text).strip() | |
| def normalize(self, text: str) -> str: | |
| """Steps 7-8: character-level normalization.""" | |
| text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text) # "graaabe" β "graabe" | |
| text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text) # "!!!" β "!" | |
| # Keep apostrophes (di, 'di, hindi), remove other punct | |
| text = "".join( | |
| ch if ch not in string.punctuation or ch == "'" else " " | |
| for ch in text | |
| ) | |
| return _WHITESPACE_PATTERN.sub(" ", text).strip() | |
| def tokenize(self, text: str) -> list[str]: | |
| """Step 9: whitespace tokenization.""" | |
| return [t for t in text.split() if len(t) > 1] | |
| def remove_stopwords(self, tokens: list[str]) -> list[str]: | |
| """Step 10: remove EN + TL stopwords.""" | |
| return [t for t in tokens if t not in ALL_STOPWORDS] | |
| def preprocess(self, text: str) -> PreprocessResult: | |
| """Run the full pipeline and return a structured result.""" | |
| cleaned = self.clean(text) | |
| normalized = self.normalize(cleaned) | |
| tokens = self.tokenize(normalized) | |
| filtered = self.remove_stopwords(tokens) | |
| return PreprocessResult( | |
| original=text, | |
| cleaned=cleaned, | |
| normalized=normalized, | |
| tokens=tokens, | |
| filtered_tokens=filtered, | |
| char_count=len(normalized), | |
| word_count=len(tokens), | |
| ) | |