Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

CompactAI commited on 6 days ago

Commit

f63dfdb

verified ·

1 Parent(s): d0d9725

Delete features.py

Browse files

Files changed (1) hide show

features.py +0 -157

features.py DELETED Viewed

@@ -1,157 +0,0 @@
-"""
-AIFinder Feature Extraction
-TF-IDF pipeline + stylometric features.
-Supports CoT-aware and no-CoT text preprocessing.
-"""
-import re
-import numpy as np
-from scipy.sparse import hstack, csr_matrix
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.preprocessing import MaxAbsScaler
-from sklearn.base import BaseEstimator, TransformerMixin
-from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
-# --- Text Preprocessing ---
-def strip_cot(text):
-    """Remove <think>...</think> blocks from text."""
-    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
-def has_cot(text):
-    """Check if text contains <think>...</think> blocks."""
-    return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))
-def cot_ratio(text):
-    """Ratio of thinking text to total text length."""
-    think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
-    if not think_matches or len(text) == 0:
-        return 0.0
-    think_len = sum(len(m) for m in think_matches)
-    return think_len / len(text)
-# --- Stylometric Features ---
-class StylometricFeatures(BaseEstimator, TransformerMixin):
-    """Extract stylometric features from text."""
-    def fit(self, X, y=None):
-        return self
-    def transform(self, X):
-        features = []
-        for text in X:
-            features.append(self._extract(text))
-        return csr_matrix(np.array(features, dtype=np.float32))
-    def _extract(self, text):
-        sentences = re.split(r'[.!?]+', text)
-        sentences = [s.strip() for s in sentences if s.strip()]
-        words = text.split()
-        n_chars = max(len(text), 1)
-        n_words = max(len(words), 1)
-        n_sentences = max(len(sentences), 1)
-        # Basic stats
-        avg_word_len = np.mean([len(w) for w in words]) if words else 0
-        avg_sent_len = n_words / n_sentences
-        # Punctuation densities
-        n_commas = text.count(",") / n_chars
-        n_semicolons = text.count(";") / n_chars
-        n_colons = text.count(":") / n_chars
-        n_exclaim = text.count("!") / n_chars
-        n_question = text.count("?") / n_chars
-        n_ellipsis = text.count("...") / n_chars
-        n_dash = (text.count("—") + text.count("--")) / n_chars
-        # Markdown elements
-        n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
-        n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
-        n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
-        n_code_blocks = len(re.findall(r'```', text)) / n_sentences
-        n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
-        n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
-        n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences
-        # Vocabulary richness
-        unique_words = len(set(w.lower() for w in words))
-        ttr = unique_words / n_words  # type-token ratio
-        # Paragraph structure
-        paragraphs = text.split("\n\n")
-        n_paragraphs = len([p for p in paragraphs if p.strip()])
-        avg_para_len = n_words / max(n_paragraphs, 1)
-        # Special patterns
-        starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
-        has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0
-        # CoT features (present even in no-CoT mode, just will be 0)
-        has_think = 1.0 if has_cot(text) else 0.0
-        think_ratio = cot_ratio(text)
-        return [
-            avg_word_len, avg_sent_len,
-            n_commas, n_semicolons, n_colons, n_exclaim, n_question,
-            n_ellipsis, n_dash,
-            n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
-            n_bullet, n_numbered,
-            ttr, n_paragraphs, avg_para_len,
-            starts_with_certainly, has_disclaimer,
-            has_think, think_ratio,
-            n_chars, n_words,
-        ]
-# --- Feature Pipeline ---
-class FeaturePipeline:
-    """Combined TF-IDF + stylometric feature pipeline."""
-    def __init__(self):
-        self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
-        self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
-        self.stylo = StylometricFeatures()
-        self.scaler = MaxAbsScaler()
-    def fit_transform(self, texts):
-        """Fit and transform texts into feature matrix."""
-        import time
-        print(f"    Input: {len(texts)} texts")
-        # Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
-        texts_no_cot = [strip_cot(t) for t in texts]
-        t0 = time.time()
-        word_features = self.word_tfidf.fit_transform(texts_no_cot)
-        print(f"    word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
-        t0 = time.time()
-        char_features = self.char_tfidf.fit_transform(texts_no_cot)
-        print(f"    char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
-        # Stylometric uses original text (has_think, think_ratio still work)
-        t0 = time.time()
-        stylo_features = self.stylo.fit_transform(texts)
-        print(f"    stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
-        combined = hstack([word_features, char_features, stylo_features])
-        combined = self.scaler.fit_transform(combined)
-        print(f"    Combined feature matrix: {combined.shape}")
-        return combined
-    def transform(self, texts):
-        """Transform texts into feature matrix (after fitting)."""
-        texts_no_cot = [strip_cot(t) for t in texts]
-        word_features = self.word_tfidf.transform(texts_no_cot)
-        char_features = self.char_tfidf.transform(texts_no_cot)
-        stylo_features = self.stylo.transform(texts)
-        combined = hstack([word_features, char_features, stylo_features])
-        return self.scaler.transform(combined)