Delete features.py
Browse files- features.py +0 -157
features.py
DELETED
|
@@ -1,157 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
AIFinder Feature Extraction
|
| 3 |
-
TF-IDF pipeline + stylometric features.
|
| 4 |
-
Supports CoT-aware and no-CoT text preprocessing.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import re
|
| 8 |
-
import numpy as np
|
| 9 |
-
from scipy.sparse import hstack, csr_matrix
|
| 10 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
-
from sklearn.preprocessing import MaxAbsScaler
|
| 12 |
-
from sklearn.base import BaseEstimator, TransformerMixin
|
| 13 |
-
|
| 14 |
-
from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# --- Text Preprocessing ---
|
| 18 |
-
|
| 19 |
-
def strip_cot(text):
|
| 20 |
-
"""Remove <think>...</think> blocks from text."""
|
| 21 |
-
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def has_cot(text):
|
| 25 |
-
"""Check if text contains <think>...</think> blocks."""
|
| 26 |
-
return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def cot_ratio(text):
|
| 30 |
-
"""Ratio of thinking text to total text length."""
|
| 31 |
-
think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
|
| 32 |
-
if not think_matches or len(text) == 0:
|
| 33 |
-
return 0.0
|
| 34 |
-
think_len = sum(len(m) for m in think_matches)
|
| 35 |
-
return think_len / len(text)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# --- Stylometric Features ---
|
| 39 |
-
|
| 40 |
-
class StylometricFeatures(BaseEstimator, TransformerMixin):
|
| 41 |
-
"""Extract stylometric features from text."""
|
| 42 |
-
|
| 43 |
-
def fit(self, X, y=None):
|
| 44 |
-
return self
|
| 45 |
-
|
| 46 |
-
def transform(self, X):
|
| 47 |
-
features = []
|
| 48 |
-
for text in X:
|
| 49 |
-
features.append(self._extract(text))
|
| 50 |
-
return csr_matrix(np.array(features, dtype=np.float32))
|
| 51 |
-
|
| 52 |
-
def _extract(self, text):
|
| 53 |
-
sentences = re.split(r'[.!?]+', text)
|
| 54 |
-
sentences = [s.strip() for s in sentences if s.strip()]
|
| 55 |
-
words = text.split()
|
| 56 |
-
|
| 57 |
-
n_chars = max(len(text), 1)
|
| 58 |
-
n_words = max(len(words), 1)
|
| 59 |
-
n_sentences = max(len(sentences), 1)
|
| 60 |
-
|
| 61 |
-
# Basic stats
|
| 62 |
-
avg_word_len = np.mean([len(w) for w in words]) if words else 0
|
| 63 |
-
avg_sent_len = n_words / n_sentences
|
| 64 |
-
|
| 65 |
-
# Punctuation densities
|
| 66 |
-
n_commas = text.count(",") / n_chars
|
| 67 |
-
n_semicolons = text.count(";") / n_chars
|
| 68 |
-
n_colons = text.count(":") / n_chars
|
| 69 |
-
n_exclaim = text.count("!") / n_chars
|
| 70 |
-
n_question = text.count("?") / n_chars
|
| 71 |
-
n_ellipsis = text.count("...") / n_chars
|
| 72 |
-
n_dash = (text.count("—") + text.count("--")) / n_chars
|
| 73 |
-
|
| 74 |
-
# Markdown elements
|
| 75 |
-
n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
|
| 76 |
-
n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
|
| 77 |
-
n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
|
| 78 |
-
n_code_blocks = len(re.findall(r'```', text)) / n_sentences
|
| 79 |
-
n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
|
| 80 |
-
n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
|
| 81 |
-
n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences
|
| 82 |
-
|
| 83 |
-
# Vocabulary richness
|
| 84 |
-
unique_words = len(set(w.lower() for w in words))
|
| 85 |
-
ttr = unique_words / n_words # type-token ratio
|
| 86 |
-
|
| 87 |
-
# Paragraph structure
|
| 88 |
-
paragraphs = text.split("\n\n")
|
| 89 |
-
n_paragraphs = len([p for p in paragraphs if p.strip()])
|
| 90 |
-
avg_para_len = n_words / max(n_paragraphs, 1)
|
| 91 |
-
|
| 92 |
-
# Special patterns
|
| 93 |
-
starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
|
| 94 |
-
has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0
|
| 95 |
-
|
| 96 |
-
# CoT features (present even in no-CoT mode, just will be 0)
|
| 97 |
-
has_think = 1.0 if has_cot(text) else 0.0
|
| 98 |
-
think_ratio = cot_ratio(text)
|
| 99 |
-
|
| 100 |
-
return [
|
| 101 |
-
avg_word_len, avg_sent_len,
|
| 102 |
-
n_commas, n_semicolons, n_colons, n_exclaim, n_question,
|
| 103 |
-
n_ellipsis, n_dash,
|
| 104 |
-
n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
|
| 105 |
-
n_bullet, n_numbered,
|
| 106 |
-
ttr, n_paragraphs, avg_para_len,
|
| 107 |
-
starts_with_certainly, has_disclaimer,
|
| 108 |
-
has_think, think_ratio,
|
| 109 |
-
n_chars, n_words,
|
| 110 |
-
]
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
# --- Feature Pipeline ---
|
| 114 |
-
|
| 115 |
-
class FeaturePipeline:
|
| 116 |
-
"""Combined TF-IDF + stylometric feature pipeline."""
|
| 117 |
-
|
| 118 |
-
def __init__(self):
|
| 119 |
-
self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
|
| 120 |
-
self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
|
| 121 |
-
self.stylo = StylometricFeatures()
|
| 122 |
-
self.scaler = MaxAbsScaler()
|
| 123 |
-
|
| 124 |
-
def fit_transform(self, texts):
|
| 125 |
-
"""Fit and transform texts into feature matrix."""
|
| 126 |
-
import time
|
| 127 |
-
print(f" Input: {len(texts)} texts")
|
| 128 |
-
|
| 129 |
-
# Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
|
| 130 |
-
texts_no_cot = [strip_cot(t) for t in texts]
|
| 131 |
-
|
| 132 |
-
t0 = time.time()
|
| 133 |
-
word_features = self.word_tfidf.fit_transform(texts_no_cot)
|
| 134 |
-
print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
|
| 135 |
-
|
| 136 |
-
t0 = time.time()
|
| 137 |
-
char_features = self.char_tfidf.fit_transform(texts_no_cot)
|
| 138 |
-
print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
|
| 139 |
-
|
| 140 |
-
# Stylometric uses original text (has_think, think_ratio still work)
|
| 141 |
-
t0 = time.time()
|
| 142 |
-
stylo_features = self.stylo.fit_transform(texts)
|
| 143 |
-
print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
|
| 144 |
-
|
| 145 |
-
combined = hstack([word_features, char_features, stylo_features])
|
| 146 |
-
combined = self.scaler.fit_transform(combined)
|
| 147 |
-
print(f" Combined feature matrix: {combined.shape}")
|
| 148 |
-
return combined
|
| 149 |
-
|
| 150 |
-
def transform(self, texts):
|
| 151 |
-
"""Transform texts into feature matrix (after fitting)."""
|
| 152 |
-
texts_no_cot = [strip_cot(t) for t in texts]
|
| 153 |
-
word_features = self.word_tfidf.transform(texts_no_cot)
|
| 154 |
-
char_features = self.char_tfidf.transform(texts_no_cot)
|
| 155 |
-
stylo_features = self.stylo.transform(texts)
|
| 156 |
-
combined = hstack([word_features, char_features, stylo_features])
|
| 157 |
-
return self.scaler.transform(combined)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|