CompactAI commited on
Commit
f63dfdb
·
verified ·
1 Parent(s): d0d9725

Delete features.py

Browse files
Files changed (1) hide show
  1. features.py +0 -157
features.py DELETED
@@ -1,157 +0,0 @@
1
- """
2
- AIFinder Feature Extraction
3
- TF-IDF pipeline + stylometric features.
4
- Supports CoT-aware and no-CoT text preprocessing.
5
- """
6
-
7
- import re
8
- import numpy as np
9
- from scipy.sparse import hstack, csr_matrix
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
- from sklearn.preprocessing import MaxAbsScaler
12
- from sklearn.base import BaseEstimator, TransformerMixin
13
-
14
- from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
15
-
16
-
17
- # --- Text Preprocessing ---
18
-
19
- def strip_cot(text):
20
- """Remove <think>...</think> blocks from text."""
21
- return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
22
-
23
-
24
- def has_cot(text):
25
- """Check if text contains <think>...</think> blocks."""
26
- return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))
27
-
28
-
29
- def cot_ratio(text):
30
- """Ratio of thinking text to total text length."""
31
- think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
32
- if not think_matches or len(text) == 0:
33
- return 0.0
34
- think_len = sum(len(m) for m in think_matches)
35
- return think_len / len(text)
36
-
37
-
38
- # --- Stylometric Features ---
39
-
40
- class StylometricFeatures(BaseEstimator, TransformerMixin):
41
- """Extract stylometric features from text."""
42
-
43
- def fit(self, X, y=None):
44
- return self
45
-
46
- def transform(self, X):
47
- features = []
48
- for text in X:
49
- features.append(self._extract(text))
50
- return csr_matrix(np.array(features, dtype=np.float32))
51
-
52
- def _extract(self, text):
53
- sentences = re.split(r'[.!?]+', text)
54
- sentences = [s.strip() for s in sentences if s.strip()]
55
- words = text.split()
56
-
57
- n_chars = max(len(text), 1)
58
- n_words = max(len(words), 1)
59
- n_sentences = max(len(sentences), 1)
60
-
61
- # Basic stats
62
- avg_word_len = np.mean([len(w) for w in words]) if words else 0
63
- avg_sent_len = n_words / n_sentences
64
-
65
- # Punctuation densities
66
- n_commas = text.count(",") / n_chars
67
- n_semicolons = text.count(";") / n_chars
68
- n_colons = text.count(":") / n_chars
69
- n_exclaim = text.count("!") / n_chars
70
- n_question = text.count("?") / n_chars
71
- n_ellipsis = text.count("...") / n_chars
72
- n_dash = (text.count("—") + text.count("--")) / n_chars
73
-
74
- # Markdown elements
75
- n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
76
- n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
77
- n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
78
- n_code_blocks = len(re.findall(r'```', text)) / n_sentences
79
- n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
80
- n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
81
- n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences
82
-
83
- # Vocabulary richness
84
- unique_words = len(set(w.lower() for w in words))
85
- ttr = unique_words / n_words # type-token ratio
86
-
87
- # Paragraph structure
88
- paragraphs = text.split("\n\n")
89
- n_paragraphs = len([p for p in paragraphs if p.strip()])
90
- avg_para_len = n_words / max(n_paragraphs, 1)
91
-
92
- # Special patterns
93
- starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
94
- has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0
95
-
96
- # CoT features (present even in no-CoT mode, just will be 0)
97
- has_think = 1.0 if has_cot(text) else 0.0
98
- think_ratio = cot_ratio(text)
99
-
100
- return [
101
- avg_word_len, avg_sent_len,
102
- n_commas, n_semicolons, n_colons, n_exclaim, n_question,
103
- n_ellipsis, n_dash,
104
- n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
105
- n_bullet, n_numbered,
106
- ttr, n_paragraphs, avg_para_len,
107
- starts_with_certainly, has_disclaimer,
108
- has_think, think_ratio,
109
- n_chars, n_words,
110
- ]
111
-
112
-
113
- # --- Feature Pipeline ---
114
-
115
- class FeaturePipeline:
116
- """Combined TF-IDF + stylometric feature pipeline."""
117
-
118
- def __init__(self):
119
- self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
120
- self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
121
- self.stylo = StylometricFeatures()
122
- self.scaler = MaxAbsScaler()
123
-
124
- def fit_transform(self, texts):
125
- """Fit and transform texts into feature matrix."""
126
- import time
127
- print(f" Input: {len(texts)} texts")
128
-
129
- # Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
130
- texts_no_cot = [strip_cot(t) for t in texts]
131
-
132
- t0 = time.time()
133
- word_features = self.word_tfidf.fit_transform(texts_no_cot)
134
- print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
135
-
136
- t0 = time.time()
137
- char_features = self.char_tfidf.fit_transform(texts_no_cot)
138
- print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
139
-
140
- # Stylometric uses original text (has_think, think_ratio still work)
141
- t0 = time.time()
142
- stylo_features = self.stylo.fit_transform(texts)
143
- print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
144
-
145
- combined = hstack([word_features, char_features, stylo_features])
146
- combined = self.scaler.fit_transform(combined)
147
- print(f" Combined feature matrix: {combined.shape}")
148
- return combined
149
-
150
- def transform(self, texts):
151
- """Transform texts into feature matrix (after fitting)."""
152
- texts_no_cot = [strip_cot(t) for t in texts]
153
- word_features = self.word_tfidf.transform(texts_no_cot)
154
- char_features = self.char_tfidf.transform(texts_no_cot)
155
- stylo_features = self.stylo.transform(texts)
156
- combined = hstack([word_features, char_features, stylo_features])
157
- return self.scaler.transform(combined)