CompactAI commited on
Commit
9d7e5cd
·
verified ·
1 Parent(s): 32cc55b

Upload features.py

Browse files
Files changed (1) hide show
  1. features.py +647 -0
features.py ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AIFinder Feature Extraction
3
+ TF-IDF and stylometric features for AI model detection.
4
+ """
5
+
6
+ import re
7
+ import numpy as np
8
+ from scipy.sparse import csr_matrix, hstack
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
+ from sklearn.preprocessing import MaxAbsScaler
12
+
13
+ from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
14
+
15
+
16
+ def strip_cot(text):
17
+ text = re.sub(r"<think(?:ing)?>.*?</think(?:ing)?>", "", text, flags=re.DOTALL)
18
+ return text.strip()
19
+
20
+
21
+ def strip_markdown(text):
22
+ text = re.sub(r"```[\s\S]*?```", "", text)
23
+ text = re.sub(r"`[^`]+`", "", text)
24
+ text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
25
+ text = re.sub(r"\*([^*]+)\*", r"\1", text)
26
+ text = re.sub(r"__([^_]+)__", r"\1", text)
27
+ text = re.sub(r"_([^_]+)_", r"\1", text)
28
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
29
+ text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
30
+ text = re.sub(r"^\s*\d+[.)]\s+", "", text, flags=re.MULTILINE)
31
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
32
+ text = re.sub(r"^>.*$", "", text, flags=re.MULTILINE)
33
+ text = re.sub(r"^---+$", "", text, flags=re.MULTILINE)
34
+ return text.strip()
35
+
36
+
37
+ class StylometricFeatures(BaseEstimator, TransformerMixin):
38
+ def fit(self, X, y=None):
39
+ return self
40
+
41
+ def transform(self, X):
42
+ features = []
43
+ for text in X:
44
+ features.append(self._extract(text))
45
+ return csr_matrix(np.array(features, dtype=np.float32))
46
+
47
+ def _extract(self, text):
48
+ words = text.split()
49
+ n_chars = max(len(text), 1)
50
+ n_words = max(len(words), 1)
51
+
52
+ sentences = re.split(r"[.!?]+", text)
53
+ sentences = [s.strip() for s in sentences if s.strip()]
54
+ n_sentences = max(len(sentences), 1)
55
+
56
+ paragraphs = text.split("\n\n")
57
+ non_empty_paras = [p for p in paragraphs if p.strip()]
58
+ n_paragraphs = len(non_empty_paras)
59
+
60
+ lines = text.split("\n")
61
+ non_empty_lines = [l for l in lines if l.strip()]
62
+ n_lines = max(len(non_empty_lines), 1)
63
+
64
+ # === Word-level stats ===
65
+ word_lens = [len(w) for w in words]
66
+ avg_word_len = np.mean(word_lens) if words else 0
67
+ word_len_std = np.std(word_lens) if len(words) > 1 else 0
68
+ median_word_len = np.median(word_lens) if words else 0
69
+ avg_sent_len = n_words / n_sentences
70
+
71
+ # === Punctuation density ===
72
+ n_commas = text.count(",") / n_chars
73
+ n_semicolons = text.count(";") / n_chars
74
+ n_colons = text.count(":") / n_chars
75
+ n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars
76
+ n_parens = (text.count("(") + text.count(")")) / n_chars
77
+ n_quotes = (text.count('"') + text.count("'")) / n_chars
78
+ n_exclaim = text.count("!") / n_chars
79
+ n_question = text.count("?") / n_chars
80
+ n_period = text.count(".") / n_chars
81
+ n_ellipsis = (text.count("...") + text.count("…")) / n_chars
82
+
83
+ comma_colon_ratio = n_commas / (n_colons + 0.001)
84
+ comma_period_ratio = n_commas / (n_period + 0.001)
85
+ excl_question_ratio = n_exclaim / (n_question + 0.001)
86
+
87
+ # === Markdown/formatting features ===
88
+ n_headers = len(re.findall(r"^#{1,6}\s", text, re.MULTILINE)) / n_sentences
89
+ n_bold = len(re.findall(r"\*\*.*?\*\*", text)) / n_sentences
90
+ n_code_blocks = len(re.findall(r"```", text)) / n_sentences
91
+ n_inline_code = len(re.findall(r"`[^`]+`", text)) / n_sentences
92
+ n_bullet = len(re.findall(r"^[\s]*[-*+]\s", text, re.MULTILINE)) / n_sentences
93
+ n_numbered = len(re.findall(r"^\s*\d+[.)]\s", text, re.MULTILINE)) / n_sentences
94
+ n_tables = len(re.findall(r"\|.*\|", text)) / n_sentences
95
+
96
+ # === Whitespace & structure ===
97
+ newline_density = text.count("\n") / n_chars
98
+ double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
99
+ uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
100
+ digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars
101
+ space_ratio = sum(1 for c in text if c.isspace()) / n_chars
102
+
103
+ unique_chars = len(set(text)) / n_chars
104
+ unique_chars_ratio = len(set(text.lower())) / n_chars
105
+
106
+ # === Sentence-level stats ===
107
+ sent_lens = [len(s.split()) for s in sentences]
108
+ sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0
109
+ sent_len_max = max(sent_lens) if sent_lens else 0
110
+ sent_len_min = min(sent_lens) if sent_lens else 0
111
+ sent_len_median = np.median(sent_lens) if sent_lens else 0
112
+ sent_len_range = sent_len_max - sent_len_min
113
+
114
+ # === Structural markers ===
115
+ has_think = 1.0 if re.search(r"<think>", text) else 0.0
116
+ has_xml = 1.0 if re.search(r"<[^>]+>", text) else 0.0
117
+ has_hr = 1.0 if re.search(r"^---+", text, re.MULTILINE) else 0.0
118
+ has_url = 1.0 if re.search(r"https?://", text) else 0.0
119
+
120
+ # === Pronoun and person features ===
121
+ words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
122
+
123
+ first_person = {
124
+ "i",
125
+ "me",
126
+ "my",
127
+ "mine",
128
+ "myself",
129
+ "we",
130
+ "us",
131
+ "our",
132
+ "ours",
133
+ "ourselves",
134
+ }
135
+ second_person = {"you", "your", "yours", "yourself", "yourselves"}
136
+ third_person = {"he", "she", "it", "they", "them", "his", "her", "its", "their"}
137
+
138
+ first_person_ratio = sum(1 for w in words_lower if w in first_person) / n_words
139
+ second_person_ratio = (
140
+ sum(1 for w in words_lower if w in second_person) / n_words
141
+ )
142
+ third_person_ratio = sum(1 for w in words_lower if w in third_person) / n_words
143
+
144
+ # === Vocabulary richness ===
145
+ unique_words = len(set(words_lower))
146
+ ttr = unique_words / n_words if n_words > 0 else 0
147
+ hapax = sum(1 for w in set(words_lower) if words_lower.count(w) == 1)
148
+ hapax_ratio = hapax / n_words if n_words > 0 else 0
149
+
150
+ contraction_count = len(re.findall(r"\b\w+'\w+\b", text))
151
+ contraction_ratio = contraction_count / n_words if n_words > 0 else 0
152
+
153
+ # === Sentence starters ===
154
+ sentences_starters = [
155
+ s.split()[0].lower() if s.split() else "" for s in sentences
156
+ ]
157
+ starter_vocab = (
158
+ len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0
159
+ )
160
+
161
+ and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
162
+ but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences
163
+ so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences
164
+ the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences
165
+ it_starts = (
166
+ sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences
167
+ )
168
+ i_starts = (
169
+ sum(1 for s in sentences_starters if s in ("i", "i'm", "i've"))
170
+ / n_sentences
171
+ )
172
+
173
+ # === Word length distributions ===
174
+ short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
175
+ medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
176
+ long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
177
+ very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words
178
+
179
+ # === Paragraph stats ===
180
+ para_lens = (
181
+ [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
182
+ )
183
+ avg_para_len = np.mean(para_lens)
184
+ para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0
185
+
186
+ # === Discourse markers ===
187
+ conjunctions = {
188
+ "and",
189
+ "but",
190
+ "or",
191
+ "nor",
192
+ "for",
193
+ "yet",
194
+ "so",
195
+ "because",
196
+ "although",
197
+ "while",
198
+ "if",
199
+ "when",
200
+ "where",
201
+ }
202
+ discourse = {
203
+ "however",
204
+ "therefore",
205
+ "moreover",
206
+ "furthermore",
207
+ "nevertheless",
208
+ "consequently",
209
+ "thus",
210
+ "hence",
211
+ }
212
+ hedging = {
213
+ "perhaps",
214
+ "maybe",
215
+ "might",
216
+ "could",
217
+ "possibly",
218
+ "seemingly",
219
+ "apparently",
220
+ "arguably",
221
+ "potentially",
222
+ }
223
+ certainty = {
224
+ "definitely",
225
+ "certainly",
226
+ "absolutely",
227
+ "clearly",
228
+ "obviously",
229
+ "undoubtedly",
230
+ "indeed",
231
+ "surely",
232
+ }
233
+ transition = {
234
+ "additionally",
235
+ "meanwhile",
236
+ "subsequently",
237
+ "alternatively",
238
+ "specifically",
239
+ "notably",
240
+ "importantly",
241
+ "essentially",
242
+ }
243
+
244
+ conjunction_ratio = sum(1 for w in words_lower if w in conjunctions) / n_words
245
+ discourse_ratio = sum(1 for w in words_lower if w in discourse) / n_words
246
+ hedging_ratio = sum(1 for w in words_lower if w in hedging) / n_words
247
+ certainty_ratio = sum(1 for w in words_lower if w in certainty) / n_words
248
+ transition_ratio = sum(1 for w in words_lower if w in transition) / n_words
249
+
250
+ # === Question patterns ===
251
+ question_starts = sum(
252
+ 1
253
+ for s in sentences
254
+ if s
255
+ and s.strip()
256
+ .lower()
257
+ .startswith(("who", "what", "when", "where", "why", "how"))
258
+ )
259
+
260
+ # === List features ===
261
+ has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
262
+ list_items = n_bullet + n_numbered
263
+
264
+ # === Emoji and special chars ===
265
+ emoji_count = len(re.findall(r"[\U00010000-\U0010ffff]", text))
266
+ has_emoji = 1.0 if emoji_count > 0 else 0.0
267
+
268
+ # === Specific style markers ===
269
+ # ALL CAPS words (emphasis style)
270
+ all_caps_words = sum(
271
+ 1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
272
+ )
273
+ all_caps_ratio = all_caps_words / n_words
274
+
275
+ # Parenthetical asides
276
+ paren_count = len(re.findall(r"\([^)]+\)", text))
277
+ paren_ratio = paren_count / n_sentences
278
+
279
+ # Rhetorical questions (sentences ending with ?)
280
+ rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
281
+ rhetorical_ratio = rhetorical_q / n_sentences
282
+
283
+ # Direct address / casual markers
284
+ casual_markers = {
285
+ "okay",
286
+ "ok",
287
+ "hey",
288
+ "hi",
289
+ "cool",
290
+ "awesome",
291
+ "wow",
292
+ "basically",
293
+ "actually",
294
+ "literally",
295
+ "right",
296
+ "yeah",
297
+ }
298
+ casual_ratio = sum(1 for w in words_lower if w in casual_markers) / n_words
299
+
300
+ # Formal markers
301
+ formal_markers = {
302
+ "regarding",
303
+ "concerning",
304
+ "pertaining",
305
+ "aforementioned",
306
+ "respectively",
307
+ "accordingly",
308
+ "henceforth",
309
+ "whereby",
310
+ "notwithstanding",
311
+ "pursuant",
312
+ }
313
+ formal_ratio = sum(1 for w in words_lower if w in formal_markers) / n_words
314
+
315
+ # Chinese character detection
316
+ chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
317
+ has_chinese = 1.0 if chinese_chars > 0 else 0.0
318
+ chinese_ratio = chinese_chars / n_chars
319
+
320
+ # Self-identification patterns
321
+ has_self_id_ai = (
322
+ 1.0
323
+ if re.search(
324
+ r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b",
325
+ text,
326
+ re.IGNORECASE,
327
+ )
328
+ else 0.0
329
+ )
330
+ has_provider_mention = (
331
+ 1.0
332
+ if re.search(
333
+ r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI"
334
+ r"|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b",
335
+ text,
336
+ re.IGNORECASE,
337
+ )
338
+ else 0.0
339
+ )
340
+
341
+ # Response ending patterns
342
+ ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
343
+ has_closing_offer = (
344
+ 1.0
345
+ if re.search(
346
+ r"(let me know|feel free|happy to help|don't hesitate|hope this helps)",
347
+ text,
348
+ re.IGNORECASE,
349
+ )
350
+ else 0.0
351
+ )
352
+
353
+ # Sentence complexity (approximation via commas per sentence)
354
+ commas_per_sentence = text.count(",") / n_sentences
355
+
356
+ # Line-level features
357
+ avg_line_len = (
358
+ np.mean([len(l) for l in non_empty_lines]) if non_empty_lines else 0
359
+ )
360
+ short_lines_ratio = (
361
+ sum(1 for l in non_empty_lines if len(l.split()) <= 5) / n_lines
362
+ )
363
+
364
+ # Capitalized word ratio (proper nouns, emphasis)
365
+ cap_words = len(re.findall(r"\b[A-Z][a-z]+\b", text))
366
+ cap_word_ratio = cap_words / n_words
367
+
368
+ # Multi-word phrases per sentence
369
+ four_word_phrases = len(re.findall(r"\b\w+\s+\w+\s+\w+\s+\w+\b", text))
370
+ phrase_ratio = four_word_phrases / n_sentences
371
+
372
+ # Sentence boundary patterns
373
+ sent_boundaries = len(re.findall(r"[.!?]\s+[A-Z]", text))
374
+ sent_boundary_ratio = sent_boundaries / n_sentences
375
+
376
+ # Special punctuation
377
+ has_checkmark = (
378
+ 1.0 if "✓" in text or "✗" in text or "✔" in text or "✘" in text else 0.0
379
+ )
380
+ has_arrow = 1.0 if "→" in text or "←" in text or "➡" in text else 0.0
381
+ has_star = 1.0 if "⭐" in text or "★" in text or "☆" in text else 0.0
382
+ special_unicode = len(re.findall(r"[^\x00-\x7F]", text)) / n_chars
383
+
384
+ # Colon-based definitions (common in some providers)
385
+ colon_definitions = len(re.findall(r"\b\w+:\s+\w+", text)) / n_sentences
386
+
387
+ # Quotation usage
388
+ double_quote_pairs = len(re.findall(r'"[^"]*"', text)) / n_sentences
389
+ single_quote_pairs = len(re.findall(r"'[^']*'", text)) / n_sentences
390
+
391
+ # Greeting patterns
392
+ greeting_patterns = len(
393
+ re.findall(
394
+ r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", text, re.IGNORECASE
395
+ )
396
+ )
397
+ greeting_ratio = greeting_patterns / n_sentences
398
+
399
+ # Response length categories
400
+ is_short = 1.0 if n_words < 100 else 0.0
401
+ is_medium = 1.0 if 100 <= n_words < 500 else 0.0
402
+ is_long = 1.0 if n_words >= 500 else 0.0
403
+
404
+ # Exclamation usage
405
+ excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
406
+ excl_sentence_ratio = excl_sentences / n_sentences
407
+
408
+ # Question-only responses
409
+ question_lines = [l for l in non_empty_lines if l.strip().endswith("?")]
410
+ question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0
411
+
412
+ # Common conversational phrases
413
+ conversational_phrases = len(
414
+ re.findall(
415
+ r"\b(great|perfect|sure|definitely|certainly|absolutely|of course"
416
+ r"|no problem|sounds good|got it|understood|okay|alright)\b",
417
+ text,
418
+ re.IGNORECASE,
419
+ )
420
+ )
421
+ conv_phrase_ratio = conversational_phrases / n_words
422
+
423
+ # Helpful/closing phrases
424
+ helpful_phrases = len(
425
+ re.findall(
426
+ r"\b(let me know|feel free|happy to|glad to|happy to help"
427
+ r"|don't hesitate|let me know if|please let me|reach out)\b",
428
+ text,
429
+ re.IGNORECASE,
430
+ )
431
+ )
432
+ helpful_ratio = helpful_phrases / n_sentences
433
+
434
+ return [
435
+ # Basic word stats (0-3)
436
+ avg_word_len,
437
+ word_len_std,
438
+ median_word_len,
439
+ avg_sent_len,
440
+ # Sentence stats (4-9)
441
+ sent_len_std,
442
+ sent_len_max,
443
+ sent_len_min,
444
+ sent_len_median,
445
+ sent_len_range,
446
+ commas_per_sentence,
447
+ # Punctuation density (10-22)
448
+ n_commas,
449
+ n_semicolons,
450
+ n_colons,
451
+ n_dash,
452
+ n_parens,
453
+ n_quotes,
454
+ n_exclaim,
455
+ n_question,
456
+ n_period,
457
+ n_ellipsis,
458
+ comma_colon_ratio,
459
+ comma_period_ratio,
460
+ excl_question_ratio,
461
+ # Markdown features (23-30)
462
+ n_headers,
463
+ n_bold,
464
+ n_code_blocks,
465
+ n_inline_code,
466
+ n_bullet,
467
+ n_numbered,
468
+ n_tables,
469
+ has_list,
470
+ # Structure (31-40)
471
+ newline_density,
472
+ double_newline_ratio,
473
+ uppercase_ratio,
474
+ digit_ratio,
475
+ space_ratio,
476
+ unique_chars,
477
+ unique_chars_ratio,
478
+ list_items,
479
+ n_paragraphs,
480
+ n_lines / n_sentences,
481
+ # Sentence level (41-44)
482
+ has_think,
483
+ has_xml,
484
+ has_hr,
485
+ has_url,
486
+ # Pronoun features (45-47)
487
+ first_person_ratio,
488
+ second_person_ratio,
489
+ third_person_ratio,
490
+ # Vocabulary (48-52)
491
+ ttr,
492
+ hapax_ratio,
493
+ contraction_ratio,
494
+ short_word_ratio,
495
+ medium_word_ratio,
496
+ # Word length distributions (53-54)
497
+ long_word_ratio,
498
+ very_long_word_ratio,
499
+ # Sentence starters (55-60)
500
+ starter_vocab,
501
+ and_starts,
502
+ but_starts,
503
+ so_starts,
504
+ the_starts,
505
+ it_starts,
506
+ # Paragraph stats (61-62)
507
+ avg_para_len,
508
+ para_len_std,
509
+ # Discourse markers (63-67)
510
+ conjunction_ratio,
511
+ discourse_ratio,
512
+ hedging_ratio,
513
+ certainty_ratio,
514
+ transition_ratio,
515
+ # Questions (68)
516
+ question_starts / n_sentences if n_sentences > 0 else 0,
517
+ # Emoji/special (69-71)
518
+ emoji_count,
519
+ has_emoji,
520
+ special_unicode,
521
+ # Style markers (72-79)
522
+ all_caps_ratio,
523
+ paren_ratio,
524
+ rhetorical_ratio,
525
+ casual_ratio,
526
+ formal_ratio,
527
+ has_chinese,
528
+ chinese_ratio,
529
+ has_self_id_ai,
530
+ # Provider mention & response patterns (80-83)
531
+ has_provider_mention,
532
+ ends_with_question,
533
+ has_closing_offer,
534
+ has_checkmark,
535
+ # More structure (84-89)
536
+ has_arrow,
537
+ has_star,
538
+ avg_line_len,
539
+ short_lines_ratio,
540
+ cap_word_ratio,
541
+ phrase_ratio,
542
+ # Final features (90-94)
543
+ sent_boundary_ratio,
544
+ colon_definitions,
545
+ double_quote_pairs,
546
+ single_quote_pairs,
547
+ i_starts,
548
+ # New features (95-102)
549
+ greeting_ratio,
550
+ is_short,
551
+ is_medium,
552
+ is_long,
553
+ excl_sentence_ratio,
554
+ question_line_ratio,
555
+ conv_phrase_ratio,
556
+ helpful_ratio,
557
+ ]
558
+
559
+
560
+ class FeaturePipeline:
561
+ def __init__(self, use_tfidf=True):
562
+ word_params = dict(TFIDF_WORD_PARAMS)
563
+ char_params = dict(TFIDF_CHAR_PARAMS)
564
+
565
+ if word_params.get("max_features", 1) == 0:
566
+ word_params["max_features"] = None
567
+ if char_params.get("max_features", 1) == 0:
568
+ char_params["max_features"] = None
569
+
570
+ self.word_tfidf = TfidfVectorizer(**word_params)
571
+ self.char_tfidf = TfidfVectorizer(**char_params)
572
+ self.stylo = StylometricFeatures()
573
+ self.scaler = MaxAbsScaler()
574
+ self.use_tfidf = use_tfidf and (
575
+ TFIDF_WORD_PARAMS.get("max_features", 1) > 0
576
+ or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0
577
+ )
578
+
579
+ def _clean_for_tfidf(self, text):
580
+ """Strip CoT and markdown for TF-IDF (remove formatting artifacts, keep content)."""
581
+ return strip_markdown(strip_cot(text))
582
+
583
+ def fit_transform(self, texts):
584
+ import time
585
+
586
+ print(f" Input: {len(texts)} texts", flush=True)
587
+
588
+ texts_tfidf = [self._clean_for_tfidf(t) for t in texts]
589
+ texts_stylo = [strip_markdown(strip_cot(t)) for t in texts]
590
+
591
+ use_word_tfidf = (
592
+ self.word_tfidf.max_features is not None
593
+ and self.word_tfidf.max_features > 0
594
+ )
595
+ if use_word_tfidf:
596
+ t0 = time.time()
597
+ word_features = self.word_tfidf.fit_transform(texts_tfidf)
598
+ print(
599
+ f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)",
600
+ flush=True,
601
+ )
602
+ else:
603
+ word_features = csr_matrix((len(texts), 0), dtype=np.float32)
604
+
605
+ if self.use_tfidf:
606
+ t0 = time.time()
607
+ char_features = self.char_tfidf.fit_transform(texts_tfidf)
608
+ print(
609
+ f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)",
610
+ flush=True,
611
+ )
612
+ else:
613
+ char_features = csr_matrix((len(texts), 0), dtype=np.float32)
614
+
615
+ t0 = time.time()
616
+ stylo_features = self.stylo.fit_transform(texts_stylo)
617
+ print(
618
+ f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
619
+ flush=True,
620
+ )
621
+
622
+ combined = hstack([word_features, char_features, stylo_features])
623
+ combined = self.scaler.fit_transform(combined)
624
+ print(f" Combined feature matrix: {combined.shape}", flush=True)
625
+ return combined
626
+
627
+ def transform(self, texts):
628
+ texts_tfidf = [self._clean_for_tfidf(t) for t in texts]
629
+ texts_stylo = [strip_markdown(strip_cot(t)) for t in texts]
630
+
631
+ use_word_tfidf = (
632
+ self.word_tfidf.max_features is not None
633
+ and self.word_tfidf.max_features > 0
634
+ )
635
+ if use_word_tfidf:
636
+ word_features = self.word_tfidf.transform(texts_tfidf)
637
+ else:
638
+ word_features = csr_matrix((len(texts), 0), dtype=np.float32)
639
+
640
+ if self.use_tfidf:
641
+ char_features = self.char_tfidf.transform(texts_tfidf)
642
+ else:
643
+ char_features = csr_matrix((len(texts), 0), dtype=np.float32)
644
+
645
+ stylo_features = self.stylo.transform(texts_stylo)
646
+ combined = hstack([word_features, char_features, stylo_features])
647
+ return self.scaler.transform(combined)