MetaCortex-Dynamics commited on
Commit
d10d4e5
Β·
verified Β·
1 Parent(s): cc7c68a

Create pipeline/stages/s3_decompose.py

Browse files
Files changed (1) hide show
  1. pipeline/stages/s3_decompose.py +354 -0
pipeline/stages/s3_decompose.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ S3: DECOMPOSE β€” Prose to (Ch.A, Ch.B, Ch.C) in 15-operator vocabulary.
3
+
4
+ Per SPEC-PIPELINE-001 Part B.2:
5
+ (a) Identify channel_a content β†’ Ch.A using WHENCE, WHEN
6
+ (b) Identify channel_b content β†’ Ch.B using WHAT, WHERE, WHICH
7
+ (c) Identify channel_c content β†’ Ch.C using FOR-WHAT, HOW
8
+
9
+ T1 bypass: CecCert sources call extract_from_cert() directly.
10
+ This module handles T2 (RTL docs) and T3 (formal properties).
11
+
12
+ THIS IS THE CORE IP. The decomposer patterns are protected.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from pipeline.stages.s1_segment import Segment
18
+ from pipeline.stages.s2_classify import ClassifiedSegment, Classification
19
+ from pipeline.types import (
20
+ ModalityGrounding, Op, OperatorExpression, OperatorSequence,
21
+ SourceProvenance, Tier, FrameExample, Witness, WitnessAttestation,
22
+ WitnessBundle,
23
+ )
24
+
25
+
26
+ def decompose(classified: ClassifiedSegment) -> FrameExample | None:
27
+ """Decompose a TECHNICAL segment into a FrameExample.
28
+
29
+ Returns None if the segment cannot be decomposed (should not happen
30
+ for properly classified TECHNICAL segments β€” log as Untranslatable).
31
+ """
32
+ seg = classified.segment
33
+ if classified.classification != Classification.TECHNICAL:
34
+ return None
35
+
36
+ if seg.source.tier == Tier.T1:
37
+ return _decompose_t1(seg)
38
+ elif seg.source.tier == Tier.T2:
39
+ return _decompose_t2(seg)
40
+ elif seg.source.tier == Tier.T3:
41
+ # T3 includes chat archives β€” use content-aware routing
42
+ if seg.segment_type.startswith("chat_"):
43
+ return _decompose_chat(seg)
44
+ return _decompose_t3(seg)
45
+ return None
46
+
47
+
48
+ def _decompose_t1(seg: Segment) -> FrameExample:
49
+ """T1 bypass β€” CecCert extraction handled by kernel.
50
+
51
+ At pipeline level, T1 records are already structured. The kernel's
52
+ extract_from_cert() does the actual projection. Here we wrap
53
+ the segment text as a minimal FrameExample for T1 non-cert records
54
+ (e.g., CSV rows, Yosys JSON fragments, Kernel source).
55
+ """
56
+ text = seg.text
57
+ text_lower = text.lower()
58
+
59
+ # Kernel source β€” extract operator structure from code
60
+ if any(kw in text_lower for kw in ["module ", "pub fn ", "pub mod ", "record ", "trait ", "impl "]):
61
+ return _build_example(
62
+ seg,
63
+ g_ops=[
64
+ (Op.THIS, f"this(source_module) β€” T1 code unit"),
65
+ (Op.NO, f"no(floating_point) β€” deterministic arithmetic"),
66
+ ],
67
+ s_ops=[
68
+ (Op.GOES_WITH, f"goes_with(module, dependencies) β€” channel_b relation"),
69
+ (Op.TOGETHER_ALONE, f"together(declarations) β€” module composition"),
70
+ ],
71
+ f_ops=[
72
+ (Op.IF_THEN, f"if_then(precondition, postcondition) β€” code contract"),
73
+ (Op.MUST_LET, f"must(type_safety) β€” effect annotation"),
74
+ ],
75
+ )
76
+
77
+ return _build_example(
78
+ seg,
79
+ g_ops=[(Op.THIS, f"this(record) β€” T1 structured data")],
80
+ s_ops=[
81
+ (Op.GOES_WITH, f"goes_with(record, format) β€” channel_b relation"),
82
+ (Op.INSIDE_OUTSIDE, f"inside(record, T1_corpus)"),
83
+ ],
84
+ f_ops=[(Op.SAME_NOT_SAME, f"same(reference, transformed) β€” equivalence check")],
85
+ )
86
+
87
+
88
+ def _decompose_t2(seg: Segment) -> FrameExample:
89
+ """T2: RTL design documents β€” Verilog modules, design specs."""
90
+ text = seg.text
91
+
92
+ # Module decomposition
93
+ if seg.segment_type == "module":
94
+ module_name = _extract_module_name(text)
95
+ ports = _extract_ports(text)
96
+ return _build_example(
97
+ seg,
98
+ g_ops=[
99
+ (Op.THIS, f"this(module={module_name}) β€” design unit"),
100
+ (Op.BECAUSE, f"because(RTL_specification, {module_name})"),
101
+ ],
102
+ s_ops=[
103
+ (Op.INSIDE_OUTSIDE, f"inside({module_name}, design_hierarchy)"),
104
+ (Op.TOGETHER_ALONE, f"together({', '.join(ports[:4])}) β€” port interface"),
105
+ ],
106
+ f_ops=[
107
+ (Op.CAN_CANNOT, f"can({module_name}, synthesize)"),
108
+ (Op.MUST_LET, f"must(timing_constraints) β€” channel_c requirement"),
109
+ ],
110
+ )
111
+
112
+ # Section/paragraph decomposition β€” ensure home operators in each modality
113
+ return _build_example(
114
+ seg,
115
+ g_ops=[(Op.THIS, f"this(section) β€” design document content")],
116
+ s_ops=[
117
+ (Op.GOES_WITH, f"goes_with(section, T2_document) β€” channel_b relation"),
118
+ (Op.INSIDE_OUTSIDE, f"inside(content, document_scope)"),
119
+ ],
120
+ f_ops=[
121
+ (Op.IF_THEN, f"if_then(requirement, behavior)"),
122
+ (Op.NEAR_FAR, f"near(section, related_context) β€” document proximity"),
123
+ ],
124
+ )
125
+
126
+
127
+ def _decompose_chat(seg: Segment) -> FrameExample:
128
+ """T3 chat messages β€” extract operator reasoning from conversation."""
129
+ import re
130
+ text = seg.text
131
+ text_lower = text.lower()
132
+ role = seg.segment_type.replace("chat_", "")
133
+
134
+ # ── Ch.A: Channel A (Tier 1: THIS, SAME/NOT-SAME, NO) ──
135
+ g_ops: list[tuple[Op, str]] = [
136
+ (Op.THIS, f"this({role}_message) β€” deictic anchor to speaker turn"),
137
+ ]
138
+ # If the message references identity/equivalence
139
+ if any(w in text_lower for w in ["same", "identical", "equal", "different", "not the same", "distinguish"]):
140
+ g_ops.append((Op.SAME_NOT_SAME, f"same/not-same detected in: {text[:80]}"))
141
+ # If the message uses negation channel_bly
142
+ if any(w in text_lower for w in ["not ", "no ", "never ", "cannot ", "don't", "doesn't", "isn't"]):
143
+ g_ops.append((Op.NO, f"no(negation) in: {text[:80]}"))
144
+
145
+ # ── Ch.B: Channel B (Tier 2: GOES-WITH, TOGETHER/ALONE, MANY/ONE, EVERY/SOME, MORE/LESS, CAN/CANNOT) ──
146
+ s_ops: list[tuple[Op, str]] = []
147
+
148
+ # GOES-WITH: association/coupling language
149
+ if any(w in text_lower for w in ["coupled", "associated", "related", "connected", "linked", "maps to", "corresponds"]):
150
+ s_ops.append((Op.GOES_WITH, f"goes_with(relation) in: {text[:80]}"))
151
+ else:
152
+ # Default: message goes-with its conversation context
153
+ s_ops.append((Op.GOES_WITH, f"goes_with({role}, conversation_context)"))
154
+
155
+ # TOGETHER/ALONE: composition language
156
+ if any(w in text_lower for w in ["together", "combined", "joint", "both", "separate", "alone", "independent", "isolated"]):
157
+ s_ops.append((Op.TOGETHER_ALONE, f"together/alone in: {text[:80]}"))
158
+
159
+ # MANY/ONE: cardinality language
160
+ if any(w in text_lower for w in ["multiple", "several", "many", "each", "single", "one ", "unique"]):
161
+ s_ops.append((Op.MANY_ONE, f"many/one in: {text[:80]}"))
162
+
163
+ # EVERY/SOME: quantification
164
+ if any(w in text_lower for w in ["every ", "all ", "any ", "some ", "each "]):
165
+ s_ops.append((Op.EVERY_SOME, f"every/some in: {text[:80]}"))
166
+
167
+ # MORE/LESS: comparative
168
+ if any(w in text_lower for w in ["more ", "less ", "greater", "smaller", "better", "worse", "stronger", "weaker"]):
169
+ s_ops.append((Op.MORE_LESS, f"more/less in: {text[:80]}"))
170
+
171
+ # CAN/CANNOT: capability
172
+ if any(w in text_lower for w in ["can ", "cannot ", "capable", "able to", "unable"]):
173
+ s_ops.append((Op.CAN_CANNOT, f"can/cannot in: {text[:80]}"))
174
+
175
+ # Ensure at least one Tier 2 operator
176
+ if not any(op in _TIER_2_OPS for op, _ in s_ops):
177
+ s_ops.insert(0, (Op.GOES_WITH, f"goes_with({role}, topic) β€” default channel_b relation"))
178
+
179
+ # ── Ch.C: Channel C (Tier 3: INSIDE/OUTSIDE, NEAR/FAR, IF/THEN, BECAUSE, MAYBE, MUST/LET) ──
180
+ f_ops: list[tuple[Op, str]] = []
181
+
182
+ # IF/THEN: conditional reasoning
183
+ if any(w in text_lower for w in ["if ", "then ", "when ", "implies", "conditional", "given that"]):
184
+ f_ops.append((Op.IF_THEN, f"if_then in: {text[:80]}"))
185
+
186
+ # BECAUSE: causal reasoning
187
+ if any(w in text_lower for w in ["because", "since ", "therefore", "thus ", "hence", "reason", "caused by"]):
188
+ f_ops.append((Op.BECAUSE, f"because in: {text[:80]}"))
189
+
190
+ # MUST/LET: deontic
191
+ if any(w in text_lower for w in ["must ", "shall ", "required", "permitted", "allowed", "forbidden"]):
192
+ f_ops.append((Op.MUST_LET, f"must/let in: {text[:80]}"))
193
+
194
+ # MAYBE: epistemic uncertainty
195
+ if any(w in text_lower for w in ["maybe", "perhaps", "possibly", "might ", "uncertain", "unclear"]):
196
+ f_ops.append((Op.MAYBE, f"maybe in: {text[:80]}"))
197
+
198
+ # INSIDE/OUTSIDE: containment/boundary
199
+ if any(w in text_lower for w in ["inside", "outside", "within", "boundary", "scope", "contained", "enclosed"]):
200
+ f_ops.append((Op.INSIDE_OUTSIDE, f"inside/outside in: {text[:80]}"))
201
+
202
+ # NEAR/FAR: proximity/distance language
203
+ if any(w in text_lower for w in ["near ", "far ", "close to", "distant", "proxim", "remote", "adjacent", "approach"]):
204
+ f_ops.append((Op.NEAR_FAR, f"near/far in: {text[:80]}"))
205
+
206
+ # Default: at least one Tier 3 operator
207
+ if not any(op in _CAUSAL_OPS for op, _ in f_ops):
208
+ f_ops.append((Op.IF_THEN, f"if_then({role}_states, content_follows) β€” default channel_c"))
209
+
210
+ return _build_example(seg, g_ops=g_ops, s_ops=s_ops, f_ops=f_ops)
211
+
212
+
213
+ # Operator sets for home-condition checking
214
+ _TIER_2_OPS = {Op.GOES_WITH, Op.TOGETHER_ALONE, Op.MANY_ONE, Op.EVERY_SOME, Op.MORE_LESS, Op.CAN_CANNOT}
215
+ _CAUSAL_OPS = {Op.INSIDE_OUTSIDE, Op.NEAR_FAR, Op.IF_THEN, Op.BECAUSE, Op.MAYBE, Op.MUST_LET}
216
+
217
+
218
+ def _decompose_t3(seg: Segment) -> FrameExample:
219
+ """T3: Formal property specifications β€” SVA assertions, constraints."""
220
+ text = seg.text
221
+
222
+ if seg.segment_type == "assertion":
223
+ return _build_example(
224
+ seg,
225
+ g_ops=[
226
+ (Op.THIS, f"this(assertion) β€” formal property"),
227
+ (Op.BECAUSE, f"because(design_requirement, property)"),
228
+ ],
229
+ s_ops=[
230
+ (Op.EVERY_SOME, f"every(cycle, property_holds)"),
231
+ (Op.MUST_LET, f"must(property) β€” invariant constraint"),
232
+ ],
233
+ f_ops=[
234
+ (Op.IF_THEN, f"if_then(antecedent, consequent) β€” temporal implication"),
235
+ (Op.SAME_NOT_SAME, f"same(design_intent, formal_property)"),
236
+ ],
237
+ )
238
+
239
+ return _build_example(
240
+ seg,
241
+ g_ops=[(Op.THIS, f"this(formal_content) β€” T3 specification")],
242
+ s_ops=[
243
+ (Op.GOES_WITH, f"goes_with(property, verification_scope) β€” channel_b relation"),
244
+ (Op.INSIDE_OUTSIDE, f"inside(property, formal_context)"),
245
+ ],
246
+ f_ops=[(Op.MUST_LET, f"must(constraint) β€” formal requirement")],
247
+ )
248
+
249
+
250
+ # ═══════════════════════════════════════════════════════════════════════════════
251
+ # HELPERS
252
+ # ═══════════════════════════════════════════════════════════════════════════════
253
+
254
+ def _build_example(
255
+ seg: Segment,
256
+ g_ops: list[tuple[Op, str]],
257
+ s_ops: list[tuple[Op, str]],
258
+ f_ops: list[tuple[Op, str]],
259
+ ) -> FrameExample:
260
+ """Construct a FrameExample from operator lists and auto-witness.
261
+
262
+ Sorts each modality's operators by DAG index (non-decreasing)
263
+ before building the sequence. This ensures G₁ channel_b integrity.
264
+ """
265
+ # Sort by operator index to maintain DAG ordering
266
+ g_sorted = sorted(g_ops, key=lambda x: x[0].value)
267
+ s_sorted = sorted(s_ops, key=lambda x: x[0].value)
268
+ f_sorted = sorted(f_ops, key=lambda x: x[0].value)
269
+
270
+ channel_a = ModalityGrounding(
271
+ modality="G",
272
+ operators=OperatorSequence(
273
+ expressions=[OperatorExpression(operator=op, evidence=ev) for op, ev in g_sorted]
274
+ ),
275
+ )
276
+ channel_b = ModalityGrounding(
277
+ modality="S",
278
+ operators=OperatorSequence(
279
+ expressions=[OperatorExpression(operator=op, evidence=ev) for op, ev in s_sorted]
280
+ ),
281
+ )
282
+ channel_c = ModalityGrounding(
283
+ modality="F",
284
+ operators=OperatorSequence(
285
+ expressions=[OperatorExpression(operator=op, evidence=ev) for op, ev in f_sorted]
286
+ ),
287
+ )
288
+
289
+ witnesses = _auto_witness(seg)
290
+
291
+ provenance = SourceProvenance(
292
+ source_id=seg.source.source_id,
293
+ tier=seg.source.tier,
294
+ url=seg.source.url,
295
+ commit_or_version=seg.source.commit_or_version,
296
+ license=seg.source.license,
297
+ acquired_at=seg.source.acquired_at,
298
+ artifact_sha256=seg.source.artifact_sha256,
299
+ )
300
+
301
+ example = FrameExample(
302
+ provenance=provenance,
303
+ channel_a=channel_a,
304
+ channel_b=channel_b,
305
+ channel_c=channel_c,
306
+ witnesses=witnesses,
307
+ )
308
+ example.content_hash = example.compute_hash()
309
+ return example
310
+
311
+
312
+ def _auto_witness(seg: Segment) -> WitnessBundle:
313
+ """Generate witness attestations from segment metadata."""
314
+ bundle = WitnessBundle()
315
+ bundle.attestations[Witness.WHAT] = WitnessAttestation(
316
+ witness=Witness.WHAT, attested=True,
317
+ evidence=f"{seg.segment_type} segment, {len(seg.text)} chars",
318
+ )
319
+ bundle.attestations[Witness.WHERE] = WitnessAttestation(
320
+ witness=Witness.WHERE, attested=True,
321
+ evidence=f"source={seg.source.source_id}, bytes={seg.byte_range}",
322
+ )
323
+ bundle.attestations[Witness.WHICH] = WitnessAttestation(
324
+ witness=Witness.WHICH, attested=True,
325
+ evidence=f"tier={seg.source.tier.value}, type={seg.segment_type}",
326
+ )
327
+ bundle.attestations[Witness.WHEN] = WitnessAttestation(
328
+ witness=Witness.WHEN, attested=True,
329
+ evidence=f"acquired={seg.source.acquired_at}",
330
+ )
331
+ bundle.attestations[Witness.FOR_WHAT] = WitnessAttestation(
332
+ witness=Witness.FOR_WHAT, attested=True,
333
+ evidence="governed training data for governed generation pipeline",
334
+ )
335
+ bundle.attestations[Witness.HOW] = WitnessAttestation(
336
+ witness=Witness.HOW, attested=True,
337
+ evidence=f"pipeline S3 decompose, tier={seg.source.tier.value}",
338
+ )
339
+ bundle.attestations[Witness.WHENCE] = WitnessAttestation(
340
+ witness=Witness.WHENCE, attested=True,
341
+ evidence=f"url={seg.source.url}, commit={seg.source.commit_or_version}",
342
+ )
343
+ return bundle
344
+
345
+
346
+ def _extract_module_name(text: str) -> str:
347
+ import re
348
+ m = re.search(r"module\s+(\w+)", text)
349
+ return m.group(1) if m else "unknown"
350
+
351
+
352
+ def _extract_ports(text: str) -> list[str]:
353
+ import re
354
+ return re.findall(r"\b(?:input|output|inout)\s+(?:\[\d+:\d+\]\s*)?(\w+)", text)