rtferraz commited on
Commit
c00ac2c
·
verified ·
1 Parent(s): 818a2e9

Add predefined schemas (FINANCE, ECOMMERCE, HEALTHCARE)

Browse files
src/domain_tokenizer/schemas/predefined.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Predefined domain schemas for common use cases.
3
+
4
+ Each schema follows the validated patterns from the research:
5
+ - FINANCE_SCHEMA: Based on Nubank nuFormer (arXiv:2507.23267) — 97 special tokens
6
+ - ECOMMERCE_SCHEMA: Adapted from ActionPiece (arXiv:2502.13581) + nuFormer patterns
7
+ - HEALTHCARE_SCHEMA: Clinical event sequences
8
+ """
9
+
10
+ from ..schema import DomainSchema, FieldSpec, FieldType
11
+
12
+
13
+ # =============================================================================
14
+ # FINANCE SCHEMA — Based on Nubank nuFormer
15
+ # sign(2) + amount_bucket(21) + month(12) + dow(7) + dom(31) + hour(24) = 97
16
+ # =============================================================================
17
+
18
+ FINANCE_SCHEMA = DomainSchema(
19
+ name="finance",
20
+ description=(
21
+ "Financial transaction schema following Nubank nuFormer (arXiv:2507.23267). "
22
+ "Each transaction = sign + amount bucket + calendar features + text description. "
23
+ "~14 tokens per transaction, 2048 context = ~146 transactions."
24
+ ),
25
+ fields=[
26
+ FieldSpec(name="amount_sign", field_type=FieldType.SIGN, prefix="AMT_SIGN"),
27
+ FieldSpec(name="amount", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="AMT", n_bins=21),
28
+ FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
29
+ calendar_fields=["month", "dow", "dom", "hour"]),
30
+ FieldSpec(name="description", field_type=FieldType.TEXT, prefix="DESC"),
31
+ ],
32
+ )
33
+
34
+
35
+ # =============================================================================
36
+ # E-COMMERCE SCHEMA — Adapted from ActionPiece + nuFormer patterns
37
+ # =============================================================================
38
+
39
+ ECOMMERCE_SCHEMA = DomainSchema(
40
+ name="ecommerce",
41
+ description=(
42
+ "E-commerce event schema adapted from ActionPiece (arXiv:2502.13581) "
43
+ "and nuFormer patterns. Events: view/cart/purchase/return/wishlist. "
44
+ "~16 tokens per event, 2048 context = ~128 events."
45
+ ),
46
+ fields=[
47
+ FieldSpec(name="event_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="EVT",
48
+ categories=["view", "add_to_cart", "purchase", "return", "wishlist"]),
49
+ FieldSpec(name="price", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="PRICE", n_bins=21),
50
+ FieldSpec(name="quantity", field_type=FieldType.NUMERICAL_DISCRETE, prefix="QTY", max_value=10),
51
+ FieldSpec(name="category", field_type=FieldType.CATEGORICAL_FIXED, prefix="CAT",
52
+ categories=[
53
+ "electronics", "clothing", "home_garden", "books", "sports",
54
+ "toys", "food_grocery", "health_beauty", "automotive", "office",
55
+ "pet_supplies", "jewelry", "music", "movies", "games",
56
+ "baby", "tools", "arts_crafts", "industrial", "other",
57
+ ]),
58
+ FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
59
+ calendar_fields=["month", "dow", "dom", "hour"]),
60
+ FieldSpec(name="product_title", field_type=FieldType.TEXT, prefix="TITLE"),
61
+ ],
62
+ )
63
+
64
+
65
+ # =============================================================================
66
+ # HEALTHCARE SCHEMA — Clinical event sequences
67
+ # =============================================================================
68
+
69
+ HEALTHCARE_SCHEMA = DomainSchema(
70
+ name="healthcare",
71
+ description=(
72
+ "Clinical event schema for healthcare sequences. "
73
+ "Events: diagnosis/procedure/lab/medication/visit."
74
+ ),
75
+ fields=[
76
+ FieldSpec(name="event_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="CLIN",
77
+ categories=[
78
+ "diagnosis", "procedure", "lab_result", "medication",
79
+ "visit_inpatient", "visit_outpatient", "visit_er",
80
+ "imaging", "referral", "discharge",
81
+ ]),
82
+ FieldSpec(name="cost", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="COST", n_bins=21),
83
+ FieldSpec(name="severity", field_type=FieldType.CATEGORICAL_FIXED, prefix="SEV",
84
+ categories=["low", "moderate", "high", "critical"]),
85
+ FieldSpec(name="provider_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="PROV",
86
+ categories=[
87
+ "pcp", "specialist", "surgeon", "er_physician",
88
+ "nurse_practitioner", "therapist", "pharmacist", "other",
89
+ ]),
90
+ FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
91
+ calendar_fields=["month", "dow", "dom"]),
92
+ FieldSpec(name="description", field_type=FieldType.TEXT, prefix="DESC"),
93
+ ],
94
+ )