CHIP / chip /rules /rules.yaml
luancy1208's picture
v0.2 initial
67d959b verified
# CHIP Compression Rules v0.2 (2025-05-01)
# ==========================================
# v0.2 vs v0.1 主要变化:
# - 标签层从 [角:X] / 【任】 改为 ### 角色 (实测全 tokenizer 1 token,完爆方括号)
# - 新增 L3 成语层(基于 idiom_whitelist.json 实测)
# - 新增 L4 协议层(归一化用户已有的标签)
rules:
# ============================================================
# L1: 词法层 — 啰嗦套话剪枝
# ============================================================
- id: L1-001
layer: L1
pattern: "请你?帮我?"
replacement: ""
saves: 2
risk: low
description: "客套语 '请你帮我' / '请帮我' → 空"
- id: L1-002
layer: L1
pattern: "麻烦你?"
replacement: ""
saves: 2
risk: low
- id: L1-003
layer: L1
pattern: "如果可以的话[,,]?"
replacement: ""
saves: 3
risk: low
- id: L1-004
layer: L1
pattern: "(?:能不能|可不可以|可以|能)(?=帮|告诉|解释|总结|分析)"
replacement: ""
saves: 2
risk: low
- id: L1-005
layer: L1
pattern: "辛苦你?"
replacement: ""
saves: 2
risk: low
- id: L1-006
layer: L1
pattern: "(?:谢谢|感谢)(?:你|了)?[!!.。]?"
replacement: ""
saves: 2
risk: low
# ---- 进行/做 + 动词性名词 → 单字动词 ----
- id: L1-010
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?分析"
replacement: "分析"
saves: 2
risk: low
- id: L1-011
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?总结"
replacement: "总结"
saves: 2
risk: low
- id: L1-012
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?处理"
replacement: "处理"
saves: 2
risk: low
- id: L1-013
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?解释"
replacement: "解释"
saves: 2
risk: low
- id: L1-014
layer: L1
pattern: "做(?:一?(?:个|下|次)?)?判断"
replacement: "判定"
saves: 3
risk: low
- id: L1-015
layer: L1
pattern: "做(?:一?(?:个|下|次)?)?解释"
replacement: "解释"
saves: 3
risk: low
- id: L1-016
layer: L1
pattern: "给(?:出|我)(?:一些|几个)?建议"
replacement: "建议"
saves: 2
risk: low
- id: L1-017
layer: L1
pattern: "提供(?:一些|相关|相对)?帮助"
replacement: "助"
saves: 2
risk: mid
- id: L1-018
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?检查"
replacement: "检查"
saves: 2
risk: low
- id: L1-019
layer: L1
pattern: "进行(?:一?(?:个|下|次)?)?优化"
replacement: "优化"
saves: 2
risk: low
# ---- 连接词 ----
- id: L1-020
layer: L1
pattern: "也就是说[,,]?"
replacement: "即"
saves: 3
risk: low
- id: L1-021
layer: L1
pattern: "换句话说[,,]?"
replacement: "即"
saves: 3
risk: low
- id: L1-022
layer: L1
pattern: "与此同时[,,]?"
replacement: "同时,"
saves: 2
risk: low
- id: L1-023
layer: L1
pattern: "在这种情况下[,,]?"
replacement: "此时,"
saves: 3
risk: low
- id: L1-024
layer: L1
pattern: "由此可见[,,]?"
replacement: "故"
saves: 3
risk: low
- id: L1-025
layer: L1
pattern: "因此(?:[,,]|说)?"
replacement: "故"
saves: 1
risk: low
- id: L1-026
layer: L1
pattern: "如果没有"
replacement: "若无"
saves: 2
risk: low
- id: L1-027
layer: L1
pattern: "通过(.+?)的方式"
replacement: "用\\1"
saves: 2
risk: mid
- id: L1-028
layer: L1
pattern: "(?:如上所述|前面提到的|刚才说的)"
replacement: "前述"
saves: 3
risk: low
# ---- 修饰副词 ----
- id: L1-030
layer: L1
pattern: "比较(?:简洁|清晰|详细)地?"
replacement: ""
saves: 3
risk: low
- id: L1-031
layer: L1
pattern: "相对(?:简洁|详细|完整)地?"
replacement: ""
saves: 3
risk: low
- id: L1-032
layer: L1
pattern: "尽可能(?:地)?"
replacement: "尽量"
saves: 1
risk: low
- id: L1-033
layer: L1
pattern: "非常(?:详细|详尽|全面)地?"
replacement: "详细"
saves: 2
risk: low
# ============================================================
# L2: 句法层
# ============================================================
- id: L2-001
layer: L2
pattern: "对(.+?)进行(?:一?(?:个|下|次)?(?:全面|详细|简要|认真|深入)?的?)?([\\u4e00-\\u9fff]{1,4})"
replacement: "\\2\\1"
saves: 2
risk: mid
description: "'对 X 进行 Y' → 'Y X'"
- id: L2-002
layer: L2
pattern: "把(.+?)作为(.+?)(?=[,,。.\\s])"
replacement: "视\\1为\\2"
saves: 2
risk: mid
- id: L2-003
layer: L2
pattern: "由于(.+?)所以"
replacement: "\\1故"
saves: 3
risk: low
- id: L2-004
layer: L2
pattern: "虽然(.+?)但是"
replacement: "\\1然"
saves: 3
risk: mid
- id: L2-005
layer: L2
pattern: "不仅(.+?)而且"
replacement: "\\1且"
saves: 3
risk: low
- id: L2-006
layer: L2
pattern: "因为(.+?)所以"
replacement: "\\1故"
saves: 3
risk: low
- id: L2-007
layer: L2
pattern: "如果(.+?)那么"
replacement: "若\\1则"
saves: 2
risk: low
# ---- 列表化 ----
- id: L2-010
layer: L2
pattern: "第一[,,]"
replacement: "1. "
saves: 1
risk: low
- id: L2-011
layer: L2
pattern: "第二[,,]"
replacement: "2. "
saves: 1
risk: low
- id: L2-012
layer: L2
pattern: "第三[,,]"
replacement: "3. "
saves: 1
risk: low
- id: L2-013
layer: L2
pattern: "第四[,,]"
replacement: "4. "
saves: 1
risk: low
- id: L2-014
layer: L2
pattern: "首先[,,]"
replacement: "1. "
saves: 1
risk: low
- id: L2-015
layer: L2
pattern: "其次[,,]"
replacement: "2. "
saves: 1
risk: low
# ============================================================
# L2 协议化重写 (v0.2 修订)
# 实测:### 在所有 9 个 tokenizer 上都是 1 token
# ============================================================
- id: L2-020
layer: L2
pattern: "请\\s*(?:用|以)?\\s*(?:JSON|json|Json)\\s*格式\\s*(?:输出|返回|回答)"
replacement: "\n### 输出\nJSON"
saves: 4
risk: low
- id: L2-021
layer: L2
pattern: "请\\s*(?:用|以)?\\s*中文\\s*(?:回答|回复|输出)"
replacement: "\n### 输出\n中文"
saves: 3
risk: low
- id: L2-022
layer: L2
pattern: "请\\s*(?:你)?\\s*扮演\\s*(?:一(?:个|位))?\\s*(.+?)(?=[,,。.\\n]|的角色|$)"
replacement: "\n### 角色\n\\1\n"
saves: 4
risk: high
description: |
'请你扮演一位 X' → '### 角色\nX'
已知问题:含空格的复合 NP 可能被截断,Day 3 用 jieba 修复
# ============================================================
# L3: 成语层(默认 universal 11 条核心成语,需 layer=L3 显式启用)
# 在 ≥3 国产 tokenizer 上 1 token,基于 idiom_whitelist.json 实测
# ============================================================
- id: L3-001
layer: L3
pattern: "(?:大家都知道|每个人都知道|众人皆知)"
replacement: "众所周知"
saves: 2
risk: mid
- id: L3-002
layer: L3
pattern: "投入(?:全部|所有)?(?:精力|力量)(?:去做|做)?"
replacement: "全力以赴"
saves: 2
risk: mid
- id: L3-003
layer: L3
pattern: "(?:根据|结合|按照)(?:当地|实际)情况"
replacement: "因地制宜"
saves: 2
risk: mid
- id: L3-004
layer: L3
pattern: "(?:一步一步|一步步)(?:地)?(?:推进|进行)"
replacement: "循序渐进"
saves: 3
risk: mid
- id: L3-005
layer: L3
pattern: "(?:不断|持续|一直)(?:坚持|努力做)"
replacement: "持之以恒"
saves: 2
risk: mid
- id: L3-006
layer: L3
pattern: "认真(?:仔细)?(?:地)?对待"
replacement: "脚踏实地"
saves: 1
risk: mid
# ============================================================
# L4: 协议层归一化
# ============================================================
- id: L4-001
layer: L4
pattern: "(?:#+\\s*)?(?:任务|目标|Task|TASK)\\s*[::]\\s*"
replacement: "### 任务\n"
saves: 0
risk: low
- id: L4-002
layer: L4
pattern: "(?:#+\\s*)?(?:角色|身份|Role|ROLE)\\s*[::]\\s*"
replacement: "### 角色\n"
saves: 0
risk: low
- id: L4-003
layer: L4
pattern: "(?:#+\\s*)?(?:输出|返回|输出格式|Output|OUTPUT)\\s*[::]\\s*"
replacement: "### 输出\n"
saves: 0
risk: low
- id: L4-004
layer: L4
pattern: "(?:#+\\s*)?(?:约束|限制|要求|规则|Constraints|CONSTRAINTS)\\s*[::]\\s*"
replacement: "### 约束\n"
saves: 0
risk: low