Spaces:
Runtime error
Runtime error
PPP commited on
Commit ·
1a91c20
1
Parent(s): f0690fd
feat: add reproducible evaluation pipeline and structured interaction logging
Browse files- .gitignore +6 -0
- README.md +530 -3
- app.py +330 -84
- evaluation/datasets/branch_divergence.json +84 -0
- evaluation/datasets/consistency.json +283 -0
- evaluation/datasets/intent_accuracy.json +201 -0
- evaluation/datasets/latency.json +79 -0
- evaluation/run_evaluations.py +567 -0
- nlu_engine.py +25 -22
- story_engine.py +174 -94
- telemetry.py +81 -0
- utils.py +25 -14
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
|
| 4 |
+
.env
|
| 5 |
+
logs/
|
| 6 |
+
evaluation/results/
|
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: StoryWeaver
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
|
@@ -8,7 +8,534 @@ sdk_version: 6.7.0
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
-
short_description:
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: StoryWeaver
|
| 3 |
+
emoji: 📖
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
+
short_description: Interactive NLP story engine with evaluation and logging
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# StoryWeaver
|
| 15 |
+
|
| 16 |
+
StoryWeaver is an interactive text-adventure system built for our NLP course project. The repo is structured as an engineering project first and a demo second: it contains the playable app, the state-management core, evaluation scripts, and logging utilities needed for report writing and team collaboration.
|
| 17 |
+
|
| 18 |
+
This README is written for teammates who need to:
|
| 19 |
+
|
| 20 |
+
- understand how the system is organized
|
| 21 |
+
- run the app locally
|
| 22 |
+
- know where to change prompts, rules, or UI
|
| 23 |
+
- collect evaluation results for the report
|
| 24 |
+
- debug a bad interaction without reading the whole codebase first
|
| 25 |
+
|
| 26 |
+
## What This Repository Contains
|
| 27 |
+
|
| 28 |
+
At a high level, the project has five responsibilities:
|
| 29 |
+
|
| 30 |
+
1. parse player input into structured intent
|
| 31 |
+
2. keep the world state consistent across turns
|
| 32 |
+
3. generate the next story response and options
|
| 33 |
+
4. expose the system through a Gradio UI
|
| 34 |
+
5. export logs and run reproducible evaluation
|
| 35 |
+
|
| 36 |
+
This means the repo is not only a "game demo". It is also the evidence pipeline for the course deliverables.
|
| 37 |
+
|
| 38 |
+
## Quick Start
|
| 39 |
+
|
| 40 |
+
### 1. Install dependencies
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
pip install -r requirements.txt
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 2. Create `.env`
|
| 47 |
+
|
| 48 |
+
Create a `.env` file in the project root:
|
| 49 |
+
|
| 50 |
+
```env
|
| 51 |
+
QWEN_API_KEY=your_api_key_here
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
Optional:
|
| 55 |
+
|
| 56 |
+
```env
|
| 57 |
+
STORYWEAVER_LOG_DIR=logs/interactions
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### 3. Run the app
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
python app.py
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Default local URL:
|
| 67 |
+
|
| 68 |
+
- `http://localhost:7860`
|
| 69 |
+
|
| 70 |
+
### 4. Run evaluation
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
python evaluation/run_evaluations.py --task all --repeats 3
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
Useful variants:
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
python evaluation/run_evaluations.py --task intent
|
| 80 |
+
python evaluation/run_evaluations.py --task consistency
|
| 81 |
+
python evaluation/run_evaluations.py --task latency --repeats 5
|
| 82 |
+
python evaluation/run_evaluations.py --task branch
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## Recommended Reading Order
|
| 86 |
+
|
| 87 |
+
If you are new to the repo, read files in this order:
|
| 88 |
+
|
| 89 |
+
1. [state_manager.py](./state_manager.py)
|
| 90 |
+
Why: this is the single source of truth for player state, world state, quests, items, consistency checks, and state updates.
|
| 91 |
+
2. [nlu_engine.py](./nlu_engine.py)
|
| 92 |
+
Why: this shows how raw player text becomes structured intent.
|
| 93 |
+
3. [story_engine.py](./story_engine.py)
|
| 94 |
+
Why: this is the main generation pipeline and fallback logic.
|
| 95 |
+
4. [app.py](./app.py)
|
| 96 |
+
Why: this connects the UI with the engines and now also writes interaction logs.
|
| 97 |
+
5. [evaluation/run_evaluations.py](./evaluation/run_evaluations.py)
|
| 98 |
+
Why: this shows how we measure the system for the report.
|
| 99 |
+
|
| 100 |
+
If you only have 10 minutes, start with:
|
| 101 |
+
|
| 102 |
+
- `GameState.pre_validate_action`
|
| 103 |
+
- `GameState.check_consistency`
|
| 104 |
+
- `GameState.apply_changes`
|
| 105 |
+
- `NLUEngine.parse_intent`
|
| 106 |
+
- `StoryEngine.generate_story_stream`
|
| 107 |
+
- `process_user_input` in [app.py](./app.py)
|
| 108 |
+
|
| 109 |
+
## Repository Map
|
| 110 |
+
|
| 111 |
+
```text
|
| 112 |
+
StoryWeaver/
|
| 113 |
+
|-- app.py
|
| 114 |
+
|-- nlu_engine.py
|
| 115 |
+
|-- story_engine.py
|
| 116 |
+
|-- state_manager.py
|
| 117 |
+
|-- telemetry.py
|
| 118 |
+
|-- utils.py
|
| 119 |
+
|-- requirements.txt
|
| 120 |
+
|-- evaluation/
|
| 121 |
+
| |-- run_evaluations.py
|
| 122 |
+
| |-- datasets/
|
| 123 |
+
| `-- results/
|
| 124 |
+
`-- logs/
|
| 125 |
+
`-- interactions/
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
Core responsibilities by file:
|
| 129 |
+
|
| 130 |
+
- [app.py](./app.py)
|
| 131 |
+
Gradio app, session lifecycle, UI callbacks, per-turn logging.
|
| 132 |
+
- [state_manager.py](./state_manager.py)
|
| 133 |
+
Player/world models, item registry, NPC registry, quest registry, state validation, consistency checks, change application.
|
| 134 |
+
- [nlu_engine.py](./nlu_engine.py)
|
| 135 |
+
Intent parsing. Uses LLM parsing when available and keyword fallback when not.
|
| 136 |
+
- [story_engine.py](./story_engine.py)
|
| 137 |
+
Opening generation, main story generation, option generation, stream handling, fallback handling, telemetry tags.
|
| 138 |
+
- [telemetry.py](./telemetry.py)
|
| 139 |
+
Session metadata and JSONL interaction log export.
|
| 140 |
+
- [utils.py](./utils.py)
|
| 141 |
+
API client setup, Qwen calls, JSON extraction, retry helpers.
|
| 142 |
+
- [evaluation/run_evaluations.py](./evaluation/run_evaluations.py)
|
| 143 |
+
Reproducible experiment runner for the report.
|
| 144 |
+
|
| 145 |
+
## System Architecture
|
| 146 |
+
|
| 147 |
+
The main runtime path is:
|
| 148 |
+
|
| 149 |
+
`Player Input -> NLU -> Validation -> Story Generation -> State Update -> UI Output -> Interaction Log`
|
| 150 |
+
|
| 151 |
+
There are two ideas that matter most in this codebase:
|
| 152 |
+
|
| 153 |
+
### 1. `GameState` is the source of truth
|
| 154 |
+
|
| 155 |
+
Almost everything meaningful lives in [state_manager.py](./state_manager.py):
|
| 156 |
+
|
| 157 |
+
- player stats
|
| 158 |
+
- location
|
| 159 |
+
- time and weather
|
| 160 |
+
- inventory and equipment
|
| 161 |
+
- quests
|
| 162 |
+
- NPC states
|
| 163 |
+
- event history
|
| 164 |
+
|
| 165 |
+
When changing gameplay, try to keep state logic here instead of scattering it across prompts and UI code.
|
| 166 |
+
|
| 167 |
+
### 2. The app is a coordinator, not the game logic
|
| 168 |
+
|
| 169 |
+
[app.py](./app.py) should mostly:
|
| 170 |
+
|
| 171 |
+
- receive user input
|
| 172 |
+
- call NLU
|
| 173 |
+
- call the story engine
|
| 174 |
+
- update the chat UI
|
| 175 |
+
- write telemetry logs
|
| 176 |
+
|
| 177 |
+
If a new feature changes game rules, it probably belongs in [state_manager.py](./state_manager.py) or [story_engine.py](./story_engine.py), not in the UI layer.
|
| 178 |
+
|
| 179 |
+
## Runtime Flow
|
| 180 |
+
|
| 181 |
+
### Text input flow
|
| 182 |
+
|
| 183 |
+
For normal text input, the path is:
|
| 184 |
+
|
| 185 |
+
1. `process_user_input` receives raw text from the UI
|
| 186 |
+
2. `NLUEngine.parse_intent` converts it into a structured intent dict
|
| 187 |
+
3. `GameState.pre_validate_action` blocks clearly invalid actions early
|
| 188 |
+
4. `StoryEngine.generate_story_stream` runs the main narrative pipeline
|
| 189 |
+
5. `GameState.check_consistency` and `apply_changes` update state
|
| 190 |
+
6. UI is refreshed with story text, options, and status panel
|
| 191 |
+
7. `_record_interaction_log` writes a JSONL record to disk
|
| 192 |
+
|
| 193 |
+
### Option click flow
|
| 194 |
+
|
| 195 |
+
Button clicks do not go through full free-text parsing. Instead:
|
| 196 |
+
|
| 197 |
+
1. the selected option is converted to an intent-like dict
|
| 198 |
+
2. the story engine processes it the same way as text input
|
| 199 |
+
3. the result is rendered and logged
|
| 200 |
+
|
| 201 |
+
This is useful because option interactions and free-text interactions now share the same evaluation and observability format.
|
| 202 |
+
|
| 203 |
+
## Main Modules in More Detail
|
| 204 |
+
|
| 205 |
+
### `state_manager.py`
|
| 206 |
+
|
| 207 |
+
This file defines:
|
| 208 |
+
|
| 209 |
+
- `PlayerState`
|
| 210 |
+
- `WorldState`
|
| 211 |
+
- `GameEvent`
|
| 212 |
+
- `GameState`
|
| 213 |
+
|
| 214 |
+
Important methods:
|
| 215 |
+
|
| 216 |
+
- `pre_validate_action`
|
| 217 |
+
Rejects obviously invalid actions before calling the model.
|
| 218 |
+
- `check_consistency`
|
| 219 |
+
Detects contradictions in proposed state changes.
|
| 220 |
+
- `apply_changes`
|
| 221 |
+
Applies state changes and returns a readable change log.
|
| 222 |
+
- `validate`
|
| 223 |
+
Makes sure the resulting state is legal.
|
| 224 |
+
- `to_prompt`
|
| 225 |
+
Serializes the current game state into prompt-ready text.
|
| 226 |
+
|
| 227 |
+
When to edit this file:
|
| 228 |
+
|
| 229 |
+
- adding new items, NPCs, quests, or locations
|
| 230 |
+
- adding deterministic rules
|
| 231 |
+
- improving consistency checks
|
| 232 |
+
- changing state serialization for prompts
|
| 233 |
+
|
| 234 |
+
### `nlu_engine.py`
|
| 235 |
+
|
| 236 |
+
This file is responsible for intent recognition.
|
| 237 |
+
|
| 238 |
+
Current behavior:
|
| 239 |
+
|
| 240 |
+
- try LLM parsing first
|
| 241 |
+
- fall back to keyword rules if parsing fails
|
| 242 |
+
- return a normalized intent dict with `parser_source`
|
| 243 |
+
|
| 244 |
+
Current intent labels include:
|
| 245 |
+
|
| 246 |
+
- `ATTACK`
|
| 247 |
+
- `TALK`
|
| 248 |
+
- `MOVE`
|
| 249 |
+
- `EXPLORE`
|
| 250 |
+
- `USE_ITEM`
|
| 251 |
+
- `TRADE`
|
| 252 |
+
- `EQUIP`
|
| 253 |
+
- `REST`
|
| 254 |
+
- `QUEST`
|
| 255 |
+
- `SKILL`
|
| 256 |
+
- `PICKUP`
|
| 257 |
+
- `FLEE`
|
| 258 |
+
- `CUSTOM`
|
| 259 |
+
|
| 260 |
+
When to edit this file:
|
| 261 |
+
|
| 262 |
+
- adding a new intent type
|
| 263 |
+
- improving keyword fallback
|
| 264 |
+
- adding target extraction logic
|
| 265 |
+
- improving low-confidence handling
|
| 266 |
+
|
| 267 |
+
### `story_engine.py`
|
| 268 |
+
|
| 269 |
+
This is the main generation module.
|
| 270 |
+
|
| 271 |
+
It currently handles:
|
| 272 |
+
|
| 273 |
+
- opening generation
|
| 274 |
+
- story generation for each turn
|
| 275 |
+
- streaming and non-streaming paths
|
| 276 |
+
- default/fallback outputs
|
| 277 |
+
- consistency-aware regeneration
|
| 278 |
+
- response telemetry such as fallback reason and engine mode
|
| 279 |
+
|
| 280 |
+
Important methods:
|
| 281 |
+
|
| 282 |
+
- `generate_opening_stream`
|
| 283 |
+
- `generate_story`
|
| 284 |
+
- `generate_story_stream`
|
| 285 |
+
- `process_option_selection_stream`
|
| 286 |
+
- `_fallback_response`
|
| 287 |
+
|
| 288 |
+
When to edit this file:
|
| 289 |
+
|
| 290 |
+
- changing prompts
|
| 291 |
+
- changing multi-stage generation logic
|
| 292 |
+
- changing fallback behavior
|
| 293 |
+
- adding generation-side telemetry
|
| 294 |
+
|
| 295 |
+
### `app.py`
|
| 296 |
+
|
| 297 |
+
This file is the UI entry point and interaction orchestrator.
|
| 298 |
+
|
| 299 |
+
Important responsibilities:
|
| 300 |
+
|
| 301 |
+
- create a new game session
|
| 302 |
+
- start and restart the app session
|
| 303 |
+
- process text input
|
| 304 |
+
- process option clicks
|
| 305 |
+
- update Gradio components
|
| 306 |
+
- write structured interaction logs
|
| 307 |
+
|
| 308 |
+
When to edit this file:
|
| 309 |
+
|
| 310 |
+
- changing UI flow
|
| 311 |
+
- adding debug panels
|
| 312 |
+
- changing how logs are written
|
| 313 |
+
- changing how outputs are displayed
|
| 314 |
+
|
| 315 |
+
### `telemetry.py`
|
| 316 |
+
|
| 317 |
+
This file handles structured log export.
|
| 318 |
+
|
| 319 |
+
It is intentionally simple and file-based:
|
| 320 |
+
|
| 321 |
+
- one session gets one JSONL file
|
| 322 |
+
- one turn becomes one JSON object line
|
| 323 |
+
|
| 324 |
+
This is useful for:
|
| 325 |
+
|
| 326 |
+
- report case studies
|
| 327 |
+
- measuring fallback rate
|
| 328 |
+
- debugging weird turns
|
| 329 |
+
- collecting examples for later evaluation
|
| 330 |
+
|
| 331 |
+
## Logging and Observability
|
| 332 |
+
|
| 333 |
+
Interaction logs are written under:
|
| 334 |
+
|
| 335 |
+
- [logs/interactions](./logs/interactions)
|
| 336 |
+
|
| 337 |
+
Each turn record includes at least:
|
| 338 |
+
|
| 339 |
+
- input source
|
| 340 |
+
- user input
|
| 341 |
+
- NLU result
|
| 342 |
+
- latency
|
| 343 |
+
- fallback metadata
|
| 344 |
+
- state changes
|
| 345 |
+
- consistency issues
|
| 346 |
+
- final output text
|
| 347 |
+
- post-turn state snapshot
|
| 348 |
+
|
| 349 |
+
Example shape:
|
| 350 |
+
|
| 351 |
+
```json
|
| 352 |
+
{
|
| 353 |
+
"timestamp": "2026-03-14T18:55:00",
|
| 354 |
+
"session_id": "sw-20260314-185500-ab12cd34",
|
| 355 |
+
"turn_index": 3,
|
| 356 |
+
"input_source": "text_input",
|
| 357 |
+
"user_input": "和村长老伯谈谈最近森林里的怪事",
|
| 358 |
+
"nlu_result": {
|
| 359 |
+
"intent": "TALK",
|
| 360 |
+
"target": "村长老伯",
|
| 361 |
+
"parser_source": "llm"
|
| 362 |
+
},
|
| 363 |
+
"latency_ms": 842.13,
|
| 364 |
+
"used_fallback": false,
|
| 365 |
+
"state_changes": {},
|
| 366 |
+
"output_text": "...",
|
| 367 |
+
"post_turn_snapshot": {
|
| 368 |
+
"location": "村庄广场"
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
```
|
| 372 |
+
|
| 373 |
+
If you need to debug a bad interaction, the fastest path is:
|
| 374 |
+
|
| 375 |
+
1. check the log file
|
| 376 |
+
2. inspect `nlu_result`
|
| 377 |
+
3. inspect `telemetry.used_fallback`
|
| 378 |
+
4. inspect `state_changes`
|
| 379 |
+
5. inspect the post-turn snapshot
|
| 380 |
+
|
| 381 |
+
## Evaluation Pipeline
|
| 382 |
+
|
| 383 |
+
Evaluation entry point:
|
| 384 |
+
|
| 385 |
+
- [evaluation/run_evaluations.py](./evaluation/run_evaluations.py)
|
| 386 |
+
|
| 387 |
+
Datasets:
|
| 388 |
+
|
| 389 |
+
- [evaluation/datasets/intent_accuracy.json](./evaluation/datasets/intent_accuracy.json)
|
| 390 |
+
- [evaluation/datasets/consistency.json](./evaluation/datasets/consistency.json)
|
| 391 |
+
- [evaluation/datasets/latency.json](./evaluation/datasets/latency.json)
|
| 392 |
+
- [evaluation/datasets/branch_divergence.json](./evaluation/datasets/branch_divergence.json)
|
| 393 |
+
|
| 394 |
+
Results:
|
| 395 |
+
|
| 396 |
+
- [evaluation/results](./evaluation/results)
|
| 397 |
+
|
| 398 |
+
### What each task measures
|
| 399 |
+
|
| 400 |
+
#### Intent
|
| 401 |
+
|
| 402 |
+
- labeled input -> predicted intent
|
| 403 |
+
- optional target matching
|
| 404 |
+
- parser source breakdown
|
| 405 |
+
- per-example latency
|
| 406 |
+
|
| 407 |
+
#### Consistency
|
| 408 |
+
|
| 409 |
+
- action guard correctness via `pre_validate_action`
|
| 410 |
+
- contradiction detection via `check_consistency`
|
| 411 |
+
|
| 412 |
+
#### Latency
|
| 413 |
+
|
| 414 |
+
- NLU latency
|
| 415 |
+
- generation latency
|
| 416 |
+
- total latency
|
| 417 |
+
- fallback rate
|
| 418 |
+
|
| 419 |
+
#### Branch divergence
|
| 420 |
+
|
| 421 |
+
- same start state, different choices
|
| 422 |
+
- compare resulting story text
|
| 423 |
+
- compare option differences
|
| 424 |
+
- compare state snapshot differences
|
| 425 |
+
|
| 426 |
+
## Common Development Tasks
|
| 427 |
+
|
| 428 |
+
### Add a new intent
|
| 429 |
+
|
| 430 |
+
You will usually need to touch:
|
| 431 |
+
|
| 432 |
+
- [nlu_engine.py](./nlu_engine.py)
|
| 433 |
+
- [state_manager.py](./state_manager.py)
|
| 434 |
+
- [story_engine.py](./story_engine.py)
|
| 435 |
+
- [evaluation/datasets/intent_accuracy.json](./evaluation/datasets/intent_accuracy.json)
|
| 436 |
+
|
| 437 |
+
Suggested checklist:
|
| 438 |
+
|
| 439 |
+
1. add the label to the NLU logic
|
| 440 |
+
2. decide whether it needs pre-validation
|
| 441 |
+
3. make sure story prompts know how to handle it
|
| 442 |
+
4. add at least a few evaluation examples
|
| 443 |
+
|
| 444 |
+
### Add a new location, NPC, quest, or item
|
| 445 |
+
|
| 446 |
+
Most of the time you only need:
|
| 447 |
+
|
| 448 |
+
- [state_manager.py](./state_manager.py)
|
| 449 |
+
|
| 450 |
+
That file contains the initial world setup and registry-style data.
|
| 451 |
+
|
| 452 |
+
### Add more evaluation cases
|
| 453 |
+
|
| 454 |
+
Edit files under:
|
| 455 |
+
|
| 456 |
+
- [evaluation/datasets](./evaluation/datasets)
|
| 457 |
+
|
| 458 |
+
This is the easiest way to improve the report without changing runtime logic.
|
| 459 |
+
|
| 460 |
+
### Investigate a strange game turn
|
| 461 |
+
|
| 462 |
+
Check in this order:
|
| 463 |
+
|
| 464 |
+
1. interaction log under `logs/interactions`
|
| 465 |
+
2. `parser_source` in the NLU result
|
| 466 |
+
3. `telemetry` in the final story result
|
| 467 |
+
4. whether `pre_validate_action` rejected or allowed the turn
|
| 468 |
+
5. whether `check_consistency` flagged anything
|
| 469 |
+
|
| 470 |
+
### Change UI behavior without touching gameplay
|
| 471 |
+
|
| 472 |
+
Edit:
|
| 473 |
+
|
| 474 |
+
- [app.py](./app.py)
|
| 475 |
+
|
| 476 |
+
Try not to put game rules in the UI layer.
|
| 477 |
+
|
| 478 |
+
## Environment Notes
|
| 479 |
+
|
| 480 |
+
### If `QWEN_API_KEY` is missing
|
| 481 |
+
|
| 482 |
+
- warning logs will appear
|
| 483 |
+
- some paths will still run through fallback logic
|
| 484 |
+
- evaluation can still execute, but model-quality conclusions are not meaningful
|
| 485 |
+
|
| 486 |
+
### If `openai` is not installed
|
| 487 |
+
|
| 488 |
+
- the repo can still import in some cases because the client is lazily initialized
|
| 489 |
+
- full Qwen generation will not work
|
| 490 |
+
- evaluation scripts will mostly reflect fallback behavior
|
| 491 |
+
|
| 492 |
+
### If `gradio` is not installed
|
| 493 |
+
|
| 494 |
+
- the app cannot launch
|
| 495 |
+
- offline evaluation scripts can still be useful
|
| 496 |
+
|
| 497 |
+
## Current Known Limitations
|
| 498 |
+
|
| 499 |
+
These are the main gaps we still know about:
|
| 500 |
+
|
| 501 |
+
- some item and equipment effects are stored as metadata but not fully executed as deterministic rules
|
| 502 |
+
- combat and trade are still more prompt-driven than rule-driven
|
| 503 |
+
- branch divergence is much more meaningful with a real model than in fallback-only mode
|
| 504 |
+
- evaluation quality depends on whether the real model environment is available
|
| 505 |
+
|
| 506 |
+
## Suggested Team Workflow
|
| 507 |
+
|
| 508 |
+
If multiple teammates are working in parallel, this split is usually clean:
|
| 509 |
+
|
| 510 |
+
- gameplay/state teammate
|
| 511 |
+
Focus on [state_manager.py](./state_manager.py)
|
| 512 |
+
- prompt/generation teammate
|
| 513 |
+
Focus on [story_engine.py](./story_engine.py)
|
| 514 |
+
- NLU/evaluation teammate
|
| 515 |
+
Focus on [nlu_engine.py](./nlu_engine.py) and [evaluation](./evaluation)
|
| 516 |
+
- UI/demo teammate
|
| 517 |
+
Focus on [app.py](./app.py)
|
| 518 |
+
- report teammate
|
| 519 |
+
Focus on `evaluation/results`, `logs/interactions`, and case-study collection
|
| 520 |
+
|
| 521 |
+
## What To Use in the Final Report
|
| 522 |
+
|
| 523 |
+
For the course report, the most useful artifacts from this repo are:
|
| 524 |
+
|
| 525 |
+
- evaluation JSON outputs under `evaluation/results`
|
| 526 |
+
- interaction logs under `logs/interactions`
|
| 527 |
+
- dataset files under `evaluation/datasets`
|
| 528 |
+
- readable state transitions from `change_log`
|
| 529 |
+
- fallback metadata from `telemetry`
|
| 530 |
+
|
| 531 |
+
These can directly support:
|
| 532 |
+
|
| 533 |
+
- experiment setup
|
| 534 |
+
- metric definition
|
| 535 |
+
- result tables
|
| 536 |
+
- success cases
|
| 537 |
+
- failure case analysis
|
| 538 |
+
|
| 539 |
+
## License
|
| 540 |
+
|
| 541 |
+
MIT
|
app.py
CHANGED
|
@@ -13,14 +13,17 @@ app.py - StoryWeaver Gradio 交互界面
|
|
| 13 |
Gradio UI ← 状态管理器(校验 + 更新) ← 叙事引擎(文本 + 选项)
|
| 14 |
"""
|
| 15 |
|
| 16 |
-
import
|
| 17 |
-
import
|
| 18 |
-
import
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
from
|
| 23 |
-
from
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# ============================================================
|
| 26 |
# 全局游戏实例(每个会话独立)
|
|
@@ -30,18 +33,133 @@ from utils import logger
|
|
| 30 |
# 这里先定义工厂函数
|
| 31 |
|
| 32 |
|
| 33 |
-
def create_new_game(player_name: str = "旅人") -> dict:
|
| 34 |
-
"""创建新游戏实例,返回包含所有引擎的字典"""
|
| 35 |
-
game_state = GameState(player_name=player_name)
|
| 36 |
-
nlu = NLUEngine(game_state)
|
| 37 |
-
story = StoryEngine(game_state)
|
| 38 |
return {
|
| 39 |
-
"game_state": game_state,
|
| 40 |
-
"nlu": nlu,
|
| 41 |
-
"story": story,
|
| 42 |
-
"current_options": [],
|
| 43 |
-
"started": False,
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def restart_game() -> tuple:
|
|
@@ -96,7 +214,8 @@ def start_game(player_name: str, game_session: dict):
|
|
| 96 |
)
|
| 97 |
|
| 98 |
# 流式生成开场(选项仅在流结束后从 final 事件中提取,流式期间不解析选项)
|
| 99 |
-
|
|
|
|
| 100 |
final_result = None
|
| 101 |
|
| 102 |
for update in game_session["story"].generate_opening_stream():
|
|
@@ -110,7 +229,9 @@ def start_game(player_name: str, game_session: dict):
|
|
| 110 |
gr.update(interactive=False),
|
| 111 |
)
|
| 112 |
elif update["type"] == "final":
|
| 113 |
-
final_result = update
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# ★ 只在数据流完全结束后,从 final_result 中提取选项
|
| 116 |
if final_result:
|
|
@@ -123,13 +244,36 @@ def start_game(player_name: str, game_session: dict):
|
|
| 123 |
options = _ensure_min_options(options, 3)
|
| 124 |
|
| 125 |
# 最终 yield:显示完整文本 + 选项 + 启用按钮
|
| 126 |
-
game_session["current_options"] = options
|
| 127 |
-
options_text = _format_options(options)
|
| 128 |
-
full_message = f"{story_text}\n\n{options_text}"
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
yield (
|
| 135 |
chat_history, status_text,
|
|
@@ -167,9 +311,10 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 167 |
)
|
| 168 |
return
|
| 169 |
|
| 170 |
-
gs: GameState = game_session["game_state"]
|
| 171 |
-
nlu: NLUEngine = game_session["nlu"]
|
| 172 |
-
story: StoryEngine = game_session["story"]
|
|
|
|
| 173 |
|
| 174 |
# 检查游戏是否已结束
|
| 175 |
if gs.is_game_over():
|
|
@@ -185,8 +330,10 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 185 |
)
|
| 186 |
return
|
| 187 |
|
| 188 |
-
# 1. NLU 解析
|
| 189 |
-
|
|
|
|
|
|
|
| 190 |
|
| 191 |
# 1.5 预校验:立即驳回违反一致性的操作(不调用 LLM,不消耗回合)
|
| 192 |
is_valid, rejection_msg = gs.pre_validate_action(intent)
|
|
@@ -199,8 +346,31 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 199 |
f"⚠️ **行动被驳回**:{rejection_msg}\n\n"
|
| 200 |
f"请重新选择行动,或输入其他指令。\n\n{options_text}"
|
| 201 |
)
|
| 202 |
-
chat_history.append({"role": "assistant", "content": rejection_content})
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
yield (
|
| 205 |
chat_history,
|
| 206 |
_format_status_panel(gs),
|
|
@@ -223,18 +393,21 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 223 |
)
|
| 224 |
|
| 225 |
# 3. 流式生成故事
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
| 232 |
_format_status_panel(gs),
|
| 233 |
loading[0], loading[1], loading[2],
|
| 234 |
game_session,
|
| 235 |
)
|
| 236 |
-
elif update["type"] == "final":
|
| 237 |
-
final_result = update
|
|
|
|
|
|
|
| 238 |
|
| 239 |
# 4. 最终更新:完整文本 + 状态变化 + 选项 + 按钮
|
| 240 |
if final_result:
|
|
@@ -256,13 +429,24 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 256 |
|
| 257 |
options_text = _format_options(options)
|
| 258 |
full_message = f"{final_result['story_text']}{log_text}{issues_text}\n\n{options_text}"
|
| 259 |
-
chat_history[-1]["content"] = full_message
|
| 260 |
-
|
| 261 |
-
status_text = _format_status_panel(gs)
|
| 262 |
-
btn_updates = _get_button_updates(options)
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
status_text,
|
| 267 |
btn_updates[0], btn_updates[1], btn_updates[2],
|
| 268 |
game_session,
|
|
@@ -272,14 +456,37 @@ def process_user_input(user_input: str, chat_history: list, game_session: dict):
|
|
| 272 |
logger.warning("流式生成未产生 final 事件,使用兜底文本")
|
| 273 |
fallback_text = "你环顾四周,思考着接下来该做什么..."
|
| 274 |
fallback_options = _ensure_min_options([], 3)
|
| 275 |
-
game_session["current_options"] = fallback_options
|
| 276 |
-
|
| 277 |
-
options_text = _format_options(fallback_options)
|
| 278 |
-
full_message = f"{fallback_text}\n\n{options_text}"
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
yield (
|
| 285 |
chat_history,
|
|
@@ -318,9 +525,11 @@ def process_option_click(option_idx: int, chat_history: list, game_session: dict
|
|
| 318 |
)
|
| 319 |
return
|
| 320 |
|
| 321 |
-
selected_option = options[option_idx]
|
| 322 |
-
gs: GameState = game_session["game_state"]
|
| 323 |
-
story: StoryEngine = game_session["story"]
|
|
|
|
|
|
|
| 324 |
|
| 325 |
# 检查特殊选项:重新开始
|
| 326 |
if selected_option.get("action_type") == "RESTART":
|
|
@@ -404,18 +613,21 @@ def process_option_click(option_idx: int, chat_history: list, game_session: dict
|
|
| 404 |
game_session,
|
| 405 |
)
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
|
|
|
| 413 |
_format_status_panel(gs),
|
| 414 |
loading[0], loading[1], loading[2],
|
| 415 |
game_session,
|
| 416 |
)
|
| 417 |
-
elif update["type"] == "final":
|
| 418 |
-
final_result = update
|
|
|
|
|
|
|
| 419 |
|
| 420 |
if final_result:
|
| 421 |
# ★ 安全兜底:强制确保恰好 3 个选项
|
|
@@ -430,13 +642,24 @@ def process_option_click(option_idx: int, chat_history: list, game_session: dict
|
|
| 430 |
|
| 431 |
options_text = _format_options(options)
|
| 432 |
full_message = f"{final_result['story_text']}{log_text}\n\n{options_text}"
|
| 433 |
-
chat_history[-1]["content"] = full_message
|
| 434 |
-
|
| 435 |
-
status_text = _format_status_panel(gs)
|
| 436 |
-
btn_updates = _get_button_updates(options)
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
btn_updates[0], btn_updates[1], btn_updates[2],
|
| 441 |
game_session,
|
| 442 |
)
|
|
@@ -445,14 +668,37 @@ def process_option_click(option_idx: int, chat_history: list, game_session: dict
|
|
| 445 |
logger.warning("[选项点击] 流式生成未产生 final 事件,使用兜底文本")
|
| 446 |
fallback_text = "你环顾四周,思考着接下来该做什么..."
|
| 447 |
fallback_options = _ensure_min_options([], 3)
|
| 448 |
-
game_session["current_options"] = fallback_options
|
| 449 |
-
|
| 450 |
-
options_text = _format_options(fallback_options)
|
| 451 |
-
full_message = f"{fallback_text}\n\n{options_text}"
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
yield (
|
| 458 |
chat_history, status_text,
|
|
|
|
| 13 |
Gradio UI ← 状态管理器(校验 + 更新) ← 叙事引擎(文本 + 选项)
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
import copy
|
| 17 |
+
import json
|
| 18 |
+
import logging
|
| 19 |
+
from time import perf_counter
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
from state_manager import GameState
|
| 23 |
+
from nlu_engine import NLUEngine
|
| 24 |
+
from story_engine import StoryEngine
|
| 25 |
+
from telemetry import append_turn_log, create_session_metadata
|
| 26 |
+
from utils import logger
|
| 27 |
|
| 28 |
# ============================================================
|
| 29 |
# 全局游戏实例(每个会话独立)
|
|
|
|
| 33 |
# 这里先定义工厂函数
|
| 34 |
|
| 35 |
|
| 36 |
+
def create_new_game(player_name: str = "旅人") -> dict:
|
| 37 |
+
"""创建新游戏实例,返回包含所有引擎的字典"""
|
| 38 |
+
game_state = GameState(player_name=player_name)
|
| 39 |
+
nlu = NLUEngine(game_state)
|
| 40 |
+
story = StoryEngine(game_state)
|
| 41 |
return {
|
| 42 |
+
"game_state": game_state,
|
| 43 |
+
"nlu": nlu,
|
| 44 |
+
"story": story,
|
| 45 |
+
"current_options": [],
|
| 46 |
+
"started": False,
|
| 47 |
+
**create_session_metadata(),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _json_safe(value):
|
| 52 |
+
"""Convert nested values into JSON-serializable data for logs."""
|
| 53 |
+
if value is None or isinstance(value, (str, int, float, bool)):
|
| 54 |
+
return value
|
| 55 |
+
if isinstance(value, dict):
|
| 56 |
+
return {str(key): _json_safe(val) for key, val in value.items()}
|
| 57 |
+
if isinstance(value, (list, tuple, set)):
|
| 58 |
+
return [_json_safe(item) for item in value]
|
| 59 |
+
if hasattr(value, "model_dump"):
|
| 60 |
+
return _json_safe(value.model_dump())
|
| 61 |
+
return str(value)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _build_state_snapshot(gs: GameState) -> dict:
|
| 65 |
+
"""Build a compact state snapshot for reproducible evaluation logs."""
|
| 66 |
+
active_quests = []
|
| 67 |
+
for quest in gs.world.quests.values():
|
| 68 |
+
if quest.status == "active":
|
| 69 |
+
active_quests.append(
|
| 70 |
+
{
|
| 71 |
+
"quest_id": quest.quest_id,
|
| 72 |
+
"title": quest.title,
|
| 73 |
+
"status": quest.status,
|
| 74 |
+
"objectives": _json_safe(quest.objectives),
|
| 75 |
+
}
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return {
|
| 79 |
+
"turn": gs.turn,
|
| 80 |
+
"game_mode": gs.game_mode,
|
| 81 |
+
"location": gs.player.location,
|
| 82 |
+
"scene": gs.world.current_scene,
|
| 83 |
+
"day": gs.world.day_count,
|
| 84 |
+
"time_of_day": gs.world.time_of_day,
|
| 85 |
+
"weather": gs.world.weather,
|
| 86 |
+
"player": {
|
| 87 |
+
"name": gs.player.name,
|
| 88 |
+
"level": gs.player.level,
|
| 89 |
+
"hp": gs.player.hp,
|
| 90 |
+
"max_hp": gs.player.max_hp,
|
| 91 |
+
"mp": gs.player.mp,
|
| 92 |
+
"max_mp": gs.player.max_mp,
|
| 93 |
+
"gold": gs.player.gold,
|
| 94 |
+
"morale": gs.player.morale,
|
| 95 |
+
"sanity": gs.player.sanity,
|
| 96 |
+
"hunger": gs.player.hunger,
|
| 97 |
+
"karma": gs.player.karma,
|
| 98 |
+
"inventory": list(gs.player.inventory),
|
| 99 |
+
"equipment": copy.deepcopy(gs.player.equipment),
|
| 100 |
+
"skills": list(gs.player.skills),
|
| 101 |
+
"status_effects": [effect.name for effect in gs.player.status_effects],
|
| 102 |
+
},
|
| 103 |
+
"active_quests": active_quests,
|
| 104 |
+
"event_log_size": len(gs.event_log),
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _record_interaction_log(
|
| 109 |
+
game_session: dict,
|
| 110 |
+
*,
|
| 111 |
+
input_source: str,
|
| 112 |
+
user_input: str,
|
| 113 |
+
intent_result: dict | None,
|
| 114 |
+
output_text: str,
|
| 115 |
+
latency_ms: float,
|
| 116 |
+
nlu_latency_ms: float | None = None,
|
| 117 |
+
generation_latency_ms: float | None = None,
|
| 118 |
+
final_result: dict | None = None,
|
| 119 |
+
selected_option: dict | None = None,
|
| 120 |
+
):
|
| 121 |
+
"""Append a structured interaction log without affecting gameplay."""
|
| 122 |
+
if not game_session or "game_state" not in game_session:
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
final_result = final_result or {}
|
| 126 |
+
telemetry = _json_safe(final_result.get("telemetry", {})) or {}
|
| 127 |
+
record = {
|
| 128 |
+
"input_source": input_source,
|
| 129 |
+
"user_input": user_input,
|
| 130 |
+
"selected_option": _json_safe(selected_option),
|
| 131 |
+
"nlu_result": _json_safe(intent_result),
|
| 132 |
+
"latency_ms": round(latency_ms, 2),
|
| 133 |
+
"nlu_latency_ms": None if nlu_latency_ms is None else round(nlu_latency_ms, 2),
|
| 134 |
+
"generation_latency_ms": None if generation_latency_ms is None else round(generation_latency_ms, 2),
|
| 135 |
+
"used_fallback": bool(telemetry.get("used_fallback", False)),
|
| 136 |
+
"fallback_reason": telemetry.get("fallback_reason"),
|
| 137 |
+
"engine_mode": telemetry.get("engine_mode"),
|
| 138 |
+
"state_changes": _json_safe(final_result.get("state_changes", {})),
|
| 139 |
+
"change_log": _json_safe(final_result.get("change_log", [])),
|
| 140 |
+
"consistency_issues": _json_safe(final_result.get("consistency_issues", [])),
|
| 141 |
+
"output_text": output_text,
|
| 142 |
+
"story_text": final_result.get("story_text"),
|
| 143 |
+
"options": _json_safe(final_result.get("options", game_session.get("current_options", []))),
|
| 144 |
+
"post_turn_snapshot": _build_state_snapshot(game_session["game_state"]),
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
append_turn_log(game_session, record)
|
| 149 |
+
except Exception as exc:
|
| 150 |
+
logger.warning(f"Failed to append interaction log: {exc}")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _build_option_intent(selected_option: dict) -> dict:
|
| 154 |
+
"""Represent button clicks in the same schema as free-text NLU output."""
|
| 155 |
+
option_text = selected_option.get("text", "")
|
| 156 |
+
return {
|
| 157 |
+
"intent": selected_option.get("action_type", "EXPLORE"),
|
| 158 |
+
"target": None,
|
| 159 |
+
"details": option_text,
|
| 160 |
+
"raw_input": option_text,
|
| 161 |
+
"parser_source": "option_click",
|
| 162 |
+
}
|
| 163 |
|
| 164 |
|
| 165 |
def restart_game() -> tuple:
|
|
|
|
| 214 |
)
|
| 215 |
|
| 216 |
# 流式生成开场(选项仅在流结束后从 final 事件中提取,流式期间不解析选项)
|
| 217 |
+
turn_started = perf_counter()
|
| 218 |
+
story_text = ""
|
| 219 |
final_result = None
|
| 220 |
|
| 221 |
for update in game_session["story"].generate_opening_stream():
|
|
|
|
| 229 |
gr.update(interactive=False),
|
| 230 |
)
|
| 231 |
elif update["type"] == "final":
|
| 232 |
+
final_result = update
|
| 233 |
+
|
| 234 |
+
generation_latency_ms = (perf_counter() - turn_started) * 1000
|
| 235 |
|
| 236 |
# ★ 只在数据流完全结束后,从 final_result 中提取选项
|
| 237 |
if final_result:
|
|
|
|
| 244 |
options = _ensure_min_options(options, 3)
|
| 245 |
|
| 246 |
# 最终 yield:显示完整文本 + 选项 + 启用按钮
|
| 247 |
+
game_session["current_options"] = options
|
| 248 |
+
options_text = _format_options(options)
|
| 249 |
+
full_message = f"{story_text}\n\n{options_text}"
|
| 250 |
+
if not final_result:
|
| 251 |
+
final_result = {
|
| 252 |
+
"story_text": story_text,
|
| 253 |
+
"options": options,
|
| 254 |
+
"state_changes": {},
|
| 255 |
+
"change_log": [],
|
| 256 |
+
"consistency_issues": [],
|
| 257 |
+
"telemetry": {
|
| 258 |
+
"engine_mode": "opening_app",
|
| 259 |
+
"used_fallback": True,
|
| 260 |
+
"fallback_reason": "missing_final_event",
|
| 261 |
+
},
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
chat_history[-1]["content"] = full_message
|
| 265 |
+
status_text = _format_status_panel(game_session["game_state"])
|
| 266 |
+
btn_updates = _get_button_updates(options)
|
| 267 |
+
_record_interaction_log(
|
| 268 |
+
game_session,
|
| 269 |
+
input_source="system_opening",
|
| 270 |
+
user_input="",
|
| 271 |
+
intent_result=None,
|
| 272 |
+
output_text=full_message,
|
| 273 |
+
latency_ms=generation_latency_ms,
|
| 274 |
+
generation_latency_ms=generation_latency_ms,
|
| 275 |
+
final_result=final_result,
|
| 276 |
+
)
|
| 277 |
|
| 278 |
yield (
|
| 279 |
chat_history, status_text,
|
|
|
|
| 311 |
)
|
| 312 |
return
|
| 313 |
|
| 314 |
+
gs: GameState = game_session["game_state"]
|
| 315 |
+
nlu: NLUEngine = game_session["nlu"]
|
| 316 |
+
story: StoryEngine = game_session["story"]
|
| 317 |
+
turn_started = perf_counter()
|
| 318 |
|
| 319 |
# 检查游戏是否已结束
|
| 320 |
if gs.is_game_over():
|
|
|
|
| 330 |
)
|
| 331 |
return
|
| 332 |
|
| 333 |
+
# 1. NLU 解析
|
| 334 |
+
nlu_started = perf_counter()
|
| 335 |
+
intent = nlu.parse_intent(user_input)
|
| 336 |
+
nlu_latency_ms = (perf_counter() - nlu_started) * 1000
|
| 337 |
|
| 338 |
# 1.5 预校验:立即驳回违反一致性的操作(不调用 LLM,不消耗回合)
|
| 339 |
is_valid, rejection_msg = gs.pre_validate_action(intent)
|
|
|
|
| 346 |
f"⚠️ **行动被驳回**:{rejection_msg}\n\n"
|
| 347 |
f"请重新选择行动,或输入其他指令。\n\n{options_text}"
|
| 348 |
)
|
| 349 |
+
chat_history.append({"role": "assistant", "content": rejection_content})
|
| 350 |
+
rejection_result = {
|
| 351 |
+
"story_text": rejection_content,
|
| 352 |
+
"options": options,
|
| 353 |
+
"state_changes": {},
|
| 354 |
+
"change_log": [],
|
| 355 |
+
"consistency_issues": [],
|
| 356 |
+
"telemetry": {
|
| 357 |
+
"engine_mode": "pre_validation",
|
| 358 |
+
"used_fallback": False,
|
| 359 |
+
"fallback_reason": None,
|
| 360 |
+
},
|
| 361 |
+
}
|
| 362 |
+
_record_interaction_log(
|
| 363 |
+
game_session,
|
| 364 |
+
input_source="text_input",
|
| 365 |
+
user_input=user_input,
|
| 366 |
+
intent_result=intent,
|
| 367 |
+
output_text=rejection_content,
|
| 368 |
+
latency_ms=(perf_counter() - turn_started) * 1000,
|
| 369 |
+
nlu_latency_ms=nlu_latency_ms,
|
| 370 |
+
generation_latency_ms=0.0,
|
| 371 |
+
final_result=rejection_result,
|
| 372 |
+
)
|
| 373 |
+
btn_updates = _get_button_updates(options)
|
| 374 |
yield (
|
| 375 |
chat_history,
|
| 376 |
_format_status_panel(gs),
|
|
|
|
| 393 |
)
|
| 394 |
|
| 395 |
# 3. 流式生成故事
|
| 396 |
+
generation_started = perf_counter()
|
| 397 |
+
final_result = None
|
| 398 |
+
for update in story.generate_story_stream(intent):
|
| 399 |
+
if update["type"] == "story_chunk":
|
| 400 |
+
chat_history[-1]["content"] = update["text"]
|
| 401 |
+
yield (
|
| 402 |
+
chat_history,
|
| 403 |
_format_status_panel(gs),
|
| 404 |
loading[0], loading[1], loading[2],
|
| 405 |
game_session,
|
| 406 |
)
|
| 407 |
+
elif update["type"] == "final":
|
| 408 |
+
final_result = update
|
| 409 |
+
|
| 410 |
+
generation_latency_ms = (perf_counter() - generation_started) * 1000
|
| 411 |
|
| 412 |
# 4. 最终更新:完整文本 + 状态变化 + 选项 + 按钮
|
| 413 |
if final_result:
|
|
|
|
| 429 |
|
| 430 |
options_text = _format_options(options)
|
| 431 |
full_message = f"{final_result['story_text']}{log_text}{issues_text}\n\n{options_text}"
|
| 432 |
+
chat_history[-1]["content"] = full_message
|
| 433 |
+
|
| 434 |
+
status_text = _format_status_panel(gs)
|
| 435 |
+
btn_updates = _get_button_updates(options)
|
| 436 |
+
_record_interaction_log(
|
| 437 |
+
game_session,
|
| 438 |
+
input_source="text_input",
|
| 439 |
+
user_input=user_input,
|
| 440 |
+
intent_result=intent,
|
| 441 |
+
output_text=full_message,
|
| 442 |
+
latency_ms=(perf_counter() - turn_started) * 1000,
|
| 443 |
+
nlu_latency_ms=nlu_latency_ms,
|
| 444 |
+
generation_latency_ms=generation_latency_ms,
|
| 445 |
+
final_result=final_result,
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
yield (
|
| 449 |
+
chat_history,
|
| 450 |
status_text,
|
| 451 |
btn_updates[0], btn_updates[1], btn_updates[2],
|
| 452 |
game_session,
|
|
|
|
| 456 |
logger.warning("流式生成未产生 final 事件,使用兜底文本")
|
| 457 |
fallback_text = "你环顾四周,思考着接下来该做什么..."
|
| 458 |
fallback_options = _ensure_min_options([], 3)
|
| 459 |
+
game_session["current_options"] = fallback_options
|
| 460 |
+
|
| 461 |
+
options_text = _format_options(fallback_options)
|
| 462 |
+
full_message = f"{fallback_text}\n\n{options_text}"
|
| 463 |
+
fallback_result = {
|
| 464 |
+
"story_text": fallback_text,
|
| 465 |
+
"options": fallback_options,
|
| 466 |
+
"state_changes": {},
|
| 467 |
+
"change_log": [],
|
| 468 |
+
"consistency_issues": [],
|
| 469 |
+
"telemetry": {
|
| 470 |
+
"engine_mode": "app_fallback",
|
| 471 |
+
"used_fallback": True,
|
| 472 |
+
"fallback_reason": "missing_final_event",
|
| 473 |
+
},
|
| 474 |
+
}
|
| 475 |
+
chat_history[-1]["content"] = full_message
|
| 476 |
+
|
| 477 |
+
status_text = _format_status_panel(gs)
|
| 478 |
+
btn_updates = _get_button_updates(fallback_options)
|
| 479 |
+
_record_interaction_log(
|
| 480 |
+
game_session,
|
| 481 |
+
input_source="text_input",
|
| 482 |
+
user_input=user_input,
|
| 483 |
+
intent_result=intent,
|
| 484 |
+
output_text=full_message,
|
| 485 |
+
latency_ms=(perf_counter() - turn_started) * 1000,
|
| 486 |
+
nlu_latency_ms=nlu_latency_ms,
|
| 487 |
+
generation_latency_ms=generation_latency_ms,
|
| 488 |
+
final_result=fallback_result,
|
| 489 |
+
)
|
| 490 |
|
| 491 |
yield (
|
| 492 |
chat_history,
|
|
|
|
| 525 |
)
|
| 526 |
return
|
| 527 |
|
| 528 |
+
selected_option = options[option_idx]
|
| 529 |
+
gs: GameState = game_session["game_state"]
|
| 530 |
+
story: StoryEngine = game_session["story"]
|
| 531 |
+
option_intent = _build_option_intent(selected_option)
|
| 532 |
+
turn_started = perf_counter()
|
| 533 |
|
| 534 |
# 检查特殊选项:重新开始
|
| 535 |
if selected_option.get("action_type") == "RESTART":
|
|
|
|
| 613 |
game_session,
|
| 614 |
)
|
| 615 |
|
| 616 |
+
generation_started = perf_counter()
|
| 617 |
+
final_result = None
|
| 618 |
+
for update in story.process_option_selection_stream(selected_option):
|
| 619 |
+
if update["type"] == "story_chunk":
|
| 620 |
+
chat_history[-1]["content"] = update["text"]
|
| 621 |
+
yield (
|
| 622 |
+
chat_history,
|
| 623 |
_format_status_panel(gs),
|
| 624 |
loading[0], loading[1], loading[2],
|
| 625 |
game_session,
|
| 626 |
)
|
| 627 |
+
elif update["type"] == "final":
|
| 628 |
+
final_result = update
|
| 629 |
+
|
| 630 |
+
generation_latency_ms = (perf_counter() - generation_started) * 1000
|
| 631 |
|
| 632 |
if final_result:
|
| 633 |
# ★ 安全兜底:强制确保恰好 3 个选项
|
|
|
|
| 642 |
|
| 643 |
options_text = _format_options(options)
|
| 644 |
full_message = f"{final_result['story_text']}{log_text}\n\n{options_text}"
|
| 645 |
+
chat_history[-1]["content"] = full_message
|
| 646 |
+
|
| 647 |
+
status_text = _format_status_panel(gs)
|
| 648 |
+
btn_updates = _get_button_updates(options)
|
| 649 |
+
_record_interaction_log(
|
| 650 |
+
game_session,
|
| 651 |
+
input_source="option_click",
|
| 652 |
+
user_input=selected_option.get("text", ""),
|
| 653 |
+
intent_result=option_intent,
|
| 654 |
+
output_text=full_message,
|
| 655 |
+
latency_ms=(perf_counter() - turn_started) * 1000,
|
| 656 |
+
generation_latency_ms=generation_latency_ms,
|
| 657 |
+
final_result=final_result,
|
| 658 |
+
selected_option=selected_option,
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
yield (
|
| 662 |
+
chat_history, status_text,
|
| 663 |
btn_updates[0], btn_updates[1], btn_updates[2],
|
| 664 |
game_session,
|
| 665 |
)
|
|
|
|
| 668 |
logger.warning("[选项点击] 流式生成未产生 final 事件,使用兜底文本")
|
| 669 |
fallback_text = "你环顾四周,思考着接下来该做什么..."
|
| 670 |
fallback_options = _ensure_min_options([], 3)
|
| 671 |
+
game_session["current_options"] = fallback_options
|
| 672 |
+
|
| 673 |
+
options_text = _format_options(fallback_options)
|
| 674 |
+
full_message = f"{fallback_text}\n\n{options_text}"
|
| 675 |
+
fallback_result = {
|
| 676 |
+
"story_text": fallback_text,
|
| 677 |
+
"options": fallback_options,
|
| 678 |
+
"state_changes": {},
|
| 679 |
+
"change_log": [],
|
| 680 |
+
"consistency_issues": [],
|
| 681 |
+
"telemetry": {
|
| 682 |
+
"engine_mode": "app_fallback",
|
| 683 |
+
"used_fallback": True,
|
| 684 |
+
"fallback_reason": "missing_final_event",
|
| 685 |
+
},
|
| 686 |
+
}
|
| 687 |
+
chat_history[-1]["content"] = full_message
|
| 688 |
+
|
| 689 |
+
status_text = _format_status_panel(gs)
|
| 690 |
+
btn_updates = _get_button_updates(fallback_options)
|
| 691 |
+
_record_interaction_log(
|
| 692 |
+
game_session,
|
| 693 |
+
input_source="option_click",
|
| 694 |
+
user_input=selected_option.get("text", ""),
|
| 695 |
+
intent_result=option_intent,
|
| 696 |
+
output_text=full_message,
|
| 697 |
+
latency_ms=(perf_counter() - turn_started) * 1000,
|
| 698 |
+
generation_latency_ms=generation_latency_ms,
|
| 699 |
+
final_result=fallback_result,
|
| 700 |
+
selected_option=selected_option,
|
| 701 |
+
)
|
| 702 |
|
| 703 |
yield (
|
| 704 |
chat_history, status_text,
|
evaluation/datasets/branch_divergence.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "branch_001_village_square",
|
| 4 |
+
"setup": {
|
| 5 |
+
"player": {
|
| 6 |
+
"location": "村庄广场",
|
| 7 |
+
"inventory": ["面包", "小型治疗药水"]
|
| 8 |
+
},
|
| 9 |
+
"world": {
|
| 10 |
+
"current_scene": "村庄广场"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"branches": [
|
| 14 |
+
{
|
| 15 |
+
"label": "talk_elder",
|
| 16 |
+
"input": "和村长老伯谈谈最近森林里的怪事"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"label": "go_inn",
|
| 20 |
+
"input": "前往村庄旅店"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"label": "explore_square",
|
| 24 |
+
"input": "探索一下村庄广场"
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": "branch_002_resource_management",
|
| 30 |
+
"setup": {
|
| 31 |
+
"player": {
|
| 32 |
+
"location": "村庄旅店",
|
| 33 |
+
"inventory": ["面包", "小型治疗药水"],
|
| 34 |
+
"hp": 58,
|
| 35 |
+
"morale": 65,
|
| 36 |
+
"sanity": 80
|
| 37 |
+
},
|
| 38 |
+
"world": {
|
| 39 |
+
"current_scene": "村庄旅店"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"branches": [
|
| 43 |
+
{
|
| 44 |
+
"label": "rest",
|
| 45 |
+
"input": "休息一会儿"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"label": "use_potion",
|
| 49 |
+
"input": "使用小型治疗药水"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"label": "talk_innkeeper",
|
| 53 |
+
"input": "和旅店老板娘莉娜聊聊"
|
| 54 |
+
}
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"id": "branch_003_roadside_choices",
|
| 59 |
+
"setup": {
|
| 60 |
+
"player": {
|
| 61 |
+
"location": "村口小路",
|
| 62 |
+
"inventory": ["面包", "小型治疗药水"],
|
| 63 |
+
"hp": 85
|
| 64 |
+
},
|
| 65 |
+
"world": {
|
| 66 |
+
"current_scene": "村口小路"
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"branches": [
|
| 70 |
+
{
|
| 71 |
+
"label": "enter_forest",
|
| 72 |
+
"input": "前往黑暗森林入口"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"label": "return_square",
|
| 76 |
+
"input": "回村庄广场"
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"label": "explore_road",
|
| 80 |
+
"input": "搜索附近有没有线索"
|
| 81 |
+
}
|
| 82 |
+
]
|
| 83 |
+
}
|
| 84 |
+
]
|
evaluation/datasets/consistency.json
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"action_guard_cases": [
|
| 3 |
+
{
|
| 4 |
+
"id": "guard_001",
|
| 5 |
+
"setup": {
|
| 6 |
+
"player": {
|
| 7 |
+
"location": "村庄广场",
|
| 8 |
+
"inventory": ["面包", "小型治疗药水"]
|
| 9 |
+
},
|
| 10 |
+
"world": {
|
| 11 |
+
"current_scene": "村庄广场"
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"intent": {
|
| 15 |
+
"intent": "USE_ITEM",
|
| 16 |
+
"target": "小型治疗药水",
|
| 17 |
+
"details": "喝掉药水",
|
| 18 |
+
"raw_input": "使用小型治疗药水"
|
| 19 |
+
},
|
| 20 |
+
"expected_valid": true
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "guard_002",
|
| 24 |
+
"setup": {
|
| 25 |
+
"player": {
|
| 26 |
+
"location": "村庄广场",
|
| 27 |
+
"inventory": ["面包"]
|
| 28 |
+
},
|
| 29 |
+
"world": {
|
| 30 |
+
"current_scene": "村庄广场"
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"intent": {
|
| 34 |
+
"intent": "USE_ITEM",
|
| 35 |
+
"target": "火把",
|
| 36 |
+
"details": "点亮火把",
|
| 37 |
+
"raw_input": "使用火把"
|
| 38 |
+
},
|
| 39 |
+
"expected_valid": false
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": "guard_003",
|
| 43 |
+
"setup": {
|
| 44 |
+
"player": {
|
| 45 |
+
"location": "村庄广场"
|
| 46 |
+
},
|
| 47 |
+
"world": {
|
| 48 |
+
"current_scene": "村庄广场"
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
"intent": {
|
| 52 |
+
"intent": "MOVE",
|
| 53 |
+
"target": "村庄旅店",
|
| 54 |
+
"details": "去旅店",
|
| 55 |
+
"raw_input": "前往村庄旅店"
|
| 56 |
+
},
|
| 57 |
+
"expected_valid": true
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"id": "guard_004",
|
| 61 |
+
"setup": {
|
| 62 |
+
"player": {
|
| 63 |
+
"location": "村庄广场"
|
| 64 |
+
},
|
| 65 |
+
"world": {
|
| 66 |
+
"current_scene": "村庄广场"
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"intent": {
|
| 70 |
+
"intent": "MOVE",
|
| 71 |
+
"target": "森林深处",
|
| 72 |
+
"details": "直接冲进森林深处",
|
| 73 |
+
"raw_input": "去森林深处"
|
| 74 |
+
},
|
| 75 |
+
"expected_valid": true
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"id": "guard_005",
|
| 79 |
+
"setup": {
|
| 80 |
+
"player": {
|
| 81 |
+
"location": "村庄广场",
|
| 82 |
+
"inventory": ["铁剑", "面包"]
|
| 83 |
+
},
|
| 84 |
+
"world": {
|
| 85 |
+
"current_scene": "村庄广场"
|
| 86 |
+
}
|
| 87 |
+
},
|
| 88 |
+
"intent": {
|
| 89 |
+
"intent": "EQUIP",
|
| 90 |
+
"target": "铁剑",
|
| 91 |
+
"details": "装备武器",
|
| 92 |
+
"raw_input": "装备铁剑"
|
| 93 |
+
},
|
| 94 |
+
"expected_valid": true
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"id": "guard_006",
|
| 98 |
+
"setup": {
|
| 99 |
+
"player": {
|
| 100 |
+
"location": "村庄广场",
|
| 101 |
+
"inventory": ["面包"],
|
| 102 |
+
"equipment": {
|
| 103 |
+
"weapon": "铁剑"
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"world": {
|
| 107 |
+
"current_scene": "村庄广场"
|
| 108 |
+
}
|
| 109 |
+
},
|
| 110 |
+
"intent": {
|
| 111 |
+
"intent": "EQUIP",
|
| 112 |
+
"target": "铁剑",
|
| 113 |
+
"details": "再装备一次铁剑",
|
| 114 |
+
"raw_input": "装备铁剑"
|
| 115 |
+
},
|
| 116 |
+
"expected_valid": false
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"id": "guard_007",
|
| 120 |
+
"setup": {
|
| 121 |
+
"player": {
|
| 122 |
+
"location": "村庄广场",
|
| 123 |
+
"skills": ["火球术"]
|
| 124 |
+
},
|
| 125 |
+
"world": {
|
| 126 |
+
"current_scene": "村庄广场"
|
| 127 |
+
}
|
| 128 |
+
},
|
| 129 |
+
"intent": {
|
| 130 |
+
"intent": "SKILL",
|
| 131 |
+
"target": "火球术",
|
| 132 |
+
"details": "施法",
|
| 133 |
+
"raw_input": "施放火球术"
|
| 134 |
+
},
|
| 135 |
+
"expected_valid": true
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"id": "guard_008",
|
| 139 |
+
"setup": {
|
| 140 |
+
"player": {
|
| 141 |
+
"location": "村庄广场",
|
| 142 |
+
"skills": []
|
| 143 |
+
},
|
| 144 |
+
"world": {
|
| 145 |
+
"current_scene": "村庄广场"
|
| 146 |
+
}
|
| 147 |
+
},
|
| 148 |
+
"intent": {
|
| 149 |
+
"intent": "SKILL",
|
| 150 |
+
"target": "火球术",
|
| 151 |
+
"details": "施法",
|
| 152 |
+
"raw_input": "施放火球术"
|
| 153 |
+
},
|
| 154 |
+
"expected_valid": false
|
| 155 |
+
}
|
| 156 |
+
],
|
| 157 |
+
"state_check_cases": [
|
| 158 |
+
{
|
| 159 |
+
"id": "state_001",
|
| 160 |
+
"setup": {
|
| 161 |
+
"player": {
|
| 162 |
+
"location": "村庄广场",
|
| 163 |
+
"gold": 50
|
| 164 |
+
},
|
| 165 |
+
"world": {
|
| 166 |
+
"current_scene": "村庄广场"
|
| 167 |
+
}
|
| 168 |
+
},
|
| 169 |
+
"proposed_changes": {
|
| 170 |
+
"new_location": "村庄旅店"
|
| 171 |
+
},
|
| 172 |
+
"expected_contradiction": false
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"id": "state_002",
|
| 176 |
+
"setup": {
|
| 177 |
+
"player": {
|
| 178 |
+
"location": "村庄广场"
|
| 179 |
+
},
|
| 180 |
+
"world": {
|
| 181 |
+
"current_scene": "村庄广场"
|
| 182 |
+
}
|
| 183 |
+
},
|
| 184 |
+
"proposed_changes": {
|
| 185 |
+
"new_location": "森林深处"
|
| 186 |
+
},
|
| 187 |
+
"expected_contradiction": true,
|
| 188 |
+
"expected_contains": ["不相邻"]
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"id": "state_003",
|
| 192 |
+
"setup": {
|
| 193 |
+
"player": {
|
| 194 |
+
"location": "村庄广场",
|
| 195 |
+
"gold": 50
|
| 196 |
+
},
|
| 197 |
+
"world": {
|
| 198 |
+
"current_scene": "村庄广场"
|
| 199 |
+
}
|
| 200 |
+
},
|
| 201 |
+
"proposed_changes": {
|
| 202 |
+
"gold_change": -80
|
| 203 |
+
},
|
| 204 |
+
"expected_contradiction": true,
|
| 205 |
+
"expected_contains": ["金币"]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"id": "state_004",
|
| 209 |
+
"setup": {
|
| 210 |
+
"player": {
|
| 211 |
+
"location": "村庄广场",
|
| 212 |
+
"inventory": ["面包"]
|
| 213 |
+
},
|
| 214 |
+
"world": {
|
| 215 |
+
"current_scene": "村庄广场"
|
| 216 |
+
}
|
| 217 |
+
},
|
| 218 |
+
"proposed_changes": {
|
| 219 |
+
"items_lost": ["火把"]
|
| 220 |
+
},
|
| 221 |
+
"expected_contradiction": true,
|
| 222 |
+
"expected_contains": ["未持有"]
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"id": "state_005",
|
| 226 |
+
"setup": {
|
| 227 |
+
"player": {
|
| 228 |
+
"location": "村庄广场",
|
| 229 |
+
"inventory": ["小型治疗药水"]
|
| 230 |
+
},
|
| 231 |
+
"world": {
|
| 232 |
+
"current_scene": "村庄广场"
|
| 233 |
+
}
|
| 234 |
+
},
|
| 235 |
+
"proposed_changes": {
|
| 236 |
+
"items_lost": ["小型治疗药水"]
|
| 237 |
+
},
|
| 238 |
+
"expected_contradiction": false
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"id": "state_006",
|
| 242 |
+
"setup": {
|
| 243 |
+
"player": {
|
| 244 |
+
"location": "村庄广场",
|
| 245 |
+
"inventory": ["铁剑"]
|
| 246 |
+
},
|
| 247 |
+
"world": {
|
| 248 |
+
"current_scene": "村庄广场"
|
| 249 |
+
}
|
| 250 |
+
},
|
| 251 |
+
"proposed_changes": {
|
| 252 |
+
"items_lost": ["铁剑"]
|
| 253 |
+
},
|
| 254 |
+
"expected_contradiction": true,
|
| 255 |
+
"expected_contains": ["不是消耗品"]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"id": "state_007",
|
| 259 |
+
"setup": {
|
| 260 |
+
"player": {
|
| 261 |
+
"location": "村庄广场"
|
| 262 |
+
},
|
| 263 |
+
"world": {
|
| 264 |
+
"current_scene": "村庄广场"
|
| 265 |
+
},
|
| 266 |
+
"npc_overrides": {
|
| 267 |
+
"村长老伯": {
|
| 268 |
+
"is_alive": false
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
},
|
| 272 |
+
"proposed_changes": {
|
| 273 |
+
"npc_changes": {
|
| 274 |
+
"村长老伯": {
|
| 275 |
+
"attitude": "friendly"
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
},
|
| 279 |
+
"expected_contradiction": true,
|
| 280 |
+
"expected_contains": ["已经死亡"]
|
| 281 |
+
}
|
| 282 |
+
]
|
| 283 |
+
}
|
evaluation/datasets/intent_accuracy.json
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "intent_001",
|
| 4 |
+
"input": "和村长老伯谈谈最近森林里的怪事",
|
| 5 |
+
"intent": "TALK",
|
| 6 |
+
"target": "村长老伯",
|
| 7 |
+
"setup": {
|
| 8 |
+
"player": {
|
| 9 |
+
"location": "村庄广场"
|
| 10 |
+
},
|
| 11 |
+
"world": {
|
| 12 |
+
"current_scene": "村庄广场"
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"id": "intent_002",
|
| 18 |
+
"input": "前往村庄旅店",
|
| 19 |
+
"intent": "MOVE",
|
| 20 |
+
"target": "村庄旅店",
|
| 21 |
+
"setup": {
|
| 22 |
+
"player": {
|
| 23 |
+
"location": "村庄广场"
|
| 24 |
+
},
|
| 25 |
+
"world": {
|
| 26 |
+
"current_scene": "村庄广场"
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"id": "intent_003",
|
| 32 |
+
"input": "探索一下村庄广场",
|
| 33 |
+
"intent": "EXPLORE"
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"id": "intent_004",
|
| 37 |
+
"input": "使用小型治疗药水",
|
| 38 |
+
"intent": "USE_ITEM",
|
| 39 |
+
"target": "小型治疗药水"
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": "intent_005",
|
| 43 |
+
"input": "装备铁剑",
|
| 44 |
+
"intent": "EQUIP",
|
| 45 |
+
"target": "铁剑"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"id": "intent_006",
|
| 49 |
+
"input": "和铁匠格林交易",
|
| 50 |
+
"intent": "TRADE",
|
| 51 |
+
"target": "铁匠格林",
|
| 52 |
+
"setup": {
|
| 53 |
+
"player": {
|
| 54 |
+
"location": "村庄铁匠铺"
|
| 55 |
+
},
|
| 56 |
+
"world": {
|
| 57 |
+
"current_scene": "村庄铁匠铺"
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"id": "intent_007",
|
| 63 |
+
"input": "休息一会儿",
|
| 64 |
+
"intent": "REST"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"id": "intent_008",
|
| 68 |
+
"input": "查看当前任务",
|
| 69 |
+
"intent": "QUEST"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"id": "intent_009",
|
| 73 |
+
"input": "施放火球术",
|
| 74 |
+
"intent": "SKILL",
|
| 75 |
+
"setup": {
|
| 76 |
+
"player": {
|
| 77 |
+
"location": "村庄广场",
|
| 78 |
+
"skills": [
|
| 79 |
+
"火球术"
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
"world": {
|
| 83 |
+
"current_scene": "村庄广场"
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"id": "intent_010",
|
| 89 |
+
"input": "拿起火把",
|
| 90 |
+
"intent": "PICKUP",
|
| 91 |
+
"target": "火把"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"id": "intent_011",
|
| 95 |
+
"input": "赶紧逃跑",
|
| 96 |
+
"intent": "FLEE"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"id": "intent_012",
|
| 100 |
+
"input": "和旅店老板娘莉娜聊聊",
|
| 101 |
+
"intent": "TALK",
|
| 102 |
+
"target": "旅店老板娘莉娜",
|
| 103 |
+
"setup": {
|
| 104 |
+
"player": {
|
| 105 |
+
"location": "村庄旅店"
|
| 106 |
+
},
|
| 107 |
+
"world": {
|
| 108 |
+
"current_scene": "村庄旅店"
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"id": "intent_013",
|
| 114 |
+
"input": "买一瓶解毒药水",
|
| 115 |
+
"intent": "TRADE",
|
| 116 |
+
"target": "解毒药水"
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"id": "intent_014",
|
| 120 |
+
"input": "去村口小路看看",
|
| 121 |
+
"intent": "MOVE",
|
| 122 |
+
"target": "村口小路",
|
| 123 |
+
"setup": {
|
| 124 |
+
"player": {
|
| 125 |
+
"location": "村庄广场"
|
| 126 |
+
},
|
| 127 |
+
"world": {
|
| 128 |
+
"current_scene": "村庄广场"
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": "intent_015",
|
| 134 |
+
"input": "吃一个面包",
|
| 135 |
+
"intent": "USE_ITEM",
|
| 136 |
+
"target": "面包"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"id": "intent_016",
|
| 140 |
+
"input": "接受这个任务",
|
| 141 |
+
"intent": "QUEST"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"id": "intent_017",
|
| 145 |
+
"input": "搜索附近有没有线索",
|
| 146 |
+
"intent": "EXPLORE"
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"id": "intent_018",
|
| 150 |
+
"input": "穿上皮甲",
|
| 151 |
+
"intent": "EQUIP",
|
| 152 |
+
"target": "皮甲"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"id": "intent_019",
|
| 156 |
+
"input": "去村庄杂货铺买点东西",
|
| 157 |
+
"intent": "TRADE",
|
| 158 |
+
"target": "村庄杂货铺",
|
| 159 |
+
"setup": {
|
| 160 |
+
"player": {
|
| 161 |
+
"location": "村庄广场"
|
| 162 |
+
},
|
| 163 |
+
"world": {
|
| 164 |
+
"current_scene": "村庄广场"
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"id": "intent_020",
|
| 170 |
+
"input": "调查黑暗森林入口",
|
| 171 |
+
"intent": "EXPLORE",
|
| 172 |
+
"target": "黑暗森林入口",
|
| 173 |
+
"setup": {
|
| 174 |
+
"player": {
|
| 175 |
+
"location": "村口小路"
|
| 176 |
+
},
|
| 177 |
+
"world": {
|
| 178 |
+
"current_scene": "村口小路"
|
| 179 |
+
}
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"id": "intent_021",
|
| 184 |
+
"input": "和村长老伯谈判",
|
| 185 |
+
"intent": "TALK",
|
| 186 |
+
"target": "村长老伯",
|
| 187 |
+
"setup": {
|
| 188 |
+
"player": {
|
| 189 |
+
"location": "村庄广场"
|
| 190 |
+
},
|
| 191 |
+
"world": {
|
| 192 |
+
"current_scene": "村庄广场"
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"id": "intent_022",
|
| 198 |
+
"input": "我想扔石头试试看",
|
| 199 |
+
"intent": "CUSTOM"
|
| 200 |
+
}
|
| 201 |
+
]
|
evaluation/datasets/latency.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "latency_001",
|
| 4 |
+
"input": "和村长老伯谈谈最近森林里的怪事",
|
| 5 |
+
"setup": {
|
| 6 |
+
"player": {
|
| 7 |
+
"location": "村庄广场"
|
| 8 |
+
},
|
| 9 |
+
"world": {
|
| 10 |
+
"current_scene": "村庄广场"
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"id": "latency_002",
|
| 16 |
+
"input": "前往村庄旅店",
|
| 17 |
+
"setup": {
|
| 18 |
+
"player": {
|
| 19 |
+
"location": "村庄广场"
|
| 20 |
+
},
|
| 21 |
+
"world": {
|
| 22 |
+
"current_scene": "村庄广场"
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"id": "latency_003",
|
| 28 |
+
"input": "使用小型治疗药水",
|
| 29 |
+
"setup": {
|
| 30 |
+
"player": {
|
| 31 |
+
"location": "村庄旅店",
|
| 32 |
+
"inventory": ["面包", "小型治疗药水"],
|
| 33 |
+
"hp": 65
|
| 34 |
+
},
|
| 35 |
+
"world": {
|
| 36 |
+
"current_scene": "村庄旅店"
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"id": "latency_004",
|
| 42 |
+
"input": "探索一下村口小路",
|
| 43 |
+
"setup": {
|
| 44 |
+
"player": {
|
| 45 |
+
"location": "村口小路"
|
| 46 |
+
},
|
| 47 |
+
"world": {
|
| 48 |
+
"current_scene": "村口小路"
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "latency_005",
|
| 54 |
+
"input": "和铁匠格林交易",
|
| 55 |
+
"setup": {
|
| 56 |
+
"player": {
|
| 57 |
+
"location": "村庄铁匠铺"
|
| 58 |
+
},
|
| 59 |
+
"world": {
|
| 60 |
+
"current_scene": "村庄铁匠铺"
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"id": "latency_006",
|
| 66 |
+
"input": "休息一会儿",
|
| 67 |
+
"setup": {
|
| 68 |
+
"player": {
|
| 69 |
+
"location": "村庄旅店",
|
| 70 |
+
"hp": 72,
|
| 71 |
+
"morale": 60,
|
| 72 |
+
"sanity": 82
|
| 73 |
+
},
|
| 74 |
+
"world": {
|
| 75 |
+
"current_scene": "村庄旅店"
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
]
|
evaluation/run_evaluations.py
ADDED
|
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import statistics
|
| 6 |
+
import sys
|
| 7 |
+
from collections import Counter, defaultdict
|
| 8 |
+
from copy import deepcopy
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from difflib import SequenceMatcher
|
| 11 |
+
from itertools import combinations
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from time import perf_counter
|
| 14 |
+
from typing import Any
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 18 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 20 |
+
|
| 21 |
+
from nlu_engine import NLUEngine
|
| 22 |
+
from state_manager import GameState
|
| 23 |
+
from story_engine import StoryEngine
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
DATASET_DIR = PROJECT_ROOT / "evaluation" / "datasets"
|
| 27 |
+
RESULTS_DIR = PROJECT_ROOT / "evaluation" / "results"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _json_safe(value: Any) -> Any:
|
| 31 |
+
if value is None or isinstance(value, (str, int, float, bool)):
|
| 32 |
+
return value
|
| 33 |
+
if isinstance(value, dict):
|
| 34 |
+
return {str(key): _json_safe(val) for key, val in value.items()}
|
| 35 |
+
if isinstance(value, (list, tuple, set)):
|
| 36 |
+
return [_json_safe(item) for item in value]
|
| 37 |
+
if hasattr(value, "model_dump"):
|
| 38 |
+
return _json_safe(value.model_dump())
|
| 39 |
+
return str(value)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _normalize_text(value: Any) -> str:
|
| 43 |
+
return str(value or "").strip().lower()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _load_dataset(name: str) -> Any:
|
| 47 |
+
with (DATASET_DIR / f"{name}.json").open("r", encoding="utf-8") as fh:
|
| 48 |
+
return json.load(fh)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _apply_setup(game_state: GameState, setup: dict[str, Any] | None) -> GameState:
|
| 52 |
+
if not setup:
|
| 53 |
+
game_state.player.location = game_state.world.current_scene
|
| 54 |
+
return game_state
|
| 55 |
+
|
| 56 |
+
player_setup = setup.get("player", {})
|
| 57 |
+
world_setup = setup.get("world", {})
|
| 58 |
+
|
| 59 |
+
for key, value in player_setup.items():
|
| 60 |
+
if key == "inventory":
|
| 61 |
+
game_state.player.inventory = list(value)
|
| 62 |
+
elif key == "skills":
|
| 63 |
+
game_state.player.skills = list(value)
|
| 64 |
+
elif key == "equipment":
|
| 65 |
+
updated = dict(game_state.player.equipment)
|
| 66 |
+
updated.update(dict(value))
|
| 67 |
+
game_state.player.equipment = updated
|
| 68 |
+
else:
|
| 69 |
+
setattr(game_state.player, key, deepcopy(value))
|
| 70 |
+
|
| 71 |
+
for key, value in world_setup.items():
|
| 72 |
+
if key == "discovered_locations":
|
| 73 |
+
game_state.world.discovered_locations = list(value)
|
| 74 |
+
elif key == "global_flags":
|
| 75 |
+
game_state.world.global_flags.update(dict(value))
|
| 76 |
+
else:
|
| 77 |
+
setattr(game_state.world, key, deepcopy(value))
|
| 78 |
+
|
| 79 |
+
for npc_name, overrides in setup.get("npc_overrides", {}).items():
|
| 80 |
+
npc = game_state.world.npcs.get(npc_name)
|
| 81 |
+
if npc is None:
|
| 82 |
+
continue
|
| 83 |
+
for key, value in overrides.items():
|
| 84 |
+
setattr(npc, key, deepcopy(value))
|
| 85 |
+
|
| 86 |
+
if "turn" in setup:
|
| 87 |
+
game_state.turn = int(setup["turn"])
|
| 88 |
+
|
| 89 |
+
if "location" not in player_setup and "current_scene" in world_setup:
|
| 90 |
+
game_state.player.location = game_state.world.current_scene
|
| 91 |
+
elif "location" in player_setup and "current_scene" not in world_setup:
|
| 92 |
+
game_state.world.current_scene = game_state.player.location
|
| 93 |
+
elif not player_setup and not world_setup:
|
| 94 |
+
game_state.player.location = game_state.world.current_scene
|
| 95 |
+
|
| 96 |
+
return game_state
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _build_game_state(setup: dict[str, Any] | None = None) -> GameState:
|
| 100 |
+
game_state = GameState(player_name="Evaluator")
|
| 101 |
+
return _apply_setup(game_state, setup)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _state_snapshot(game_state: GameState) -> dict[str, Any]:
|
| 105 |
+
return {
|
| 106 |
+
"turn": game_state.turn,
|
| 107 |
+
"game_mode": game_state.game_mode,
|
| 108 |
+
"location": game_state.player.location,
|
| 109 |
+
"scene": game_state.world.current_scene,
|
| 110 |
+
"day": game_state.world.day_count,
|
| 111 |
+
"time_of_day": game_state.world.time_of_day,
|
| 112 |
+
"weather": game_state.world.weather,
|
| 113 |
+
"hp": game_state.player.hp,
|
| 114 |
+
"mp": game_state.player.mp,
|
| 115 |
+
"gold": game_state.player.gold,
|
| 116 |
+
"morale": game_state.player.morale,
|
| 117 |
+
"sanity": game_state.player.sanity,
|
| 118 |
+
"hunger": game_state.player.hunger,
|
| 119 |
+
"inventory": list(game_state.player.inventory),
|
| 120 |
+
"equipment": dict(game_state.player.equipment),
|
| 121 |
+
"skills": list(game_state.player.skills),
|
| 122 |
+
"active_quests": {
|
| 123 |
+
quest_id: {
|
| 124 |
+
"status": quest.status,
|
| 125 |
+
"objectives": dict(quest.objectives),
|
| 126 |
+
}
|
| 127 |
+
for quest_id, quest in game_state.world.quests.items()
|
| 128 |
+
if quest.status == "active"
|
| 129 |
+
},
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _flatten(value: Any, prefix: str = "") -> set[str]:
|
| 134 |
+
flattened: set[str] = set()
|
| 135 |
+
if isinstance(value, dict):
|
| 136 |
+
for key, child in value.items():
|
| 137 |
+
child_prefix = f"{prefix}.{key}" if prefix else str(key)
|
| 138 |
+
flattened.update(_flatten(child, child_prefix))
|
| 139 |
+
elif isinstance(value, list):
|
| 140 |
+
list_prefix = prefix or "list"
|
| 141 |
+
for index, child in enumerate(value):
|
| 142 |
+
flattened.update(_flatten(child, f"{list_prefix}[{index}]"))
|
| 143 |
+
if not value:
|
| 144 |
+
flattened.add(f"{list_prefix}=[]")
|
| 145 |
+
else:
|
| 146 |
+
flattened.add(f"{prefix}={value}")
|
| 147 |
+
return flattened
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _jaccard_distance(left: set[str], right: set[str]) -> float:
|
| 151 |
+
union = left | right
|
| 152 |
+
if not union:
|
| 153 |
+
return 0.0
|
| 154 |
+
intersection = left & right
|
| 155 |
+
return 1.0 - (len(intersection) / len(union))
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _option_texts(options: list[dict[str, Any]]) -> set[str]:
|
| 159 |
+
texts = set()
|
| 160 |
+
for option in options or []:
|
| 161 |
+
if isinstance(option, dict):
|
| 162 |
+
texts.add(str(option.get("text", "")))
|
| 163 |
+
else:
|
| 164 |
+
texts.add(str(option))
|
| 165 |
+
return texts
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _consume_story_stream(story_engine: StoryEngine, intent: dict[str, Any]) -> tuple[dict[str, Any], float]:
|
| 169 |
+
story_chunks: list[str] = []
|
| 170 |
+
final_result: dict[str, Any] | None = None
|
| 171 |
+
started = perf_counter()
|
| 172 |
+
|
| 173 |
+
for update in story_engine.generate_story_stream(intent):
|
| 174 |
+
if update["type"] == "story_chunk":
|
| 175 |
+
story_chunks.append(update["text"])
|
| 176 |
+
elif update["type"] == "final":
|
| 177 |
+
final_result = update
|
| 178 |
+
|
| 179 |
+
latency_ms = (perf_counter() - started) * 1000
|
| 180 |
+
if final_result is None:
|
| 181 |
+
final_result = {
|
| 182 |
+
"story_text": story_chunks[-1] if story_chunks else "",
|
| 183 |
+
"options": [],
|
| 184 |
+
"state_changes": {},
|
| 185 |
+
"change_log": [],
|
| 186 |
+
"consistency_issues": [],
|
| 187 |
+
"telemetry": {
|
| 188 |
+
"engine_mode": "evaluation_fallback",
|
| 189 |
+
"used_fallback": True,
|
| 190 |
+
"fallback_reason": "missing_final_event",
|
| 191 |
+
},
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
return final_result, latency_ms
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _run_text_turn(user_input: str, setup: dict[str, Any] | None = None) -> dict[str, Any]:
|
| 198 |
+
game_state = _build_game_state(setup)
|
| 199 |
+
nlu = NLUEngine(game_state)
|
| 200 |
+
story = StoryEngine(game_state)
|
| 201 |
+
|
| 202 |
+
nlu_started = perf_counter()
|
| 203 |
+
intent = nlu.parse_intent(user_input)
|
| 204 |
+
nlu_latency_ms = (perf_counter() - nlu_started) * 1000
|
| 205 |
+
|
| 206 |
+
final_result, story_latency_ms = _consume_story_stream(story, intent)
|
| 207 |
+
return {
|
| 208 |
+
"user_input": user_input,
|
| 209 |
+
"intent": intent,
|
| 210 |
+
"nlu_latency_ms": nlu_latency_ms,
|
| 211 |
+
"story_latency_ms": story_latency_ms,
|
| 212 |
+
"total_latency_ms": nlu_latency_ms + story_latency_ms,
|
| 213 |
+
"final_result": final_result,
|
| 214 |
+
"state_snapshot": _state_snapshot(game_state),
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _percentile(values: list[float], percentile: float) -> float:
|
| 219 |
+
if not values:
|
| 220 |
+
return 0.0
|
| 221 |
+
ordered = sorted(values)
|
| 222 |
+
index = max(0, min(len(ordered) - 1, round((percentile / 100) * (len(ordered) - 1))))
|
| 223 |
+
return ordered[index]
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def evaluate_intent_accuracy() -> dict[str, Any]:
|
| 227 |
+
dataset = _load_dataset("intent_accuracy")
|
| 228 |
+
details = []
|
| 229 |
+
parser_sources = Counter()
|
| 230 |
+
confusion = defaultdict(Counter)
|
| 231 |
+
intent_correct = 0
|
| 232 |
+
target_correct = 0
|
| 233 |
+
target_total = 0
|
| 234 |
+
latencies = []
|
| 235 |
+
|
| 236 |
+
for example in dataset:
|
| 237 |
+
game_state = _build_game_state(example.get("setup"))
|
| 238 |
+
nlu = NLUEngine(game_state)
|
| 239 |
+
|
| 240 |
+
started = perf_counter()
|
| 241 |
+
result = nlu.parse_intent(example["input"])
|
| 242 |
+
latency_ms = (perf_counter() - started) * 1000
|
| 243 |
+
|
| 244 |
+
expected_intent = example["intent"]
|
| 245 |
+
predicted_intent = result.get("intent")
|
| 246 |
+
is_intent_correct = predicted_intent == expected_intent
|
| 247 |
+
intent_correct += int(is_intent_correct)
|
| 248 |
+
latencies.append(latency_ms)
|
| 249 |
+
parser_sources[result.get("parser_source", "unknown")] += 1
|
| 250 |
+
confusion[expected_intent][str(predicted_intent)] += 1
|
| 251 |
+
|
| 252 |
+
expected_target = example.get("target")
|
| 253 |
+
predicted_target = result.get("target")
|
| 254 |
+
is_target_correct = None
|
| 255 |
+
if expected_target is not None:
|
| 256 |
+
target_total += 1
|
| 257 |
+
is_target_correct = _normalize_text(predicted_target) == _normalize_text(expected_target)
|
| 258 |
+
target_correct += int(bool(is_target_correct))
|
| 259 |
+
|
| 260 |
+
details.append(
|
| 261 |
+
{
|
| 262 |
+
"id": example["id"],
|
| 263 |
+
"input": example["input"],
|
| 264 |
+
"expected_intent": expected_intent,
|
| 265 |
+
"predicted_intent": predicted_intent,
|
| 266 |
+
"intent_correct": is_intent_correct,
|
| 267 |
+
"expected_target": expected_target,
|
| 268 |
+
"predicted_target": predicted_target,
|
| 269 |
+
"target_correct": is_target_correct,
|
| 270 |
+
"parser_source": result.get("parser_source"),
|
| 271 |
+
"latency_ms": round(latency_ms, 2),
|
| 272 |
+
}
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
return {
|
| 276 |
+
"task": "intent_accuracy",
|
| 277 |
+
"dataset_size": len(dataset),
|
| 278 |
+
"intent_accuracy": round(intent_correct / len(dataset), 4) if dataset else 0.0,
|
| 279 |
+
"target_accuracy": round(target_correct / target_total, 4) if target_total else None,
|
| 280 |
+
"avg_latency_ms": round(statistics.mean(latencies), 2) if latencies else 0.0,
|
| 281 |
+
"parser_source_breakdown": dict(parser_sources),
|
| 282 |
+
"confusion": {expected: dict(counts) for expected, counts in confusion.items()},
|
| 283 |
+
"details": details,
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def evaluate_consistency() -> dict[str, Any]:
|
| 288 |
+
dataset = _load_dataset("consistency")
|
| 289 |
+
guard_cases = dataset["action_guard_cases"]
|
| 290 |
+
state_cases = dataset["state_check_cases"]
|
| 291 |
+
|
| 292 |
+
guard_details = []
|
| 293 |
+
guard_correct = 0
|
| 294 |
+
for case in guard_cases:
|
| 295 |
+
game_state = _build_game_state(case.get("setup"))
|
| 296 |
+
is_valid, rejection_reason = game_state.pre_validate_action(case["intent"])
|
| 297 |
+
is_correct = is_valid == case["expected_valid"]
|
| 298 |
+
guard_correct += int(is_correct)
|
| 299 |
+
guard_details.append(
|
| 300 |
+
{
|
| 301 |
+
"id": case["id"],
|
| 302 |
+
"expected_valid": case["expected_valid"],
|
| 303 |
+
"predicted_valid": is_valid,
|
| 304 |
+
"correct": is_correct,
|
| 305 |
+
"rejection_reason": rejection_reason,
|
| 306 |
+
"intent": case["intent"],
|
| 307 |
+
}
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
state_details = []
|
| 311 |
+
state_correct = 0
|
| 312 |
+
for case in state_cases:
|
| 313 |
+
game_state = _build_game_state(case.get("setup"))
|
| 314 |
+
contradictions = game_state.check_consistency(case["proposed_changes"])
|
| 315 |
+
predicted_contradiction = bool(contradictions)
|
| 316 |
+
is_correct = predicted_contradiction == case["expected_contradiction"]
|
| 317 |
+
expected_contains = case.get("expected_contains", [])
|
| 318 |
+
if expected_contains:
|
| 319 |
+
is_correct = is_correct and all(
|
| 320 |
+
any(fragment in issue for issue in contradictions)
|
| 321 |
+
for fragment in expected_contains
|
| 322 |
+
)
|
| 323 |
+
state_correct += int(is_correct)
|
| 324 |
+
state_details.append(
|
| 325 |
+
{
|
| 326 |
+
"id": case["id"],
|
| 327 |
+
"expected_contradiction": case["expected_contradiction"],
|
| 328 |
+
"predicted_contradiction": predicted_contradiction,
|
| 329 |
+
"correct": is_correct,
|
| 330 |
+
"contradictions": contradictions,
|
| 331 |
+
"proposed_changes": case["proposed_changes"],
|
| 332 |
+
}
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
total_cases = len(guard_cases) + len(state_cases)
|
| 336 |
+
total_correct = guard_correct + state_correct
|
| 337 |
+
|
| 338 |
+
return {
|
| 339 |
+
"task": "consistency",
|
| 340 |
+
"guard_accuracy": round(guard_correct / len(guard_cases), 4) if guard_cases else 0.0,
|
| 341 |
+
"state_check_accuracy": round(state_correct / len(state_cases), 4) if state_cases else 0.0,
|
| 342 |
+
"overall_accuracy": round(total_correct / total_cases, 4) if total_cases else 0.0,
|
| 343 |
+
"action_guard_details": guard_details,
|
| 344 |
+
"state_check_details": state_details,
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def evaluate_latency(repeats: int) -> dict[str, Any]:
|
| 349 |
+
dataset = _load_dataset("latency")
|
| 350 |
+
scenario_summaries = []
|
| 351 |
+
all_nlu = []
|
| 352 |
+
all_story = []
|
| 353 |
+
all_total = []
|
| 354 |
+
fallback_total = 0
|
| 355 |
+
total_runs = 0
|
| 356 |
+
|
| 357 |
+
for scenario in dataset:
|
| 358 |
+
runs = []
|
| 359 |
+
for _ in range(repeats):
|
| 360 |
+
run_result = _run_text_turn(scenario["input"], scenario.get("setup"))
|
| 361 |
+
final_result = run_result["final_result"]
|
| 362 |
+
telemetry = final_result.get("telemetry", {})
|
| 363 |
+
used_fallback = bool(telemetry.get("used_fallback", False))
|
| 364 |
+
|
| 365 |
+
total_runs += 1
|
| 366 |
+
fallback_total += int(used_fallback)
|
| 367 |
+
all_nlu.append(run_result["nlu_latency_ms"])
|
| 368 |
+
all_story.append(run_result["story_latency_ms"])
|
| 369 |
+
all_total.append(run_result["total_latency_ms"])
|
| 370 |
+
|
| 371 |
+
runs.append(
|
| 372 |
+
{
|
| 373 |
+
"nlu_latency_ms": round(run_result["nlu_latency_ms"], 2),
|
| 374 |
+
"story_latency_ms": round(run_result["story_latency_ms"], 2),
|
| 375 |
+
"total_latency_ms": round(run_result["total_latency_ms"], 2),
|
| 376 |
+
"used_fallback": used_fallback,
|
| 377 |
+
"fallback_reason": telemetry.get("fallback_reason"),
|
| 378 |
+
"engine_mode": telemetry.get("engine_mode"),
|
| 379 |
+
}
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
total_values = [item["total_latency_ms"] for item in runs]
|
| 383 |
+
scenario_summaries.append(
|
| 384 |
+
{
|
| 385 |
+
"id": scenario["id"],
|
| 386 |
+
"input": scenario["input"],
|
| 387 |
+
"repeats": repeats,
|
| 388 |
+
"avg_total_latency_ms": round(statistics.mean(total_values), 2),
|
| 389 |
+
"p95_total_latency_ms": round(_percentile(total_values, 95), 2),
|
| 390 |
+
"fallback_rate": round(
|
| 391 |
+
sum(1 for item in runs if item["used_fallback"]) / len(runs),
|
| 392 |
+
4,
|
| 393 |
+
),
|
| 394 |
+
"runs": runs,
|
| 395 |
+
}
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
return {
|
| 399 |
+
"task": "latency",
|
| 400 |
+
"scenario_count": len(dataset),
|
| 401 |
+
"repeats": repeats,
|
| 402 |
+
"avg_nlu_latency_ms": round(statistics.mean(all_nlu), 2) if all_nlu else 0.0,
|
| 403 |
+
"avg_story_latency_ms": round(statistics.mean(all_story), 2) if all_story else 0.0,
|
| 404 |
+
"avg_total_latency_ms": round(statistics.mean(all_total), 2) if all_total else 0.0,
|
| 405 |
+
"p95_total_latency_ms": round(_percentile(all_total, 95), 2) if all_total else 0.0,
|
| 406 |
+
"fallback_rate": round(fallback_total / total_runs, 4) if total_runs else 0.0,
|
| 407 |
+
"scenarios": scenario_summaries,
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def evaluate_branch_divergence() -> dict[str, Any]:
|
| 412 |
+
dataset = _load_dataset("branch_divergence")
|
| 413 |
+
group_summaries = []
|
| 414 |
+
pair_scores = []
|
| 415 |
+
|
| 416 |
+
for group in dataset:
|
| 417 |
+
branch_results = []
|
| 418 |
+
for branch in group["branches"]:
|
| 419 |
+
run_result = _run_text_turn(branch["input"], group.get("setup"))
|
| 420 |
+
branch_results.append(
|
| 421 |
+
{
|
| 422 |
+
"label": branch["label"],
|
| 423 |
+
"input": branch["input"],
|
| 424 |
+
"story_text": run_result["final_result"].get("story_text", ""),
|
| 425 |
+
"options": run_result["final_result"].get("options", []),
|
| 426 |
+
"state_snapshot": run_result["state_snapshot"],
|
| 427 |
+
"state_changes": run_result["final_result"].get("state_changes", {}),
|
| 428 |
+
"telemetry": run_result["final_result"].get("telemetry", {}),
|
| 429 |
+
}
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
group_pairs = []
|
| 433 |
+
for left, right in combinations(branch_results, 2):
|
| 434 |
+
text_divergence = 1.0 - SequenceMatcher(
|
| 435 |
+
None,
|
| 436 |
+
left["story_text"],
|
| 437 |
+
right["story_text"],
|
| 438 |
+
).ratio()
|
| 439 |
+
state_divergence = _jaccard_distance(
|
| 440 |
+
_flatten(left["state_snapshot"]),
|
| 441 |
+
_flatten(right["state_snapshot"]),
|
| 442 |
+
)
|
| 443 |
+
option_divergence = _jaccard_distance(
|
| 444 |
+
_option_texts(left["options"]),
|
| 445 |
+
_option_texts(right["options"]),
|
| 446 |
+
)
|
| 447 |
+
pair_score = round((text_divergence + state_divergence + option_divergence) / 3, 4)
|
| 448 |
+
pair_detail = {
|
| 449 |
+
"left": left["label"],
|
| 450 |
+
"right": right["label"],
|
| 451 |
+
"text_divergence": round(text_divergence, 4),
|
| 452 |
+
"state_divergence": round(state_divergence, 4),
|
| 453 |
+
"option_divergence": round(option_divergence, 4),
|
| 454 |
+
"pair_divergence_score": pair_score,
|
| 455 |
+
"meaningfully_divergent": pair_score >= 0.2,
|
| 456 |
+
}
|
| 457 |
+
pair_scores.append(pair_score)
|
| 458 |
+
group_pairs.append(pair_detail)
|
| 459 |
+
|
| 460 |
+
group_summaries.append(
|
| 461 |
+
{
|
| 462 |
+
"id": group["id"],
|
| 463 |
+
"avg_pair_divergence": round(
|
| 464 |
+
statistics.mean([pair["pair_divergence_score"] for pair in group_pairs]),
|
| 465 |
+
4,
|
| 466 |
+
) if group_pairs else 0.0,
|
| 467 |
+
"branches": [
|
| 468 |
+
{
|
| 469 |
+
"label": branch["label"],
|
| 470 |
+
"input": branch["input"],
|
| 471 |
+
"telemetry": _json_safe(branch["telemetry"]),
|
| 472 |
+
"state_changes": _json_safe(branch["state_changes"]),
|
| 473 |
+
}
|
| 474 |
+
for branch in branch_results
|
| 475 |
+
],
|
| 476 |
+
"pair_details": group_pairs,
|
| 477 |
+
}
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
meaningful_pairs = sum(1 for score in pair_scores if score >= 0.2)
|
| 481 |
+
return {
|
| 482 |
+
"task": "branch_divergence",
|
| 483 |
+
"group_count": len(dataset),
|
| 484 |
+
"avg_pair_divergence": round(statistics.mean(pair_scores), 4) if pair_scores else 0.0,
|
| 485 |
+
"meaningfully_divergent_pair_rate": round(
|
| 486 |
+
meaningful_pairs / len(pair_scores),
|
| 487 |
+
4,
|
| 488 |
+
) if pair_scores else 0.0,
|
| 489 |
+
"groups": group_summaries,
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
TASK_RUNNERS = {
|
| 494 |
+
"intent": lambda repeats: evaluate_intent_accuracy(),
|
| 495 |
+
"consistency": lambda repeats: evaluate_consistency(),
|
| 496 |
+
"latency": lambda repeats: evaluate_latency(repeats),
|
| 497 |
+
"branch": lambda repeats: evaluate_branch_divergence(),
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def _build_summary(results: dict[str, Any]) -> dict[str, Any]:
|
| 502 |
+
summary = {}
|
| 503 |
+
if "intent" in results:
|
| 504 |
+
summary["intent_accuracy"] = results["intent"]["intent_accuracy"]
|
| 505 |
+
if "consistency" in results:
|
| 506 |
+
summary["consistency_overall_accuracy"] = results["consistency"]["overall_accuracy"]
|
| 507 |
+
if "latency" in results:
|
| 508 |
+
summary["avg_total_latency_ms"] = results["latency"]["avg_total_latency_ms"]
|
| 509 |
+
summary["latency_fallback_rate"] = results["latency"]["fallback_rate"]
|
| 510 |
+
if "branch" in results:
|
| 511 |
+
summary["avg_pair_divergence"] = results["branch"]["avg_pair_divergence"]
|
| 512 |
+
return summary
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
def main() -> int:
|
| 516 |
+
parser = argparse.ArgumentParser(description="Run reproducible StoryWeaver evaluation tasks.")
|
| 517 |
+
parser.add_argument(
|
| 518 |
+
"--task",
|
| 519 |
+
choices=["all", *TASK_RUNNERS.keys()],
|
| 520 |
+
default="all",
|
| 521 |
+
help="Evaluation task to run.",
|
| 522 |
+
)
|
| 523 |
+
parser.add_argument(
|
| 524 |
+
"--repeats",
|
| 525 |
+
type=int,
|
| 526 |
+
default=3,
|
| 527 |
+
help="Repeat count for latency measurements.",
|
| 528 |
+
)
|
| 529 |
+
parser.add_argument(
|
| 530 |
+
"--output",
|
| 531 |
+
type=str,
|
| 532 |
+
default="",
|
| 533 |
+
help="Optional path for the output JSON file.",
|
| 534 |
+
)
|
| 535 |
+
args = parser.parse_args()
|
| 536 |
+
|
| 537 |
+
selected_tasks = list(TASK_RUNNERS.keys()) if args.task == "all" else [args.task]
|
| 538 |
+
task_results = {task: TASK_RUNNERS[task](args.repeats) for task in selected_tasks}
|
| 539 |
+
|
| 540 |
+
payload = {
|
| 541 |
+
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
| 542 |
+
"task": args.task,
|
| 543 |
+
"summary": _build_summary(task_results),
|
| 544 |
+
"results": task_results,
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 548 |
+
if args.output:
|
| 549 |
+
output_path = Path(args.output)
|
| 550 |
+
if not output_path.is_absolute():
|
| 551 |
+
output_path = PROJECT_ROOT / output_path
|
| 552 |
+
else:
|
| 553 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 554 |
+
suffix = args.task
|
| 555 |
+
output_path = RESULTS_DIR / f"{timestamp}-{suffix}.json"
|
| 556 |
+
|
| 557 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 558 |
+
with output_path.open("w", encoding="utf-8") as fh:
|
| 559 |
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
| 560 |
+
|
| 561 |
+
print(json.dumps(payload["summary"], ensure_ascii=False, indent=2))
|
| 562 |
+
print(f"Saved evaluation results to: {output_path}")
|
| 563 |
+
return 0
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
if __name__ == "__main__":
|
| 567 |
+
raise SystemExit(main())
|
nlu_engine.py
CHANGED
|
@@ -117,13 +117,14 @@ class NLUEngine:
|
|
| 117 |
"raw_input": "我想用剑攻击那个哥布林"
|
| 118 |
}
|
| 119 |
"""
|
| 120 |
-
if not user_input or not user_input.strip():
|
| 121 |
-
return {
|
| 122 |
-
"intent": "EXPLORE",
|
| 123 |
-
"target": None,
|
| 124 |
-
"details": "玩家沉默不语",
|
| 125 |
-
"raw_input": "",
|
| 126 |
-
|
|
|
|
| 127 |
|
| 128 |
user_input = user_input.strip()
|
| 129 |
logger.info(f"NLU 解析输入: '{user_input}'")
|
|
@@ -168,16 +169,17 @@ class NLUEngine:
|
|
| 168 |
max_retries=2,
|
| 169 |
)
|
| 170 |
|
| 171 |
-
if result and isinstance(result, dict) and "intent" in result:
|
| 172 |
-
# 验证意图类型合法
|
| 173 |
-
valid_intents = {
|
| 174 |
-
"ATTACK", "TALK", "MOVE", "EXPLORE", "USE_ITEM",
|
| 175 |
-
"TRADE", "EQUIP", "REST", "QUEST", "SKILL",
|
| 176 |
-
"PICKUP", "FLEE", "CUSTOM",
|
| 177 |
-
}
|
| 178 |
-
if result["intent"] not in valid_intents:
|
| 179 |
-
result["intent"] = "CUSTOM"
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
return None
|
| 183 |
|
|
@@ -230,11 +232,12 @@ class NLUEngine:
|
|
| 230 |
# 尝试提取目标
|
| 231 |
target = self._extract_target_from_text(user_input)
|
| 232 |
|
| 233 |
-
return {
|
| 234 |
-
"intent": detected_intent,
|
| 235 |
-
"target": target,
|
| 236 |
-
"details": None,
|
| 237 |
-
|
|
|
|
| 238 |
|
| 239 |
def _extract_target_from_text(self, text: str) -> Optional[str]:
|
| 240 |
"""
|
|
|
|
| 117 |
"raw_input": "我想用剑攻击那个哥布林"
|
| 118 |
}
|
| 119 |
"""
|
| 120 |
+
if not user_input or not user_input.strip():
|
| 121 |
+
return {
|
| 122 |
+
"intent": "EXPLORE",
|
| 123 |
+
"target": None,
|
| 124 |
+
"details": "玩家沉默不语",
|
| 125 |
+
"raw_input": "",
|
| 126 |
+
"parser_source": "empty_input",
|
| 127 |
+
}
|
| 128 |
|
| 129 |
user_input = user_input.strip()
|
| 130 |
logger.info(f"NLU 解析输入: '{user_input}'")
|
|
|
|
| 169 |
max_retries=2,
|
| 170 |
)
|
| 171 |
|
| 172 |
+
if result and isinstance(result, dict) and "intent" in result:
|
| 173 |
+
# 验证意图类型合法
|
| 174 |
+
valid_intents = {
|
| 175 |
+
"ATTACK", "TALK", "MOVE", "EXPLORE", "USE_ITEM",
|
| 176 |
+
"TRADE", "EQUIP", "REST", "QUEST", "SKILL",
|
| 177 |
+
"PICKUP", "FLEE", "CUSTOM",
|
| 178 |
+
}
|
| 179 |
+
if result["intent"] not in valid_intents:
|
| 180 |
+
result["intent"] = "CUSTOM"
|
| 181 |
+
result.setdefault("parser_source", "llm")
|
| 182 |
+
return result
|
| 183 |
|
| 184 |
return None
|
| 185 |
|
|
|
|
| 232 |
# 尝试提取目标
|
| 233 |
target = self._extract_target_from_text(user_input)
|
| 234 |
|
| 235 |
+
return {
|
| 236 |
+
"intent": detected_intent,
|
| 237 |
+
"target": target,
|
| 238 |
+
"details": None,
|
| 239 |
+
"parser_source": "keyword_fallback",
|
| 240 |
+
}
|
| 241 |
|
| 242 |
def _extract_target_from_text(self, text: str) -> Optional[str]:
|
| 243 |
"""
|
story_engine.py
CHANGED
|
@@ -84,7 +84,7 @@ def _merge_change_logs(tick_log: list[str], action_log: list[str]) -> list[str]:
|
|
| 84 |
return remaining_tick + merged_results
|
| 85 |
|
| 86 |
|
| 87 |
-
def _normalize_markers(text: str) -> str:
|
| 88 |
"""
|
| 89 |
标准化 LLM 输出中的分隔标记,处理常见变体格式。
|
| 90 |
|
|
@@ -97,9 +97,29 @@ def _normalize_markers(text: str) -> str:
|
|
| 97 |
"""
|
| 98 |
text = re.sub(r'-{2,}\s*STORY[_ ]?TEXT\s*-{2,}', '---STORY_TEXT---', text, flags=re.IGNORECASE)
|
| 99 |
text = re.sub(r'-{2,}\s*OPTIONS[_ ]?JSON\s*-{2,}', '---OPTIONS_JSON---', text, flags=re.IGNORECASE)
|
| 100 |
-
text = re.sub(r'-{2,}\s*STATE[_ ]?JSON\s*-{2,}', '---STATE_JSON---', text, flags=re.IGNORECASE)
|
| 101 |
-
text = re.sub(r'-{2,}\s*THINKING\s*-{2,}', '---THINKING---', text, flags=re.IGNORECASE)
|
| 102 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
# ============================================================
|
|
@@ -458,12 +478,13 @@ class StoryEngine:
|
|
| 458 |
story_text, options = self._parse_story_response(raw_text)
|
| 459 |
|
| 460 |
# 开场没有状态变更
|
| 461 |
-
return {
|
| 462 |
-
"story_text": story_text,
|
| 463 |
-
"options": options,
|
| 464 |
-
"state_changes": {},
|
| 465 |
-
"change_log": [],
|
| 466 |
-
|
|
|
|
| 467 |
|
| 468 |
def generate_story(self, player_intent: dict) -> dict:
|
| 469 |
"""
|
|
@@ -494,7 +515,8 @@ class StoryEngine:
|
|
| 494 |
"consistency_issues": ["一致性问题"],
|
| 495 |
}
|
| 496 |
"""
|
| 497 |
-
logger.info(f"生成故事响应,玩家意图: {player_intent}")
|
|
|
|
| 498 |
|
| 499 |
# ============================================
|
| 500 |
# 推进时间(行动前,时间自然流逝)
|
|
@@ -510,10 +532,15 @@ class StoryEngine:
|
|
| 510 |
# ============================================
|
| 511 |
outline = self._generate_outline(player_intent)
|
| 512 |
|
| 513 |
-
if outline is None:
|
| 514 |
-
# 大纲生成失败 —— 降级处理
|
| 515 |
-
logger.error("大纲生成失败,使用降级叙事")
|
| 516 |
-
return self._fallback_response(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
# ============================================
|
| 519 |
# 处理时间冲突:如果大纲指定了 time_change(时间跳跃),
|
|
@@ -529,15 +556,21 @@ class StoryEngine:
|
|
| 529 |
# ============================================
|
| 530 |
consistency_issues = self.game_state.check_consistency(state_changes)
|
| 531 |
|
| 532 |
-
if consistency_issues:
|
| 533 |
-
logger.warning(f"发现一致性问题: {consistency_issues}")
|
| 534 |
-
# 尝试修复:重新生成大纲,附带一致性约束
|
| 535 |
-
outline = self._regenerate_outline_with_fixes(player_intent, consistency_issues)
|
| 536 |
-
if outline is None:
|
| 537 |
-
return self._fallback_response(
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
# 移除与非法物品相关的状态变更(安全网)
|
| 543 |
if consistency_issues:
|
|
@@ -588,14 +621,21 @@ class StoryEngine:
|
|
| 588 |
# 合并 tick_log 和 change_log 中的重复属性条目
|
| 589 |
merged_log = _merge_change_logs(tick_log, change_log + validation_issues)
|
| 590 |
|
| 591 |
-
return {
|
| 592 |
-
"story_text": story_text,
|
| 593 |
-
"options": options,
|
| 594 |
-
"state_changes": state_changes,
|
| 595 |
-
"change_log": merged_log,
|
| 596 |
-
"outline": outline,
|
| 597 |
-
"consistency_issues": consistency_issues,
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
def _generate_outline(self, player_intent: dict) -> Optional[dict]:
|
| 601 |
"""
|
|
@@ -715,14 +755,15 @@ class StoryEngine:
|
|
| 715 |
raw_text = call_qwen(messages, model=self.model, temperature=0.9, max_tokens=1500)
|
| 716 |
story_text, options = self._parse_story_response(raw_text)
|
| 717 |
|
| 718 |
-
return {
|
| 719 |
-
"story_text": story_text,
|
| 720 |
-
"options": options,
|
| 721 |
-
"state_changes": {},
|
| 722 |
-
"change_log": ["游戏结束"],
|
| 723 |
-
"outline": None,
|
| 724 |
-
"consistency_issues": [],
|
| 725 |
-
|
|
|
|
| 726 |
|
| 727 |
@staticmethod
|
| 728 |
def _clean_story_text(story_text: str) -> str:
|
|
@@ -925,7 +966,14 @@ class StoryEngine:
|
|
| 925 |
opt["id"] = i
|
| 926 |
return options[:3]
|
| 927 |
|
| 928 |
-
def _fallback_response(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 929 |
"""
|
| 930 |
降级响应:当大纲生成完全失败时,提供基本响应。
|
| 931 |
|
|
@@ -965,14 +1013,19 @@ class StoryEngine:
|
|
| 965 |
options = self._generate_default_options()
|
| 966 |
|
| 967 |
fallback_change_log = (tick_log or []) + ["(系统提示:本回合使用了降级响应)"]
|
| 968 |
-
return {
|
| 969 |
-
"story_text": story_text,
|
| 970 |
-
"options": options,
|
| 971 |
-
"state_changes": {},
|
| 972 |
-
"change_log": fallback_change_log,
|
| 973 |
-
"outline": None,
|
| 974 |
-
"consistency_issues": [],
|
| 975 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 976 |
|
| 977 |
def _sanitize_state_changes(self, changes: dict, event_type: str = "") -> tuple[dict, list[str]]:
|
| 978 |
"""
|
|
@@ -1211,13 +1264,18 @@ class StoryEngine:
|
|
| 1211 |
result["options"] = self._ensure_three_options(result.get("options", []))
|
| 1212 |
yield {"type": "final", **result}
|
| 1213 |
except Exception:
|
| 1214 |
-
yield {
|
| 1215 |
-
"type": "final",
|
| 1216 |
-
"story_text": "你踏上了一段新的旅程...",
|
| 1217 |
-
"options": self._generate_default_options(),
|
| 1218 |
-
"state_changes": {},
|
| 1219 |
-
"change_log": [],
|
| 1220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
return
|
| 1222 |
|
| 1223 |
# ★ 如果流式阶段未检测到标记但有累积文本,先 yield 给 UI 显示
|
|
@@ -1239,11 +1297,12 @@ class StoryEngine:
|
|
| 1239 |
|
| 1240 |
yield {
|
| 1241 |
"type": "final",
|
| 1242 |
-
"story_text": story_text,
|
| 1243 |
-
"options": options,
|
| 1244 |
-
"state_changes": {},
|
| 1245 |
-
"change_log": [],
|
| 1246 |
-
|
|
|
|
| 1247 |
|
| 1248 |
def generate_story_stream(self, player_intent: dict):
|
| 1249 |
"""
|
|
@@ -1315,18 +1374,23 @@ class StoryEngine:
|
|
| 1315 |
if display_text.strip():
|
| 1316 |
yield {"type": "story_chunk", "text": display_text.strip()}
|
| 1317 |
|
| 1318 |
-
except Exception as e:
|
| 1319 |
-
logger.error(f"流式合并生成失败: {e},降级为非流式两阶段")
|
| 1320 |
-
try:
|
| 1321 |
-
result = self.generate_story(player_intent)
|
| 1322 |
# 降级结果也强制保证 3 个选项
|
| 1323 |
result["options"] = self._ensure_three_options(result.get("options", []))
|
| 1324 |
yield {"type": "final", **result}
|
| 1325 |
-
except Exception:
|
| 1326 |
-
fallback = self._fallback_response(
|
| 1327 |
-
|
| 1328 |
-
|
| 1329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1330 |
|
| 1331 |
# ★ 如果流式阶段未检测到标记但有累积文本,先 yield 给 UI 显示
|
| 1332 |
if not story_started and full_text.strip():
|
|
@@ -1345,21 +1409,31 @@ class StoryEngine:
|
|
| 1345 |
if story_text and story_text.strip():
|
| 1346 |
logger.warning("大纲(STATE_JSON)解析失败,但故事文本已提取,跳过状态更新继续")
|
| 1347 |
options = self._ensure_three_options(options)
|
| 1348 |
-
yield {
|
| 1349 |
-
"type": "final",
|
| 1350 |
-
"story_text": story_text,
|
| 1351 |
-
"options": options,
|
| 1352 |
-
"state_changes": {},
|
| 1353 |
-
"change_log": tick_log + ["(系统提示:本回合状态解析失败,未更新状态)"],
|
| 1354 |
-
"outline": None,
|
| 1355 |
-
"consistency_issues": [],
|
| 1356 |
-
|
| 1357 |
-
|
| 1358 |
-
|
| 1359 |
-
|
| 1360 |
-
|
| 1361 |
-
|
| 1362 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1363 |
|
| 1364 |
# 处理时间冲突
|
| 1365 |
state_changes = outline.get("state_changes", {})
|
|
@@ -1410,15 +1484,21 @@ class StoryEngine:
|
|
| 1410 |
# 合并日志
|
| 1411 |
merged_log = _merge_change_logs(tick_log, change_log + validation_issues)
|
| 1412 |
|
| 1413 |
-
yield {
|
| 1414 |
-
"type": "final",
|
| 1415 |
-
"story_text": story_text,
|
| 1416 |
-
"options": options,
|
| 1417 |
-
"state_changes": state_changes,
|
| 1418 |
-
"change_log": merged_log,
|
| 1419 |
-
"outline": outline,
|
| 1420 |
-
"consistency_issues": consistency_issues,
|
| 1421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1422 |
|
| 1423 |
def process_option_selection_stream(self, option: dict):
|
| 1424 |
"""
|
|
|
|
| 84 |
return remaining_tick + merged_results
|
| 85 |
|
| 86 |
|
| 87 |
+
def _normalize_markers(text: str) -> str:
|
| 88 |
"""
|
| 89 |
标准化 LLM 输出中的分隔标记,处理常见变体格式。
|
| 90 |
|
|
|
|
| 97 |
"""
|
| 98 |
text = re.sub(r'-{2,}\s*STORY[_ ]?TEXT\s*-{2,}', '---STORY_TEXT---', text, flags=re.IGNORECASE)
|
| 99 |
text = re.sub(r'-{2,}\s*OPTIONS[_ ]?JSON\s*-{2,}', '---OPTIONS_JSON---', text, flags=re.IGNORECASE)
|
| 100 |
+
text = re.sub(r'-{2,}\s*STATE[_ ]?JSON\s*-{2,}', '---STATE_JSON---', text, flags=re.IGNORECASE)
|
| 101 |
+
text = re.sub(r'-{2,}\s*THINKING\s*-{2,}', '---THINKING---', text, flags=re.IGNORECASE)
|
| 102 |
+
return text
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _build_telemetry(
|
| 106 |
+
engine_mode: str,
|
| 107 |
+
*,
|
| 108 |
+
used_fallback: bool = False,
|
| 109 |
+
fallback_reason: str | None = None,
|
| 110 |
+
consistency_issues_count: int = 0,
|
| 111 |
+
validation_issues_count: int = 0,
|
| 112 |
+
outline_regenerated: bool = False,
|
| 113 |
+
) -> dict:
|
| 114 |
+
"""构建供日志与评估脚本使用的轻量运行元信息。"""
|
| 115 |
+
return {
|
| 116 |
+
"engine_mode": engine_mode,
|
| 117 |
+
"used_fallback": used_fallback,
|
| 118 |
+
"fallback_reason": fallback_reason,
|
| 119 |
+
"consistency_issues_count": consistency_issues_count,
|
| 120 |
+
"validation_issues_count": validation_issues_count,
|
| 121 |
+
"outline_regenerated": outline_regenerated,
|
| 122 |
+
}
|
| 123 |
|
| 124 |
|
| 125 |
# ============================================================
|
|
|
|
| 478 |
story_text, options = self._parse_story_response(raw_text)
|
| 479 |
|
| 480 |
# 开场没有状态变更
|
| 481 |
+
return {
|
| 482 |
+
"story_text": story_text,
|
| 483 |
+
"options": options,
|
| 484 |
+
"state_changes": {},
|
| 485 |
+
"change_log": [],
|
| 486 |
+
"telemetry": _build_telemetry(engine_mode="opening", used_fallback=False),
|
| 487 |
+
}
|
| 488 |
|
| 489 |
def generate_story(self, player_intent: dict) -> dict:
|
| 490 |
"""
|
|
|
|
| 515 |
"consistency_issues": ["一致性问题"],
|
| 516 |
}
|
| 517 |
"""
|
| 518 |
+
logger.info(f"生成故事响应,玩家意图: {player_intent}")
|
| 519 |
+
outline_regenerated = False
|
| 520 |
|
| 521 |
# ============================================
|
| 522 |
# 推进时间(行动前,时间自然流逝)
|
|
|
|
| 532 |
# ============================================
|
| 533 |
outline = self._generate_outline(player_intent)
|
| 534 |
|
| 535 |
+
if outline is None:
|
| 536 |
+
# 大纲生成失败 —— 降级处理
|
| 537 |
+
logger.error("大纲生成失败,使用降级叙事")
|
| 538 |
+
return self._fallback_response(
|
| 539 |
+
player_intent,
|
| 540 |
+
tick_log,
|
| 541 |
+
fallback_reason="outline_generation_failed",
|
| 542 |
+
engine_mode="two_stage",
|
| 543 |
+
)
|
| 544 |
|
| 545 |
# ============================================
|
| 546 |
# 处理时间冲突:如果大纲指定了 time_change(时间跳跃),
|
|
|
|
| 556 |
# ============================================
|
| 557 |
consistency_issues = self.game_state.check_consistency(state_changes)
|
| 558 |
|
| 559 |
+
if consistency_issues:
|
| 560 |
+
logger.warning(f"发现一致性问题: {consistency_issues}")
|
| 561 |
+
# 尝试修复:重新生成大纲,附带一致性约束
|
| 562 |
+
outline = self._regenerate_outline_with_fixes(player_intent, consistency_issues)
|
| 563 |
+
if outline is None:
|
| 564 |
+
return self._fallback_response(
|
| 565 |
+
player_intent,
|
| 566 |
+
tick_log,
|
| 567 |
+
fallback_reason="outline_regeneration_failed",
|
| 568 |
+
engine_mode="two_stage",
|
| 569 |
+
)
|
| 570 |
+
outline_regenerated = True
|
| 571 |
+
state_changes = outline.get("state_changes", {})
|
| 572 |
+
# 再次检查
|
| 573 |
+
consistency_issues = self.game_state.check_consistency(state_changes)
|
| 574 |
|
| 575 |
# 移除与非法物品相关的状态变更(安全网)
|
| 576 |
if consistency_issues:
|
|
|
|
| 621 |
# 合并 tick_log 和 change_log 中的重复属性条目
|
| 622 |
merged_log = _merge_change_logs(tick_log, change_log + validation_issues)
|
| 623 |
|
| 624 |
+
return {
|
| 625 |
+
"story_text": story_text,
|
| 626 |
+
"options": options,
|
| 627 |
+
"state_changes": state_changes,
|
| 628 |
+
"change_log": merged_log,
|
| 629 |
+
"outline": outline,
|
| 630 |
+
"consistency_issues": consistency_issues,
|
| 631 |
+
"telemetry": _build_telemetry(
|
| 632 |
+
engine_mode="two_stage",
|
| 633 |
+
used_fallback=False,
|
| 634 |
+
consistency_issues_count=len(consistency_issues),
|
| 635 |
+
validation_issues_count=len(validation_issues),
|
| 636 |
+
outline_regenerated=outline_regenerated,
|
| 637 |
+
),
|
| 638 |
+
}
|
| 639 |
|
| 640 |
def _generate_outline(self, player_intent: dict) -> Optional[dict]:
|
| 641 |
"""
|
|
|
|
| 755 |
raw_text = call_qwen(messages, model=self.model, temperature=0.9, max_tokens=1500)
|
| 756 |
story_text, options = self._parse_story_response(raw_text)
|
| 757 |
|
| 758 |
+
return {
|
| 759 |
+
"story_text": story_text,
|
| 760 |
+
"options": options,
|
| 761 |
+
"state_changes": {},
|
| 762 |
+
"change_log": ["游戏结束"],
|
| 763 |
+
"outline": None,
|
| 764 |
+
"consistency_issues": [],
|
| 765 |
+
"telemetry": _build_telemetry(engine_mode="death_narrative", used_fallback=False),
|
| 766 |
+
}
|
| 767 |
|
| 768 |
@staticmethod
|
| 769 |
def _clean_story_text(story_text: str) -> str:
|
|
|
|
| 966 |
opt["id"] = i
|
| 967 |
return options[:3]
|
| 968 |
|
| 969 |
+
def _fallback_response(
|
| 970 |
+
self,
|
| 971 |
+
player_intent: dict,
|
| 972 |
+
tick_log: list[str] | None = None,
|
| 973 |
+
*,
|
| 974 |
+
fallback_reason: str = "unknown",
|
| 975 |
+
engine_mode: str = "fallback",
|
| 976 |
+
) -> dict:
|
| 977 |
"""
|
| 978 |
降级响应:当大纲生成完全失败时,提供基本响应。
|
| 979 |
|
|
|
|
| 1013 |
options = self._generate_default_options()
|
| 1014 |
|
| 1015 |
fallback_change_log = (tick_log or []) + ["(系统提示:本回合使用了降级响应)"]
|
| 1016 |
+
return {
|
| 1017 |
+
"story_text": story_text,
|
| 1018 |
+
"options": options,
|
| 1019 |
+
"state_changes": {},
|
| 1020 |
+
"change_log": fallback_change_log,
|
| 1021 |
+
"outline": None,
|
| 1022 |
+
"consistency_issues": [],
|
| 1023 |
+
"telemetry": _build_telemetry(
|
| 1024 |
+
engine_mode=engine_mode,
|
| 1025 |
+
used_fallback=True,
|
| 1026 |
+
fallback_reason=fallback_reason,
|
| 1027 |
+
),
|
| 1028 |
+
}
|
| 1029 |
|
| 1030 |
def _sanitize_state_changes(self, changes: dict, event_type: str = "") -> tuple[dict, list[str]]:
|
| 1031 |
"""
|
|
|
|
| 1264 |
result["options"] = self._ensure_three_options(result.get("options", []))
|
| 1265 |
yield {"type": "final", **result}
|
| 1266 |
except Exception:
|
| 1267 |
+
yield {
|
| 1268 |
+
"type": "final",
|
| 1269 |
+
"story_text": "你踏上了一段新的旅程...",
|
| 1270 |
+
"options": self._generate_default_options(),
|
| 1271 |
+
"state_changes": {},
|
| 1272 |
+
"change_log": [],
|
| 1273 |
+
"telemetry": _build_telemetry(
|
| 1274 |
+
engine_mode="opening",
|
| 1275 |
+
used_fallback=True,
|
| 1276 |
+
fallback_reason="opening_stream_exception",
|
| 1277 |
+
),
|
| 1278 |
+
}
|
| 1279 |
return
|
| 1280 |
|
| 1281 |
# ★ 如果流式阶段未检测到标记但有累积文本,先 yield 给 UI 显示
|
|
|
|
| 1297 |
|
| 1298 |
yield {
|
| 1299 |
"type": "final",
|
| 1300 |
+
"story_text": story_text,
|
| 1301 |
+
"options": options,
|
| 1302 |
+
"state_changes": {},
|
| 1303 |
+
"change_log": [],
|
| 1304 |
+
"telemetry": _build_telemetry(engine_mode="opening", used_fallback=False),
|
| 1305 |
+
}
|
| 1306 |
|
| 1307 |
def generate_story_stream(self, player_intent: dict):
|
| 1308 |
"""
|
|
|
|
| 1374 |
if display_text.strip():
|
| 1375 |
yield {"type": "story_chunk", "text": display_text.strip()}
|
| 1376 |
|
| 1377 |
+
except Exception as e:
|
| 1378 |
+
logger.error(f"流式合并生成失败: {e},降级为非流式两阶段")
|
| 1379 |
+
try:
|
| 1380 |
+
result = self.generate_story(player_intent)
|
| 1381 |
# 降级结果也强制保证 3 个选项
|
| 1382 |
result["options"] = self._ensure_three_options(result.get("options", []))
|
| 1383 |
yield {"type": "final", **result}
|
| 1384 |
+
except Exception:
|
| 1385 |
+
fallback = self._fallback_response(
|
| 1386 |
+
player_intent,
|
| 1387 |
+
tick_log,
|
| 1388 |
+
fallback_reason="stream_exception",
|
| 1389 |
+
engine_mode="stream_merged",
|
| 1390 |
+
)
|
| 1391 |
+
fallback["options"] = self._ensure_three_options(fallback.get("options", []))
|
| 1392 |
+
yield {"type": "final", **fallback}
|
| 1393 |
+
return
|
| 1394 |
|
| 1395 |
# ★ 如果流式阶段未检测到标记但有累积文本,先 yield 给 UI 显示
|
| 1396 |
if not story_started and full_text.strip():
|
|
|
|
| 1409 |
if story_text and story_text.strip():
|
| 1410 |
logger.warning("大纲(STATE_JSON)解析失败,但故事文本已提取,跳过状态更新继续")
|
| 1411 |
options = self._ensure_three_options(options)
|
| 1412 |
+
yield {
|
| 1413 |
+
"type": "final",
|
| 1414 |
+
"story_text": story_text,
|
| 1415 |
+
"options": options,
|
| 1416 |
+
"state_changes": {},
|
| 1417 |
+
"change_log": tick_log + ["(系统提示:本回合状态解析失败,未更新状态)"],
|
| 1418 |
+
"outline": None,
|
| 1419 |
+
"consistency_issues": [],
|
| 1420 |
+
"telemetry": _build_telemetry(
|
| 1421 |
+
engine_mode="stream_merged",
|
| 1422 |
+
used_fallback=True,
|
| 1423 |
+
fallback_reason="state_parse_failed",
|
| 1424 |
+
),
|
| 1425 |
+
}
|
| 1426 |
+
return
|
| 1427 |
+
else:
|
| 1428 |
+
logger.error("合并响应解析完全失败,使用降级")
|
| 1429 |
+
fallback = self._fallback_response(
|
| 1430 |
+
player_intent,
|
| 1431 |
+
tick_log,
|
| 1432 |
+
fallback_reason="merged_response_parse_failed",
|
| 1433 |
+
engine_mode="stream_merged",
|
| 1434 |
+
)
|
| 1435 |
+
yield {"type": "final", **fallback}
|
| 1436 |
+
return
|
| 1437 |
|
| 1438 |
# 处理时间冲突
|
| 1439 |
state_changes = outline.get("state_changes", {})
|
|
|
|
| 1484 |
# 合并日志
|
| 1485 |
merged_log = _merge_change_logs(tick_log, change_log + validation_issues)
|
| 1486 |
|
| 1487 |
+
yield {
|
| 1488 |
+
"type": "final",
|
| 1489 |
+
"story_text": story_text,
|
| 1490 |
+
"options": options,
|
| 1491 |
+
"state_changes": state_changes,
|
| 1492 |
+
"change_log": merged_log,
|
| 1493 |
+
"outline": outline,
|
| 1494 |
+
"consistency_issues": consistency_issues,
|
| 1495 |
+
"telemetry": _build_telemetry(
|
| 1496 |
+
engine_mode="stream_merged",
|
| 1497 |
+
used_fallback=False,
|
| 1498 |
+
consistency_issues_count=len(consistency_issues),
|
| 1499 |
+
validation_issues_count=len(validation_issues),
|
| 1500 |
+
),
|
| 1501 |
+
}
|
| 1502 |
|
| 1503 |
def process_option_selection_stream(self, option: dict):
|
| 1504 |
"""
|
telemetry.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
telemetry.py - StoryWeaver 结构化交互日志工具
|
| 3 |
+
|
| 4 |
+
职责:
|
| 5 |
+
1. 为每个游戏会话分配稳定的 session_id
|
| 6 |
+
2. 以 JSONL 形式落盘每回合交互记录
|
| 7 |
+
3. 为评估脚本和案例分析提供统一的日志格式
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import uuid
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
PROJECT_ROOT = Path(__file__).resolve().parent
|
| 21 |
+
DEFAULT_LOG_DIR = PROJECT_ROOT / "logs" / "interactions"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _resolve_log_dir() -> Path:
|
| 25 |
+
custom_dir = os.getenv("STORYWEAVER_LOG_DIR", "").strip()
|
| 26 |
+
if custom_dir:
|
| 27 |
+
return Path(custom_dir).expanduser()
|
| 28 |
+
return DEFAULT_LOG_DIR
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_session_metadata(session_id: str | None = None) -> dict[str, Any]:
|
| 32 |
+
"""
|
| 33 |
+
创建新的会话元数据。
|
| 34 |
+
|
| 35 |
+
每个会话对应一个单独的 JSONL 文件,便于回放和分析。
|
| 36 |
+
"""
|
| 37 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 38 |
+
new_session_id = session_id or f"sw-{timestamp}-{uuid.uuid4().hex[:8]}"
|
| 39 |
+
log_dir = _resolve_log_dir()
|
| 40 |
+
log_path = log_dir / f"{new_session_id}.jsonl"
|
| 41 |
+
return {
|
| 42 |
+
"session_id": new_session_id,
|
| 43 |
+
"turn_index": 0,
|
| 44 |
+
"interaction_log_path": str(log_path),
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def ensure_session_metadata(game_session: dict[str, Any]) -> dict[str, Any]:
|
| 49 |
+
"""确保游戏会话中带有日志所需的元数据。"""
|
| 50 |
+
if "session_id" not in game_session or "interaction_log_path" not in game_session:
|
| 51 |
+
game_session.update(create_session_metadata())
|
| 52 |
+
if "turn_index" not in game_session:
|
| 53 |
+
game_session["turn_index"] = 0
|
| 54 |
+
return game_session
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def append_turn_log(game_session: dict[str, Any], record: dict[str, Any]) -> str:
|
| 58 |
+
"""
|
| 59 |
+
追加一条结构化交互日志。
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
日志文件路径,便于调试和脚本复用。
|
| 63 |
+
"""
|
| 64 |
+
ensure_session_metadata(game_session)
|
| 65 |
+
|
| 66 |
+
game_session["turn_index"] += 1
|
| 67 |
+
log_path = Path(game_session["interaction_log_path"])
|
| 68 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
|
| 70 |
+
payload = {
|
| 71 |
+
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
| 72 |
+
"session_id": game_session["session_id"],
|
| 73 |
+
"turn_index": game_session["turn_index"],
|
| 74 |
+
**record,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
with log_path.open("a", encoding="utf-8") as fh:
|
| 78 |
+
json.dump(payload, fh, ensure_ascii=False)
|
| 79 |
+
fh.write("\n")
|
| 80 |
+
|
| 81 |
+
return str(log_path)
|
utils.py
CHANGED
|
@@ -10,11 +10,17 @@ utils.py - StoryWeaver 工具函数模块
|
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
import json
|
| 13 |
-
import time
|
| 14 |
-
import logging
|
| 15 |
-
from typing import Any, Optional
|
| 16 |
-
from dotenv import load_dotenv
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# ============================================================
|
| 20 |
# 日志配置
|
|
@@ -43,17 +49,22 @@ if not QWEN_API_KEY or QWEN_API_KEY == "sk-xxxxxx":
|
|
| 43 |
|
| 44 |
# 使用 OpenAI 兼容格式连接 Qwen API
|
| 45 |
# base_url 指向通义千问的 OpenAI 兼容端点
|
| 46 |
-
_client: Optional[
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def get_client() ->
|
| 50 |
"""
|
| 51 |
获取全局 OpenAI 客户端(懒加载单例)。
|
| 52 |
-
使用兼容格式调用 Qwen API。
|
| 53 |
-
"""
|
| 54 |
-
global _client
|
| 55 |
-
if
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
api_key=QWEN_API_KEY,
|
| 58 |
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
| 59 |
)
|
|
|
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
import json
|
| 13 |
+
import time
|
| 14 |
+
import logging
|
| 15 |
+
from typing import Any, Optional
|
| 16 |
+
from dotenv import load_dotenv
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
_OPENAI_IMPORT_ERROR: Optional[Exception] = None
|
| 21 |
+
except ImportError as exc: # pragma: no cover - depends on local env
|
| 22 |
+
OpenAI = None # type: ignore[assignment]
|
| 23 |
+
_OPENAI_IMPORT_ERROR = exc
|
| 24 |
|
| 25 |
# ============================================================
|
| 26 |
# 日志配置
|
|
|
|
| 49 |
|
| 50 |
# 使用 OpenAI 兼容格式连接 Qwen API
|
| 51 |
# base_url 指向通义千问的 OpenAI 兼容端点
|
| 52 |
+
_client: Optional[Any] = None
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def get_client() -> Any:
|
| 56 |
"""
|
| 57 |
获取全局 OpenAI 客户端(懒加载单例)。
|
| 58 |
+
使用兼容格式调用 Qwen API。
|
| 59 |
+
"""
|
| 60 |
+
global _client
|
| 61 |
+
if OpenAI is None:
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
"未安装 openai 依赖,无法初始化 Qwen 客户端。"
|
| 64 |
+
"请先执行 `pip install -r requirements.txt`。"
|
| 65 |
+
) from _OPENAI_IMPORT_ERROR
|
| 66 |
+
if _client is None:
|
| 67 |
+
_client = OpenAI(
|
| 68 |
api_key=QWEN_API_KEY,
|
| 69 |
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
| 70 |
)
|