Sibam commited on
Commit
b9664a2
Β·
1 Parent(s): c0872c1

feat: PreferenceLab complete - RLHF preference simulation OpenEnv environment

Browse files
Files changed (3) hide show
  1. debug_response.py +23 -0
  2. test_hf_space.py +211 -0
  3. tests/test_environment.py +2 -2
debug_response.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # debug_response.py
2
+ import urllib.request, json
3
+
4
+ BASE = 'https://dev-crafterx-preference-lab.hf.space'
5
+
6
+ def post(url, data):
7
+ body = json.dumps(data).encode()
8
+ req = urllib.request.Request(url, data=body, headers={'Content-Type': 'application/json'})
9
+ with urllib.request.urlopen(req, timeout=60) as r:
10
+ return json.loads(r.read())
11
+
12
+ print("RESET response:")
13
+ r = post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 42})
14
+ print(json.dumps(r, indent=2))
15
+
16
+ print("\nSTEP response:")
17
+ s = post(f'{BASE}/step', {'action': {'choice': 'A'}})
18
+ print(json.dumps(s, indent=2))
19
+
20
+ print("\nSTATE response:")
21
+ import urllib.request as ur
22
+ with ur.urlopen(f'{BASE}/state', timeout=60) as resp:
23
+ print(json.dumps(json.loads(resp.read()), indent=2))
test_hf_space.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import urllib.error
3
+ import json
4
+ import time
5
+
6
+ BASE = 'https://dev-crafterx-preference-lab.hf.space'
7
+
8
+ PASS = 0
9
+ FAIL = 0
10
+
11
+ def post(url, data={}):
12
+ body = json.dumps(data).encode()
13
+ req = urllib.request.Request(url, data=body,
14
+ headers={'Content-Type': 'application/json'})
15
+ try:
16
+ with urllib.request.urlopen(req, timeout=60) as r:
17
+ return json.loads(r.read())
18
+ except Exception as e:
19
+ return {'error': str(e)}
20
+
21
+ def get(url):
22
+ try:
23
+ with urllib.request.urlopen(url, timeout=60) as r:
24
+ return json.loads(r.read())
25
+ except Exception as e:
26
+ return {'error': str(e)}
27
+
28
+ def get_obs(r):
29
+ """reward/done are at TOP LEVEL, not inside observation."""
30
+ obs = r.get('observation', r)
31
+ obs['reward'] = r.get('reward')
32
+ obs['done'] = r.get('done')
33
+ return obs
34
+
35
+ def check(name, condition, got=None):
36
+ global PASS, FAIL
37
+ if condition:
38
+ print(f' βœ… PASS β€” {name}')
39
+ PASS += 1
40
+ else:
41
+ print(f' ❌ FAIL β€” {name} | got: {got}')
42
+ FAIL += 1
43
+
44
+ print()
45
+ print('=' * 60)
46
+ print(' PreferenceLab β€” Full Judge Simulation Test')
47
+ print(f' Target: {BASE}')
48
+ print('=' * 60)
49
+
50
+ # ── TEST 1: Health ─────────────────────────────────────────────
51
+ print('\n[1] HEALTH CHECK')
52
+ r = get(f'{BASE}/health')
53
+ check('Returns status field', 'status' in r, r)
54
+ check('Status is healthy', r.get('status') == 'healthy', r.get('status'))
55
+
56
+ # ── TEST 2: Reset β€” all 3 task types ──────────────────────────
57
+ print('\n[2] RESET β€” ALL 3 TASK TYPES')
58
+ for task in ['pairwise', 'likert', 'consistency']:
59
+ r = post(f'{BASE}/reset', {'task_type': task})
60
+ obs = get_obs(r)
61
+ check(f'reset({task}) returns observation', 'observation' in r, list(r.keys()))
62
+ check(f'reset({task}) has prompt', bool(obs.get('prompt')), obs.get('prompt','')[:30])
63
+ check(f'reset({task}) reward=0.0', obs.get('reward') == 0.0, obs.get('reward'))
64
+ check(f'reset({task}) done=false', obs.get('done') == False, obs.get('done'))
65
+ check(f'reset({task}) task_type correct', obs.get('task_type') == task, obs.get('task_type'))
66
+
67
+ # ── TEST 3: Pairwise β€” all choices ────────────────────────────
68
+ print('\n[3] PAIRWISE β€” ALL CHOICES')
69
+ rewards_pairwise = {}
70
+ for choice in ['A', 'B', 'tie', 'skip']:
71
+ post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 42})
72
+ r = post(f'{BASE}/step', {'action': {'choice': choice}})
73
+ obs = get_obs(r)
74
+ reward = obs.get('reward')
75
+ rewards_pairwise[choice] = reward
76
+ check(f'choice={choice} reward in [0,1]',
77
+ reward is not None and 0.0 <= reward <= 1.0, reward)
78
+
79
+ check('A and B give different rewards (anti-constant)',
80
+ rewards_pairwise.get('A') != rewards_pairwise.get('B'),
81
+ f"A={rewards_pairwise.get('A')} B={rewards_pairwise.get('B')}")
82
+
83
+ check('skip gives partial credit (0.3)',
84
+ rewards_pairwise.get('skip') == 0.3,
85
+ rewards_pairwise.get('skip'))
86
+
87
+ # ── TEST 4: Likert scoring ─────────────────────────────────────
88
+ print('\n[4] LIKERT SCORING')
89
+ post(f'{BASE}/reset', {'task_type': 'likert', 'seed': 42})
90
+ r = post(f'{BASE}/step', {'action': {
91
+ 'helpfulness': 5, 'honesty': 5,
92
+ 'harmlessness': 5, 'instruction_following': 5
93
+ }})
94
+ obs = get_obs(r)
95
+ check('Likert perfect scores accepted', obs.get('reward') is not None, list(obs.keys()))
96
+ check('Likert reward in [0,1]',
97
+ obs.get('reward') is not None and 0.0 <= obs.get('reward', -1) <= 1.0,
98
+ obs.get('reward'))
99
+
100
+ post(f'{BASE}/reset', {'task_type': 'likert', 'seed': 42})
101
+ r2 = post(f'{BASE}/step', {'action': {
102
+ 'helpfulness': 1, 'honesty': 1,
103
+ 'harmlessness': 1, 'instruction_following': 1
104
+ }})
105
+ obs2 = get_obs(r2)
106
+ check('Likert worst != perfect (varies)',
107
+ obs.get('reward') != obs2.get('reward'),
108
+ f"perfect={obs.get('reward')} worst={obs2.get('reward')}")
109
+
110
+ # ── TEST 5: Consistency ranking ────────────────────────────────
111
+ print('\n[5] CONSISTENCY RANKING')
112
+ post(f'{BASE}/reset', {'task_type': 'consistency', 'seed': 42})
113
+ r = post(f'{BASE}/step', {'action': {'ranking': ['A', 'B', 'C', 'D']}})
114
+ obs = get_obs(r)
115
+ check('Consistency accepts 4-item ranking', obs.get('reward') is not None, list(obs.keys()))
116
+ check('Consistency reward in [0,1]',
117
+ obs.get('reward') is not None and 0.0 <= obs.get('reward', -1) <= 1.0,
118
+ obs.get('reward'))
119
+
120
+ post(f'{BASE}/reset', {'task_type': 'consistency', 'seed': 42})
121
+ r2 = post(f'{BASE}/step', {'action': {'ranking': ['D', 'C', 'B', 'A']}})
122
+ obs2 = get_obs(r2)
123
+ check('Consistency reversed != perfect (grader works)',
124
+ obs.get('reward') != obs2.get('reward'),
125
+ f"correct={obs.get('reward')} reversed={obs2.get('reward')}")
126
+
127
+ # ── TEST 6: Full episode β€” 5 steps ────────────────────────────
128
+ print('\n[6] FULL EPISODE β€” 5 STEPS')
129
+ post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 99})
130
+ rewards = []
131
+ done = False
132
+ steps = 0
133
+ for i in range(10):
134
+ r = post(f'{BASE}/step', {'action': {'choice': 'A'}})
135
+ obs = get_obs(r)
136
+ rewards.append(obs.get('reward', 0))
137
+ done = obs.get('done', False)
138
+ steps += 1
139
+ if done:
140
+ break
141
+
142
+ check('Episode terminates (done=true)', done == True, done)
143
+ check('Episode runs exactly 5 steps', steps == 5, steps)
144
+ check('All rewards in [0,1]',
145
+ all(0.0 <= rv <= 1.0 for rv in rewards if rv is not None), rewards)
146
+
147
+ # ── TEST 7: Reproducibility with seed ─────────────────────────
148
+ print('\n[7] REPRODUCIBILITY β€” SAME SEED = SAME EPISODE')
149
+ r1 = post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 777})
150
+ r2 = post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 777})
151
+ p1 = get_obs(r1).get('prompt', 'X')
152
+ p2 = get_obs(r2).get('prompt', 'Y')
153
+ check('Same seed produces same prompt', p1 == p2, f"p1={p1[:30]} p2={p2[:30]}")
154
+
155
+ r3 = post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 888})
156
+ p3 = get_obs(r3).get('prompt', 'Z')
157
+ check('Different seed produces different episode',
158
+ p1 != p3, f"seed777={p1[:30]} seed888={p3[:30]}")
159
+
160
+ # ── TEST 8: State endpoint ─────────────────────────────────────
161
+ print('\n[8] STATE ENDPOINT')
162
+ post(f'{BASE}/reset', {'task_type': 'likert', 'seed': 42})
163
+ post(f'{BASE}/step', {'action': {
164
+ 'helpfulness': 4, 'honesty': 4,
165
+ 'harmlessness': 4, 'instruction_following': 4
166
+ }})
167
+ r = get(f'{BASE}/state')
168
+ check('State has step_count > 0', r.get('step_count', 0) > 0, r.get('step_count'))
169
+ check('State has task_type=likert', r.get('task_type') == 'likert', r.get('task_type'))
170
+ check('State has seed=42', r.get('seed') == 42, r.get('seed'))
171
+ check('State has cumulative_reward', 'cumulative_reward' in r, list(r.keys()))
172
+
173
+ # ── TEST 9: Concurrent resets ──────────────────────────────────
174
+ print('\n[9] CONCURRENT RESETS β€” ISOLATION CHECK')
175
+ ra = post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': 1})
176
+ rb = post(f'{BASE}/reset', {'task_type': 'consistency', 'seed': 2})
177
+ ta = get_obs(ra).get('task_type')
178
+ tb = get_obs(rb).get('task_type')
179
+ check('Different task types accepted',
180
+ ta in ['pairwise','likert','consistency'] and
181
+ tb in ['pairwise','likert','consistency'], f"ta={ta} tb={tb}")
182
+
183
+ # ── TEST 10: Disqualification guard ───────────────────────────
184
+ print('\n[10] DISQUALIFICATION GUARD β€” REWARDS MUST VARY')
185
+ all_rewards = set()
186
+ for seed in [1, 2, 3, 4, 5]:
187
+ post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': seed})
188
+ for choice in ['A', 'B']:
189
+ r = post(f'{BASE}/step', {'action': {'choice': choice}})
190
+ obs = get_obs(r)
191
+ rw = obs.get('reward')
192
+ if rw is not None:
193
+ all_rewards.add(rw)
194
+ post(f'{BASE}/reset', {'task_type': 'pairwise', 'seed': seed})
195
+
196
+ check('Rewards NOT constant (DQ check)',
197
+ len(all_rewards) > 1, f"unique rewards: {all_rewards}")
198
+
199
+ # ── FINAL SUMMARY ──────────────────────────────────────────────
200
+ print()
201
+ print('=' * 60)
202
+ total = PASS + FAIL
203
+ print(f' RESULTS: {PASS}/{total} PASSED')
204
+ print()
205
+ if FAIL == 0:
206
+ print(' πŸ† ALL TESTS PASSED β€” SUBMISSION READY!')
207
+ elif FAIL <= 3:
208
+ print(' ⚠️ MINOR ISSUES β€” Check above')
209
+ else:
210
+ print(' ❌ FAILURES β€” Fix before submitting')
211
+ print('=' * 60)
tests/test_environment.py CHANGED
@@ -242,5 +242,5 @@ class TestPreferenceLabEnvironment:
242
  for task in ["pairwise", "likert", "consistency"]:
243
  obs = self.env.reset(task_type=task)
244
  assert obs is not None
245
- state = self.env.state()
246
- assert state["task_type"] == task
 
242
  for task in ["pairwise", "likert", "consistency"]:
243
  obs = self.env.reset(task_type=task)
244
  assert obs is not None
245
+ state = self.env.state
246
+ assert state.task_type == task