Spaces:

mathi3046
/

customer-support-env

Sleeping

App Files Files Community

customer-support-env / validate.py

mathi3046

last 0 and 1 error update

6c591d0 about 1 month ago

raw

history blame contribute delete

10 kB

	"""
	Validation / smoke-test script for the Customer Support Environment.

	Runs through all 3 tasks with deterministic responses and verifies:
	✓ reset() returns valid SupportObservation
	✓ step() returns (observation, reward, done, info) with correct types
	✓ state() returns valid SupportState
	✓ Rewards are non-constant and in (0.0, 1.0) strict open interval
	✓ Episodes terminate correctly
	✓ Grader produces varying scores for different responses

	Usage:
	python validate.py
	"""

	import sys
	import os

	# Ensure project root is on path
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score
	from server.environment import CustomerSupportEnvironment
	from tasks import TASK_IDS


	def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list[str]) -> dict:
	"""Run a task with given responses and collect results."""
	print(f"\n{'='*50}")
	print(f" Validating: {task_id}")
	print(f"{'='*50}")

	# Test reset
	obs = env.reset(task_id=task_id)
	assert isinstance(obs, SupportObservation), f"reset() must return SupportObservation, got {type(obs)}"
	assert obs.task_id == task_id, f"task_id mismatch: {obs.task_id} != {task_id}"
	assert not obs.done, "Episode should not be done after reset"
	assert obs.current_message, "Initial customer message should not be empty"
	print(f" ✓ reset() returned valid SupportObservation")
	print(f" Customer: {obs.ticket.customer_name}")
	print(f" Subject: {obs.ticket.subject}")
	print(f" Message: {obs.current_message[:60]}...")

	# Test state after reset
	state = env.state()
	assert isinstance(state, SupportState), f"state() must return SupportState, got {type(state)}"
	assert state.step_count == 0, "Step count should be 0 after reset"
	assert not state.done, "State should not be done after reset"
	print(f" ✓ state() returned valid SupportState")

	# Test steps
	rewards = []
	for i, response_text in enumerate(responses):
	action = SupportAction(
	response_text=response_text,
	action_type="respond" if i < len(responses) - 1 else "resolve",
	)
	obs, reward, done, info = env.step(action)

	assert isinstance(obs, SupportObservation), f"step() obs must be SupportObservation"
	assert isinstance(reward, float), f"step() reward must be float, got {type(reward)}"
	assert isinstance(done, bool), f"step() done must be bool, got {type(done)}"
	assert isinstance(info, dict), f"step() info must be dict, got {type(info)}"
	assert 0.0 < reward < 1.0, f"Reward {reward} out of strict (0.0, 1.0) range"

	rewards.append(reward)
	breakdown = info.get("reward_breakdown", {})
	print(f" ✓ step({i+1}) → reward={reward:.4f} \| "
	f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} "
	f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} "
	f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} "
	f"done={done}")

	if done:
	break

	# Verify final state
	state = env.state()
	assert state.step_count > 0, "Step count should be > 0 after steps"
	print(f" ✓ Final state: steps={state.step_count}, reward={state.cumulative_reward:.4f}")

	return {
	"task_id": task_id,
	"rewards": rewards,
	"avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5,
	"steps": len(rewards),
	}


	def validate_grader_variance():
	"""Verify the grader doesn't return constant values."""
	print(f"\n{'='*50}")
	print(f" Validating: Grader Variance")
	print(f"{'='*50}")

	env = CustomerSupportEnvironment()
	env.reset(task_id="easy_faq")

	# Test with a GOOD response
	good_action = SupportAction(
	response_text=(
	"Hi Sarah! Thank you for reaching out about your order ORD-55821. "
	"I completely understand your concern about the shipping update. "
	"Standard shipping typically takes 5-7 business days, and since your "
	"order was placed on March 28th, it should be arriving soon. "
	"You should receive a tracking number via email. Let me look into "
	"the specific status of your order right away and I'll update you. "
	"Is there anything else I can help you with?"
	),
	action_type="respond",
	)
	_, good_reward, _, good_info = env.step(good_action)

	# Reset and test with a BAD response
	env.reset(task_id="easy_faq")
	bad_action = SupportAction(
	response_text="I don't know.",
	action_type="respond",
	)
	_, bad_reward, _, bad_info = env.step(bad_action)

	# Reset and test with an IRRELEVANT response
	env.reset(task_id="easy_faq")
	irr_action = SupportAction(
	response_text="The weather is nice today. Have you tried checking the stock market?",
	action_type="respond",
	)
	_, irr_reward, _, irr_info = env.step(irr_action)

	print(f" Good response reward: {good_reward:.4f}")
	print(f" Bad response reward: {bad_reward:.4f}")
	print(f" Irrelevant response reward: {irr_reward:.4f}")

	assert good_reward != bad_reward, "Grader returns same reward for good and bad responses!"
	assert good_reward > bad_reward, "Good response should score higher than bad response!"
	assert good_reward > irr_reward, "Good response should score higher than irrelevant response!"
	print(f" ✓ Grader produces varying scores (NOT constant)")
	print(f" ✓ Good > Bad > Irrelevant ordering confirmed")

	# Verify ALL rewards are strictly in (0, 1)
	for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]:
	assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!"
	print(f" ✓ All rewards strictly in (0, 1) open interval")


	def main():
	print("=" * 50)
	print(" Customer Support Environment — Validation")
	print("=" * 50)

	env = CustomerSupportEnvironment()

	# Test responses per task
	test_responses = {
	"easy_faq": [
	"Hi Sarah! Thank you for reaching out about your order ORD-55821. "
	"Standard shipping takes 5-7 business days. You'll receive a tracking "
	"number via email within 24 hours of shipment. Let me check on the "
	"status of your Wireless Bluetooth Headphones order right away.",
	],
	"medium_refund": [
	"Hi Michael, I'm sorry to hear about the stitching issue with your "
	"Premium Leather Laptop Bag. That sounds like a manufacturing defect, "
	"and I completely understand your frustration. According to our policy, "
	"defective items qualify for a full refund or replacement at any time. "
	"Could you please send photos of the defect so we can process this quickly?",
	"Thank you for the photos, Michael. I can confirm this is a defect. "
	"You have two options: a full refund of $149.99 or a replacement bag. "
	"Either way, we'll provide a prepaid return shipping label. "
	"Which would you prefer?",
	"We'll process your full refund within 5-7 business days after we "
	"receive the returned bag. I'll email you the return label right away. "
	"I sincerely apologize for the inconvenience.",
	],
	"hard_escalation": [
	"Mr. Martinez, I sincerely apologize for this terrible experience. "
	"What happened — receiving the wrong item after a late delivery, "
	"and then being treated rudely by our support staff — is completely "
	"unacceptable. You deserve much better. I'm escalating this to our "
	"senior support team immediately as a top priority case.",
	"I understand your frustration completely, Mr. Martinez. Here's exactly "
	"what I'm going to do: First, I'm processing a full refund of $349.99 "
	"for the wrong item. Second, I'm adding a $50 store credit as compensation "
	"for the inconvenience. Third, I'm personally ensuring the correct "
	"Smart Home Security Camera System ships via expedited delivery today. "
	"The staff member's behavior will be addressed by management.",
	"Absolutely, Mr. Martinez. Here are the specifics: Your refund will be "
	"processed within 24 hours. The replacement ships via priority express "
	"and will arrive within 2-3 business days. The $50 credit is already "
	"applied to your account. I will personally follow up with you via "
	"email tomorrow to confirm everything is on track.",
	"I completely understand, Mr. Martinez. I'll send you a confirmation "
	"email within the hour with all the details in writing: the refund, "
	"the replacement tracking, and the store credit. You have my word "
	"this will be resolved. Thank you for your patience."
	],
	}

	all_results = []
	for task_id in TASK_IDS:
	responses = test_responses.get(task_id, ["Thank you for reaching out."])
	result = validate_task(env, task_id, responses)
	all_results.append(result)

	# Validate grader variance
	validate_grader_variance()

	# Summary
	print(f"\n{'='*50}")
	print(f" VALIDATION SUMMARY")
	print(f"{'='*50}")
	total_avg = 0.0
	for r in all_results:
	print(f" ✓ {r['task_id']:20s} → avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
	total_avg += r['avg_reward']
	overall = safe_score(total_avg / len(all_results)) if all_results else 0.01
	print(f"\n Overall Score: {overall:.4f}")
	print(f"\n ✅ ALL VALIDATIONS PASSED!")
	return 0


	if __name__ == "__main__":
	sys.exit(main())