Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

Hopcroft-Skill-Classification / tests /behavioral /test_directional.py

DaCrow13

Deploy to HF Spaces (Clean)

225af6a 2 months ago

12.1 kB

	"""
	Directional Tests for Skill Classification Model

	These tests verify that specific changes to the input lead to PREDICTABLE changes
	in the model's predictions. For example:
	- Adding skill-specific keywords should increase confidence in related skills
	- Removing domain-specific terms should decrease confidence in those domains
	- Adding context about a technology should add related skill predictions

	Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models"
	"""
	import pytest
	import numpy as np


	@pytest.mark.directional
	class TestDirectional:
	"""Test suite for directional expectations of the model."""

	def test_adding_language_keyword(self, predict_with_labels, predict_text):
	"""
	Test that adding programming language keywords increases language-related predictions.

	Adding "Java" or "Python" should make language skills more likely.
	"""
	base = "Fixed bug in authentication system"
	with_java = "Fixed bug in Java authentication system"
	with_python = "Fixed bug in Python authentication system"

	pred_base = set(predict_with_labels(base))
	pred_java = set(predict_with_labels(with_java))
	pred_python = set(predict_with_labels(with_python))

	# Check if language-related labels appear (depends on your label schema)
	# Note: Adjust these checks based on actual labels in your dataset
	print(f"\nBase predictions: {pred_base}")
	print(f"With Java: {pred_java}")
	print(f"With Python: {pred_python}")

	# At minimum, predictions should not become drastically worse
	# It's acceptable if predictions stay the same (model might already predict Language)
	assert len(pred_java) >= len(pred_base) * 0.5, (
	"Adding Java should not drastically reduce predictions"
	)
	assert len(pred_python) >= len(pred_base) * 0.5, (
	"Adding Python should not drastically reduce predictions"
	)

	def test_adding_data_structure_keyword(self, predict_with_labels):
	"""
	Test that adding data structure keywords increases data structure predictions.
	"""
	base = "Implemented search functionality"
	with_hashmap = "Implemented search functionality using HashMap"
	with_tree = "Implemented search functionality using binary tree"

	pred_base = set(predict_with_labels(base))
	pred_hashmap = set(predict_with_labels(with_hashmap))
	pred_tree = set(predict_with_labels(with_tree))

	print(f"\nBase: {pred_base}")
	print(f"With HashMap: {pred_hashmap}")
	print(f"With Tree: {pred_tree}")

	# Adding data structures should increase related predictions
	# pred_hashmap and pred_tree should have more or different labels than base
	assert len(pred_hashmap) >= len(pred_base) * 0.8, (
	"Adding HashMap should not drastically reduce predictions"
	)
	assert len(pred_tree) >= len(pred_base) * 0.8, (
	"Adding tree should not drastically reduce predictions"
	)

	def test_adding_error_handling_context(self, predict_with_labels):
	"""
	Test that adding error handling keywords increases error handling predictions.
	"""
	base = "Updated user login flow"
	with_exception = "Updated user login flow with exception handling"
	with_try_catch = "Updated user login flow with try-catch blocks"

	pred_base = set(predict_with_labels(base))
	pred_exception = set(predict_with_labels(with_exception))
	pred_try_catch = set(predict_with_labels(with_try_catch))

	print(f"\nBase: {pred_base}")
	print(f"With exception: {pred_exception}")
	print(f"With try-catch: {pred_try_catch}")

	# Error handling keywords should not drastically reduce predictions
	# Check if "Error Handling" is in predictions (likely already there)
	has_error_handling = any("Error" in label or "Exception" in label
	for label in pred_exception \| pred_try_catch)

	assert len(pred_exception) >= len(pred_base) * 0.5, (
	"Adding error handling context should not drastically reduce predictions"
	)

	# At least one prediction should contain error-related terms
	print(f"Has error handling related labels: {has_error_handling}")

	def test_removing_specific_technology(self, predict_text):
	"""
	Test that removing technology-specific keywords reduces related predictions.
	"""
	with_tech = "Fixed database connection pooling issue in PostgreSQL"
	without_tech = "Fixed database connection pooling issue"

	pred_with = predict_text(with_tech)
	pred_without = predict_text(without_tech)

	# Predictions should differ when removing specific technology
	# The version with specific tech should generally have same or more predictions
	assert len(pred_with) >= len(pred_without) * 0.7, (
	"Removing technology specifics should not drastically increase predictions"
	)

	def test_adding_api_context(self, predict_with_labels):
	"""
	Test that adding API-related keywords increases API/web service predictions.
	"""
	base = "Fixed user authentication"
	with_api = "Fixed user authentication REST API endpoint"
	with_graphql = "Fixed user authentication GraphQL endpoint"

	pred_base = set(predict_with_labels(base))
	pred_api = set(predict_with_labels(with_api))
	pred_graphql = set(predict_with_labels(with_graphql))

	print(f"\nBase: {pred_base}")
	print(f"With REST API: {pred_api}")
	print(f"With GraphQL: {pred_graphql}")

	# API keywords should not drastically reduce predictions
	assert len(pred_api) >= len(pred_base) * 0.5, (
	"Adding REST API should not drastically reduce predictions"
	)
	assert len(pred_graphql) >= len(pred_base) * 0.5, (
	"Adding GraphQL should not drastically reduce predictions"
	)

	def test_adding_testing_keywords(self, predict_with_labels):
	"""
	Test that adding testing-related keywords increases testing skill predictions.
	"""
	base = "Implemented new feature for user management"
	with_tests = "Implemented new feature for user management with unit tests"
	with_integration = "Implemented new feature for user management with integration tests"

	pred_base = set(predict_with_labels(base))
	pred_unit = set(predict_with_labels(with_tests))
	pred_integration = set(predict_with_labels(with_integration))

	print(f"\nBase: {pred_base}")
	print(f"With unit tests: {pred_unit}")
	print(f"With integration tests: {pred_integration}")

	# Testing keywords should not drastically reduce predictions
	# Check if testing-related labels are present
	has_testing = any("Test" in label or "Automated" in label
	for label in pred_unit \| pred_integration)

	assert len(pred_unit) >= len(pred_base) * 0.5, (
	"Adding testing keywords should not drastically reduce predictions"
	)

	print(f"Has testing related labels: {has_testing}")

	def test_adding_performance_keywords(self, predict_with_labels):
	"""
	Test that adding performance-related keywords affects predictions.
	"""
	base = "Optimized search algorithm"
	with_perf = "Optimized search algorithm for better performance and reduced memory usage"
	with_cache = "Optimized search algorithm with caching"

	pred_base = set(predict_with_labels(base))
	pred_perf = set(predict_with_labels(with_perf))
	pred_cache = set(predict_with_labels(with_cache))

	print(f"\nBase: {pred_base}")
	print(f"With performance: {pred_perf}")
	print(f"With caching: {pred_cache}")

	# Performance keywords should affect predictions
	# More specific descriptions should generally maintain or add labels
	assert len(pred_perf) >= len(pred_base) * 0.7, (
	"Adding performance context should not drastically reduce predictions"
	)

	def test_adding_security_context(self, predict_with_labels):
	"""
	Test that adding security keywords increases security-related predictions.
	"""
	base = "Updated authentication system"
	with_security = "Updated authentication system with OAuth2 security"
	with_encryption = "Updated authentication system with password encryption"

	pred_base = set(predict_with_labels(base))
	pred_oauth = set(predict_with_labels(with_security))
	pred_encryption = set(predict_with_labels(with_encryption))

	print(f"\nBase: {pred_base}")
	print(f"With OAuth: {pred_oauth}")
	print(f"With encryption: {pred_encryption}")

	# Security keywords should not drastically reduce predictions
	# Authentication is already security-related, so predictions should be stable
	assert len(pred_oauth) >= len(pred_base) * 0.5, (
	"Adding OAuth2 should not drastically reduce predictions"
	)
	assert len(pred_encryption) >= len(pred_base) * 0.5, (
	"Adding encryption should not drastically reduce predictions"
	)

	def test_adding_devops_keywords(self, predict_with_labels):
	"""
	Test that adding DevOps keywords increases DevOps-related predictions.
	"""
	base = "Deployed new version"
	with_docker = "Deployed new version using Docker containers"
	with_ci = "Deployed new version through CI/CD pipeline"

	pred_base = set(predict_with_labels(base))
	pred_docker = set(predict_with_labels(with_docker))
	pred_ci = set(predict_with_labels(with_ci))

	print(f"\nBase: {pred_base}")
	print(f"With Docker: {pred_docker}")
	print(f"With CI/CD: {pred_ci}")

	# DevOps keywords should not drastically reduce predictions
	# Check if DevOps-related labels are present
	has_devops = any("DevOps" in label or "Operations" in label or "Deployment" in label
	for label in pred_docker \| pred_ci \| pred_base)

	assert len(pred_docker) >= len(pred_base) * 0.5, (
	"Adding Docker should not drastically reduce predictions"
	)

	print(f"Has DevOps related labels: {has_devops}")

	def test_increasing_technical_detail(self, predict_text):
	"""
	Test that adding more technical detail generally increases or maintains predictions.

	More specific descriptions should not drastically reduce the number of relevant skills.
	"""
	vague = "Fixed bug"
	specific = "Fixed null pointer exception in user service layer"
	very_specific = "Fixed null pointer exception in UserService.getUserById() method when handling deleted users"

	pred_vague = predict_text(vague)
	pred_specific = predict_text(specific)
	pred_very_specific = predict_text(very_specific)

	print(f"\nVague ({len(pred_vague)} labels): {pred_vague}")
	print(f"Specific ({len(pred_specific)} labels): {pred_specific}")
	print(f"Very specific ({len(pred_very_specific)} labels): {pred_very_specific}")

	# More detail should generally add relevant skills, not remove them drastically
	# Allow some variance since very specific text might lose some general predictions
	assert len(pred_specific) >= len(pred_vague) * 0.5, (
	"Adding technical detail should not reduce predictions drastically"
	)