Spaces:

society-ethics
/

model-card-regulatory-check

Running

App Files Files Community

model-card-regulatory-check / tests /test_compliance_checks.py

NimaBoscarino

WIP: Dynamic block-based results, funky reporting

aae10fc over 1 year ago

raw

history blame

No virus

8.92 kB

	import pytest
	from unittest.mock import MagicMock

	import markdown
	from bs4 import BeautifulSoup
	from compliance_checks import (
	ComplianceSuite,
	ModelProviderIdentityCheck, ModelProviderIdentityResult,
	IntendedPurposeCheck, IntendedPurposeResult,
	GeneralLimitationsCheck, GeneralLimitationsResult,
	ComputationalRequirementsCheck, ComputationalRequirementsResult,
	)


	expected_infrastructure = """\
	Jean Zay Public Supercomputer, provided by the French government.\
	Hardware\
	384 A100 80GB GPUs (48 nodes)\
	Software\
	Megatron-DeepSpeed (Github link)\
	"""


	class TestComplianceCheck:
	@pytest.fixture
	def provider_identity_model_card(self):
	return """
	# Model Card for Sample Model

	Some random info...

	## Model Details

	### Model Description

	<!-- Provide a longer summary of what this model is. -->

	- Developed by: Nima Boscarino
	- Model type: Yada yada yada
	"""

	@pytest.fixture
	def bad_provider_identity_model_card(self):
	return """
	# Model Card for Sample Model

	Some random info...

	## Model Details

	### Model Description

	- Developed by: [More Information Needed]
	- Model type: Yada yada yada
	"""

	@pytest.fixture
	def intended_purpose_model_card(self):
	return """
	# Model Card for Sample Model

	Some random info...

	## Uses

	<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->

	### Direct Use

	Here is some info about direct uses...

	### Downstream Use [optional]

	<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->

	[More Information Needed]

	### Out-of-Scope Use

	<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->

	Here is some info about out-of-scope uses...

	## Bias, Risks, and Limitations

	<!-- This section is meant to convey both technical and sociotechnical limitations. -->

	[More Information Needed]
	"""

	@pytest.fixture
	def bad_intended_purpose_model_card(self):
	return """
	# Model Card for Sample Model

	Some random info...

	## Uses

	<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->

	### Direct Use

	<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->

	[More Information Needed]

	### Downstream Use [optional]

	<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->

	[More Information Needed]

	### Out-of-Scope Use

	<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->

	[More Information Needed]

	## Bias, Risks, and Limitations

	<!-- This section is meant to convey both technical and sociotechnical limitations. -->

	[More Information Needed]
	"""

	@pytest.fixture
	def general_limitations_model_card(self):
	return """
	# Model Card for Sample Model

	## Some Random Header

	## Bias, Risks, and Limitations

	<!-- This section is meant to convey both technical and sociotechnical limitations. -->

	Hello world! These are some risks...

	## More Things
	"""

	@pytest.fixture
	def bad_general_limitations_model_card(self):
	return """
	# Model Card for Sample Model

	## Some Random Header

	## Bias, Risks, and Limitations

	<!-- This section is meant to convey both technical and sociotechnical limitations. -->

	[More Information Needed]

	## More Things
	"""

	@pytest.fixture
	def computational_requirements_model_card(self):
	# Adapted from: https://huggingface.co/bigscience/bloom/blob/main/README.md
	return """
	# Model Card for Sample Model

	## Some Random Header

	## Technical Specifications

	### Compute infrastructure
	Jean Zay Public Supercomputer, provided by the French government.

	#### Hardware

	* 384 A100 80GB GPUs (48 nodes)

	#### Software

	* Megatron-DeepSpeed ([Github link](https://github.com/bigscience-workshop/Megatron-DeepSpeed))
	</details>

	## Intended Use

	Etc..
	"""

	@pytest.fixture
	def bad_computational_requirements_model_card(self):
	# Adapted from: https://huggingface.co/bigscience/bloom/blob/main/README.md
	return """
	# Model Card for Sample Model

	## Some Random Header

	## Technical Specifications

	### Compute infrastructure
	[More Information Needed]

	## Intended Use

	Etc..
	"""

	@pytest.mark.parametrize("check,card,expected", [
	(ModelProviderIdentityCheck(), "provider_identity_model_card", ModelProviderIdentityResult(
	status=True,
	provider="Nima Boscarino",
	)),
	(ModelProviderIdentityCheck(), "bad_provider_identity_model_card", ModelProviderIdentityResult()),
	(IntendedPurposeCheck(), "intended_purpose_model_card", IntendedPurposeResult(
	status=True,
	direct_use="Here is some info about direct uses...",
	downstream_use=None,
	out_of_scope_use="Here is some info about out-of-scope uses...",
	)),
	(IntendedPurposeCheck(), "bad_intended_purpose_model_card", IntendedPurposeResult()),
	(GeneralLimitationsCheck(), "general_limitations_model_card", GeneralLimitationsResult(
	status=True,
	limitations="Hello world! These are some risks..."
	)),
	(GeneralLimitationsCheck(), "bad_general_limitations_model_card", GeneralLimitationsResult()),
	(ComputationalRequirementsCheck(), "computational_requirements_model_card", ComputationalRequirementsResult(
	status=True,
	requirements=expected_infrastructure,
	)),
	(ComputationalRequirementsCheck(), "bad_computational_requirements_model_card", ComputationalRequirementsResult()),
	])
	def test_run_checks(self, check, card, expected, request):
	card = request.getfixturevalue(card)

	model_card_html = markdown.markdown(card)
	card_soup = BeautifulSoup(model_card_html, features="html.parser")

	results = check.run_check(card_soup)

	assert results == expected


	class TestComplianceSuite:
	@pytest.fixture
	def mock_compliance_check(self):
	mockComplianceCheck = MagicMock()
	mockComplianceCheck.run_check = MagicMock(return_value=True)

	return mockComplianceCheck

	@pytest.fixture
	def empty_compliance_suite(self):
	return ComplianceSuite(
	checks=[]
	)

	@pytest.fixture
	def compliance_suite(self, mock_compliance_check):
	return ComplianceSuite(
	checks=[mock_compliance_check]
	)

	@pytest.fixture
	def empty_compliance_results(self):
	return []

	@pytest.fixture
	def compliance_results(self):
	return [True]

	def test_create_empty_compliance_suite(self, empty_compliance_suite):
	assert len(empty_compliance_suite.checks) == 0

	def test_create_compliance_suite(self, compliance_suite):
	assert len(compliance_suite.checks) == 1

	@pytest.mark.parametrize("suite,results", [
	("empty_compliance_suite", "empty_compliance_results"),
	("compliance_suite", "compliance_results")
	])
	def test_run_compliance_suite(self, suite, results, request):
	suite: ComplianceSuite = request.getfixturevalue(suite)
	results: list = request.getfixturevalue(results)
	assert suite.run("") == results

	for check in suite.checks:
	check.run_check.assert_called_once()


	class TestEndToEnd:
	@pytest.mark.parametrize("card,fixture", [
	("""
	# Model Card for Sample Model

	Some random info...

	## Model Details

	### Model Description

	- Developed by: Nima Boscarino
	- Model type: Yada yada yada

	## Uses

	### Direct Use

	Here is some info about direct uses...

	### Downstream Use [optional]

	[More Information Needed]

	### Out-of-Scope Use

	Here is some info about out-of-scope uses...

	## Bias, Risks, and Limitations

	Hello world! These are some risks...

	## Technical Specifications

	### Compute infrastructure
	Jean Zay Public Supercomputer, provided by the French government.

	#### Hardware

	* 384 A100 80GB GPUs (48 nodes)

	#### Software

	* Megatron-DeepSpeed ([Github link](https://github.com/bigscience-workshop/Megatron-DeepSpeed))
	</details>

	## More Things
	""", False),
	("bloom_card", True)
	])
	def test_end_to_end_compliance_suite(self, card, fixture, request):
	if fixture:
	card = request.getfixturevalue(card)

	suite = ComplianceSuite(checks=[
	ModelProviderIdentityCheck(),
	IntendedPurposeCheck(),
	GeneralLimitationsCheck(),
	ComputationalRequirementsCheck()
	])

	results = suite.run(card)

	assert all([r.status for r in results])