File size: 4,023 Bytes
e814211
 
 
 
 
 
 
 
f62b8c4
 
 
 
 
 
 
 
 
 
e814211
f62b8c4
 
f5bf147
e814211
 
 
 
 
 
 
f5bf147
 
 
e814211
f5bf147
 
 
 
 
 
e814211
f5bf147
e814211
f5bf147
e814211
f5bf147
 
 
 
e814211
f5bf147
e814211
f5bf147
 
 
 
e814211
f5bf147
e814211
f5bf147
e814211
f5bf147
e814211
f5bf147
 
 
 
 
 
 
 
 
 
 
 
e814211
f5bf147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
013f801
 
 
 
 
 
 
 
f5bf147
 
 
 
 
 
 
 
 
 
 
 
 
 
013f801
f5bf147
 
e814211
 
 
f5bf147
e814211
f5bf147
f62b8c4
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pytest

import markdown
from bs4 import BeautifulSoup
from compliance_checks import (
    GeneralLimitationsCheck, GeneralLimitationsResult,
)

empty_template = """\
## Bias, Risks, and Limitations

<!-- This section is meant to convey both technical and sociotechnical limitations. -->

[More Information Needed]

### Recommendations

<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->

Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
"""
model_card_template = """\
# Model Card for Sample Model

## Bias, Risks, and Limitations

<!-- This section is meant to convey both technical and sociotechnical limitations. -->

Hello world! These are some risks...
"""
albert_base_v2 = """\
# ALBERT Base v2

## Intended uses & limitations
You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to
be fine-tuned on a downstream task.
"""
distilbert_base_cased_distilled_squad = """\
# DistilBERT base cased distilled SQuAD

## Risks, Limitations and Biases

**CONTENT WARNING: Readers should be aware that language generated by this model can be disturbing or offensive to some and can propagate historical and current stereotypes.**

Significant research has explored bias and fairness issues with language models.
"""
gpt2 = """\
# GPT-2

### Limitations and bias

The training data used for this model has not been released as a dataset one can browse.
"""
clip = """\
# Model Card: CLIP

## Limitations

CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects.

### Bias and Fairness

We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude.
"""
runway = """\
# Stable Diffusion v1-5 Model Card

## Limitations and Bias

### Limitations

- The model does not achieve perfect photorealism

### Bias

While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
"""
distilroberta_base = """\
# Model Card for DistilRoBERTa base

# Bias, Risks, and Limitations

Significant research has explored bias and fairness issues with language models.
"""
bloom = """\
# BLOOM

# Risks and Limitations
*This section identifies foreseeable harms and misunderstandings.*
"""

t_zero = """\
# Limitations

- The models of the T0* series are quite large (3B or 11B parameters). Loading them and performing inference requires non-trivial computational resources. When using multiple GPUs, it is possible to use [.parallelize()](https://huggingface.co/transformers/parallelism.html).
- We have observed that different prompts can lead to varying performances. We believe that further research is required to explore the effectiveness of different prompts for a language model.
- Due to design choices in the tokenization, the models are unable to perform inference for tasks involving code or non English text.
"""

success_result = GeneralLimitationsResult(
    status=True
)


@pytest.mark.parametrize("card", [
    model_card_template,
    albert_base_v2,
    distilbert_base_cased_distilled_squad,
    gpt2,
    clip,
    runway,
    distilroberta_base,
    bloom,
    t_zero,
])
def test_run_checks(card):
    model_card_html = markdown.markdown(card)
    card_soup = BeautifulSoup(model_card_html, features="html.parser")

    results = GeneralLimitationsCheck().run_check(card_soup)

    assert results == success_result


def test_fail_on_empty_template():
    model_card_html = markdown.markdown(empty_template)
    card_soup = BeautifulSoup(model_card_html, features="html.parser")
    results = GeneralLimitationsCheck().run_check(card_soup)
    assert results == GeneralLimitationsResult()