{ "metadata": { "Name": "Model C", "Provider": "TechStart", "Version": "1.0", "Release Date": "2023-12-15", "Type": "Specialized NLP Model", "Modalities": ["Text-to-Text"] }, "scores": { "Bias, Stereotypes, and Representational Harms Evaluation": { "Comprehensive Evaluation Methodology": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)", "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods" ] }, "Inclusive Protected Class Consideration": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)", "Consideration of intersectionality and how identity aspects interact" ] }, "Cultural and Linguistic Diversity": { "status": "No", "source": null, "applicable_evaluations": [ "Tests of model performance and biases across languages and cultures", "Consideration of how protected categories may shift in meaning across regions" ] }, "Stereotype and Harmful Association Detection": { "status": "No", "source": null, "applicable_evaluations": [ "Detection of stereotypical word associations in text models", "Sentiment analysis and toxicity measurements, especially regarding specific groups" ] }, "Performance Disparities Assessment": { "status": "No", "source": null, "applicable_evaluations": [ "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups", "Performance analysis for disadvantaged subgroups" ] }, "Bias Mitigation and Impact Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Documentation of bias mitigation strategies", "Analyses of how model updates or mitigations affect bias metrics" ] }, "Transparency and Limitations Disclosure": { "status": "No", "source": null, "applicable_evaluations": [ "Clear statements on the capabilities and limitations of evaluation methods", "Acknowledgment of potential biases from the evaluation tools/processes" ] }, "Ongoing Evaluation Commitment": { "status": "No", "source": null, "applicable_evaluations": [ "Plans for continual bias assessment as the model is updated or deployed in new contexts", "Commitments to transparency and regular reporting on bias-related issues" ] } }, "Cultural Values and Sensitive Content Evaluation": { "Hate Speech and Toxicity Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessments of harmful text generation", "Evaluations of toxicity, hurtfulness, or offensiveness" ] }, "Cultural Value Representation": { "status": "No", "source": null, "applicable_evaluations": [ "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)", "Assessments of ethical scenarios and political value representation" ] }, "Diverse Cultural Context": { "status": "No", "source": null, "applicable_evaluations": [ "Assessments that don't equate nationality with cultural context", "Representation of differing cultural values within countries" ] }, "Sensitive Content Identification": { "status": "No", "source": null, "applicable_evaluations": [ "Recognition of topics that vary by culture and viewpoint", "Evaluation of adult sexual content identification" ] }, "Impact of Generated Content": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of potential harm to targeted viewers", "Evaluation of content's potential to normalize harmful ideas" ] }, "Multidimensional Cultural Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluations at word, sentence, and document levels for text", "Multi-level analysis of cultural representation" ] } }, "Disparate Performance": { "Subpopulation Performance Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations", "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios" ] }, "Cross-lingual and Dialect Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Cross-lingual prompting on standard benchmarks", "Examination of performance across dialects" ] }, "Image Generation Quality Assessment": { "status": "N/A", "source": null, "applicable_evaluations": [] }, "Data Duplication and Bias Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Analysis of the effect of retaining duplicate examples in the training dataset", "Evaluation of model bias towards generating certain phrases or concepts" ] }, "Dataset Disparities Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of dataset skew with fewer examples from some subpopulations", "Evaluation of feature inconsistencies across subpopulations" ] }, "Evaluation of Systemic Issues": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of disparities due to dataset collection methods", "Evaluation of the impact of varying levels of internet access on data representation" ] }, "Long-tail Data Distribution Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of model performance on rare or uncommon data points", "Evaluation of the trade-off between fitting long tails and unintentional memorization" ] } }, "Environmental Costs and Carbon Emissions Evaluation": { "Energy Consumption Measurement": { "status": "No", "source": null, "applicable_evaluations": [ "Measurement of energy used in training, testing, and deploying the system", "Evaluation of compute power consumption" ] }, "Carbon Footprint Quantification": { "status": "No", "source": null, "applicable_evaluations": [ "Use of tools like CodeCarbon or Carbontracker", "Measurement of carbon emissions for training and inference" ] }, "Hardware Resource Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of CPU, GPU, and TPU usage", "Measurement of FLOPS (Floating Point Operations)" ] }, "Comprehensive Environmental Impact Assessment": { "status": "No", "source": null, "applicable_evaluations": [ "Use of Life Cycle Assessment (LCA) methodologies", "Evaluation of immediate impacts of applying ML" ] }, "Transparency in Environmental Reporting": { "status": "No", "source": null, "applicable_evaluations": [ "Disclosure of uncertainty around measured variables", "Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)" ] }, "Comprehensive Environmental Impact Metrics": { "status": "No", "source": null, "applicable_evaluations": [ "Discussion of different approaches to measuring environmental impact", "Use of diverse measurements beyond energy consumption" ] } }, "Privacy and Data Protection Evaluation": { "Data Minimization and Consent Practices": { "status": "No", "source": null, "applicable_evaluations": [ "Implementation of data minimization practices", "Use of opt-in data collection methods" ] }, "Memorization and Data Leakage Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Examination of the maximum amount of discoverable information given training data", "Evaluation of extractable information without training data access" ] }, "Personal Information Revelation Assessment": { "status": "No", "source": null, "applicable_evaluations": [ "Direct prompting tests to reveal Personally Identifiable Information (PII)", "Evaluation of the system's ability to infer personal attributes" ] }, "Image and Audio Privacy Evaluation": { "status": "N/A", "source": null, "applicable_evaluations": [] }, "Intellectual Property and Copyright Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of the system's ability to generate copyrighted content", "Evaluation of intellectual property concerns in generated content" ] }, "Retroactive Privacy Protection": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of the system's capability to retroactively retrain in accordance with privacy policies", "Evaluation of processes for removing specific data points upon request" ] }, "Third-party Hosting Privacy Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of potential leakage of private input data in generations", "Evaluation of system prompt privacy, especially for prompts containing proprietary information" ] }, "Generative AI-Specific Privacy Measures": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of the applicability of data sanitization techniques to generative models", "Evaluation of differential privacy approaches in the context of generative AI" ] } }, "Financial Costs Evaluation": { "Comprehensive Cost Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Estimation of infrastructure and hardware costs", "Calculation of labor hours from researchers, developers, and crowd workers" ] }, "Storage and Training Cost Analysis": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of storage costs for both datasets and resulting models", "Evaluation of training costs based on in-house GPUs or per-hour-priced instances" ] }, "Hosting and Inference Cost Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Evaluation of low-latency serving costs", "Assessment of inference costs based on token usage" ] }, "Modality-Specific Cost Analysis": { "status": "N/A", "source": null, "applicable_evaluations": [] }, "Long-term Cost Considerations": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of pre- and post-deployment costs", "Consideration of human labor and hidden costs" ] }, "API Cost Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of token-usage based pricing", "Evaluation of cost variations based on initial prompt length and requested token response length" ] }, "Comprehensive Cost Tracking": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of costs related to broader infrastructure or organizational changes", "Evaluation of long-term maintenance and update costs" ] } }, "Data and Content Moderation Labor Evaluation": { "Crowdwork Standards Compliance": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of compliance with Criteria for Fairer Microwork", "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines" ] }, "Crowdworker Demographics and Compensation": { "status": "No", "source": null, "applicable_evaluations": [ "Documentation of crowd workers' demographics", "Assessment of how crowdworkers were evaluated and compensated" ] }, "Psychological Support and Content Exposure": { "status": "No", "source": null, "applicable_evaluations": [ "Documentation of immediate trauma support availability", "Evaluation of practices for controlling exposure to traumatic material" ] }, "Transparency in Crowdwork Documentation": { "status": "No", "source": null, "applicable_evaluations": [ "Use of transparent reporting frameworks", "Documentation of crowdwork's role in shaping AI system output" ] }, "Crowdwork Stages and Types": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of crowdwork in data gathering, curation, cleaning, and labeling", "Evaluation of crowdwork during model development and interim evaluations" ] }, "Evaluation of Labor Protection and Regulations": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of compliance with relevant labor law interventions by jurisdiction", "Evaluation of worker classification and associated protections" ] }, "Outsourcing Impact Evaluation": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of communication barriers created by outsourcing", "Evaluation of differences in working conditions between in-house and outsourced labor" ] }, "Impact of Precarious Employment": { "status": "No", "source": null, "applicable_evaluations": [ "Assessment of job security and its impact on worker feedback", "Evaluation of anonymous reporting systems for substandard working conditions" ] } } } }