every_eval_ever_space / eval.schema.json
deepmage121's picture
interim update in parser
4b56c7c
{
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "0.1.0",
"type": "object",
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
"required": [
"schema_version",
"evaluation_id",
"retrieved_timestamp",
"source_data",
"source_metadata",
"model_info",
"evaluation_results"
],
"additionalProperties": false,
"properties": {
"schema_version": {
"type": "string",
"description": "Version of the schema used for this evaluation data"
},
"evaluation_id": {
"type": "string",
"description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
},
"retrieved_timestamp": {
"type": "string",
"description": "Timestamp for when this record was created - using Unix Epoch time format"
},
"source_data": {
"description": "Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.",
"oneOf": [
{
"type": "array",
"description": "URLs for the source of the evaluation data",
"items": {
"type": "string"
}
},
{
"type": "object",
"description": "Details about HuggingFace dataset used for evaluation",
"required": [
"dataset_name"
],
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the source dataset"
},
"hf_repo": {
"type": "string",
"description": "HuggingFace repository identifier"
},
"hf_split": {
"type": "string",
"description": "One of train, val or test."
},
"samples_number": {
"type": "integer",
"description": "Number of samples in the dataset"
},
"sample_ids": {
"type": "array",
"description": "Array of sample ids used for evaluation",
"items": {
"type": ["integer", "string"]
}
},
"additional_details": {
"type": "object",
"description": "Additional dataset info parameters",
"additionalProperties": true
}
}
}
]
},
"source_metadata": {
"type": "object",
"description": "Metadata about the source of the leaderboard data",
"required": [
"source_type",
"source_organization_name",
"evaluator_relationship"
],
"properties": {
"source_name": {
"type": "string",
"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
},
"source_type": {
"type": "string",
"enum": [
"documentation",
"evaluation_run"
],
"description": "Whether the data comes from a direct evaluation run or from documentation"
},
"source_organization_name": {
"type": "string",
"description": "Name of the organization that provides the data"
},
"source_organization_url": {
"type": "string",
"description": "URL for the organization that provides the data"
},
"source_organization_logo_url": {
"type": "string",
"description": "URL for the Logo for the organization that provides the data"
},
"evaluator_relationship": {
"type": "string",
"description": "Relationship between the evaluator and the model",
"enum": [
"first_party",
"third_party",
"collaborative",
"other"
]
}
}
},
"model_info": {
"type": "object",
"description": "Complete model specification including basic information, technical configuration and inference settings",
"required": [
"name",
"id"
],
"properties": {
"name": {
"type": "string",
"description": "Model name provided by evaluation source"
},
"id": {
"type": "string",
"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
},
"developer": {
"type": "string",
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
},
"inference_platform": {
"type": "string",
"description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
},
"inference_engine": {
"type": "string",
"description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama)."
},
"additional_details": {
"type": "object",
"description": "Additional model configuration parameters",
"additionalProperties": true
}
}
},
"evaluation_results": {
"type": "array",
"description": "Array of evaluation results",
"items": {
"type": "object",
"required": [
"evaluation_name",
"metric_config",
"score_details"
],
"properties": {
"evaluation_name": {
"type": "string",
"description": "Name of the evaluation"
},
"evaluation_timestamp": {
"type": "string",
"description": "Timestamp for when the evaluations were run"
},
"metric_config": {
"type": "object",
"description": "Details about the metric",
"required": [
"lower_is_better"
],
"properties": {
"evaluation_description": {
"type": "string",
"description": "Description of the evaluation"
},
"lower_is_better": {
"type": "boolean",
"description": "Whether a lower score is better"
},
"score_type": {
"type": "string",
"description": "Type of score",
"enum": [
"binary",
"continuous",
"levels"
]
},
"level_names": {
"type": "array",
"description": "Names of the score levels",
"items": {
"type": "string"
}
},
"level_metadata": {
"type": "array",
"description": "Additional Description for each Score Level",
"items": {
"type": "string"
}
},
"has_unknown_level": {
"type": "boolean",
"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
},
"min_score": {
"type": "number",
"description": "Minimum possible score for continuous metric"
},
"max_score": {
"type": "number",
"description": "Maximum possible score for continuous metric"
}
},
"if": {
"properties": {
"score_type": {
"const": "levels"
}
}
},
"then": {
"required": [
"level_names",
"has_unknown_level"
]
},
"else": {
"if": {
"properties": {
"score_type": {
"const": "continuous"
}
}
},
"then": {
"required": [
"min_score",
"max_score"
]
}
}
},
"score_details": {
"description": "The score for the evaluation and related details",
"required": [
"score"
],
"properties": {
"score": {
"type": "number",
"description": "The score for the evaluation"
},
"details": {
"type": "object",
"description": "Any additional details about the score",
"additionalProperties": true
}
}
},
"detailed_evaluation_results_url": {
"type": "string",
"description": "Link to detailed evaluation data"
},
"generation_config": {
"type": "object",
"generation_args": {
"type": "object",
"description": "Parameters used to generate results - properties may vary by model type",
"properties": {
"temperature": {
"type": [
"null",
"number"
],
"description": "Sampling temperature"
},
"top_p": {
"type": [
"null",
"number"
],
"description": "Nucleus sampling parameter"
},
"top_k": {
"type": [
"null",
"number"
],
"description": "Top-k sampling parameter"
},
"max_tokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum number of tokens to generate"
},
"execution_command": {
"type": "string",
"description": "Command used to run the model to generate results"
},
"reasoning": {
"type": "boolean",
"description": "Whether reasoning orchain-of-thought was used to generate results"
}
},
"additionalProperties": true
},
"additional_details": {
"type": "string",
"description": "Additional details about how the results for this metric were generated."
}
}
}
}
},
"detailed_evaluation_results_per_samples": {
"description": "Detailed eval results for all individual samples in the evaluation. This can be provided as source link or list of DetailedEvaluationResultsPerSample objects.",
"anyOf": [
{
"type": "string",
"description": "Link to detailed evaluation data for all samples"
},
{
"type": "array",
"description": "Array of evaluation results",
"items": {
"type": "object",
"required": [
"sample_id",
"input",
"ground_truth",
"response"
],
"properties": {
"sample_id": {
"type": "string",
"description": "Simple sample ID"
},
"input": {
"type": "string",
"description": "Raw input for the model"
},
"prompt": {
"type": "string",
"description": "Full prompt for the model"
},
"ground_truth": {
"description": "Target response that may include one or multiple correct answers.",
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
]
},
"response": {
"type": "string",
"description": "Response from the model"
},
"choices": {
"description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
"oneOf": [
{
"type": "array",
"items": {
"type": "string"
}
},
{
"type": "array",
"items": {
"type": "array",
"items": [
{
"type": "string"
},
{
"type": "string"
}
],
"minItems": 2,
"maxItems": 2
}
}
]
},
"full_logprobs": {
"type": "array",
"description": "Full log probabilities generated for this sample",
"items": {
"type": "array",
"items": {
"type": "object",
"required": [
"token_id",
"logprob",
"decoded_token"
],
"properties": {
"token_id": {
"type": "number",
"description": "Id of token for which we keep its logprob"
},
"logprob": {
"type": "number",
"description": "Log probability of the token"
},
"decoded_token": {
"type": "string",
"description": "The decoded string representation of the token"
}
}
}
}
}
}
}
}
]
}
}
}