Spaces:

evaleval
/

every_eval_ever_space

Running

App Files Files Community

every_eval_ever_space / eval.schema.json

deepmage121

interim update in parser

4b56c7c 25 days ago

raw

history blame contribute delete

20.4 kB

	{
	"$schema": "http://json-schema.org/draft-07/schema#",
	"version": "0.1.0",
	"type": "object",
	"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
	"required": [
	"schema_version",
	"evaluation_id",
	"retrieved_timestamp",
	"source_data",
	"source_metadata",
	"model_info",
	"evaluation_results"
	],
	"additionalProperties": false,
	"properties": {
	"schema_version": {
	"type": "string",
	"description": "Version of the schema used for this evaluation data"
	},
	"evaluation_id": {
	"type": "string",
	"description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
	},
	"retrieved_timestamp": {
	"type": "string",
	"description": "Timestamp for when this record was created - using Unix Epoch time format"
	},
	"source_data": {
	"description": "Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.",
	"oneOf": [
	{
	"type": "array",
	"description": "URLs for the source of the evaluation data",
	"items": {
	"type": "string"
	}
	},
	{
	"type": "object",
	"description": "Details about HuggingFace dataset used for evaluation",
	"required": [
	"dataset_name"
	],
	"properties": {
	"dataset_name": {
	"type": "string",
	"description": "Name of the source dataset"
	},
	"hf_repo": {
	"type": "string",
	"description": "HuggingFace repository identifier"
	},
	"hf_split": {
	"type": "string",
	"description": "One of train, val or test."
	},
	"samples_number": {
	"type": "integer",
	"description": "Number of samples in the dataset"
	},
	"sample_ids": {
	"type": "array",
	"description": "Array of sample ids used for evaluation",
	"items": {
	"type": ["integer", "string"]
	}
	},
	"additional_details": {
	"type": "object",
	"description": "Additional dataset info parameters",
	"additionalProperties": true
	}
	}
	}
	]
	},
	"source_metadata": {
	"type": "object",
	"description": "Metadata about the source of the leaderboard data",
	"required": [
	"source_type",
	"source_organization_name",
	"evaluator_relationship"
	],
	"properties": {
	"source_name": {
	"type": "string",
	"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
	},
	"source_type": {
	"type": "string",
	"enum": [
	"documentation",
	"evaluation_run"
	],
	"description": "Whether the data comes from a direct evaluation run or from documentation"
	},
	"source_organization_name": {
	"type": "string",
	"description": "Name of the organization that provides the data"
	},
	"source_organization_url": {
	"type": "string",
	"description": "URL for the organization that provides the data"
	},
	"source_organization_logo_url": {
	"type": "string",
	"description": "URL for the Logo for the organization that provides the data"
	},
	"evaluator_relationship": {
	"type": "string",
	"description": "Relationship between the evaluator and the model",
	"enum": [
	"first_party",
	"third_party",
	"collaborative",
	"other"
	]
	}
	}
	},
	"model_info": {
	"type": "object",
	"description": "Complete model specification including basic information, technical configuration and inference settings",
	"required": [
	"name",
	"id"
	],
	"properties": {
	"name": {
	"type": "string",
	"description": "Model name provided by evaluation source"
	},
	"id": {
	"type": "string",
	"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
	},
	"developer": {
	"type": "string",
	"description": "Name of organization that provides the model (e.g. 'OpenAI')"
	},
	"inference_platform": {
	"type": "string",
	"description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
	},
	"inference_engine": {
	"type": "string",
	"description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama)."
	},
	"additional_details": {
	"type": "object",
	"description": "Additional model configuration parameters",
	"additionalProperties": true
	}
	}
	},
	"evaluation_results": {
	"type": "array",
	"description": "Array of evaluation results",
	"items": {
	"type": "object",
	"required": [
	"evaluation_name",
	"metric_config",
	"score_details"
	],
	"properties": {
	"evaluation_name": {
	"type": "string",
	"description": "Name of the evaluation"
	},
	"evaluation_timestamp": {
	"type": "string",
	"description": "Timestamp for when the evaluations were run"
	},
	"metric_config": {
	"type": "object",
	"description": "Details about the metric",
	"required": [
	"lower_is_better"
	],
	"properties": {
	"evaluation_description": {
	"type": "string",
	"description": "Description of the evaluation"
	},
	"lower_is_better": {
	"type": "boolean",
	"description": "Whether a lower score is better"
	},
	"score_type": {
	"type": "string",
	"description": "Type of score",
	"enum": [
	"binary",
	"continuous",
	"levels"
	]
	},
	"level_names": {
	"type": "array",
	"description": "Names of the score levels",
	"items": {
	"type": "string"
	}
	},
	"level_metadata": {
	"type": "array",
	"description": "Additional Description for each Score Level",
	"items": {
	"type": "string"
	}
	},
	"has_unknown_level": {
	"type": "boolean",
	"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
	},
	"min_score": {
	"type": "number",
	"description": "Minimum possible score for continuous metric"
	},
	"max_score": {
	"type": "number",
	"description": "Maximum possible score for continuous metric"
	}
	},
	"if": {
	"properties": {
	"score_type": {
	"const": "levels"
	}
	}
	},
	"then": {
	"required": [
	"level_names",
	"has_unknown_level"
	]
	},
	"else": {
	"if": {
	"properties": {
	"score_type": {
	"const": "continuous"
	}
	}
	},
	"then": {
	"required": [
	"min_score",
	"max_score"
	]
	}
	}
	},
	"score_details": {
	"description": "The score for the evaluation and related details",
	"required": [
	"score"
	],
	"properties": {
	"score": {
	"type": "number",
	"description": "The score for the evaluation"
	},
	"details": {
	"type": "object",
	"description": "Any additional details about the score",
	"additionalProperties": true
	}
	}
	},
	"detailed_evaluation_results_url": {
	"type": "string",
	"description": "Link to detailed evaluation data"
	},
	"generation_config": {
	"type": "object",
	"generation_args": {
	"type": "object",
	"description": "Parameters used to generate results - properties may vary by model type",
	"properties": {
	"temperature": {
	"type": [
	"null",
	"number"
	],
	"description": "Sampling temperature"
	},
	"top_p": {
	"type": [
	"null",
	"number"
	],
	"description": "Nucleus sampling parameter"
	},
	"top_k": {
	"type": [
	"null",
	"number"
	],
	"description": "Top-k sampling parameter"
	},
	"max_tokens": {
	"type": "integer",
	"minimum": 1,
	"description": "Maximum number of tokens to generate"
	},
	"execution_command": {
	"type": "string",
	"description": "Command used to run the model to generate results"
	},
	"reasoning": {
	"type": "boolean",
	"description": "Whether reasoning orchain-of-thought was used to generate results"
	}
	},
	"additionalProperties": true
	},
	"additional_details": {
	"type": "string",
	"description": "Additional details about how the results for this metric were generated."
	}
	}
	}
	}
	},
	"detailed_evaluation_results_per_samples": {
	"description": "Detailed eval results for all individual samples in the evaluation. This can be provided as source link or list of DetailedEvaluationResultsPerSample objects.",
	"anyOf": [
	{
	"type": "string",
	"description": "Link to detailed evaluation data for all samples"
	},
	{
	"type": "array",
	"description": "Array of evaluation results",
	"items": {
	"type": "object",
	"required": [
	"sample_id",
	"input",
	"ground_truth",
	"response"
	],
	"properties": {
	"sample_id": {
	"type": "string",
	"description": "Simple sample ID"
	},
	"input": {
	"type": "string",
	"description": "Raw input for the model"
	},
	"prompt": {
	"type": "string",
	"description": "Full prompt for the model"
	},
	"ground_truth": {
	"description": "Target response that may include one or multiple correct answers.",
	"oneOf": [
	{
	"type": "string"
	},
	{
	"type": "array",
	"items": {
	"type": "string"
	}
	}
	]
	},
	"response": {
	"type": "string",
	"description": "Response from the model"
	},
	"choices": {
	"description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
	"oneOf": [
	{
	"type": "array",
	"items": {
	"type": "string"
	}
	},
	{
	"type": "array",
	"items": {
	"type": "array",
	"items": [
	{
	"type": "string"
	},
	{
	"type": "string"
	}
	],
	"minItems": 2,
	"maxItems": 2
	}
	}
	]
	},
	"full_logprobs": {
	"type": "array",
	"description": "Full log probabilities generated for this sample",
	"items": {
	"type": "array",
	"items": {
	"type": "object",
	"required": [
	"token_id",
	"logprob",
	"decoded_token"
	],
	"properties": {
	"token_id": {
	"type": "number",
	"description": "Id of token for which we keep its logprob"
	},
	"logprob": {
	"type": "number",
	"description": "Log probability of the token"
	},
	"decoded_token": {
	"type": "string",
	"description": "The decoded string representation of the token"
	}
	}
	}
	}
	}
	}
	}
	}
	]
	}
	}
	}