ghrua commited on
Commit
18e1fdf
·
verified ·
1 Parent(s): 9c86ec6

Upload OLMo2-1B-stage2-seed42-SEXMH-L5/ruler_qa__4096::suite/metrics-all.jsonl with huggingface_hub

Browse files
OLMo2-1B-stage2-seed42-SEXMH-L5/ruler_qa__4096::suite/metrics-all.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"task_name": "ruler_qa__4096::suite", "task_hash": "4867b6c160c1a78b3e53ad7bc50bea94", "model_hash": "5389d94da3a4a023fb5b881b08d10db1", "model_config": {"model": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "revision": null, "trust_remote_code": null, "max_length": 4096, "model_path": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "model_type": "vllm"}, "task_config": {"task_name": "ruler_qa__4096::suite", "task_core": "ruler_qa_1__4096", "limit": 100, "split": "validation", "num_shots": 0, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 42, "context_kwargs": {"truncate_custom": false, "truncation_size_when_saved": null}, "generation_kwargs": {"temperature": 0.0, "do_sample": false, "top_p": 1, "repetition_penalty": 1, "max_gen_toks": 50, "stop_sequences": []}, "metric_kwargs": {"metric_names": ["substring_exact_match", "exact_match", "f1", "rougeL_f1", "rougeL_recall", "rougeLsum_f1", "rougeLsum_recall"], "model_judge": null}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "data/ruler/qa_1/validation_4096.jsonl", "dataset_name": "qa_1__4096", "use_chat_format": false, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "custom_kwargs": {"shuffle_choices": false, "choices_first": false}, "skip_model_judges": null, "model_max_length": 4096, "metadata": {"num_tasks": 2, "description": "Aggregate metric", "alias": "ruler_qa__4096::suite"}}, "compute_config": {"batch_size": "10000", "max_batch_size": 32, "output_dir": "results_hist/251213_results_ruler/OLMo2-1B-stage2-seed42-SEXMH-L5/ruler_qa__4096::suite", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 2.014157772064209, "current_date": "2025-12-13 02:25:37 UTC", "num_instances": 200, "beaker_info": {}, "metrics": {"f1_micro": 0.0, "f1_macro": 0.0, "exact_match_micro": 0.0, "exact_match_macro": 0.0, "rougeL_recall_micro": 0.0, "rougeL_recall_macro": 0.0, "substring_exact_match_micro": 0.0, "substring_exact_match_macro": 0.0, "primary_score_micro": 0.0, "primary_score_macro": 0.0, "rougeLsum_recall_micro": 0.0, "rougeLsum_recall_macro": 0.0, "rougeLsum_f1_micro": 0.0, "rougeLsum_f1_macro": 0.0, "rougeL_f1_micro": 0.0, "rougeL_f1_macro": 0.0, "primary_score": 0.0}, "task_idx": null}
2
+ {"task_name": "ruler_qa_1__4096", "task_hash": "bc959be9bf8d09b2a828b70e82379209", "model_hash": "5389d94da3a4a023fb5b881b08d10db1", "model_config": {"model": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "revision": null, "trust_remote_code": null, "max_length": 4096, "model_path": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "model_type": "vllm"}, "task_config": {"task_name": "ruler_qa_1__4096", "task_core": "ruler_qa_1__4096", "limit": 100, "split": "validation", "num_shots": 0, "fewshot_seed": 1234, "primary_metric": "substring_exact_match", "random_subsample_seed": 42, "context_kwargs": {"truncate_custom": false, "truncation_size_when_saved": null}, "generation_kwargs": {"temperature": 0.0, "do_sample": false, "top_p": 1, "repetition_penalty": 1, "max_gen_toks": 50, "stop_sequences": []}, "metric_kwargs": {"metric_names": ["substring_exact_match", "exact_match", "f1", "rougeL_f1", "rougeL_recall", "rougeLsum_f1", "rougeLsum_recall"], "model_judge": null}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "data/ruler/qa_1/validation_4096.jsonl", "dataset_name": "qa_1__4096", "use_chat_format": false, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "custom_kwargs": {"shuffle_choices": false, "choices_first": false}, "skip_model_judges": null, "model_max_length": 4096, "metadata": {"regimes": ["Ruler"], "alias": "ruler_qa_1__4096::std"}}, "compute_config": {"batch_size": "10000", "max_batch_size": 32, "output_dir": "results_hist/251213_results_ruler/OLMo2-1B-stage2-seed42-SEXMH-L5/ruler_qa__4096::suite", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 1.0359244346618652, "current_date": "2025-12-13 02:25:37 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"substring_exact_match": 0.0, "exact_match": 0.0, "f1": 0.0, "rougeL_f1": 0.0, "rougeL_recall": 0.0, "rougeLsum_f1": 0.0, "rougeLsum_recall": 0.0, "primary_score": 0.0}, "task_idx": 0}
3
+ {"task_name": "ruler_qa_2__4096", "task_hash": "cdb11e1d5829543d1c1368cddbe73b13", "model_hash": "5389d94da3a4a023fb5b881b08d10db1", "model_config": {"model": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "revision": null, "trust_remote_code": null, "max_length": 4096, "model_path": "../OLMo/hf_ckpts/OLMo2-1B-stage2-seed42-SEXMH-L5/step23852-unsharded", "model_type": "vllm"}, "task_config": {"task_name": "ruler_qa_2__4096", "task_core": "ruler_qa_2__4096", "limit": 100, "split": "validation", "num_shots": 0, "fewshot_seed": 1234, "primary_metric": "substring_exact_match", "random_subsample_seed": 42, "context_kwargs": {"truncate_custom": false, "truncation_size_when_saved": null}, "generation_kwargs": {"temperature": 0.0, "do_sample": false, "top_p": 1, "repetition_penalty": 1, "max_gen_toks": 50, "stop_sequences": []}, "metric_kwargs": {"metric_names": ["substring_exact_match", "exact_match", "f1", "rougeL_f1", "rougeL_recall", "rougeLsum_f1", "rougeLsum_recall"], "model_judge": null}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "data/ruler/qa_2/validation_4096.jsonl", "dataset_name": "qa_2__4096", "use_chat_format": false, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "custom_kwargs": {"shuffle_choices": false, "choices_first": false}, "skip_model_judges": null, "model_max_length": 4096, "metadata": {"regimes": ["Ruler"], "alias": "ruler_qa_2__4096::std"}}, "compute_config": {"batch_size": "10000", "max_batch_size": 32, "output_dir": "results_hist/251213_results_ruler/OLMo2-1B-stage2-seed42-SEXMH-L5/ruler_qa__4096::suite", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 0.9782333374023438, "current_date": "2025-12-13 02:25:52 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"substring_exact_match": 0.0, "exact_match": 0.0, "f1": 0.0, "rougeL_f1": 0.0, "rougeL_recall": 0.0, "rougeLsum_f1": 0.0, "rougeLsum_recall": 0.0, "primary_score": 0.0}, "task_idx": 1}