Spaces:
Sleeping
Sleeping
File size: 4,193 Bytes
f05dc8f 7ccf9d4 6454c0e 3adea5e f05dc8f 3adea5e 089a447 133c6d8 7ccf9d4 6454c0e 5289522 089a447 ea047ad 133c6d8 ea047ad bae4131 ea047ad 25580aa 089a447 25580aa 089a447 ea047ad bae4131 ea047ad 089a447 ea047ad 6454c0e 3adea5e 089a447 6454c0e 3adea5e ea047ad 089a447 6454c0e ea047ad 6454c0e ea047ad 6454c0e ea047ad 6454c0e 089a447 ea047ad 6454c0e ea047ad 3d76e98 ea047ad 6454c0e ea047ad 089a447 6454c0e bae4131 089a447 f05dc8f 133c6d8 f05dc8f 133c6d8 7ccf9d4 089a447 133c6d8 7ccf9d4 133c6d8 7ccf9d4 089a447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
from ruamel.yaml import YAML
from loguru import logger
from yourbench_space import PATH
from yourbench_space.utils import to_commentable_yaml
def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
"""Creates the base config dictionary"""
return {
"hf_configuration": {
"token": "$HF_TOKEN",
"hf_organization": hf_org,
"private": True,
"hf_dataset_name": hf_dataset_name,
"concat_if_exist": False,
},
"model_list": [
{
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
"provider": "novita",
"max_concurrent_requests": 32,
},
{
"model_name": "Qwen/Qwen2.5-72B-Instruct",
"provider": "novita",
"max_concurrent_requests": 32,
},
],
"model_roles": {
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
"chunking": ["intfloat/multilingual-e5-large-instruct"],
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
},
"pipeline": {
"ingestion": {
"source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/",
"output_dir": f"{PATH}/{session_uid}/ingested",
"run": True,
},
"upload_ingest_to_hub": {
"source_documents_dir": f"{PATH}/{session_uid}/ingested",
"run": True,
},
"summarization": {
"run": True,
},
"chunking": {
"run": True,
"chunking_configuration": {
"l_min_tokens": 64,
"l_max_tokens": 128,
"tau_threshold": 0.8,
"h_min": 2,
"h_max": 5,
"num_multihops_factor": 2,
},
},
"single_shot_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "count",
"value": 5,
"random_seed": 123,
},
},
"multi_hop_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "percentage",
"value": 0.3,
"random_seed": 42,
},
},
"lighteval": {
"run": True,
},
},
}
def save_yaml_file(config: dict, path: str):
"""Saves the given config dictionary to a YAML file with helpful comments."""
yaml = YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
config_cm = to_commentable_yaml(config)
# Now we can add inline comments
ingestion = config_cm["pipeline"]["ingestion"]
ingestion.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Change this path to match your local directory")
ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved")
upload = config_cm["pipeline"]["upload_ingest_to_hub"]
upload.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed")
with open(path, "w") as file:
yaml.dump(config_cm, file)
return path
def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
"""Generates and saves the YAML configuration file"""
logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
config = generate_base_config(hf_org, hf_name, session_uid)
file_path = save_yaml_file(config, config_path)
logger.success(f"Config saved at: {file_path}")
return file_path
|