import os | |
import sys | |
import json | |
from llamafactory.train.tuner import run_exp | |
from llamafactory.extras.misc import get_current_device | |
from weclone.utils.config import load_config | |
from weclone.utils.log import logger | |
from weclone.data.clean.strategies import LLMCleaningStrategy | |
def main(): | |
train_config = load_config(arg_type="train_sft") | |
dataset_config = load_config(arg_type="make_dataset") | |
device = get_current_device() | |
if device == "cpu": | |
logger.warning("请注意你正在使用CPU训练,非Mac设备可能会出现问题") | |
cleaner = LLMCleaningStrategy(make_dataset_config=dataset_config) | |
cleaned_data_path = cleaner.clean() | |
if not os.path.exists(cleaned_data_path): | |
logger.error(f"错误:文件 '{cleaned_data_path}' 不存在,请确保数据处理步骤已正确生成该文件。") | |
sys.exit(1) | |
formatted_config = json.dumps(train_config, indent=4, ensure_ascii=False) | |
logger.info(f"微调配置:\n{formatted_config}") | |
run_exp(train_config) | |
if __name__ == "__main__": | |
main() | |