Ashton2000 commited on Oct 16, 2025

Commit

981b783

verified ·

1 Parent(s): ebb1804

Upload folder using huggingface_hub

Browse files

Files changed (28) hide show

calc_avg_comet.py +10 -0
comet_api.py +60 -0
ds_z3_config.json +30 -0
infer.sh +79 -0
infer_2.sh +79 -0
infer_robust.sh +90 -0
infer_robust_2.sh +83 -0
merge_template.yaml +13 -0
nohup.out +32 -0
preprocess_robust.py +39 -0
qwen2.5_full_sft.yaml +50 -0
qwen2.5_lora_dpo.yaml +50 -0
run_eval_cohesion.sh +41 -0
run_eval_comet_api.sh +40 -0
run_eval_robust.sh +47 -0
run_merge_fix.sh +24 -0
run_train.sh +25 -0
test_api.py +28 -0
vllm_1000.log +0 -0
vllm_1000_2.log +0 -0
vllm_1200.log +0 -0
vllm_1400.log +0 -0
vllm_1600.log +0 -0
vllm_1800.log +158 -0
vllm_600.log +0 -0
vllm_600_2.log +0 -0
vllm_800.log +0 -0
vllm_800_2.log +0 -0

calc_avg_comet.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import sys
+with open(sys.argv[1], 'r') as f:
+    lines = [line.strip() for line in f]
+scores = [float(line.split()[-1]) for line in lines]
+with open(sys.argv[1], 'a') as f:
+    f.write(f'Average score: {sum(scores) / len(scores) * 100:.2f}\n')

comet_api.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import argparse
+import requests
+import time
+import os
+def get_comet_score(instances: list[dict], timeout=100, max_retries=10, comet_api: str=None):
+    if comet_api is not None:
+        url = f"http://{comet_api}/evaluate"
+    else:
+        url = f"http://{os.getenv('COMET_API')}/evaluate"
+    payload = {'instances': instances}
+    retries = 0
+    while retries < max_retries:
+        try:
+            response = requests.post(url, json=payload, timeout=timeout)
+            if response.status_code == 200:
+                # print(response.json())  # {'score': ...}
+                return response.json()['scores']
+            else:
+                print(f"Request failed with status code: {response.status_code}")
+        except requests.Timeout:
+            retries += 1
+            print(f"Request timed out. Retrying... ({retries}/{max_retries})")
+            time.sleep(5)
+        except requests.RequestException as e:
+            raise RuntimeError(f"Request failed due to: {e}")
+    raise RuntimeError("Max retries exceeded. Request failed.")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source_file', '-s', type=str, required=True)
+    parser.add_argument('--target_file', '-t', type=str, required=True)
+    parser.add_argument('--reference_file', '-r', type=str, required=True)
+    parser.add_argument('--url', '-u', type=str, required=True)
+    args = parser.parse_args()
+    source_file = args.source_file
+    target_file = args.target_file
+    reference_file = args.reference_file
+    comet_api = args.url
+    with open(source_file, 'r') as f:
+        source_lines = f.readlines()
+    with open(target_file, 'r') as f:
+        target_lines = f.readlines()
+    with open(reference_file, 'r') as f:
+        reference_lines = f.readlines()
+    line_comet_scores = get_comet_score([{'src': i, 'mt': j, 'ref': k} for i, j, k in zip(source_lines, target_lines, reference_lines)], comet_api=comet_api)
+    avg_score = sum(line_comet_scores) / len(line_comet_scores) if line_comet_scores and len(line_comet_scores) > 0 else -1.0
+    print(f'{target_file}\tscore: {avg_score:.4f}')
+if __name__ == '__main__':
+    main()

ds_z3_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": false,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}

infer.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+# trap 'ssh wyt@${infer_address%%:*} "killall pt_main_thread"; exit' SIGINT
+device=$1
+deploy_flag=$2
+step=$3
+if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
+    echo "Usage: $0 <device> <deploy_flag> <step>"
+    exit 1
+fi
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/lora/ted_react_trans_base_sample_sft_dpolora_balanced_474/merged_fix/checkpoint-${step}
+model_path=$4
+language=$5
+src_lang=${language%-*}
+tgt_lang=${language#*-}
+# infer_address=10.249.42.177:8010
+# schedule_address=10.249.42.177:8011
+# infer_address=127.0.0.1:801$infer_device
+# schedule_address=127.0.0.1:801$schedule_device
+# address=10.249.42.182:801${device}
+address=127.0.0.1:801${device}
+# setting=window_20_1ep
+# setting=window_20_2ep_new
+work_dir=/data/wyt/codes/DocDPO/inference_monolang/ted_en_zh_balanced_paritial
+if [ "$deploy_flag" = "true" ]; then
+    if [ "${address%%:*}" = "127.0.0.1" ]; then
+        source ~/.zshrc
+        conda activate vllm
+        CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching --gpu_memory_utilization 0.9 > vllm_${step}.log 2>&1 &
+        conda activate optima-vllm
+    else
+        ssh -n wyt@${address%%:*} "source ~/.zshrc && conda activate optima-vllm && CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching > /dev/null 2>&1 &"
+    fi
+fi
+echo "Waiting for LLM deployment in 20 seconds..."
+# sleep 20
+echo "Testing API of ${address}..."
+while true; do
+    python test_api.py $address
+    if [ $? -eq 0 ]; then
+        echo "API connected successfully!"
+        break
+    else
+        echo "API connection failed. Retrying in 5 seconds..."
+        sleep 5
+    fi
+done
+cur_path=`pwd`
+cd $work_dir
+# for i in 4 9 11; do
+for i in {0..11}; do
+    if [ ! -f "$cur_path/${src_lang}-${tgt_lang}_${step}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}" ]; then
+        echo IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}
+        python -u infer.py \
+            --src_file /data/wyt/codes/DocDPO/data/2017-01-ted-test/${src_lang}-${tgt_lang}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i \
+            --output_path $cur_path/results/${src_lang}-${tgt_lang}_${step} \
+            --window_size 10 \
+            --infer_address $address \
+            --schedule_address $address \
+            --language ${src_lang}-${tgt_lang} \
+            --infer_temperature 0.7 \
+            --schedule_temperature 0.7 \
+            --translate_style base
+    fi
+done
+cd $cur_path
+# ssh wyt@${infer_address%%:*} "killall pt_main_thread"

infer_2.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+# trap 'ssh wyt@${infer_address%%:*} "killall pt_main_thread"; exit' SIGINT
+device=$1
+deploy_flag=$2
+step=$3
+if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
+    echo "Usage: $0 <device> <deploy_flag> <step>"
+    exit 1
+fi
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/lora/ted_react_trans_base_sample_sft_dpolora_balanced_474/merged_fix/checkpoint-${step}
+model_path=$4
+language=$5
+src_lang=${language%-*}
+tgt_lang=${language#*-}
+# infer_address=10.249.42.177:8010
+# schedule_address=10.249.42.177:8011
+# infer_address=127.0.0.1:801$infer_device
+# schedule_address=127.0.0.1:801$schedule_device
+# address=10.249.42.182:801${device}
+address=127.0.0.1:800${device}
+# setting=window_20_1ep
+# setting=window_20_2ep_new
+work_dir=/data/wyt/codes/DocDPO/inference_monolang/ted_en_zh_balanced_paritial
+if [ "$deploy_flag" = "true" ]; then
+    if [ "${address%%:*}" = "127.0.0.1" ]; then
+        source ~/.zshrc
+        conda activate vllm
+        CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching --gpu_memory_utilization 0.48 > vllm_${step}_2.log 2>&1 &
+        conda activate optima-vllm
+    else
+        ssh -n wyt@${address%%:*} "source ~/.zshrc && conda activate optima-vllm && CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching > /dev/null 2>&1 &"
+    fi
+fi
+echo "Waiting for LLM deployment in 20 seconds..."
+# sleep 20
+echo "Testing API of ${address}..."
+while true; do
+    python test_api.py $address
+    if [ $? -eq 0 ]; then
+        echo "API connected successfully!"
+        break
+    else
+        echo "API connection failed. Retrying in 5 seconds..."
+        sleep 5
+    fi
+done
+cur_path=`pwd`
+cd $work_dir
+# for i in 4 9 11; do
+for i in {0..11}; do
+    if [ ! -f "$cur_path/${src_lang}-${tgt_lang}_${step}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}" ]; then
+        echo IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}
+        python -u infer.py \
+            --src_file /data/wyt/codes/DocDPO/data/2017-01-ted-test/${src_lang}-${tgt_lang}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i \
+            --output_path $cur_path/results/${src_lang}-${tgt_lang}_${step} \
+            --window_size 10 \
+            --infer_address $address \
+            --schedule_address $address \
+            --language ${src_lang}-${tgt_lang} \
+            --infer_temperature 0.7 \
+            --schedule_temperature 0.7 \
+            --translate_style base
+    fi
+done
+cd $cur_path
+# ssh wyt@${infer_address%%:*} "killall pt_main_thread"

infer_robust.sh ADDED Viewed

	@@ -0,0 +1,90 @@

+# trap 'ssh wyt@${infer_address%%:*} "killall pt_main_thread"; exit' SIGINT
+device=$1
+deploy_flag=$2
+step=$3
+if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
+    echo "Usage: $0 <device> <deploy_flag> <step>"
+    exit 1
+fi
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/lora/ted_react_trans_base_sample_sft_dpolora_balanced_474/merged_fix/checkpoint-${step}
+model_path=$4
+language=$5
+src_lang=${language%-*}
+tgt_lang=${language#*-}
+# infer_address=10.249.42.177:8010
+# schedule_address=10.249.42.177:8011
+# infer_address=127.0.0.1:801$infer_device
+# schedule_address=127.0.0.1:801$schedule_device
+# address=10.249.42.182:801${device}
+# address=127.0.0.1:801${device}
+# address=10.249.45.139:801${device}
+address=${device}
+level=$6
+# setting=window_20_1ep
+# setting=window_20_2ep_new
+work_dir=/data/wyt/codes/DocDPO/inference_monolang/ted_en_zh_balanced_paritial
+# data_dir=/data/wyt/codes/DocDPO/data/2017-01-ted-test
+data_dir=/data/wyt/codes/DocDPO/data/ted_robust/level_${level}
+output_dir=/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/results_robust/level_${level}
+if [ "$deploy_flag" = "true" ]; then
+    if [ "${address%%:*}" = "127.0.0.1" ]; then
+        source ~/.zshrc
+        conda activate vllm
+        CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching --gpu_memory_utilization 0.9 > vllm_${step}.log 2>&1 &
+        conda activate optima-vllm
+    else
+        ssh -n wyt@${address%%:*} "source ~/.zshrc && conda activate optima-vllm && CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching > /dev/null 2>&1 &"
+    fi
+fi
+echo "Waiting for LLM deployment in 20 seconds..."
+# sleep 20
+echo "Testing API of ${address}..."
+while true; do
+    python test_api.py $address
+    if [ $? -eq 0 ]; then
+        echo "API connected successfully!"
+        break
+    else
+        echo "API connection failed. Retrying in 5 seconds..."
+        sleep 5
+    fi
+done
+cur_path=`pwd`
+cd $work_dir
+doc_ids=("${@:7}")
+echo "Document IDs to process: ${doc_ids[@]}"
+# for i in {0..5}; do
+# for i in {0..11}; do
+for i in "${doc_ids[@]}"; do
+    if [ ! -f "$output_dir/${src_lang}-${tgt_lang}_${step}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}" ]; then
+        echo IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}
+        python -u infer.py \
+            --src_file $data_dir/${src_lang}-${tgt_lang}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i \
+            --output_path $output_dir/${src_lang}-${tgt_lang}_${step} \
+            --window_size 10 \
+            --infer_address $address \
+            --schedule_address $address \
+            --language ${src_lang}-${tgt_lang} \
+            --infer_temperature 0.7 \
+            --schedule_temperature 0.7 \
+            --translate_style base
+    fi
+done
+cd $cur_path
+# ssh wyt@${infer_address%%:*} "killall pt_main_thread"

infer_robust_2.sh ADDED Viewed

	@@ -0,0 +1,83 @@

+# trap 'ssh wyt@${infer_address%%:*} "killall pt_main_thread"; exit' SIGINT
+device=$1
+deploy_flag=$2
+step=$3
+if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
+    echo "Usage: $0 <device> <deploy_flag> <step>"
+    exit 1
+fi
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/lora/ted_react_trans_base_sample_sft_dpolora_balanced_474/merged_fix/checkpoint-${step}
+model_path=$4
+language=$5
+src_lang=${language%-*}
+tgt_lang=${language#*-}
+# infer_address=10.249.42.177:8010
+# schedule_address=10.249.42.177:8011
+# infer_address=127.0.0.1:801$infer_device
+# schedule_address=127.0.0.1:801$schedule_device
+# address=10.249.42.182:801${device}
+# address=127.0.0.1:801${device}
+address=10.249.45.139:801${device}
+# setting=window_20_1ep
+# setting=window_20_2ep_new
+work_dir=/data/wyt/codes/DocDPO/inference_monolang/ted_en_zh_balanced_paritial
+# data_dir=/data/wyt/codes/DocDPO/data/2017-01-ted-test
+data_dir=/data/wyt/codes/DocDPO/data/ted_robust/level_3
+output_dir=/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/results_robust/level_3
+if [ "$deploy_flag" = "true" ]; then
+    if [ "${address%%:*}" = "127.0.0.1" ]; then
+        source ~/.zshrc
+        conda activate vllm
+        CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching --gpu_memory_utilization 0.9 > vllm_${step}.log 2>&1 &
+        conda activate optima-vllm
+    else
+        ssh -n wyt@${address%%:*} "source ~/.zshrc && conda activate optima-vllm && CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching > /dev/null 2>&1 &"
+    fi
+fi
+echo "Waiting for LLM deployment in 20 seconds..."
+# sleep 20
+echo "Testing API of ${address}..."
+while true; do
+    python test_api.py $address
+    if [ $? -eq 0 ]; then
+        echo "API connected successfully!"
+        break
+    else
+        echo "API connection failed. Retrying in 5 seconds..."
+        sleep 5
+    fi
+done
+cur_path=`pwd`
+cd $work_dir
+for i in {6..11}; do
+# for i in {0..11}; do
+    if [ ! -f "$output_dir/${src_lang}-${tgt_lang}_${step}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}" ]; then
+        echo IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}
+        python -u infer.py \
+            --src_file $data_dir/${src_lang}-${tgt_lang}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i \
+            --output_path $output_dir/${src_lang}-${tgt_lang}_${step} \
+            --window_size 10 \
+            --infer_address $address \
+            --schedule_address $address \
+            --language ${src_lang}-${tgt_lang} \
+            --infer_temperature 0.7 \
+            --schedule_temperature 0.7 \
+            --translate_style base
+    fi
+done
+cd $cur_path
+# ssh wyt@${infer_address%%:*} "killall pt_main_thread"

merge_template.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### model
+model_name_or_path:
+adapter_name_or_path:
+template: qwen
+trust_remote_code: true
+### export
+export_dir:
+export_size: 5
+export_device: cpu  # choices: [cpu, auto]
+export_legacy_format: false

nohup.out ADDED Viewed

	@@ -0,0 +1,32 @@

+[2025年 09月 17日 星期三 19:57:29 CST] SFT Training Start
+[2025年 09月 18日 星期四 00:05:57 CST] SFT Training End
+[2025年 09月 18日 星期四 00:05:57 CST] DPO Training Start
+[2025年 09月 18日 星期四 09:27:43 CST] DPO Training End
+[2025年 09月 18日 星期四 09:27:43 CST] Merging Checkpoints
+[2025年 09月 18日 星期四 09:33:12 CST] Merging Checkpoints End
+[2025年 09月 18日 星期四 09:33:12 CST] Inference Start
+[2025年 09月 18日 星期四 09:33:12 CST] Inference End
+[2025年 09月 18日 星期四 10:48:45 CST] Inference Start
+[2025年 09月 18日 星期四 10:48:45 CST] Inference End
+[2025年 09月 18日 星期四 12:38:00 CST] Inference Start
+[2025年 09月 18日 星期四 12:38:00 CST] Inference End
+[2025年 09月 18日 星期四 14:29:12 CST] Merging Checkpoints
+[2025年 09月 18日 星期四 14:31:09 CST] Merging Checkpoints End
+[2025年 09月 18日 星期四 14:31:09 CST] Inference End
+[2025年 09月 18日 星期四 14:34:00 CST] Merging Checkpoints
+[2025年 09月 18日 星期四 14:34:25 CST] Merging Checkpoints
+[2025年 09月 18日 星期四 14:36:15 CST] Merging Checkpoints
+[2025年 09月 18日 星期四 14:37:01 CST] Inference Start
+[2025年 09月 18日 星期四 14:37:01 CST] Inference End
+[2025年 09月 18日 星期四 14:37:29 CST] Inference Start
+[2025年 09月 18日 星期四 14:37:29 CST] Inference End
+[2025年 09月 18日 星期四 14:38:31 CST] Inference Start
+[2025年 09月 18日 星期四 14:38:31 CST] Inference End
+[2025年 09月 18日 星期四 14:41:21 CST] Inference Start
+[2025年 09月 18日 星期四 14:41:21 CST] Inference End
+[2025年 09月 18日 星期四 14:43:00 CST] Inference Start
+[2025年 09月 18日 星期四 14:43:00 CST] Inference End
+[2025年 09月 18日 星期四 17:23:43 CST] Inference Start
+[2025年 09月 18日 星期四 17:23:43 CST] Inference End
+[2025年 09月 18日 星期四 17:24:30 CST] Inference Start
+[2025年 09月 18日 星期四 17:24:30 CST] Inference End

preprocess_robust.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import argparse
+import os
+def main():
+    src_lang, tgt_lang = args.lang_pair.split("-")
+    tgt_file_list = [file for file in os.listdir(args.tgt_path) if file.endswith(f".{tgt_lang}")]
+    for tgt_file in tgt_file_list:
+        src_file = os.path.splitext(tgt_file)[0]
+        doc_id = src_file.split('.')[-1]
+        label_file = src_file.replace(f".{src_lang}.", ".id.")
+        with open(os.path.join(args.disturb_src_path, label_file), "r", encoding="utf-8") as f:
+            labels = [line.strip() for line in f]
+        with open(os.path.join(args.tgt_path, tgt_file), "r", encoding="utf-8") as f:
+            tgt_lines = [line.strip() for line in f]
+        assert len(labels) == len(tgt_lines), f"Length mismatch in {src_file} and {label_file}"
+        filterd_tgt_lines = [tgt for tgt, label in zip(tgt_lines, labels) if label.split('-')[0] == doc_id]
+        with open(os.path.join(args.original_src_path, src_file), "r", encoding="utf-8") as f:
+            original_src_lines = [line.strip() for line in f]
+        assert len(original_src_lines) == len(filterd_tgt_lines), f"Length mismatch in {src_file} and filtered {tgt_file}"
+        with open(os.path.join(args.output_path, tgt_file), "w", encoding="utf-8") as f:
+            f.write("\n".join(filterd_tgt_lines) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--original_src_path", type=str)
+    parser.add_argument("--disturb_src_path", type=str)
+    parser.add_argument("--tgt_path", type=str)
+    parser.add_argument("--output_path", type=str)
+    parser.add_argument("--lang_pair", type=str)
+    args = parser.parse_args()
+    os.makedirs(args.output_path, exist_ok=True)
+    main()

qwen2.5_full_sft.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+### model
+model_name_or_path: /data/wyt/codes/checkpoints/Qwen2.5-7B-Instruct
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+### dataset
+dataset_dir: /data/wyt/codes/DocDPO/sft/data_multilang/red_multilang_base_balanced_en_zhdefr_320
+dataset: sft_en-zh_tool,sft_en-zh_trans_base_sample,sft_en-de_tool,sft_en-de_trans_base_sample,sft_en-fr_tool,sft_en-fr_trans_base_sample
+template: qwen
+cutoff_len: 2560
+# max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+mask_history: true
+### output
+output_dir: /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/sft
+logging_steps: 5
+# save_steps: 300
+save_strategy: epoch
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: tensorboard  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 2
+# learning_rate: 2.0e-5
+# learning_rate: 8.0e-6
+learning_rate: 1.0e-5
+num_train_epochs: 1.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500

qwen2.5_lora_dpo.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+### model
+model_name_or_path: /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/sft
+trust_remote_code: true
+### method
+stage: dpo
+do_train: true
+# finetuning_type: full
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+# deepspeed: /data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+### dataset
+dataset_dir: /data/wyt/codes/DocDPO/sft/data_multilang/red_multilang_base_balanced_en_zhdefr_320
+dataset: dpo_en-zh_tool,dpo_en-zh_trans_base_sample,dpo_en-de_tool,dpo_en-de_trans_base_sample,dpo_en-fr_tool,dpo_en-fr_trans_base_sample
+template: qwen
+cutoff_len: 2560
+# max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/adapter
+logging_steps: 5
+save_steps: 200
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: tensorboard  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 4
+# learning_rate: 1.0e-4
+learning_rate: 5.0e-6
+num_train_epochs: 2.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500

run_eval_cohesion.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+dir_path=$1
+lang=$2
+src_lang=${lang%%-*}
+tgt_lang=${lang##*-}
+data_path=/data/wyt/codes/DocDPO/data/2017-01-ted-test/$lang
+for i in {0..11}; do
+    # source=$data_path/test.en.$i
+    target=$dir_path/IWSLT17.TED.tst2017.${lang}.${src_lang}.$i.${tgt_lang}
+    reference=$data_path/IWSLT17.TED.tst2017.${lang}.${tgt_lang}.$i
+    result=$dir_path/cohesion.txt
+    # echo $target
+    python -u /data/wyt/codes/DocDPO/evaluator/fine_grained_multi_demensional/eval_cohesion.py \
+        --model gpt-4.1 \
+        --input_file $target \
+        --reference_file $reference \
+        --target_language $tgt_lang \
+        --output_file $result
+done
+python /data/wyt/codes/DocDPO/evaluator/fine_grained_multi_demensional/calc_avg_cohesion.py $dir_path/cohesion.txt
+# cd $dir_path/aligned
+# file_nums=$(ls test.*-s | sort -n -t . -k 2 | xargs wc -l | head -n -1 | awk '{ print $1 }')
+# echo $file_nums
+# cat $(ls test.*-t | sort -n -t . -k 2) > $dir_path/whole.hyp
+# cat $(ls test.*-s | sort -n -t . -k 2) > $dir_path/whole.src
+# sh /data/wyt/codes/DocMTAgent/consistency_evaluation/run_eval_sep.sh \
+#     $dir_path/whole.src $dir_path/whole.hyp $dir_path/consistency $lang "$file_nums"
+# cd $work_dir
+# python indiversity_sep.py -r $dir_path/whole.src.record_sep.json | tee -a $dir_path/whole.src.consistency_sep

run_eval_comet_api.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+# lang=en-de
+# lang=en-zh
+lang=$2
+src_lang=${lang%%-*}
+tgt_lang=${lang##*-}
+dir_path=$1
+data_path=/data/wyt/codes/DocDPO/data/2017-01-ted-test/$lang
+# align_script_path=/data/wyt/codes/DocMTAgent/Bleualign
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc2/en-zh
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc1/en-zh
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc1_trans14b/en-zh
+echo $dir_path
+for i in {0..11}; do
+    source=$data_path/IWSLT17.TED.tst2017.$lang.$src_lang.$i
+    target=$dir_path/IWSLT17.TED.tst2017.$lang.$src_lang.$i.$tgt_lang
+    reference=$data_path/IWSLT17.TED.tst2017.$lang.$tgt_lang.$i
+    # python comet_api.py -s $source -t $target -r $reference -u 10.249.45.139:8088 >> $dir_path/comet_api.txt
+    python comet_api.py -s $source -t $target -r $reference -u 10.249.42.177:8088 >> $dir_path/comet_api.txt
+done
+python calc_avg_comet.py $dir_path/comet_api.txt
+# cd $dir_path/aligned
+# file_nums=$(ls test.*-s | sort -n -t . -k 2 | xargs wc -l | head -n -1 | awk '{ print $1 }')
+# echo $file_nums
+# cat $(ls test.*-t | sort -n -t . -k 2) > $dir_path/whole.hyp
+# cat $(ls test.*-s | sort -n -t . -k 2) > $dir_path/whole.src
+# sh /data/wyt/codes/DocMTAgent/consistency_evaluation/run_eval_sep.sh \
+#     $dir_path/whole.src $dir_path/whole.hyp $dir_path/consistency $lang "$file_nums"
+# cd $work_dir
+# python indiversity_sep.py -r $dir_path/whole.src.record_sep.json | tee -a $dir_path/whole.src.consistency_sep

run_eval_robust.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/bin/bash
+# lang=en-de
+# lang=en-zh
+lang=$2
+src_lang=${lang%%-*}
+tgt_lang=${lang##*-}
+dir_path=$1
+level=$3
+data_path=/data/wyt/codes/DocDPO/data/2017-01-ted-test/$lang
+# align_script_path=/data/wyt/codes/DocMTAgent/Bleualign
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc2/en-zh
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc1/en-zh
+# dir_path=/data/wyt/codes/DocDPO/inference_mcts_shorten_rag_entity_tldr_vq_lowerbound_comet/results/window10_epc1_trans14b/en-zh
+echo $dir_path
+python preprocess_robust.py \
+    --original_src_path /data/wyt/codes/DocDPO/data/2017-01-ted-test/${src_lang}-${tgt_lang} \
+    --disturb_src_path /data/wyt/codes/DocDPO/data/ted_robust/level_${level}/${src_lang}-${tgt_lang} \
+    --tgt_path results_robust/level_${level}/${src_lang}-${tgt_lang}_1200 \
+    --output_path results_robust/level_${level}/${src_lang}-${tgt_lang}_1200/tmp_robust \
+    --lang_pair $lang
+for i in {0..11}; do
+    source=$data_path/IWSLT17.TED.tst2017.$lang.$src_lang.$i
+    target=results_robust/level_${level}/${src_lang}-${tgt_lang}_1200/tmp_robust/IWSLT17.TED.tst2017.$lang.$src_lang.$i.$tgt_lang
+    reference=$data_path/IWSLT17.TED.tst2017.$lang.$tgt_lang.$i
+    python comet_api.py -s $source -t $target -r $reference -u 127.0.0.1:8088 >> $dir_path/comet_api.txt
+done
+python calc_avg_comet.py $dir_path/comet_api.txt
+# cd $dir_path/aligned
+# file_nums=$(ls test.*-s | sort -n -t . -k 2 | xargs wc -l | head -n -1 | awk '{ print $1 }')
+# echo $file_nums
+# cat $(ls test.*-t | sort -n -t . -k 2) > $dir_path/whole.hyp
+# cat $(ls test.*-s | sort -n -t . -k 2) > $dir_path/whole.src
+# sh /data/wyt/codes/DocMTAgent/consistency_evaluation/run_eval_sep.sh \
+#     $dir_path/whole.src $dir_path/whole.hyp $dir_path/consistency $lang "$file_nums"
+# cd $work_dir
+# python indiversity_sep.py -r $dir_path/whole.src.record_sep.json | tee -a $dir_path/whole.src.consistency_sep

run_merge_fix.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+template_config="merge_template.yaml"
+output_config="merge.yaml"
+adapter_dir="dpo/adapter"
+merged_dir="dpo/merged"
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/ted_react_sft_balanced_428/checkpoint-600
+# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/ted_react_sft_trans_base_sample_balanced_428/checkpoint-600
+model_path=$1
+for dir in $adapter_dir/checkpoint-*; do
+    ckpt=$(basename $dir)
+    echo $ckpt
+    num=${ckpt#checkpoint-}
+    echo $num
+    if { [ -z "$2" ] && [ -z "$3" ]; } || { [ "$num" -ge $2 ] && [ "$num" -le $3 ]; }; then
+        mkdir -p $merged_dir/$ckpt
+        cp $template_config $merged_dir/$ckpt/$output_config
+        sed -i "s|adapter_name_or_path:.*|adapter_name_or_path: $adapter_dir/$ckpt|" "$merged_dir/$ckpt/$output_config"
+        sed -i "s|model_name_or_path:.*|model_name_or_path: $model_path|" "$merged_dir/$ckpt/$output_config"
+        sed -i "s|export_dir:.*|export_dir: $merged_dir/$ckpt|" "$merged_dir/$ckpt/$output_config"
+        llamafactory-cli export $merged_dir/$ckpt/$output_config
+    fi
+done

run_train.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+source ~/.zshrc
+conda activate llama-factory
+# echo "[$(date)] SFT Training Start"
+# CUDA_VISIBLE_DEVICES=0,1,2,3 FORCE_TORCHRUN=1 llamafactory-cli train qwen2.5_full_sft.yaml > logs/train_sft.log 2>&1 && \
+# echo "[$(date)] SFT Training End"
+# echo "[$(date)] DPO Training Start"
+# CUDA_VISIBLE_DEVICES=0,1,2,3 FORCE_TORCHRUN=1 llamafactory-cli train qwen2.5_lora_dpo.yaml > logs/train_dpo.log 2>&1 && \
+# echo "[$(date)] DPO Training End"
+# echo "[$(date)] Merging Checkpoints"
+# CUDA_VISIBLE_DEVICES=0 zsh run_merge_fix.sh /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/sft 600 800 > logs/merge.log 2>&1 && \
+# echo "[$(date)] Merging Checkpoints End"
+# conda activate optima-vllm
+echo "[$(date)] Inference Start"
+CUDA_VISIBLE_DEVICES=0 zsh infer.sh 0 true 600 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-600 en-de > logs/infer_600_en-de.log 2>&1&
+CUDA_VISIBLE_DEVICES=1 zsh infer.sh 1 true 600 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-600 en-fr > logs/infer_600_en-fr.log 2>&1&
+CUDA_VISIBLE_DEVICES=2 zsh infer.sh 2 true 800 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-800 en-fr > logs/infer_800_en-fr.log 2>&1&
+CUDA_VISIBLE_DEVICES=3 zsh infer.sh 3 true 1000 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1000 en-fr > logs/infer_1000_en-fr.log 2>&1&
+echo "[$(date)] Inference End"

test_api.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from openai import OpenAI
+import sys
+def main():
+    openai_api_key = "EMPTY"
+    api_base = f"http://{sys.argv[1]}/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=api_base,
+    )
+    try:
+        completion = client.chat.completions.create(
+            model='qwen',
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            timeout=15
+        )
+        print(completion.choices[0].message.content)
+    except Exception:
+        exit(1)
+if __name__ == '__main__':
+    main()

vllm_1000.log ADDED Viewed