File size: 10,821 Bytes
5c83af4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
model_path=""
data_home=""

python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longbook_choice_eng_gpt4_same --start-idx 0 --end-idx 1000 --sample-input-file ${data_home}/longbook_choice_eng_gpt4_same/test.json
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longbook_qa_eng_gpt4_same --start-idx 0 --end-idx 1000 --sample-input-file ${data_home}/longbook_qa_eng_gpt4_same/test.json
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longdialogue_qa_eng_gpt4_same --start-idx 0 --end-idx 2000 --max-tokens 1024 --sample-input-file ${data_home}/longdialogue_qa_eng_gpt4_same/test.json
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longbook_sum_eng_gpt4_same --start-idx 0 --end-idx 120 --max-tokens 1024 --sample-input-file ${data_home}/longbook_sum_eng_gpt4_same/test.json

python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longbook_choice_eng.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 1000 --use-retrieved-neighbours --sample-input-file ${data_home}/longbook_choice_eng.e5_mistral_retriever_chunkbysents1200/test.json
longbook_qa_eng.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset longbook_qa_eng.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 1000 --use-retrieved-neighbours --sample-input-file ${data_home}/longbook_qa_eng.e5_mistral_retriever_chunkbysents1200/test.json
qasper.e5_mistral_retriever_chunkbysents1200

python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset qasper.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 2000 --use-retrieved-neighbours --sample-input-file ${data_home}/qasper.e5_mistral_retriever_chunkbysents1200/test.json
qmsum.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset qmsum.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --use-retrieved-neighbours --sample-input-file ${data_home}/qmsum.e5_mistral_retriever_chunkbysents1200/test.json
quality.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset quality.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 2000 --use-retrieved-neighbours --sample-input-file ${data_home}/quality.e5_mistral_retriever_chunkbysents1200/test.json
musique.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset musique.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --use-retrieved-neighbours --sample-input-file ${data_home}/musique.e5_mistral_retriever_chunkbysents1200/test.json
hotpotqa.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset hotpotqa.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --use-retrieved-neighbours --sample-input-file ${data_home}/hotpotqa.e5_mistral_retriever_chunkbysents1200/test.json
multifieldqa_en.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset multifieldqa_en.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --use-retrieved-neighbours --sample-input-file ${data_home}/multifieldqa_en.e5_mistral_retriever_chunkbysents1200/test.json
longbook_choice_eng.e5_mistral_retriever_chunkbysents1200

python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset qasper.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 2000 --sample-input-file ${data_home}/qasper.e5_mistral_retriever_chunkbysents1200/test.json
qmsum.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset qmsum.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --sample-input-file ${data_home}/qmsum.e5_mistral_retriever_chunkbysents1200/test.json
quality.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset quality.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 2000 --sample-input-file ${data_home}/quality.e5_mistral_retriever_chunkbysents1200/test.json
musique.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset musique.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --sample-input-file ${data_home}/musique.e5_mistral_retriever_chunkbysents1200/test.json
hotpotqa.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset hotpotqa.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --sample-input-file ${data_home}/hotpotqa.e5_mistral_retriever_chunkbysents1200/test.json
multifieldqa_en.e5_mistral_retriever_chunkbysents1200
python evaluate_cqa_vllm_chatqa2.py --model-folder ${model_path} --eval-dataset multifieldqa_en.e5_mistral_retriever_chunkbysents1200 --start-idx 0 --end-idx 200 --sample-input-file ${data_home}/multifieldqa_en.e5_mistral_retriever_chunkbysents1200/test.json

python evaluate_cqa_vllm_chatqa2.py --eval-dataset doc2dial --start-idx 0 --end-idx 4000 --use-retrieved-neighbours --model-folder ${model_path} --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks/multi-turn-qa/doc2dial/doc2dial_ftdragon_chatgptgen7k_chunk150_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset convfinqa_general_long_answer --start-idx 0 --end-idx 1500 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/convfinqa_general/convfinqa_general_QA_dev.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset sqa_general_long_answer --start-idx 0 --end-idx 3100 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/sqa_general/sqa_general_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset coqa --start-idx 0 --end-idx 8000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks/multi-turn-qa/coqa/coqa_QA_dev.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset doqa_cooking --start-idx 0 --end-idx 2000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/doqa/doqa_cooking_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset doqa_travel --start-idx 0 --end-idx 2000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/doqa/doqa_travel_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset doqa_movies --start-idx 0 --end-idx 2000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/doqa/doqa_movies_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset topiocqa --start-idx 0 --end-idx 2600 --num-ctx 20 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks//multi-turn-qa/topiocqa/topiocqa_dev_retrieval_dragon_ft_chatgptgen7k.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset inscit --start-idx 0 --end-idx 600 --num-ctx 20 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks/multi-turn-qa/inscit/inscit_dev_retrieval_dragon_ft_chatgptgen7k_with_topic.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset qrecc --start-idx 0 --end-idx 4000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks/multi-turn-qa/qrecc/qrecc_ftdragon_chatgptgen7k_chunk150_QA_test.json
python evaluate_cqa_vllm_chatqa2.py --eval-dataset quac --start-idx 0 --end-idx 8000 --use-retrieved-neighbours --model-folder /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/checkpoints/applications/long_131072_25_multiturn_qa_blend_commercial_v28_9_multiturn_llama3_8b_step_1000_8b_64_3e-7_step_3300_hf --sample-input-file /lustre/fsw/portfolios/llmservice/users/pengx/projects/swa_long_pretrain_llama2/data/test_benchmarks/multi-turn-qa/quac/quac_ftdragon_chatgptgen7k_chunk150_QA_test.json