GPT-3.5Turbo HumanEval Contamination based on "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models"

#16
Files changed (1) hide show
  1. contamination_report.csv +2 -0
contamination_report.csv CHANGED
@@ -3,6 +3,8 @@ Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Develo
3
  gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
4
  ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
5
  openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
 
 
6
  imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
7
  imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
8
  ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
 
3
  gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
4
  ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
5
  openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
6
+ openai_humaneval;;GPT-3.5-turbo/0613;model;;;23.79;model-based;https://arxiv.org/abs/2402.15938;16
7
+ openai_humaneval;;GPT-3.5-turbo/1106;model;;;41.47;model-based;https://arxiv.org/abs/2402.15938;16
8
  imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
9
  imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
10
  ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8