import sys from vlmeval import * from vlmeval.dataset import SUPPORTED_DATASETS FAIL_MSG = 'Failed to obtain answer via API.' root = sys.argv[1] if root[-1] in '/\\': root = root[:-1] model_name = root.split('/')[-1] for d in SUPPORTED_DATASETS: fname = f'{model_name}_{d}.xlsx' pth = osp.join(root, fname) if osp.exists(pth): data = load(pth) # Detect Failure assert 'prediction' in data data['prediction'] = [str(x) for x in data['prediction']] fail = [FAIL_MSG in x for x in data['prediction']] if sum(fail): nfail = sum(fail) ntot = len(fail) print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ') eval_files = ls(root, match=f'{model_name}_{d}_') eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')] if len(eval_files) == 0: print(f'Model {model_name} x Dataset {d} openai missing') continue assert len(eval_files) == 1 eval_file = eval_files[0] data = load(eval_file) if 'MMVet' in d: bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)] if len(bad): print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') elif 'MathVista' in d: bad = [x for x in data['res'] if FAIL_MSG in str(x)] if len(bad): print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') elif d == 'LLaVABench': sub = data[data['gpt4_score'] == -1] sub = sub[sub['gpt4_score'] == -1] if len(sub): print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.') else: bad = [x for x in data['log'] if FAIL_MSG in str(x)] if len(bad): print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')