from tqdm import tqdm from pprint import pprint import pandas as pd import argparse import re import json import nltk from nltk.tokenize import word_tokenize from nltk.stem.porter import PorterStemmer p_stemmer = PorterStemmer() # nltk.download('punkt') # nltk.download('wordnet') # nltk.download('stopwords') import language_evaluation evaluator = language_evaluation.CocoEvaluator() def nltk_process(text): # Tokenization nltk_tokenList = word_tokenize(text) # Stemming nltk_stemedList = [] for word in nltk_tokenList: nltk_stemedList.append(p_stemmer.stem(word)) filtered_sentence = nltk_stemedList # Removing Punctuation tokens = [re.sub(r'[^a-zA-Z0-9]', '', tok) for tok in filtered_sentence] text = " ".join(tokens) return text def calculate_finegrained_scores(pred_id2sent, id2caption, use_coco_eval=False): if use_coco_eval: n_total = 0 refs = [] hyps = [] for id, gt_captions in id2caption.items(): pred_sent = pred_id2sent[id] refs.append(gt_captions) hyps.append(pred_sent) n_total += 1 print('caption') results = evaluator.run_evaluation(hyps, refs) pprint(results) n_total = 0 total_score = 0 for id, gt_phrases in id2background.items(): pred_sent = pred_id2sent[id] score = 0 n_phrases = len(gt_phrases) for gt_phrase in gt_phrases: word_score = 0 for gt_word in gt_phrase.split(): if gt_word in pred_sent: word_score += 1 if len(gt_phrase.split()) > 0: score += word_score / len(gt_phrase.split()) if n_phrases > 0: score /= n_phrases total_score += score n_total += 1 print('background') # print('# retrieved words:', n_retrieved) print(f'Acc: {total_score / n_total * 100:.2f}') n_total = 0 total_score = 0 for id, gt_phrases in id2object.items(): pred_sent = pred_id2sent[id] score = 0 n_phrases = len(gt_phrases) for gt_phrase in gt_phrases: word_score = 0 for gt_word in gt_phrase.split(): if gt_word in pred_sent: word_score += 1 if len(gt_phrase.split()) > 0: score += word_score / len(gt_phrase.split()) if n_phrases > 0: score /= n_phrases total_score += score n_total += 1 print('object') # print('# retrieved words:', n_retrieved) print(f'Acc: {total_score / n_total * 100:.2f}') n_total = 0 total_score = 0 for id, gt_phrases in id2relation.items(): pred_sent = pred_id2sent[id] score = 0 n_phrases = len(gt_phrases) for gt_phrase in gt_phrases: word_score = 0 for gt_word in gt_phrase.split(): if gt_word in pred_sent: word_score += 1 if len(gt_phrase.split()) > 0: score += word_score / len(gt_phrase.split()) if n_phrases > 0: score /= n_phrases total_score += score n_total += 1 print('relation') # print('# retrieved words:', n_retrieved) print(f'Acc: {total_score / n_total * 100:.2f}') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--finecapeval_path', type=str, default="data/FineCapEval.csv") parser.add_argument('--generated_id2caption', type=str, default="FineCapEval_results/mle.json") args = parser.parse_args() df = pd.read_csv(args.finecapeval_path) assert df.shape == (5000, 5) generated_id2caption = json.load(open(args.generated_id2caption, 'r')) print("Preprocessing GT FineCapEval data...") id2caption = {} id2background = {} id2object = {} id2relation = {} for row in tqdm(df.itertuples(), total=len(df)): id = row.image.split('.')[0] caption = row.caption background = row.background object = row.object relation = row.relation if not isinstance(caption, str): continue if not isinstance(background, str): continue if not isinstance(object, str): continue if not isinstance(relation, str): continue if id not in id2caption: id2caption[id] = [] id2background[id] = [] id2object[id] = [] id2relation[id] = [] id2caption[id].append(caption) phrases = [] for phrase in background.lower().split('\;'): if len(phrase) > 1: phrase = nltk_process(phrase) phrases.append(phrase) id2background[id].extend(phrases) phrases = [] for phrase in object.lower().split('\;'): if len(phrase) > 1: phrase = nltk_process(phrase) phrases.append(phrase) id2object[id].extend(phrases) phrases = [] for phrase in relation.lower().split('\;'): if len(phrase) > 1: phrase = nltk_process(phrase) phrases.append(phrase) id2relation[id].extend(phrases) print("Calculating scores...") calculate_finegrained_scores( generated_id2caption, id2caption, use_coco_eval=True)