import argparse import os import logging import json import numpy as np from coref_utils.metrics import CorefEvaluator from coref_utils.utils import get_mention_to_cluster os.environ["TOKENIZERS_PARALLELISM"] = "false" logging.basicConfig(format="%(message)s", level=logging.INFO) logger = logging.getLogger() def process_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser() # Add arguments to parser parser.add_argument("log_file", help="Log file", type=str) args = parser.parse_args() return args def singleton_analysis(data): max_length = 0 max_doc_id = "" max_cluster = [] for instance in data: gold_clusters, gold_mentions_to_cluster = get_mention_to_cluster( instance["clusters"] ) pred_clusters, pred_mentions_to_cluster = get_mention_to_cluster( instance["predicted_clusters"] ) for cluster in gold_clusters: all_mention_unseen = True for mention in cluster: if mention in pred_mentions_to_cluster: all_mention_unseen = False break if all_mention_unseen: if len(cluster) > max_length: max_length = len(cluster) max_doc_id = instance["doc_key"] max_cluster = cluster print(max_doc_id) print(max_length, max_cluster) def reverse_analysis(data): max_length = 0 max_doc_id = "" max_cluster = [] for instance in data: gold_clusters, gold_mentions_to_cluster = get_mention_to_cluster( instance["clusters"] ) pred_clusters, pred_mentions_to_cluster = get_mention_to_cluster( instance["predicted_clusters"] ) for cluster in pred_clusters: all_mention_unseen = True for mention in cluster: if mention in gold_mentions_to_cluster: all_mention_unseen = False break if all_mention_unseen: if len(cluster) > max_length: max_length = len(cluster) max_doc_id = instance["doc_key"] max_cluster = cluster print(max_doc_id) print(max_length, max_cluster) def main(): args = process_args() data = [] with open(args.log_file) as f: for line in f: data.append(json.loads(line)) singleton_analysis(data) reverse_analysis(data) if __name__ == "__main__": main()