Spaces:

midas
/

Gupshup

Runtime error

App Files Files Community

dmahata commited on Nov 22, 2021

Commit

fd40c9d

•

1 Parent(s): 42f951a

Upload run_eval.py

Browse files

Files changed (1) hide show

run_eval.py +282 -0

run_eval.py ADDED Viewed

	@@ -0,0 +1,282 @@

+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import datetime
+import json
+import time
+import warnings
+from logging import getLogger
+from pathlib import Path
+from typing import Dict, List
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from utils import (
+    calculate_bleu,
+    calculate_rouge,
+    chunks,
+    parse_numeric_n_bool_cl_kwargs,
+    use_task_specific_params,
+)
+from evaluate_gpt import gpt_eval
+logger = getLogger(__name__)
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def generate_summaries_or_translations(
+    examples: List[str],
+    out_file: str,
+    model_name: str,
+    batch_size: int = 8,
+    device: str = DEFAULT_DEVICE,
+    fp16=False,
+    task="summarization",
+    prefix=None,
+    **generate_kwargs,
+) -> Dict:
+    """Save model.generate results to <out_file>, and return how long it took."""
+    fout = Path(out_file).open("w", encoding="utf-8")
+    model_name = str(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+    if fp16:
+        model = model.half()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    logger.info(
+        f"Inferred tokenizer type: {tokenizer.__class__}"
+    )  # if this is wrong, check config.model_type.
+    start_time = time.time()
+    # update config with task specific params
+    use_task_specific_params(model, task)
+    if prefix is None:
+        prefix = prefix or getattr(model.config, "prefix", "") or ""
+    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
+        examples_chunk = [prefix + text for text in examples_chunk]
+        batch = tokenizer(
+            examples_chunk, return_tensors="pt", truncation=True, padding="longest"
+        ).to(device)
+        summaries = model.generate(
+            input_ids=batch.input_ids,
+            attention_mask=batch.attention_mask,
+            **generate_kwargs,
+        )
+        dec = tokenizer.batch_decode(
+            summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        for hypothesis in dec:
+            fout.write(hypothesis + "\n")
+            fout.flush()
+    fout.close()
+    runtime = int(time.time() - start_time)  # seconds
+    n_obs = len(examples)
+    return dict(
+        n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4)
+    )
+def datetime_now():
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+def run_generate(
+    verbose=True,
+    model_name_path=None,
+    src_txt=None,
+    tar_txt=None,
+    gen_path=None,
+    scor_path=None,
+    batch_size=None,
+):
+    """
+    Takes input text, generates output, and then using reference calculates the BLEU scores.
+    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.
+    Args:
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout
+    Returns:
+        a tuple: ``(scores, params}``
+        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
+        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=False,
+        help="like facebook/bart-large-cnn,t5-base, etc.",
+    )
+    parser.add_argument(
+        "--input_path", type=str, required=False, help="like cnn_dm/test.source"
+    )
+    parser.add_argument(
+        "--save_path", type=str, required=False, help="where to save summaries"
+    )
+    parser.add_argument(
+        "--reference_path", type=str, required=False, help="like cnn_dm/test.target"
+    )
+    parser.add_argument(
+        "--score_path",
+        type=str,
+        required=False,
+        default="metrics.json",
+        help="where to save metrics",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        required=False,
+        default=DEFAULT_DEVICE,
+        help="cuda, cuda:1, cpu etc.",
+    )
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        required=False,
+        default=None,
+        help="will be added to the begininng of src examples",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="summarization",
+        help="used for task_specific_params + metrics",
+    )
+    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
+    parser.add_argument(
+        "--n_obs",
+        type=int,
+        default=-1,
+        required=False,
+        help="How many observations. Defaults to all.",
+    )
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument(
+        "--dump-args",
+        action="store_true",
+        help="print the custom hparams with the results",
+    )
+    parser.add_argument(
+        "--info",
+        nargs="?",
+        type=str,
+        const=datetime_now(),
+        help="use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.",
+    )
+    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
+    args, rest = parser.parse_known_args()
+    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
+    if model_name_path:
+        args.model_name = model_name_path
+    if src_txt:
+        args.input_path = src_txt
+    if tar_txt:
+        args.reference_path = tar_txt
+    if batch_size:
+        args.bs = batch_size
+    if gen_path:
+        args.save_path = gen_path
+    if scor_path:
+        args.score_path = scor_path
+    if args.model_name[-3:] == 'gpt':
+        gpt_eval(
+            model_name_path=args.model_name,
+            src_txt=args.input_path,
+            tar_txt=args.reference_path,
+            gen_path=args.save_path,
+            scor_path=args.score_path,
+            batch_size=args.bs
+        )
+        return None
+    if parsed_args and verbose:
+        print(f"parsed the following generate kwargs: {parsed_args}")
+    examples = [
+        " " + x.rstrip() if "t5" in args.model_name else x.rstrip()
+        for x in open(args.input_path).readlines()
+    ]
+    if args.n_obs > 0:
+        examples = examples[: args.n_obs]
+    Path(args.save_path).parent.mkdir(exist_ok=True)
+    if args.reference_path is None and Path(args.score_path).exists():
+        warnings.warn(
+            f"score_path {args.score_path} will be overwritten unless you type ctrl-c."
+        )
+    if args.device == "cpu" and args.fp16:
+        # this mix leads to RuntimeError: "threshold_cpu" not implemented for 'Half'
+        raise ValueError("Can't mix --fp16 and --device cpu")
+    runtime_metrics = generate_summaries_or_translations(
+        examples,
+        args.save_path,
+        args.model_name,
+        batch_size=args.bs,
+        device=args.device,
+        fp16=args.fp16,
+        task=args.task,
+        prefix=args.prefix,
+        **parsed_args,
+    )
+    if args.reference_path is None:
+        return {}
+    # Compute scores
+    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
+    output_lns = [x.rstrip() for x in open(args.save_path).readlines()]
+    reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()][
+        : len(output_lns)
+    ]
+    scores: dict = score_fn(output_lns, reference_lns)
+    scores.update(runtime_metrics)
+    if args.dump_args:
+        scores.update(parsed_args)
+    if args.info:
+        scores["info"] = args.info
+    if verbose:
+        print(scores)
+    if args.score_path is not None:
+        json.dump(scores, open(args.score_path, "w"))
+    return scores
+if __name__ == "__main__":
+    # Usage for MT:
+    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
+    run_generate(verbose=True)