Spaces:

jesseplusplus
/

easy-translate

Running

App Files Files Community

Iker commited on May 4, 2022

Commit

62b1ca5

1 Parent(s): d0a815c

Implement evaluation

Browse files

Files changed (8) hide show

README.md +22 -4
dataset.py +33 -0
eval.py +159 -0
sample_text/en2es.m2m100_1.2B.json +1 -0
sample_text/en2es.m2m100_418M.json +1 -0
sample_text/{en2es.translation.txt → en2es.translation.m2m100_1.2B.txt} +1 -1
sample_text/en2es.translation.m2m100_418M.txt +0 -0
translate.py +6 -2

README.md CHANGED Viewed

@@ -68,7 +68,7 @@ Run `python translate.py -h` for more info.
 ```bash
 accelerate launch translate.py \
 --sentences_path sample_text/en.txt \
---output_path sample_text/en2es.translation.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B
@@ -83,7 +83,7 @@ You can use the Accelerate CLI to configure the Accelerate environment (Run
 ```bash
 accelerate launch --multi_gpu --num_processes 2 --num_machines 1 translate.py \
 --sentences_path sample_text/en.txt \
---output_path sample_text/en2es.translation.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B
@@ -102,7 +102,7 @@ Use the `--precision` flag to choose the precision of the model. You can choose
 ```bash
 accelerate launch translate.py \
 --sentences_path sample_text/en.txt \
---output_path sample_text/en2es.translation.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B \
@@ -111,6 +111,24 @@ accelerate launch translate.py \
 ## Evaluate translations
-Work in progress...

 ```bash
 accelerate launch translate.py \
 --sentences_path sample_text/en.txt \
+--output_path sample_text/en2es.translation.m2m100_1.2B.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B
 ```bash
 accelerate launch --multi_gpu --num_processes 2 --num_machines 1 translate.py \
 --sentences_path sample_text/en.txt \
+--output_path sample_text/en2es.translation.m2m100_1.2B.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B
 ```bash
 accelerate launch translate.py \
 --sentences_path sample_text/en.txt \
+--output_path sample_text/en2es.translation.m2m100_1.2B.txt \
 --source_lang en \
 --target_lang es \
 --model_name facebook/m2m100_1.2B \
 ## Evaluate translations
+To run the evaluation script you need to install [bert_score](https://github.com/Tiiiger/bert_score): `pip install bert_score`
+The evaluation script will calculate the following metrics:
+* [SacreBLEU](https://github.com/huggingface/datasets/tree/master/metrics/sacrebleu)
+* [BLEU](https://github.com/huggingface/datasets/tree/master/metrics/bleu)
+* [ROUGE](https://github.com/huggingface/datasets/tree/master/metrics/rouge)
+* [METEOR](https://github.com/huggingface/datasets/tree/master/metrics/meteor)
+* [TER](https://github.com/huggingface/datasets/tree/master/metrics/ter)
+* [BertScore](https://github.com/huggingface/datasets/tree/master/metrics/bertscore)
+Run the following command to evaluate the translations:
+```bash
+accelerate launch eval.py \
+--pred_path sample_text/es.txt \
+--gold_path sample_text/en2es.translation.m2m100_1.2B.txt
+```
+If you want to save the results to a file use the `--output_path` flag.

dataset.py CHANGED Viewed

@@ -38,3 +38,36 @@ class DatasetReader(IterableDataset):
         file_itr = open(self.filename, "r")
         mapped_itr = map(self.preprocess, file_itr)
         return mapped_itr

         file_itr = open(self.filename, "r")
         mapped_itr = map(self.preprocess, file_itr)
         return mapped_itr
+class ParallelTextReader(IterableDataset):
+    def __init__(self, pred_path: str, gold_path: str):
+        self.pred_path = pred_path
+        self.gold_path = gold_path
+        pref_filename_lines = count_lines(pred_path)
+        gold_path_lines = count_lines(gold_path)
+        assert pref_filename_lines == gold_path_lines, (
+            f"Lines in {pred_path} and {gold_path} do not match "
+            f"{pref_filename_lines} vs {gold_path_lines}"
+        )
+        self.num_sentences = gold_path_lines
+        self.current_line = 0
+    def preprocess(self, pred: str, gold: str):
+        self.current_line += 1
+        pred = pred.rstrip().strip()
+        gold = gold.rstrip().strip()
+        if len(pred) == 0:
+            print(f"Warning: Pred empty sentence at line {self.current_line}")
+        if len(gold) == 0:
+            print(f"Warning: Gold empty sentence at line {self.current_line}")
+        return pred, [gold]
+    def __iter__(self):
+        pred_itr = open(self.pred_path, "r")
+        gold_itr = open(self.gold_path, "r")
+        mapped_itr = map(self.preprocess, pred_itr, gold_itr)
+        return mapped_itr
+    def __len__(self):
+        return self.num_sentences

eval.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from dataset import ParallelTextReader
+from torch.utils.data import DataLoader
+from accelerate.memory_utils import find_executable_batch_size
+from datasets import load_metric
+from tqdm import tqdm
+import torch
+import json
+import argparse
+import numpy as np
+def get_dataloader(pred_path: str, gold_path: str, batch_size: int):
+    """
+    Returns a dataloader for the given files.
+    """
+    def collate_fn(batch):
+        return list(map(list, zip(*batch)))
+    reader = ParallelTextReader(pred_path=pred_path, gold_path=gold_path)
+    dataloader = DataLoader(reader, batch_size=batch_size, collate_fn=collate_fn)
+    return dataloader
+def eval_files(
+    pred_path: str,
+    gold_path: str,
+    bert_score_model: str,
+    starting_batch_size: int = 128,
+    output_path: str = None,
+):
+    """
+    Evaluates the given files.
+    """
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        print("We will use a GPU to calculate BertScore.")
+    else:
+        device = "cpu"
+        print(
+            f"We will use the CPU to calculate BertScore, this can be slow for large datasets."
+        )
+    dataloader = get_dataloader(pred_path, gold_path, starting_batch_size)
+    print("Loading sacrebleu...")
+    sacrebleu = load_metric("sacrebleu")
+    print("Loading rouge...")
+    rouge = load_metric("rouge")
+    print("Loading bleu...")
+    bleu = load_metric("bleu")
+    print("Loading meteor...")
+    meteor = load_metric("meteor")
+    print("Loading ter...")
+    ter = load_metric("ter")
+    print("Loading BertScore...")
+    bert_score = load_metric("bertscore")
+    with tqdm(total=len(dataloader.dataset), desc="Loading data...") as pbar:
+        for predictions, references in dataloader:
+            sacrebleu.add_batch(predictions=predictions, references=references)
+            rouge.add_batch(predictions=predictions, references=references)
+            bleu.add_batch(
+                predictions=[p.split() for p in predictions],
+                references=[[r[0].split()] for r in references],
+            )
+            meteor.add_batch(predictions=predictions, references=references)
+            ter.add_batch(predictions=predictions, references=references)
+            bert_score.add_batch(predictions=predictions, references=references)
+            pbar.update(len(predictions))
+    result_dictionary = {}
+    print(f"Computing sacrebleu")
+    result_dictionary["sacrebleu"] = sacrebleu.compute()
+    print(f"Computing rouge score")
+    result_dictionary["rouge"] = rouge.compute()
+    print(f"Computing bleu score")
+    result_dictionary["bleu"] = bleu.compute()
+    print(f"Computing meteor score")
+    result_dictionary["meteor"] = meteor.compute()
+    print(f"Computing ter score")
+    result_dictionary["ter"] = ter.compute()
+    @find_executable_batch_size(starting_batch_size=starting_batch_size)
+    def inference(batch_size):
+        nonlocal bert_score, bert_score_model
+        print(f"Computing bert score with batch size {batch_size} on {device}")
+        results = bert_score.compute(
+            model_type=bert_score_model,
+            batch_size=batch_size,
+            device=device,
+            use_fast_tokenizer=True,
+        )
+        results["precision"] = np.average(results["precision"])
+        results["recall"] = np.average(results["recall"])
+        results["f1"] = np.average(results["f1"])
+        return results
+    result_dictionary["bert_score"] = inference()
+    if output_path is not None:
+        with open(output_path, "w") as f:
+            json.dump(result_dictionary, f, indent=4)
+    print(f"Results: {json.dumps(result_dictionary,indent=4)}")
+    return result_dictionary
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run the translation evaluation experiments"
+    )
+    parser.add_argument(
+        "--pred_path",
+        type=str,
+        required=True,
+        help="Path to a txt file containing the predicted sentences.",
+    )
+    parser.add_argument(
+        "--gold_path",
+        type=str,
+        required=True,
+        help="Path to a txt file containing the gold sentences.",
+    )
+    parser.add_argument(
+        "--starting_batch_size",
+        type=int,
+        default=64,
+        help="Starting batch size for BertScore, we will automatically reduce it if we find an OOM error.",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        help="Path to a json file to save the results. If not given, the results will be printed to the console.",
+    )
+    parser.add_argument(
+        "--bert_score_model",
+        type=str,
+        default="microsoft/deberta-xlarge-mnli",
+        help="Model to use for BertScore. See: https://github.com/huggingface/datasets/tree/master/metrics/bertscore"
+        "and https://github.com/Tiiiger/bert_score for more details.",
+    )
+    args = parser.parse_args()
+    eval_files(
+        pred_path=args.pred_path,
+        gold_path=args.gold_path,
+        starting_batch_size=args.starting_batch_size,
+        output_path=args.output_path,
+        bert_score_model=args.bert_score_model,
+    )

sample_text/en2es.m2m100_1.2B.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"sacrebleu": {"score": 32.101150640281695, "counts": [19160, 11392, 7558, 5186], "totals": [31477, 30479, 29481, 28485], "precisions": [60.86984147155066, 37.37655434889596, 25.636850853091822, 18.20607337195015], "bp": 1.0, "sys_len": 31477, "ref_len": 30102}, "rouge": {"rouge1": [[0.5852396804366098, 0.6089057437338691, 0.5919486437026797], [0.5964621218261164, 0.6200342221830797, 0.6029705008756368], [0.6068321807422377, 0.6311106822798185, 0.61324805661008]], "rouge2": [[0.3710985389559613, 0.38708055355385995, 0.3761201217327784], [0.3844850790869714, 0.40017782122170353, 0.38920434271970195], [0.3968990790506025, 0.41382310483690327, 0.4022299418726329]], "rougeL": [[0.5351505034410595, 0.5564838960633809, 0.5410602618870524], [0.5457898501195475, 0.5677049056091881, 0.5519189480892548], [0.5575497491149766, 0.5787856637940312, 0.5630101422167583]], "rougeLsum": [[0.5352116089085267, 0.5570236521823667, 0.5415939934790461], [0.5463246235983789, 0.5676427704754348, 0.5522237812823654], [0.5581141358005033, 0.5796683147249665, 0.5630221371759908]]}, "bleu": {"bleu": 0.2842153038526809, "precisions": [0.5535070989616444, 0.33646946844340314, 0.22383069265549602, 0.15653135365661033], "brevity_penalty": 1.0, "length_ratio": 1.0469217970049918, "translation_length": 28314, "reference_length": 27045}, "meteor": {"meteor": 0.4880039569987408}, "ter": {"score": 59.500831946755405, "num_edits": 16092, "ref_length": 27045.0}, "bert_score": {"precision": 0.8192511852383614, "recall": 0.8262866012752056, "f1": 0.8223477345705033, "hashcode": "microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.11(hug_trans=4.18.0)_fast-tokenizer"}}

sample_text/en2es.m2m100_418M.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"sacrebleu": {"score": 29.035496917461597, "counts": [18582, 10514, 6681, 4387], "totals": [31477, 30479, 29481, 28485], "precisions": [59.033580074339994, 34.49588241084025, 22.662053525999795, 15.401088292083553], "bp": 1.0, "sys_len": 31477, "ref_len": 30388}, "rouge": {"rouge1": [[0.5661701202298134, 0.5806961045770566, 0.5693885562082325], [0.5768745925790656, 0.5926959547911554, 0.5803693779677083], [0.5871085218904836, 0.6035331460243276, 0.5900979805085623]], "rouge2": [[0.34243414046469267, 0.35226400857606666, 0.34469210847048837], [0.3545484183384055, 0.36470783370743065, 0.3569058648048812], [0.36612813327517263, 0.37717476449671, 0.3689653665404565]], "rougeL": [[0.5129704896656746, 0.526995889564155, 0.5162056185006965], [0.523632841460358, 0.5375452284094455, 0.5267080806612512], [0.5350158816319085, 0.5480980981777757, 0.5372302857012781]], "rougeLsum": [[0.5126805856827783, 0.5265189554049317, 0.5155154093959223], [0.5239559133309495, 0.5380410013947112, 0.5271022617246641], [0.5351934954578494, 0.5491115103854219, 0.5381174565735956]]}, "bleu": {"bleu": 0.2546886610724999, "precisions": [0.5339761248852158, 0.30784155806120955, 0.19560013678331242, 0.1308640025272469], "brevity_penalty": 1.0, "length_ratio": 1.0353982300884956, "translation_length": 28314, "reference_length": 27346}, "meteor": {"meteor": 0.4630996837124251}, "ter": {"score": 61.848167922182405, "num_edits": 16913, "ref_length": 27346.0}, "bert_score": {"precision": 0.8128398380875588, "recall": 0.8185442119538784, "f1": 0.8153291321396827, "hashcode": "microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.11(hug_trans=4.18.0)_fast-tokenizer"}}

sample_text/{en2es.translation.txt → en2es.translation.m2m100_1.2B.txt} RENAMED Viewed

@@ -997,4 +997,4 @@ Quiero felicitarle, lamentablemente en su ausencia, por la forma exhaustiva y ri
 Él mencionó anteriormente que el informe se llevó a cabo con una mayoría significativa, pero no con mi apoyo.
 Por lo tanto, aunque no comparto sus conclusiones, creo que él ha ilustrado en su informe muchas de las cuestiones que la Comisión debe abordar.
 La primera es la posibilidad de renacentización de la política de competencia.
-Sé que la Comisión se opone a esto, pero el potencial existe.

 Él mencionó anteriormente que el informe se llevó a cabo con una mayoría significativa, pero no con mi apoyo.
 Por lo tanto, aunque no comparto sus conclusiones, creo que él ha ilustrado en su informe muchas de las cuestiones que la Comisión debe abordar.
 La primera es la posibilidad de renacentización de la política de competencia.
+Sé que la Comisión se opone a esto, pero el potencial existe.

sample_text/en2es.translation.m2m100_418M.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

translate.py CHANGED Viewed

@@ -122,6 +122,7 @@ def main(
             total=total_lines, desc="Dataset translation", leave=True, ascii=True
         ) as pbar, open(output_path, "w", encoding="utf-8") as output_file:
             with torch.no_grad():
                 for batch in data_loader:
                     batch["input_ids"] = batch["input_ids"]
                     batch["attention_mask"] = batch["attention_mask"]
@@ -141,8 +142,11 @@ def main(
                     tgt_text = tokenizer.batch_decode(
                         generated_tokens, skip_special_tokens=True
                     )
-                    print("\n".join(tgt_text), file=output_file)
                     pbar.update(len(tgt_text))

             total=total_lines, desc="Dataset translation", leave=True, ascii=True
         ) as pbar, open(output_path, "w", encoding="utf-8") as output_file:
             with torch.no_grad():
+                first_batch = True
                 for batch in data_loader:
                     batch["input_ids"] = batch["input_ids"]
                     batch["attention_mask"] = batch["attention_mask"]
                     tgt_text = tokenizer.batch_decode(
                         generated_tokens, skip_special_tokens=True
                     )
+                    if not first_batch:
+                        print(file=output_file)
+                    else:
+                        first_batch = False
+                    print("\n".join(tgt_text), file=output_file, end="")
                     pbar.update(len(tgt_text))